From 2af0f87c8b56567e0254aae7a1ccbedb04413b1c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 15 Jul 2015 23:32:52 +0600 Subject: [PATCH 01/17] [prosiebensat1] Fix extraction (Closes #6215) --- youtube_dl/extractor/prosiebensat1.py | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/prosiebensat1.py b/youtube_dl/extractor/prosiebensat1.py index 536a42dc8..22efa903f 100644 --- a/youtube_dl/extractor/prosiebensat1.py +++ b/youtube_dl/extractor/prosiebensat1.py @@ -9,8 +9,9 @@ from ..compat import ( compat_urllib_parse, ) from ..utils import ( - unified_strdate, + fix_xml_ampersands, int_or_none, + unified_strdate, ) @@ -208,7 +209,7 @@ class ProSiebenSat1IE(InfoExtractor): clip_id = self._html_search_regex(self._CLIPID_REGEXES, webpage, 'clip id') access_token = 'prosieben' - client_name = 'kolibri-1.12.6' + client_name = 'kolibri-2.0.19-splec4' client_location = url videos_api_url = 'http://vas.sim-technik.de/vas/live/v2/videos?%s' % compat_urllib_parse.urlencode({ @@ -275,8 +276,9 @@ class ProSiebenSat1IE(InfoExtractor): for source in urls_sources: protocol = source['protocol'] + source_url = source['url'] if protocol == 'rtmp' or protocol == 'rtmpe': - mobj = re.search(r'^(?Prtmpe?://[^/]+)/(?P.+)$', source['url']) + mobj = re.search(r'^(?Prtmpe?://[^/]+)/(?P.+)$', source_url) if not mobj: continue path = mobj.group('path') @@ -293,9 +295,18 @@ class ProSiebenSat1IE(InfoExtractor): 'ext': 'mp4', 'format_id': '%s_%s' % (source['cdn'], source['bitrate']), }) + elif 'f4mgenerator' in source_url: + manifest = self._download_xml( + source_url, clip_id, 'Downloading generated f4m manifest', + transform_source=lambda s: fix_xml_ampersands(s).strip()) + for media in manifest.findall('./{http://ns.adobe.com/f4m/2.0}media'): + manifest_url = media.get('href') + if manifest_url: + formats.extend(self._extract_f4m_formats( + manifest_url, clip_id, f4m_id='hds')) else: formats.append({ - 'url': source['url'], + 'url': source_url, 'vbr': fix_bitrate(source['bitrate']), }) From 97f4aecfc1c5fd446e1d3edd37e49aafe246fe0b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 16 Jul 2015 01:14:08 +0600 Subject: [PATCH 02/17] [extractor/common] Handle malformed f4m manifests --- youtube_dl/extractor/common.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 3a396c0b0..f8a5ecced 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -28,6 +28,7 @@ from ..utils import ( clean_html, compiled_regex_type, ExtractorError, + fix_xml_ampersands, float_or_none, int_or_none, RegexNotFoundError, @@ -837,7 +838,10 @@ class InfoExtractor(object): def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None): manifest = self._download_xml( manifest_url, video_id, 'Downloading f4m manifest', - 'Unable to download f4m manifest') + 'Unable to download f4m manifest', + # Some manifests may be malformed, e.g. prosiebensat1 generated manifests + # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244) + transform_source=lambda s: fix_xml_ampersands(s).strip()) formats = [] manifest_version = '1.0' From cc357c4db8112ff6736a227b47fb9527d327797f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 16 Jul 2015 01:14:52 +0600 Subject: [PATCH 03/17] [extractor/common] Properly handle full URLs --- youtube_dl/extractor/common.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index f8a5ecced..78e5cf8d0 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -851,8 +851,10 @@ class InfoExtractor(object): media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media') for i, media_el in enumerate(media_nodes): if manifest_version == '2.0': - manifest_url = ('/'.join(manifest_url.split('/')[:-1]) + '/' + - (media_el.attrib.get('href') or media_el.attrib.get('url'))) + media_url = media_el.attrib.get('href') or media_el.attrib['url'] + manifest_url = ( + media_url if media_url.startswith('http://') or media_url.startswith('https://') + else ('/'.join(manifest_url.split('/')[:-1]) + '/' + media_url)) tbr = int_or_none(media_el.attrib.get('bitrate')) formats.append({ 'format_id': '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)])), From 70f0f5a8ca53d4426fc079b3ab46e9d4a8e81ea4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 16 Jul 2015 01:15:15 +0600 Subject: [PATCH 04/17] [extractor/common] Recursively extract child f4m manifests --- youtube_dl/extractor/common.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 78e5cf8d0..e3c610aa4 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -27,6 +27,7 @@ from ..utils import ( bug_reports_message, clean_html, compiled_regex_type, + determine_ext, ExtractorError, fix_xml_ampersands, float_or_none, @@ -855,6 +856,13 @@ class InfoExtractor(object): manifest_url = ( media_url if media_url.startswith('http://') or media_url.startswith('https://') else ('/'.join(manifest_url.split('/')[:-1]) + '/' + media_url)) + # If media_url is itself a f4m manifest do the recursive extraction + # since bitrates in parent manifest (this one) and media_url manifest + # may differ leading to inability to resolve the format by requested + # bitrate in f4m downloader + if determine_ext(manifest_url) == 'f4m': + formats.extend(self._extract_f4m_formats(manifest_url, video_id, preference, f4m_id)) + continue tbr = int_or_none(media_el.attrib.get('bitrate')) formats.append({ 'format_id': '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)])), From f01f731107010e0c10fc94782daa7a3ba543e92a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 16 Jul 2015 01:15:47 +0600 Subject: [PATCH 05/17] [prosiebensat1] Use generic f4m manifest extraction --- youtube_dl/extractor/prosiebensat1.py | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/youtube_dl/extractor/prosiebensat1.py b/youtube_dl/extractor/prosiebensat1.py index 22efa903f..2f9d95800 100644 --- a/youtube_dl/extractor/prosiebensat1.py +++ b/youtube_dl/extractor/prosiebensat1.py @@ -9,7 +9,7 @@ from ..compat import ( compat_urllib_parse, ) from ..utils import ( - fix_xml_ampersands, + determine_ext, int_or_none, unified_strdate, ) @@ -295,15 +295,8 @@ class ProSiebenSat1IE(InfoExtractor): 'ext': 'mp4', 'format_id': '%s_%s' % (source['cdn'], source['bitrate']), }) - elif 'f4mgenerator' in source_url: - manifest = self._download_xml( - source_url, clip_id, 'Downloading generated f4m manifest', - transform_source=lambda s: fix_xml_ampersands(s).strip()) - for media in manifest.findall('./{http://ns.adobe.com/f4m/2.0}media'): - manifest_url = media.get('href') - if manifest_url: - formats.extend(self._extract_f4m_formats( - manifest_url, clip_id, f4m_id='hds')) + elif 'f4mgenerator' in source_url or determine_ext(source_url) == 'f4m': + formats.extend(self._extract_f4m_formats(source_url, clip_id)) else: formats.append({ 'url': source_url, From 31c746e5dc46491f997eca757c5e35842f04cb59 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 16 Jul 2015 01:25:33 +0600 Subject: [PATCH 06/17] [extractor/common] Keep going in some media_url is missing --- youtube_dl/extractor/common.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index e3c610aa4..271bf8596 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -852,7 +852,9 @@ class InfoExtractor(object): media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media') for i, media_el in enumerate(media_nodes): if manifest_version == '2.0': - media_url = media_el.attrib.get('href') or media_el.attrib['url'] + media_url = media_el.attrib.get('href') or media_el.attrib.get('url') + if not media_url: + continue manifest_url = ( media_url if media_url.startswith('http://') or media_url.startswith('https://') else ('/'.join(manifest_url.split('/')[:-1]) + '/' + media_url)) From ab9b890b524a49a9ffa4c8ac7243cd8afc15d270 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 16 Jul 2015 02:23:07 +0600 Subject: [PATCH 07/17] [prosiebensat1] Clarify test purpose --- youtube_dl/extractor/prosiebensat1.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/youtube_dl/extractor/prosiebensat1.py b/youtube_dl/extractor/prosiebensat1.py index 2f9d95800..fec008ce7 100644 --- a/youtube_dl/extractor/prosiebensat1.py +++ b/youtube_dl/extractor/prosiebensat1.py @@ -22,6 +22,11 @@ class ProSiebenSat1IE(InfoExtractor): _TESTS = [ { + # Tests changes introduced in https://github.com/rg3/youtube-dl/pull/6242 + # in response to fixing https://github.com/rg3/youtube-dl/issues/6215: + # - malformed f4m manifest support + # - proper handling of URLs starting with `https?://` in 2.0 manifests + # - recursive child f4m manifests extraction 'url': 'http://www.prosieben.de/tv/circus-halligalli/videos/218-staffel-2-episode-18-jahresrueckblick-ganze-folge', 'info_dict': { 'id': '2104602', From e118031ef827e851e537daa5b439cf5c249ca88d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 16 Jul 2015 23:21:04 +0600 Subject: [PATCH 08/17] [npo] Extend _VALID_URL to support ntr.nl (Closes #6248) --- youtube_dl/extractor/npo.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index 62d12b7a6..1c823ec7f 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -37,8 +37,9 @@ class NPOBaseIE(InfoExtractor): class NPOIE(NPOBaseIE): - IE_NAME = 'npo.nl' - _VALID_URL = r'https?://(?:www\.)?npo\.nl/(?!live|radio)[^/]+/[^/]+/(?P[^/?]+)' + IE_NAME = 'npo' + IE_DESC = 'npo.nl and ntr.nl' + _VALID_URL = r'https?://(?:www\.)?(?:npo|ntr)\.nl/(?!live|radio)(?:[^/]+/){2,}(?P[^/?#]+)' _TESTS = [ { @@ -100,6 +101,18 @@ class NPOIE(NPOBaseIE): 'title': 'Hoe gaat Europa verder na Parijs?', }, }, + { + 'url': 'http://www.ntr.nl/Aap-Poot-Pies/27/detail/Aap-poot-pies/VPWON_1233944#content', + 'md5': '01c6a2841675995da1f0cf776f03a9c3', + 'info_dict': { + 'id': 'VPWON_1233944', + 'ext': 'm4v', + 'title': 'Aap, poot, pies', + 'description': 'md5:c9c8005d1869ae65b858e82c01a91fde', + 'upload_date': '20150508', + 'duration': 599, + }, + } ] def _real_extract(self, url): From 525daedd5a092b0f5329952eee99a7dac5537433 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 16 Jul 2015 23:54:43 +0600 Subject: [PATCH 09/17] [npo] Add support for omroepwnl fragments --- youtube_dl/extractor/npo.py | 34 +++++++++++++++++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index 1c823ec7f..a5162c0c6 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -1,6 +1,12 @@ from __future__ import unicode_literals +import re + from .common import InfoExtractor +from ..compat import ( + compat_urllib_request, + compat_urllib_parse, +) from ..utils import ( fix_xml_ampersands, parse_duration, @@ -39,7 +45,16 @@ class NPOBaseIE(InfoExtractor): class NPOIE(NPOBaseIE): IE_NAME = 'npo' IE_DESC = 'npo.nl and ntr.nl' - _VALID_URL = r'https?://(?:www\.)?(?:npo|ntr)\.nl/(?!live|radio)(?:[^/]+/){2,}(?P[^/?#]+)' + _VALID_URL = r'''(?x) + https?:// + (?:www\.)? + (?: + npo\.nl/(?!live|radio)(?:[^/]+/){2}| + ntr\.nl/(?:[^/]+/){2,}| + omroepwnl\.nl/video/fragment/[^/]+__ + ) + (?P[^/?#]+) + ''' _TESTS = [ { @@ -112,6 +127,18 @@ class NPOIE(NPOBaseIE): 'upload_date': '20150508', 'duration': 599, }, + }, + { + 'url': 'http://www.omroepwnl.nl/video/fragment/vandaag-de-dag-verkiezingen__POMS_WNL_853698', + 'md5': 'd30cd8417b8b9bca1fdff27428860d08', + 'info_dict': { + 'id': 'POW_00996502', + 'ext': 'm4v', + 'title': '''"Dit is wel een 'landslide'..."''', + 'description': 'md5:f8d66d537dfb641380226e31ca57b8e8', + 'upload_date': '20150508', + 'duration': 462, + }, } ] @@ -127,6 +154,11 @@ class NPOIE(NPOBaseIE): transform_source=strip_jsonp, ) + # For some videos actual video id (prid) is different (e.g. for + # http://www.omroepwnl.nl/video/fragment/vandaag-de-dag-verkiezingen__POMS_WNL_853698 + # video id is POMS_WNL_853698 but prid is POW_00996502) + video_id = metadata.get('prid') or video_id + token = self._get_token(video_id) formats = [] From 50ea2bb20d3a3e219910e87b8b30fc79ce534595 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 16 Jul 2015 23:56:57 +0600 Subject: [PATCH 10/17] [npo] Update test --- youtube_dl/extractor/npo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index a5162c0c6..cf6a388e5 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -87,7 +87,7 @@ class NPOIE(NPOBaseIE): 'id': 'VPWON_1169289', 'ext': 'm4v', 'title': 'Tegenlicht', - 'description': 'md5:d6476bceb17a8c103c76c3b708f05dd1', + 'description': 'md5:52cf4eefbc96fffcbdc06d024147abea', 'upload_date': '20130225', 'duration': 3000, }, From 03f32a7eadf9d832aef55673edf38023a8daff95 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 17 Jul 2015 00:14:38 +0600 Subject: [PATCH 11/17] [wnl] Add extractor for omroepwnl playlists --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/npo.py | 45 +++++++++++++++++++++++++++----- 2 files changed, 40 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 3f4f23521..1d55275dc 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -390,6 +390,7 @@ from .npo import ( NPORadioIE, NPORadioFragmentIE, TegenlichtVproIE, + WNLIE ) from .nrk import ( NRKIE, diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index cf6a388e5..c6bf7619d 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -46,12 +46,15 @@ class NPOIE(NPOBaseIE): IE_NAME = 'npo' IE_DESC = 'npo.nl and ntr.nl' _VALID_URL = r'''(?x) - https?:// - (?:www\.)? - (?: - npo\.nl/(?!live|radio)(?:[^/]+/){2}| - ntr\.nl/(?:[^/]+/){2,}| - omroepwnl\.nl/video/fragment/[^/]+__ + (?: + npo:| + https?:// + (?:www\.)? + (?: + npo\.nl/(?!live|radio)(?:[^/]+/){2}| + ntr\.nl/(?:[^/]+/){2,}| + omroepwnl\.nl/video/fragment/[^/]+__ + ) ) (?P[^/?#]+) ''' @@ -426,3 +429,33 @@ class TegenlichtVproIE(NPOIE): info_page = self._download_json( 'http://rs.vpro.nl/v2/api/media/%s.json' % urn, name) return self._get_info(info_page['mid']) + + +class WNLIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?omroepwnl\.nl/video/detail/(?P[^/]+)__\d+' + + _TEST = { + 'url': 'http://www.omroepwnl.nl/video/detail/vandaag-de-dag-6-mei__060515', + 'info_dict': { + 'id': 'vandaag-de-dag-6-mei', + 'title': 'Vandaag de Dag 6 mei', + }, + 'playlist_count': 4, + } + + def _real_extract(self, url): + playlist_id = self._match_id(url) + + webpage = self._download_webpage(url, playlist_id) + + entries = [ + self.url_result('npo:%s' % video_id, 'NPO') + for video_id, part in re.findall( + r']+href="([^"]+)"[^>]+class="js-mid"[^>]*>(Deel \d+)', webpage) + ] + + playlist_title = self._html_search_regex( + r'(?s)]+class="subject"[^>]*>(.+?)', + webpage, 'playlist title') + + return self.playlist_result(entries, playlist_id, playlist_title) From 611ac379bb466267aded6726f9c85e79b08168c4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 17 Jul 2015 00:34:24 +0600 Subject: [PATCH 12/17] [vpro] Fix extraction and add support for vpro playlists --- youtube_dl/extractor/__init__.py | 2 +- youtube_dl/extractor/npo.py | 35 +++++++++++++++++++++++--------- 2 files changed, 26 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 1d55275dc..06f21064b 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -389,7 +389,7 @@ from .npo import ( NPOLiveIE, NPORadioIE, NPORadioFragmentIE, - TegenlichtVproIE, + VPROIE, WNLIE ) from .nrk import ( diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index c6bf7619d..28d5c90b3 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -404,9 +404,8 @@ class NPORadioFragmentIE(InfoExtractor): } -class TegenlichtVproIE(NPOIE): - IE_NAME = 'tegenlicht.vpro.nl' - _VALID_URL = r'https?://tegenlicht\.vpro\.nl/afleveringen/.*?' +class VPROIE(NPOIE): + _VALID_URL = r'https?://(?:www\.)?(?:tegenlicht\.)?vpro\.nl/(?:[^/]+/){2,}(?P[^/]+)\.html' _TESTS = [ { @@ -416,19 +415,35 @@ class TegenlichtVproIE(NPOIE): 'id': 'VPWON_1169289', 'ext': 'm4v', 'title': 'Tegenlicht', - 'description': 'md5:d6476bceb17a8c103c76c3b708f05dd1', + 'description': 'md5:52cf4eefbc96fffcbdc06d024147abea', 'upload_date': '20130225', }, }, + { + 'url': 'http://www.vpro.nl/programmas/2doc/2015/sergio-herman.html', + 'info_dict': { + 'id': 'sergio-herman', + 'title': 'Sergio Herman: Fucking perfect', + }, + 'playlist_count': 2, + } ] def _real_extract(self, url): - name = url_basename(url) - webpage = self._download_webpage(url, name) - urn = self._html_search_meta('mediaurn', webpage) - info_page = self._download_json( - 'http://rs.vpro.nl/v2/api/media/%s.json' % urn, name) - return self._get_info(info_page['mid']) + playlist_id = self._match_id(url) + + webpage = self._download_webpage(url, playlist_id) + + entries = [ + self.url_result('npo:%s' % video_id, 'NPO') + for video_id in re.findall(r'data-media-id="([^"]+)"', webpage) + ] + + playlist_title = self._search_regex( + r'\s*([^>]+?)\s*-\s*Teledoc\s*-\s*VPRO\s*', + webpage, 'playlist title', default=None) or self._og_search_title(webpage) + + return self.playlist_result(entries, playlist_id, playlist_title) class WNLIE(InfoExtractor): From 5ba761eb854d6e415b3ab542293cb31c073dc0f3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 17 Jul 2015 00:39:22 +0600 Subject: [PATCH 13/17] [npo] Prefer aflevering_titel over titel --- youtube_dl/extractor/npo.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index 28d5c90b3..91adb23f0 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -234,7 +234,9 @@ class NPOIE(NPOBaseIE): return { 'id': video_id, - 'title': metadata['titel'], + # prefer aflevering_titel if any since titel may be too generic, e.g. + # http://tegenlicht.vpro.nl/afleveringen/2014-2015/access-to-africa.html + 'title': metadata.get('aflevering_titel') or metadata['titel'], 'description': metadata['info'], 'thumbnail': metadata.get('images', [{'url': None}])[-1]['url'], 'upload_date': unified_strdate(metadata.get('gidsdatum')), @@ -414,7 +416,7 @@ class VPROIE(NPOIE): 'info_dict': { 'id': 'VPWON_1169289', 'ext': 'm4v', - 'title': 'Tegenlicht', + 'title': 'De toekomst komt uit Afrika', 'description': 'md5:52cf4eefbc96fffcbdc06d024147abea', 'upload_date': '20130225', }, From 536b0700b03f0b29a1025be0b7753253bd627d6b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 17 Jul 2015 00:40:04 +0600 Subject: [PATCH 14/17] [npo] Allow missing description --- youtube_dl/extractor/npo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index 91adb23f0..e733d96f6 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -237,7 +237,7 @@ class NPOIE(NPOBaseIE): # prefer aflevering_titel if any since titel may be too generic, e.g. # http://tegenlicht.vpro.nl/afleveringen/2014-2015/access-to-africa.html 'title': metadata.get('aflevering_titel') or metadata['titel'], - 'description': metadata['info'], + 'description': metadata.get('info'), 'thumbnail': metadata.get('images', [{'url': None}])[-1]['url'], 'upload_date': unified_strdate(metadata.get('gidsdatum')), 'duration': parse_duration(metadata.get('tijdsduur')), From 574f42d79a8596ceda681b205e19e766e7bab046 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 17 Jul 2015 00:53:12 +0600 Subject: [PATCH 15/17] [vpro] Improve playlist extraction --- youtube_dl/extractor/npo.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index e733d96f6..583ed3e14 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -428,6 +428,15 @@ class VPROIE(NPOIE): 'title': 'Sergio Herman: Fucking perfect', }, 'playlist_count': 2, + }, + { + # playlist with youtube embed + 'url': 'http://www.vpro.nl/programmas/2doc/2015/education-education.html', + 'info_dict': { + 'id': 'education-education', + 'title': '2Doc', + }, + 'playlist_count': 2, } ] @@ -437,7 +446,7 @@ class VPROIE(NPOIE): webpage = self._download_webpage(url, playlist_id) entries = [ - self.url_result('npo:%s' % video_id, 'NPO') + self.url_result('npo:%s' % video_id if not video_id.startswith('http') else video_id) for video_id in re.findall(r'data-media-id="([^"]+)"', webpage) ] From 1540119723e7e2195a47d659993a6a3bcc02d3e5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 17 Jul 2015 00:54:05 +0600 Subject: [PATCH 16/17] [npo] Remove unused imports --- youtube_dl/extractor/npo.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index 583ed3e14..f5ffe1231 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -3,17 +3,12 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import ( - compat_urllib_request, - compat_urllib_parse, -) from ..utils import ( fix_xml_ampersands, parse_duration, qualities, strip_jsonp, unified_strdate, - url_basename, ) From 23fc384f2c3cf9afd41fd7e033fe0823d0fc5fa5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 17 Jul 2015 01:28:52 +0600 Subject: [PATCH 17/17] [npo] Compound title --- youtube_dl/extractor/npo.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index f5ffe1231..0c2d02c10 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -72,7 +72,7 @@ class NPOIE(NPOBaseIE): 'info_dict': { 'id': 'VARA_101191800', 'ext': 'm4v', - 'title': 'De Mega Mike & Mega Thomas show', + 'title': 'De Mega Mike & Mega Thomas show: The best of.', 'description': 'md5:3b74c97fc9d6901d5a665aac0e5400f4', 'upload_date': '20090227', 'duration': 2400, @@ -84,7 +84,7 @@ class NPOIE(NPOBaseIE): 'info_dict': { 'id': 'VPWON_1169289', 'ext': 'm4v', - 'title': 'Tegenlicht', + 'title': 'Tegenlicht: De toekomst komt uit Afrika', 'description': 'md5:52cf4eefbc96fffcbdc06d024147abea', 'upload_date': '20130225', 'duration': 3000, @@ -157,6 +157,13 @@ class NPOIE(NPOBaseIE): # video id is POMS_WNL_853698 but prid is POW_00996502) video_id = metadata.get('prid') or video_id + # titel is too generic in some cases so utilize aflevering_titel as well + # when available (e.g. http://tegenlicht.vpro.nl/afleveringen/2014-2015/access-to-africa.html) + title = metadata['titel'] + sub_title = metadata.get('aflevering_titel') + if sub_title and sub_title != title: + title += ': %s' % sub_title + token = self._get_token(video_id) formats = [] @@ -229,9 +236,7 @@ class NPOIE(NPOBaseIE): return { 'id': video_id, - # prefer aflevering_titel if any since titel may be too generic, e.g. - # http://tegenlicht.vpro.nl/afleveringen/2014-2015/access-to-africa.html - 'title': metadata.get('aflevering_titel') or metadata['titel'], + 'title': title, 'description': metadata.get('info'), 'thumbnail': metadata.get('images', [{'url': None}])[-1]['url'], 'upload_date': unified_strdate(metadata.get('gidsdatum')),