From 51bd5226676b0c2d02264743b38da14dbfcb4a03 Mon Sep 17 00:00:00 2001 From: TinyToweringTree <54483833+TinyToweringTree@users.noreply.github.com> Date: Mon, 5 Aug 2019 12:48:39 +0200 Subject: [PATCH 01/10] [ard] Improve resolution extraction for HTTP files --- youtube_dl/extractor/ard.py | 202 ++++++++++++++++++++++++++++++++---- 1 file changed, 180 insertions(+), 22 deletions(-) diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py index 8adae4644..c56e5179a 100644 --- a/youtube_dl/extractor/ard.py +++ b/youtube_dl/extractor/ard.py @@ -7,6 +7,7 @@ from .common import InfoExtractor from .generic import GenericIE from ..utils import ( determine_ext, + dict_get, ExtractorError, int_or_none, parse_duration, @@ -17,6 +18,7 @@ from ..utils import ( unified_timestamp, update_url_query, url_or_none, + url_basename, xpath_text, ) from ..compat import compat_etree_fromstring @@ -325,6 +327,151 @@ class ARDBetaMediathekIE(InfoExtractor): 'only_matching': True, }] + _format_url_templates = [ + # Das Erste + { + 'pattern': r'^.+/(?P\d+)-[^/]+_[^/]+\..{3,4}$', + 'format_id_suffix': 'width', + }, + + # SWR / SR / NDR + { + 'pattern': r'^.+/[^/]+\.(?P[a-z]+)\..{3,4}$', + 'format_id_suffix': 'width_key', + 'width_dict': { + # SWR / SR + 'xxl': 1920, + 'xl': 1280, + 'l': 960, + 'ml': 640, + 'm': 512, + 'sm': 480, + 's': 320, + + # NDR + 'hd': 1280, + 'hq': 960, + 'ln': 640, + 'hi': 512, + 'mn': 480, + 'lo': 320, + }, + }, + + # BR / ARD-alpha / SR + { + 'pattern': r'^.+/[^/]+_(?P[A-Z0-9])\..{3,4}$', + 'format_id_suffix': 'width_key', + 'width_dict': { + # BR, ARD-alpha + 'X': 1280, + 'C': 960, + 'E': 640, + 'B': 512, + '2': 480, + 'A': 480, + '0': 320, + + # SR + 'P': 1280, + 'L': 960, + 'N': 640, + 'M': 512, + 'K': 480, + 'S': 320, + }, + }, + + # HR + { + 'pattern': r'^.+/[^/]+?(?P[0-9]+)x(?P[0-9]+)-(?P[0-9]+)[pi]-(?P[0-9]+)kbit\..{3,4}$', + 'format_id_suffix': 'tbr', + }, + + # Radio Bremen + { + 'pattern': r'^.+/[^/]+_(?P\d+)p\..{3,4}$', + 'format_id_suffix': 'height', + }, + + # RBB + { + 'pattern': r'^.+/[^/]+_(?P\d+)k\..{3,4}$', + 'format_id_suffix': 'vbr', + }, + + # tagesschau24 + { + 'pattern': r'^.+/[^/]+\.(?P[a-z]+)\.[^/]+\..{3,4}$', + 'format_id_suffix': 'width_key', + 'width_dict': { + 'webxl': 1280, + 'webl': 960, + 'webml': 640, + 'webm': 512, + 'websm': 480, + 'webs': 256, + }, + }, + + # MDR + { + 'pattern': r'^.+/[^/]+-(?P[a-z0-9]+)_[^/]+\..{3,4}$', + 'format_id_suffix': 'width_key', + 'width_dict': { + 'be7c2950aac6': 1280, + '730aae549c28': 960, + '41dd60577440': 640, + '9a4bb04739be': 512, + '39c393010ca9': 480, + 'd1ceaa57a495': 320, + }, + }, + + # TODO Find out format data for videos from WDR and ONE. + ] + + def _get_format_from_url(self, format_url, quality): + """Extract as much format data from the format_url as possible. + + Use the templates listed in _format_url_templates to do so. + """ + + result = { + 'url': format_url, + 'preference': 10, # Plain HTTP, that's nice + } + + format_id_suffix = None + + for template in self._format_url_templates: + m = re.match(template['pattern'], format_url) + if m: + groupdict = m.groupdict() + result['width'] = int_or_none(groupdict.get('width')) + result['height'] = int_or_none(groupdict.get('height')) + result['fps'] = int_or_none(groupdict.get('fps')) + result['tbr'] = int_or_none(groupdict.get('tbr')) + result['vbr'] = int_or_none(groupdict.get('vbr')) + + width_dict = template.get('width_dict') + if width_dict: + result['width'] = width_dict.get(groupdict.get('width_key')) + + format_id_suffix = groupdict.get(template.get('format_id_suffix')) + break + + if result.get('width') and not result.get('height'): + result['height'] = int((result['width'] / 16) * 9) + + if result.get('height') and not result.get('width'): + result['width'] = int((result['height'] / 9) * 16) + + result['format_id'] = (('http-' + quality) if quality else 'http') + ('-' + format_id_suffix if format_id_suffix else '') + + return result + + def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('video_id') @@ -333,6 +480,8 @@ class ARDBetaMediathekIE(InfoExtractor): webpage = self._download_webpage(url, display_id) data_json = self._search_regex(r'window\.__APOLLO_STATE__\s*=\s*(\{.*);\n', webpage, 'json') data = self._parse_json(data_json, display_id) + #import json + #print(json.dumps(data, indent=2)) res = { 'id': video_id, @@ -361,36 +510,45 @@ class ARDBetaMediathekIE(InfoExtractor): 'url': subtitle_url, }) if '_quality' in widget: - format_url = url_or_none(try_get( - widget, lambda x: x['_stream']['json'][0])) - if not format_url: + # Read format URLs from a MediaStreamArray + stream_array = try_get(widget, + lambda x: x['_stream']['json']) + if not stream_array: continue - ext = determine_ext(format_url) - if ext == 'f4m': - formats.extend(self._extract_f4m_formats( - format_url + '?hdcore=3.11.0', - video_id, f4m_id='hds', fatal=False)) - elif ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - format_url, video_id, 'mp4', m3u8_id='hls', - fatal=False)) - else: - # HTTP formats are not available when geoblocked is True, - # other formats are fine though - if geoblocked: + + for format_url in stream_array: + format_url = url_or_none(format_url) + if not format_url: continue - quality = str_or_none(widget.get('_quality')) - formats.append({ - 'format_id': ('http-' + quality) if quality else 'http', - 'url': format_url, - 'preference': 10, # Plain HTTP, that's nice - }) + + # Make sure this format isn't already in our list. + # Occassionally, there are duplicate files from + # different servers. + duplicate = next((x for x in formats + if url_basename(x['url']) == url_basename(format_url)), None) + if duplicate: + continue + + ext = determine_ext(format_url) + if ext == 'f4m': + formats.extend(self._extract_f4m_formats( + format_url + '?hdcore=3.11.0', + video_id, f4m_id='hds', fatal=False)) + elif ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + format_url, video_id, 'mp4', m3u8_id='hls', + fatal=False)) + else: + quality = str_or_none(widget.get('_quality')) + formats.append(self._get_format_from_url(format_url, quality)) if not formats and geoblocked: self.raise_geo_restricted( msg='This video is not available due to geoblocking', countries=['DE']) + # TODO Improve error handling when video is only unavailable at + # certain times due to age restrictions. self._sort_formats(formats) res.update({ 'subtitles': subtitles, From 663779e2e8c4f1ac57a85d7541bdb61554effc51 Mon Sep 17 00:00:00 2001 From: TinyToweringTree <54483833+TinyToweringTree@users.noreply.github.com> Date: Sat, 17 Aug 2019 20:43:28 +0200 Subject: [PATCH 02/10] [ard] Add extraction and handling of age_limit --- youtube_dl/extractor/ard.py | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py index c56e5179a..7e028afd7 100644 --- a/youtube_dl/extractor/ard.py +++ b/youtube_dl/extractor/ard.py @@ -480,8 +480,6 @@ class ARDBetaMediathekIE(InfoExtractor): webpage = self._download_webpage(url, display_id) data_json = self._search_regex(r'window\.__APOLLO_STATE__\s*=\s*(\{.*);\n', webpage, 'json') data = self._parse_json(data_json, display_id) - #import json - #print(json.dumps(data, indent=2)) res = { 'id': video_id, @@ -490,9 +488,12 @@ class ARDBetaMediathekIE(InfoExtractor): formats = [] subtitles = {} geoblocked = False + blocked_by_fsk = False for widget in data.values(): if widget.get('_geoblocked') is True: geoblocked = True + if widget.get('blockedByFsk') is True: + blocked_by_fsk = True if '_duration' in widget: res['duration'] = int_or_none(widget['_duration']) if 'clipTitle' in widget: @@ -503,6 +504,15 @@ class ARDBetaMediathekIE(InfoExtractor): res['timestamp'] = unified_timestamp(widget['broadcastedOn']) if 'synopsis' in widget: res['description'] = widget['synopsis'] + if 'maturityContentRating' in widget: + fsk_str = str_or_none(widget['maturityContentRating']) + if fsk_str: + m = re.match(r'(?:FSK|fsk|Fsk)(\d+)', fsk_str) + if m and m.group(1): + res['age_limit'] = int_or_none(m.group(1)) + else: + res['age_limit'] = 0 + subtitle_url = url_or_none(widget.get('_subtitleUrl')) if subtitle_url: subtitles.setdefault('de', []).append({ @@ -547,8 +557,11 @@ class ARDBetaMediathekIE(InfoExtractor): msg='This video is not available due to geoblocking', countries=['DE']) - # TODO Improve error handling when video is only unavailable at - # certain times due to age restrictions. + if not formats and blocked_by_fsk: + raise ExtractorError( + msg = 'This video is currently not available due to age restrictions (FSK %d). Try again from %02d:00 to 06:00.' % (res['age_limit'], 22 if res['age_limit'] < 18 else 23), + expected = True) + self._sort_formats(formats) res.update({ 'subtitles': subtitles, From 0f6bfb1325eb7564fdfec2feaea74b43d11fd6d7 Mon Sep 17 00:00:00 2001 From: TinyToweringTree <54483833+TinyToweringTree@users.noreply.github.com> Date: Sat, 17 Aug 2019 22:50:51 +0200 Subject: [PATCH 03/10] [ard] Extract episode information --- youtube_dl/extractor/ard.py | 36 ++++++++++++++++++++++++++++++++++-- 1 file changed, 34 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py index 7e028afd7..021d552b6 100644 --- a/youtube_dl/extractor/ard.py +++ b/youtube_dl/extractor/ard.py @@ -471,6 +471,34 @@ class ARDBetaMediathekIE(InfoExtractor): return result + def _extract_episode_info(self, title): + patterns = [ + r'.*(?P \(S(?P\d+)/E(?P\d+)\)).*', + r'.*(?P \((?:Folge |Teil )?(?P\d+)(?:/\d+)?\)).*', + r'.*(?PFolge (?P\d+)(?:\:| -|) )\"(?P.+)\".*', + r'.*(?PFolge (?P\d+)(?:\:| -|) ).*', + ] + res = {} + + for pattern in patterns: + m = re.match(pattern, title) + if m: + groupdict = m.groupdict() + for int_entry in ['season_number', 'episode_number']: + res[int_entry] = int_or_none(groupdict.get(int_entry)) + + for str_entry in ['episode']: + res[str_entry] = str_or_none(groupdict.get(str_entry)) + + if groupdict.get('ep_info') and not res['episode']: + res['episode'] = str_or_none(title.replace(groupdict.get('ep_info'), '')) + + if res['episode']: + res['episode'] = res['episode'].strip() + + break + + return res def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) @@ -480,6 +508,8 @@ class ARDBetaMediathekIE(InfoExtractor): webpage = self._download_webpage(url, display_id) data_json = self._search_regex(r'window\.__APOLLO_STATE__\s*=\s*(\{.*);\n', webpage, 'json') data = self._parse_json(data_json, display_id) + #import json + #print(json.dumps(data, indent=2)) res = { 'id': video_id, @@ -559,8 +589,8 @@ class ARDBetaMediathekIE(InfoExtractor): if not formats and blocked_by_fsk: raise ExtractorError( - msg = 'This video is currently not available due to age restrictions (FSK %d). Try again from %02d:00 to 06:00.' % (res['age_limit'], 22 if res['age_limit'] < 18 else 23), - expected = True) + msg='This video is currently not available due to age restrictions (FSK %d). Try again from %02d:00 to 06:00.' % (res['age_limit'], 22 if res['age_limit'] < 18 else 23), + expected=True) self._sort_formats(formats) res.update({ @@ -568,4 +598,6 @@ class ARDBetaMediathekIE(InfoExtractor): 'formats': formats, }) + res.update(self._extract_episode_info(res.get('title'))) + return res From 614c62ec8f3654dc8fcaddb5efb3d260eb593697 Mon Sep 17 00:00:00 2001 From: TinyToweringTree <54483833+TinyToweringTree@users.noreply.github.com> Date: Sun, 18 Aug 2019 21:53:02 +0200 Subject: [PATCH 04/10] [ard] Streamline metadata extraction --- youtube_dl/extractor/ard.py | 177 ++++++++++++++++++++++++++---------- 1 file changed, 127 insertions(+), 50 deletions(-) diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py index 021d552b6..7e040d1dd 100644 --- a/youtube_dl/extractor/ard.py +++ b/youtube_dl/extractor/ard.py @@ -471,16 +471,59 @@ class ARDBetaMediathekIE(InfoExtractor): return result + def _get_player_page(self, data): + if not data: + return None + + root = data.get('ROOT_QUERY') + if root: + for val in root.values(): + if val.get('typename') == 'PlayerPage': + return data.get(val.get('id')) + return None + + def _get_player_page_element(self, data, player_page, entry, key=None): + element = player_page.get(entry) + if element == None or key == None: + return element + + element_id = element.get('id') + if not element_id: + return None + + data_element = data.get(element_id) + if not data_element: + return None + + return data_element.get(key) + + def _is_flag_set(self, data, flag): + player_page = self._get_player_page(data) + + if not player_page: + return False + + return self._get_player_page_element(data, player_page, flag) + + def _extract_age_limit(self, fsk_str): + m = re.match(r'(?:FSK|fsk|Fsk)(\d+)', fsk_str) + if m and m.group(1): + return int_or_none(m.group(1)) + else: + return 0 + def _extract_episode_info(self, title): - patterns = [ + res = {} + if not title: + return res + + # Try to read episode data from the title. + for pattern in [ r'.*(?P \(S(?P\d+)/E(?P\d+)\)).*', r'.*(?P \((?:Folge |Teil )?(?P\d+)(?:/\d+)?\)).*', r'.*(?PFolge (?P\d+)(?:\:| -|) )\"(?P.+)\".*', - r'.*(?PFolge (?P\d+)(?:\:| -|) ).*', - ] - res = {} - - for pattern in patterns: + r'.*(?PFolge (?P\d+)(?:/\d+)?(?:\:| -|) ).*', + ]: m = re.match(pattern, title) if m: groupdict = m.groupdict() @@ -490,6 +533,8 @@ class ARDBetaMediathekIE(InfoExtractor): for str_entry in ['episode']: res[str_entry] = str_or_none(groupdict.get(str_entry)) + # Build the episode title by removing numeric episode + # information. if groupdict.get('ep_info') and not res['episode']: res['episode'] = str_or_none(title.replace(groupdict.get('ep_info'), '')) @@ -500,6 +545,66 @@ class ARDBetaMediathekIE(InfoExtractor): return res + def _extract_metadata(self, data): + res = {} + + player_page = self._get_player_page(data) + + if player_page: + for template in [ + { 'dict_key': 'channel', + 'entry': 'publicationService', + 'key': 'name' }, + + { 'dict_key': 'series', + 'entry': 'show', + 'key': 'title' }, + + { 'dict_key': 'title', + 'entry': 'title' }, + + { 'dict_key': 'description', + 'entry': 'synopsis' }, + + { 'dict_key': 'thumbnail', + 'entry': 'image', + 'key': 'src', + 'filter': lambda image_url: image_url.replace('{width}', '1920') }, + + { 'dict_key': 'timestamp', + 'entry': 'broadcastedOn', + 'filter': unified_timestamp }, + + { 'dict_key': 'release_date', + 'entry': 'broadcastedOn', + 'filter': unified_strdate }, + + { 'dict_key': 'age_limit', + 'entry': 'maturityContentRating', + 'filter': self._extract_age_limit }, + + { 'dict_key': 'duration', + 'entry': 'mediaCollection', + 'key': '_duration', + 'filter': int_or_none }, + + { 'dict_key': 'subtitles', + 'entry': 'mediaCollection', + 'key': '_subtitleUrl', + 'filter': lambda subtitle_url: { 'de': [ { 'ext': 'ttml', 'url': subtitle_url } ]} }, + ]: + value = self._get_player_page_element(data, + player_page, + template.get('entry'), + template.get('key')) + if value != None: + filter_func = template.get('filter', str_or_none) + res[template['dict_key']] = filter_func(value) + + res.update(self._extract_episode_info(res.get('title'))) + + return res + def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('video_id') @@ -508,47 +613,21 @@ class ARDBetaMediathekIE(InfoExtractor): webpage = self._download_webpage(url, display_id) data_json = self._search_regex(r'window\.__APOLLO_STATE__\s*=\s*(\{.*);\n', webpage, 'json') data = self._parse_json(data_json, display_id) - #import json - #print(json.dumps(data, indent=2)) + + if not data: + raise ExtractorError( + msg='Did not find any video data to extract', expected=True) res = { 'id': video_id, 'display_id': display_id, } - formats = [] - subtitles = {} - geoblocked = False - blocked_by_fsk = False - for widget in data.values(): - if widget.get('_geoblocked') is True: - geoblocked = True - if widget.get('blockedByFsk') is True: - blocked_by_fsk = True - if '_duration' in widget: - res['duration'] = int_or_none(widget['_duration']) - if 'clipTitle' in widget: - res['title'] = widget['clipTitle'] - if '_previewImage' in widget: - res['thumbnail'] = widget['_previewImage'] - if 'broadcastedOn' in widget: - res['timestamp'] = unified_timestamp(widget['broadcastedOn']) - if 'synopsis' in widget: - res['description'] = widget['synopsis'] - if 'maturityContentRating' in widget: - fsk_str = str_or_none(widget['maturityContentRating']) - if fsk_str: - m = re.match(r'(?:FSK|fsk|Fsk)(\d+)', fsk_str) - if m and m.group(1): - res['age_limit'] = int_or_none(m.group(1)) - else: - res['age_limit'] = 0 - subtitle_url = url_or_none(widget.get('_subtitleUrl')) - if subtitle_url: - subtitles.setdefault('de', []).append({ - 'ext': 'ttml', - 'url': subtitle_url, - }) + res.update(self._extract_metadata(data)) + + # Extract video formats + formats = [] + for widget in data.values(): if '_quality' in widget: # Read format URLs from a MediaStreamArray stream_array = try_get(widget, @@ -582,22 +661,20 @@ class ARDBetaMediathekIE(InfoExtractor): quality = str_or_none(widget.get('_quality')) formats.append(self._get_format_from_url(format_url, quality)) - if not formats and geoblocked: + if not formats and self._is_flag_set(data, 'geoblocked'): self.raise_geo_restricted( msg='This video is not available due to geoblocking', countries=['DE']) - if not formats and blocked_by_fsk: + if not formats and self._is_flag_set(data, 'blockedByFsk'): raise ExtractorError( msg='This video is currently not available due to age restrictions (FSK %d). Try again from %02d:00 to 06:00.' % (res['age_limit'], 22 if res['age_limit'] < 18 else 23), expected=True) - self._sort_formats(formats) - res.update({ - 'subtitles': subtitles, - 'formats': formats, - }) - - res.update(self._extract_episode_info(res.get('title'))) + if formats: + self._sort_formats(formats) + res.update({ + 'formats': formats, + }) return res From a11d059889574c4da0ff567fbafedc3943a863bd Mon Sep 17 00:00:00 2001 From: TinyToweringTree <54483833+TinyToweringTree@users.noreply.github.com> Date: Tue, 20 Aug 2019 23:00:18 +0200 Subject: [PATCH 05/10] [ard] Make video format extraction more robust --- youtube_dl/extractor/ard.py | 293 ++++++++++++++++++++++-------------- 1 file changed, 181 insertions(+), 112 deletions(-) diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py index 7e040d1dd..bb35d5039 100644 --- a/youtube_dl/extractor/ard.py +++ b/youtube_dl/extractor/ard.py @@ -411,6 +411,8 @@ class ARDBetaMediathekIE(InfoExtractor): 'webm': 512, 'websm': 480, 'webs': 256, + # tagesschau24 uses a width of 256 instead of 320 for its + # smallest videos }, }, @@ -428,10 +430,11 @@ class ARDBetaMediathekIE(InfoExtractor): }, }, - # TODO Find out format data for videos from WDR and ONE. + # There is no format information in the URLs of videos from + # WDR and ONE. ] - def _get_format_from_url(self, format_url, quality): + def _build_format_from_http_url(self, format_url, suffix, width_from_json_pos): """Extract as much format data from the format_url as possible. Use the templates listed in _format_url_templates to do so. @@ -439,6 +442,7 @@ class ARDBetaMediathekIE(InfoExtractor): result = { 'url': format_url, + 'width': width_from_json_pos, 'preference': 10, # Plain HTTP, that's nice } @@ -448,7 +452,7 @@ class ARDBetaMediathekIE(InfoExtractor): m = re.match(template['pattern'], format_url) if m: groupdict = m.groupdict() - result['width'] = int_or_none(groupdict.get('width')) + result['width'] = int_or_none(groupdict.get('width', width_from_json_pos)) result['height'] = int_or_none(groupdict.get('height')) result['fps'] = int_or_none(groupdict.get('fps')) result['tbr'] = int_or_none(groupdict.get('tbr')) @@ -467,50 +471,24 @@ class ARDBetaMediathekIE(InfoExtractor): if result.get('height') and not result.get('width'): result['width'] = int((result['height'] / 9) * 16) - result['format_id'] = (('http-' + quality) if quality else 'http') + ('-' + format_id_suffix if format_id_suffix else '') + result['format_id'] = ((('http-' + suffix) if suffix else 'http') + + ('-' + format_id_suffix if format_id_suffix else '')) return result def _get_player_page(self, data): - if not data: + if not isinstance(data, dict): return None root = data.get('ROOT_QUERY') - if root: + if isinstance(root, dict): for val in root.values(): - if val.get('typename') == 'PlayerPage': + if isinstance(val, dict) and val.get('typename') == 'PlayerPage': return data.get(val.get('id')) return None - def _get_player_page_element(self, data, player_page, entry, key=None): - element = player_page.get(entry) - if element == None or key == None: - return element - - element_id = element.get('id') - if not element_id: - return None - - data_element = data.get(element_id) - if not data_element: - return None - - return data_element.get(key) - def _is_flag_set(self, data, flag): - player_page = self._get_player_page(data) - - if not player_page: - return False - - return self._get_player_page_element(data, player_page, flag) - - def _extract_age_limit(self, fsk_str): - m = re.match(r'(?:FSK|fsk|Fsk)(\d+)', fsk_str) - if m and m.group(1): - return int_or_none(m.group(1)) - else: - return 0 + return self._get_elements_from_path(data, [flag]) def _extract_episode_info(self, title): res = {} @@ -543,100 +521,144 @@ class ARDBetaMediathekIE(InfoExtractor): break + # Fallback + if not res.get('episode'): + res['episode'] = title.strip() + return res + def _extract_age_limit(self, fsk_str): + m = re.match(r'(?:FSK|fsk|Fsk)(\d+)', fsk_str) + if m and m.group(1): + return int_or_none(m.group(1)) + else: + return 0 + def _extract_metadata(self, data): res = {} - player_page = self._get_player_page(data) - - if player_page: - for template in [ - { 'dict_key': 'channel', - 'entry': 'publicationService', - 'key': 'name' }, + for template in [ + { 'dict_key': 'channel', + 'path': ['publicationService', 'name'] }, - { 'dict_key': 'series', - 'entry': 'show', - 'key': 'title' }, + { 'dict_key': 'series', + 'path': ['show', 'title'] }, - { 'dict_key': 'title', - 'entry': 'title' }, + { 'dict_key': 'title', + 'path': ['title'] }, - { 'dict_key': 'description', - 'entry': 'synopsis' }, + { 'dict_key': 'description', + 'path': ['synopsis'] }, - { 'dict_key': 'thumbnail', - 'entry': 'image', - 'key': 'src', - 'filter': lambda image_url: image_url.replace('{width}', '1920') }, + { 'dict_key': 'thumbnail', + 'path': ['image', 'src'], + 'filter': lambda image_url: image_url.replace('{width}', '1920') }, - { 'dict_key': 'timestamp', - 'entry': 'broadcastedOn', - 'filter': unified_timestamp }, + { 'dict_key': 'timestamp', + 'path': ['broadcastedOn'], + 'filter': unified_timestamp }, - { 'dict_key': 'release_date', - 'entry': 'broadcastedOn', - 'filter': unified_strdate }, + { 'dict_key': 'release_date', + 'path': ['broadcastedOn'], + 'filter': unified_strdate }, - { 'dict_key': 'age_limit', - 'entry': 'maturityContentRating', - 'filter': self._extract_age_limit }, + { 'dict_key': 'age_limit', + 'path': ['maturityContentRating'], + 'filter': self._extract_age_limit }, - { 'dict_key': 'duration', - 'entry': 'mediaCollection', - 'key': '_duration', - 'filter': int_or_none }, + { 'dict_key': 'duration', + 'path': ['mediaCollection', '_duration'], + 'filter': int_or_none }, - { 'dict_key': 'subtitles', - 'entry': 'mediaCollection', - 'key': '_subtitleUrl', - 'filter': lambda subtitle_url: { 'de': [ { 'ext': 'ttml', 'url': subtitle_url } ]} }, - ]: - value = self._get_player_page_element(data, - player_page, - template.get('entry'), - template.get('key')) - if value != None: - filter_func = template.get('filter', str_or_none) - res[template['dict_key']] = filter_func(value) + { 'dict_key': 'subtitles', + 'path': ['mediaCollection', '_subtitleUrl'], + 'filter': lambda subtitle_url: { 'de': [ { 'ext': 'ttml', 'url': subtitle_url } ]} }, + ]: + value = self._get_elements_from_path(data, template.get('path')) + if value != None: + filter_func = template.get('filter', str_or_none) + res[template['dict_key']] = filter_func(value) - res.update(self._extract_episode_info(res.get('title'))) + res.update(self._extract_episode_info(res.get('title'))) return res - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('video_id') - display_id = mobj.group('display_id') or video_id + def _resolve_element(self, data, element): + """Return the actual element if the given element links to another + element by id.""" + if element == None: + return None - webpage = self._download_webpage(url, display_id) - data_json = self._search_regex(r'window\.__APOLLO_STATE__\s*=\s*(\{.*);\n', webpage, 'json') - data = self._parse_json(data_json, display_id) + if isinstance(element, dict) and element.get('type') == 'id': + # This element refers to another element. + # Retrieve the actual element. + if not data: + return None + return data.get(element.get('id')) + + return element + + def _get_elements_from_path(self, data, path, parent=None): + if parent == None: + parent = self._get_player_page(data) + + if (not isinstance(parent, dict) or + not isinstance(path, list) or + len(path) == 0): + return None + + element = self._resolve_element(data, parent.get(path[0])) + res = element + if isinstance(element, list): + res = [] + for entry in element: + entry = self._resolve_element(data, entry) + if len(path[1:]) > 0: + res.append(self._get_elements_from_path(data, path[1:], entry)) + else: + res.append(entry) + elif len(path[1:]) > 0: + res = self._get_elements_from_path(data, path[1:], element) + + return res + + + def _extract_video_formats(self, video_id, data): + formats = [] if not data: - raise ExtractorError( - msg='Did not find any video data to extract', expected=True) + return formats - res = { - 'id': video_id, - 'display_id': display_id, - } - res.update(self._extract_metadata(data)) + qualities = self._get_elements_from_path(data, ['mediaCollection', + '_mediaArray', + '_mediaStreamArray', + '_quality']) + streams = self._get_elements_from_path(data, ['mediaCollection', + '_mediaArray', + '_mediaStreamArray', + '_stream', + 'json']) + if not streams: + return formats - # Extract video formats - formats = [] - for widget in data.values(): - if '_quality' in widget: - # Read format URLs from a MediaStreamArray - stream_array = try_get(widget, - lambda x: x['_stream']['json']) - if not stream_array: - continue + # The streams are ordered by their size in the JSON data. + # Use this to set the format's width. + # The first index is the _mediaStreamArray index, the second one is + # the _stream.json index. + widths = [ + [], # At index 0 there's an m3u8 playlist ('quality' = 'auto') + [320], + [512, 480, 480], + [640, 960], + [1280], + [1920], + ] - for format_url in stream_array: - format_url = url_or_none(format_url) + for media_array_i, media_stream_arrays in enumerate(streams): + for media_stream_array_i, streams in enumerate(media_stream_arrays): + for stream_i, stream in enumerate(streams): + format_url = url_or_none(stream) if not format_url: continue @@ -658,8 +680,51 @@ class ARDBetaMediathekIE(InfoExtractor): format_url, video_id, 'mp4', m3u8_id='hls', fatal=False)) else: - quality = str_or_none(widget.get('_quality')) - formats.append(self._get_format_from_url(format_url, quality)) + # This is a video file for direct HTTP download + + if (qualities and + media_array_i < len(qualities) and + media_stream_array_i < len(qualities[media_array_i])): + quality = str_or_none(qualities[media_array_i][media_stream_array_i]) + else: + quality = None + + suffix = '-'.join(map(str, [media_array_i, media_stream_array_i, stream_i])) + if quality != None: + suffix = suffix + '-q' + quality + + # Infer the video's size from it's position within + # the JSON arrays. + width = None + if media_stream_array_i < len(widths): + if stream_i < len(widths[media_stream_array_i]): + width = widths[media_stream_array_i][stream_i] + + formats.append(self._build_format_from_http_url(format_url, suffix, width)) + + return formats + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('video_id') + display_id = mobj.group('display_id') or video_id + + webpage = self._download_webpage(url, display_id) + data_json = self._search_regex(r'window\.__APOLLO_STATE__\s*=\s*(\{.*);\n', webpage, 'json') + data = self._parse_json(data_json, display_id) + + if not data: + raise ExtractorError( + msg='Did not find any video data to extract', expected=True) + + res = { + 'id': video_id, + 'display_id': display_id, + } + + res.update(self._extract_metadata(data)) + + formats = self._extract_video_formats(video_id, data) if not formats and self._is_flag_set(data, 'geoblocked'): self.raise_geo_restricted( @@ -667,14 +732,18 @@ class ARDBetaMediathekIE(InfoExtractor): countries=['DE']) if not formats and self._is_flag_set(data, 'blockedByFsk'): - raise ExtractorError( - msg='This video is currently not available due to age restrictions (FSK %d). Try again from %02d:00 to 06:00.' % (res['age_limit'], 22 if res['age_limit'] < 18 else 23), - expected=True) + age_limit = res.get('age_limit') + if age_limit != None: + raise ExtractorError( + msg='This video is currently not available due to age restrictions (FSK %d). Try again from %02d:00 to 06:00.' % (age_limit, 22 if age_limit < 18 else 23), + expected=True) + else: + raise ExtractorError( + msg='This video is currently not available due to age restrictions. Try again later.', + expected=True) if formats: self._sort_formats(formats) - res.update({ - 'formats': formats, - }) + res['formats'] = formats return res From 2249a4b10a771cbd27b92b9579b8b46a849d24ae Mon Sep 17 00:00:00 2001 From: TinyToweringTree <54483833+TinyToweringTree@users.noreply.github.com> Date: Fri, 23 Aug 2019 09:35:24 +0200 Subject: [PATCH 06/10] [ard] Add playlist extraction --- youtube_dl/extractor/ard.py | 497 +++++++++++++++++++++-------- youtube_dl/extractor/extractors.py | 3 +- 2 files changed, 361 insertions(+), 139 deletions(-) diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py index bb35d5039..a3504f4b3 100644 --- a/youtube_dl/extractor/ard.py +++ b/youtube_dl/extractor/ard.py @@ -7,13 +7,12 @@ from .common import InfoExtractor from .generic import GenericIE from ..utils import ( determine_ext, - dict_get, ExtractorError, int_or_none, + orderedSet, parse_duration, qualities, str_or_none, - try_get, unified_strdate, unified_timestamp, update_url_query, @@ -21,7 +20,10 @@ from ..utils import ( url_basename, xpath_text, ) -from ..compat import compat_etree_fromstring +from ..compat import ( + compat_etree_fromstring, + compat_urllib_parse_urlencode, +) class ARDMediathekIE(InfoExtractor): @@ -304,7 +306,72 @@ class ARDIE(InfoExtractor): } -class ARDBetaMediathekIE(InfoExtractor): +class ARDMediathekBaseIE(InfoExtractor): + + def _get_page(self, data): + if not isinstance(data, dict): + return None + + root = data.get('ROOT_QUERY') + if isinstance(root, dict): + for val in root.values(): + if (isinstance(val, dict) and + val.get('typename') == self._page_type): + return data.get(val.get('id')) + else: + root = data.get('data') + if isinstance(root, dict): + for val in root.values(): + if (isinstance(val, dict) and + val.get('__typename') == self._page_type): + return val + return None + + def _is_flag_set(self, data, flag): + return self._get_elements_from_path(data, [flag]) + + def _resolve_element(self, data, element): + """Return the element either directly or linked by ID.""" + if element is None: + return None + + if isinstance(element, dict) and element.get('type') == 'id': + # This element refers to another element. + # Retrieve the actual element. + if not data: + return None + return data.get(element.get('id')) + + return element + + def _get_elements_from_path(self, data, path, parent=None): + if parent is None: + parent = self._get_page(data) + + if (not isinstance(parent, dict) or + not isinstance(path, list) or + len(path) == 0): + return None + + element = self._resolve_element(data, parent.get(path[0])) + res = element + if isinstance(element, list): + res = [] + for entry in element: + entry = self._resolve_element(data, entry) + if len(path[1:]) > 0: + res.append(self._get_elements_from_path(data, + path[1:], + entry)) + else: + res.append(entry) + elif len(path[1:]) > 0: + res = self._get_elements_from_path(data, path[1:], element) + + return res + + +class ARDBetaMediathekIE(ARDMediathekBaseIE): _VALID_URL = r'https://(?:beta|www)\.ardmediathek\.de/[^/]+/(?:player|live)/(?P[a-zA-Z0-9]+)(?:/(?P[^/?#]+))?' _TESTS = [{ 'url': 'https://beta.ardmediathek.de/ard/player/Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhdG9ydC9mYmM4NGM1NC0xNzU4LTRmZGYtYWFhZS0wYzcyZTIxNGEyMDE/die-robuste-roswita', @@ -327,10 +394,12 @@ class ARDBetaMediathekIE(InfoExtractor): 'only_matching': True, }] + _page_type = "PlayerPage" + _format_url_templates = [ # Das Erste { - 'pattern': r'^.+/(?P\d+)-[^/]+_[^/]+\..{3,4}$', + 'pattern': r'^.+/(?P\d{1,4})-[^/]+_[^/]+\..{3,4}$', 'format_id_suffix': 'width', }, @@ -384,19 +453,19 @@ class ARDBetaMediathekIE(InfoExtractor): # HR { - 'pattern': r'^.+/[^/]+?(?P[0-9]+)x(?P[0-9]+)-(?P[0-9]+)[pi]-(?P[0-9]+)kbit\..{3,4}$', + 'pattern': r'^.+/[^/]+?(?P[0-9]{1,4})x(?P[0-9]{1,4})-(?P[0-9]{1,3})[pi]-(?P[0-9]{1,5})kbit\..{3,4}$', 'format_id_suffix': 'tbr', }, # Radio Bremen { - 'pattern': r'^.+/[^/]+_(?P\d+)p\..{3,4}$', + 'pattern': r'^.+/[^/]+_(?P\d{1,4})p\..{3,4}$', 'format_id_suffix': 'height', }, # RBB { - 'pattern': r'^.+/[^/]+_(?P\d+)k\..{3,4}$', + 'pattern': r'^.+/[^/]+_(?P\d{1,5})k\..{3,4}$', 'format_id_suffix': 'vbr', }, @@ -434,12 +503,11 @@ class ARDBetaMediathekIE(InfoExtractor): # WDR and ONE. ] - def _build_format_from_http_url(self, format_url, suffix, width_from_json_pos): + def _extract_format_from_url(self, format_url, suffix, width_from_json_pos): """Extract as much format data from the format_url as possible. Use the templates listed in _format_url_templates to do so. """ - result = { 'url': format_url, 'width': width_from_json_pos, @@ -452,7 +520,8 @@ class ARDBetaMediathekIE(InfoExtractor): m = re.match(template['pattern'], format_url) if m: groupdict = m.groupdict() - result['width'] = int_or_none(groupdict.get('width', width_from_json_pos)) + result['width'] = int_or_none(groupdict.get( + 'width', width_from_json_pos)) result['height'] = int_or_none(groupdict.get('height')) result['fps'] = int_or_none(groupdict.get('fps')) result['tbr'] = int_or_none(groupdict.get('tbr')) @@ -462,7 +531,8 @@ class ARDBetaMediathekIE(InfoExtractor): if width_dict: result['width'] = width_dict.get(groupdict.get('width_key')) - format_id_suffix = groupdict.get(template.get('format_id_suffix')) + format_id_suffix = groupdict.get( + template.get('format_id_suffix')) break if result.get('width') and not result.get('height'): @@ -471,24 +541,60 @@ class ARDBetaMediathekIE(InfoExtractor): if result.get('height') and not result.get('width'): result['width'] = int((result['height'] / 9) * 16) - result['format_id'] = ((('http-' + suffix) if suffix else 'http') + - ('-' + format_id_suffix if format_id_suffix else '')) + result['format_id'] = ((('http-' + suffix) + if suffix else 'http') + + ('-' + format_id_suffix + if format_id_suffix else '')) return result - def _get_player_page(self, data): - if not isinstance(data, dict): - return None + def _extract_format_from_index_pos(self, + data, + format_url, + media_array_i, + media_stream_array_i, + stream_i): + if not data: + return None - root = data.get('ROOT_QUERY') - if isinstance(root, dict): - for val in root.values(): - if isinstance(val, dict) and val.get('typename') == 'PlayerPage': - return data.get(val.get('id')) - return None + qualities = self._get_elements_from_path(data, ['mediaCollection', + '_mediaArray', + '_mediaStreamArray', + '_quality']) - def _is_flag_set(self, data, flag): - return self._get_elements_from_path(data, [flag]) + if (qualities and + media_array_i < len(qualities) and + media_stream_array_i < len( + qualities[media_array_i])): + quality = str_or_none( + qualities[media_array_i][media_stream_array_i]) + else: + quality = None + + suffix = '-'.join(map( + str, + [media_array_i, media_stream_array_i, stream_i])) + if quality is not None: + suffix = suffix + '-q' + quality + + # The streams are ordered by their size in the JSON data. + # Infer the video's size from its position within the JSON arrays. + # The first index is the _mediaStreamArray index, the second one is + # the _stream.json index. + widths = [ + [], # At index 0 there's an m3u8 playlist ('quality' = 'auto') + [320], + [512, 480, 480], + [640, 960], + [1280], + [1920], + ] + width = None + if media_stream_array_i < len(widths): + if stream_i < len(widths[media_stream_array_i]): + width = widths[media_stream_array_i][stream_i] + + return self._extract_format_from_url(format_url, suffix, width) def _extract_episode_info(self, title): res = {} @@ -514,7 +620,8 @@ class ARDBetaMediathekIE(InfoExtractor): # Build the episode title by removing numeric episode # information. if groupdict.get('ep_info') and not res['episode']: - res['episode'] = str_or_none(title.replace(groupdict.get('ep_info'), '')) + res['episode'] = str_or_none( + title.replace(groupdict.get('ep_info'), '')) if res['episode']: res['episode'] = res['episode'].strip() @@ -538,103 +645,59 @@ class ARDBetaMediathekIE(InfoExtractor): res = {} for template in [ - { 'dict_key': 'channel', - 'path': ['publicationService', 'name'] }, + {'key': 'channel', + 'path': ['publicationService', 'name']}, - { 'dict_key': 'series', - 'path': ['show', 'title'] }, + {'key': 'series', + 'path': ['show', 'title']}, - { 'dict_key': 'title', - 'path': ['title'] }, + {'key': 'title', + 'path': ['title']}, - { 'dict_key': 'description', - 'path': ['synopsis'] }, + {'key': 'description', + 'path': ['synopsis']}, - { 'dict_key': 'thumbnail', - 'path': ['image', 'src'], - 'filter': lambda image_url: image_url.replace('{width}', '1920') }, + {'key': 'thumbnail', + 'path': ['image', 'src'], + 'filter': lambda image_url: image_url.replace('{width}', '1920')}, - { 'dict_key': 'timestamp', - 'path': ['broadcastedOn'], - 'filter': unified_timestamp }, + {'key': 'timestamp', + 'path': ['broadcastedOn'], + 'filter': unified_timestamp}, - { 'dict_key': 'release_date', - 'path': ['broadcastedOn'], - 'filter': unified_strdate }, + {'key': 'release_date', + 'path': ['broadcastedOn'], + 'filter': unified_strdate}, - { 'dict_key': 'age_limit', - 'path': ['maturityContentRating'], - 'filter': self._extract_age_limit }, + {'key': 'age_limit', + 'path': ['maturityContentRating'], + 'filter': self._extract_age_limit}, - { 'dict_key': 'duration', - 'path': ['mediaCollection', '_duration'], - 'filter': int_or_none }, + {'key': 'duration', + 'path': ['mediaCollection', '_duration'], + 'filter': int_or_none}, - { 'dict_key': 'subtitles', - 'path': ['mediaCollection', '_subtitleUrl'], - 'filter': lambda subtitle_url: { 'de': [ { 'ext': 'ttml', 'url': subtitle_url } ]} }, + {'key': 'subtitles', + 'path': ['mediaCollection', '_subtitleUrl'], + 'filter': lambda subtitle_url: {'de': [{'ext': 'ttml', + 'url': subtitle_url}]}}, ]: value = self._get_elements_from_path(data, template.get('path')) - if value != None: + if value is not None: filter_func = template.get('filter', str_or_none) - res[template['dict_key']] = filter_func(value) + res[template['key']] = filter_func(value) res.update(self._extract_episode_info(res.get('title'))) return res - def _resolve_element(self, data, element): - """Return the actual element if the given element links to another - element by id.""" - if element == None: - return None - - if isinstance(element, dict) and element.get('type') == 'id': - # This element refers to another element. - # Retrieve the actual element. - if not data: - return None - return data.get(element.get('id')) - - return element - - def _get_elements_from_path(self, data, path, parent=None): - if parent == None: - parent = self._get_player_page(data) - - if (not isinstance(parent, dict) or - not isinstance(path, list) or - len(path) == 0): - return None - - element = self._resolve_element(data, parent.get(path[0])) - res = element - if isinstance(element, list): - res = [] - for entry in element: - entry = self._resolve_element(data, entry) - if len(path[1:]) > 0: - res.append(self._get_elements_from_path(data, path[1:], entry)) - else: - res.append(entry) - elif len(path[1:]) > 0: - res = self._get_elements_from_path(data, path[1:], element) - - return res - - def _extract_video_formats(self, video_id, data): formats = [] if not data: return formats - - qualities = self._get_elements_from_path(data, ['mediaCollection', - '_mediaArray', - '_mediaStreamArray', - '_quality']) - streams = self._get_elements_from_path(data, ['mediaCollection', + streams = self._get_elements_from_path(data, ['mediaCollection', '_mediaArray', '_mediaStreamArray', '_stream', @@ -642,19 +705,6 @@ class ARDBetaMediathekIE(InfoExtractor): if not streams: return formats - # The streams are ordered by their size in the JSON data. - # Use this to set the format's width. - # The first index is the _mediaStreamArray index, the second one is - # the _stream.json index. - widths = [ - [], # At index 0 there's an m3u8 playlist ('quality' = 'auto') - [320], - [512, 480, 480], - [640, 960], - [1280], - [1920], - ] - for media_array_i, media_stream_arrays in enumerate(streams): for media_stream_array_i, streams in enumerate(media_stream_arrays): for stream_i, stream in enumerate(streams): @@ -666,7 +716,9 @@ class ARDBetaMediathekIE(InfoExtractor): # Occassionally, there are duplicate files from # different servers. duplicate = next((x for x in formats - if url_basename(x['url']) == url_basename(format_url)), None) + if url_basename(x['url']) == url_basename( + format_url)), + None) if duplicate: continue @@ -681,26 +733,9 @@ class ARDBetaMediathekIE(InfoExtractor): fatal=False)) else: # This is a video file for direct HTTP download - - if (qualities and - media_array_i < len(qualities) and - media_stream_array_i < len(qualities[media_array_i])): - quality = str_or_none(qualities[media_array_i][media_stream_array_i]) - else: - quality = None - - suffix = '-'.join(map(str, [media_array_i, media_stream_array_i, stream_i])) - if quality != None: - suffix = suffix + '-q' + quality - - # Infer the video's size from it's position within - # the JSON arrays. - width = None - if media_stream_array_i < len(widths): - if stream_i < len(widths[media_stream_array_i]): - width = widths[media_stream_array_i][stream_i] - - formats.append(self._build_format_from_http_url(format_url, suffix, width)) + formats.append(self._extract_format_from_index_pos( + data, format_url, + media_array_i, media_stream_array_i, stream_i)) return formats @@ -710,7 +745,10 @@ class ARDBetaMediathekIE(InfoExtractor): display_id = mobj.group('display_id') or video_id webpage = self._download_webpage(url, display_id) - data_json = self._search_regex(r'window\.__APOLLO_STATE__\s*=\s*(\{.*);\n', webpage, 'json') + data_json = self._search_regex( + r'window\.__APOLLO_STATE__\s*=\s*(\{.*);\n', + webpage, + 'json') data = self._parse_json(data_json, display_id) if not data: @@ -733,13 +771,17 @@ class ARDBetaMediathekIE(InfoExtractor): if not formats and self._is_flag_set(data, 'blockedByFsk'): age_limit = res.get('age_limit') - if age_limit != None: + if age_limit is not None: raise ExtractorError( - msg='This video is currently not available due to age restrictions (FSK %d). Try again from %02d:00 to 06:00.' % (age_limit, 22 if age_limit < 18 else 23), + msg='This video is currently not available due to age ' + 'restrictions (FSK %d). ' + 'Try again from %02d:00 to 06:00.' % ( + age_limit, 22 if age_limit < 18 else 23), expected=True) else: raise ExtractorError( - msg='This video is currently not available due to age restrictions. Try again later.', + msg='This video is currently not available due to age ' + 'restrictions. Try again later.', expected=True) if formats: @@ -747,3 +789,182 @@ class ARDBetaMediathekIE(InfoExtractor): res['formats'] = formats return res + + +class ARDBetaMediathekPlaylistIE(ARDMediathekBaseIE): + _VALID_URL = r'https://(?:beta|www)\.ardmediathek\.de/(?P[^/]+)/(?Pshows|more)/(?P[a-zA-Z0-9]+)(?:/(?P[^/?#]+))?' + _TESTS = [{ + 'url': 'https://www.ardmediathek.de/daserste/shows/Y3JpZDovL2Rhc2Vyc3RlLmRlL3N0dXJtIGRlciBsaWViZQ/sturm-der-liebe', + 'info_dict': { + 'id': '4e55c4bGxyuGq2gig0Q4WU', + 'display_id': 'menschen-und-leben', + 'title': 'Menschen & Leben', + } + }, { + 'url': 'https://www.ardmediathek.de/alpha/shows/Y3JpZDovL2JyLmRlL2Jyb2FkY2FzdFNlcmllcy82YmM4YzFhMS1mYWQxLTRiMmYtOGRjYi0wZjk5YTk4YzU3ZTA/bob-ross-the-joy-of-painting', + 'info_dict': { + 'id': 'Y3JpZDovL2JyLmRlL2Jyb2FkY2FzdFNlcmllcy82YmM4YzFhMS1mYWQxLTRiMmYtOGRjYi0wZjk5YTk4YzU3ZTA', + 'display_id': 'bob-ross-the-joy-of-painting', + 'title': 'Bob Ross - The Joy of Painting', + } + }, { + 'url': 'https://www.ardmediathek.de/ard/more/4e55c4bGxyuGq2gig0Q4WU/menschen-und-leben', + 'info_dict': { + 'id': '4e55c4bGxyuGq2gig0Q4WU', + 'display_id': 'menschen-und-leben', + 'title': 'Menschen & Leben', + } + }, + ] + + _configurations = { + 'shows': { + 'page_type': 'ShowPage', + 'playlist_id_name': 'showId', + 'persisted_query_hash': + '1801f782ce062a81d19465b059e6147671da882c510cca99e9a9ade8e542922e', + 'total_elements_path': ['pagination', 'totalElements'], + 'video_ids_path': ['teasers', 'links', 'target', 'id'], + }, + 'more': { + 'page_type': 'MorePage', + 'playlist_id_name': 'compilationId', + 'persisted_query_hash': + '0aa6f77b1d2400b94b9f92e6dbd0fabf652903ecf7c9e74d1367458d079f0810', + 'total_elements_path': ['widget', 'pagination', 'totalElements'], + 'video_ids_path': ['widget', 'teasers', 'links', 'target', 'id'], + }, + } + + def _build_query_str(self, client, playlist_id, page_number): + query_variables = '{{"client":"{}","{}":"{}","pageNumber":{}}}'.format( + client, + self._conf.get('playlist_id_name'), + playlist_id, + page_number) + + # The order of the parameters is important. It only works like this. + return compat_urllib_parse_urlencode([ + ('variables', query_variables), + ('extensions', '{"persistedQuery":{"version":1,"sha256Hash":"' + + self._conf.get('persisted_query_hash') + '"}}'), ]) + + def _download_page(self, + video_id, referer, client, playlist_id, page_number): + api_url = 'https://api.ardmediathek.de/public-gateway' + + m = re.match(r'(?Phttps?://[^/]+)/[^/]*', referer) + origin = m.group('origin') + headers = {'Referer': referer, 'Origin': origin, + # The following headers are necessary to get a proper + # response. + 'Content-type': 'application/json', + 'Accept': '*/*', } + query_str = self._build_query_str(client, playlist_id, page_number) + + try: + note = 'Downloading video IDs (page {})'.format(page_number) + page_data = self._download_json(api_url + '?' + query_str, + video_id, + headers=headers, + note=note) + page = self._get_page(page_data) + except ExtractorError: + return None, None + + return page_data, page + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('video_id') + display_id = mobj.group('display_id') or video_id + channel = mobj.group('channel') + playlist_type = mobj.group('playlist_type') + + self._conf = self._configurations.get(playlist_type) + self._page_type = self._conf.get('page_type') + + webpage = self._download_webpage(url, display_id) + data_json = self._search_regex( + r'window\.__APOLLO_STATE__\s*=\s*(\{.*);\n', webpage, 'json') + data = self._parse_json(data_json, display_id) + page = self._get_page(data) + if not isinstance(page, dict): + raise ExtractorError(msg='No playlist data available', + expected=True) + + title = self._get_elements_from_path(data, ['title'], page) + description = self._get_elements_from_path(data, ['synopsis'], page) + description = None + + page_number = 0 + + ep_data, page = self._download_page(display_id, url, channel, + video_id, page_number) + if not isinstance(page, dict): + raise ExtractorError(msg='No playlist data available', + expected=True) + + total_elements = self._get_elements_from_path( + ep_data, self._conf.get('total_elements_path'), page) or 0 + self.to_screen('{}: There are supposed to be {} videos.'.format( + display_id, total_elements)) + + page_size = 0 + num_skipped_ids = 0 + skipped_previous_page = False + + urls = [] + while True: + ids_on_page = self._get_elements_from_path( + ep_data, self._conf.get('video_ids_path'), page) + if ids_on_page: + urls.extend(['https://www.ardmediathek.de/{}/player/{}'.format( + channel, x) for x in ids_on_page]) + page_size = max(page_size, len(ids_on_page)) + elif not skipped_previous_page: + # We're receiving data but it doesn't contain any + # video IDs. This might happen if the number of reported + # elements is higher than the actual number of videos + # in this collection. + break + + if len(urls) + num_skipped_ids >= total_elements: + break + + page_number = page_number + 1 + ep_data, page = self._download_page(display_id, url, channel, + video_id, page_number) + skipped_previous_page = False + + if not isinstance(page, dict): + self.report_warning( + 'Could not download page {} with video IDs. ' + 'Skipping {} videos.'.format( + page_number, + min(page_size, + total_elements - len(urls) - num_skipped_ids)), + display_id) + num_skipped_ids = num_skipped_ids + page_size + skipped_previous_page = True + + # Remove duplicates + urls = orderedSet(urls) + + if total_elements > len(urls): + msg = 'Only received {} video IDs'.format(len(urls)) + if num_skipped_ids > 0: + # We had to skip pages because they could not be downloaded + msg = msg + '. Had to skip {} of {} vidoes'.format( + total_elements - len(urls), total_elements) + else: + # The API reported the wrong number of videos and/or there + # might have been duplicate entries + msg = msg + ' of {} reported videos.'.format(total_elements) + self.report_warning(msg) + + entries = [ + self.url_result(item_url, ie=ARDBetaMediathekIE.ie_key()) + for item_url in urls] + + return self.playlist_result(entries, video_id, title, description) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 4adcae1e5..a7030d73b 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -54,8 +54,9 @@ from .archiveorg import ArchiveOrgIE from .arkena import ArkenaIE from .ard import ( ARDBetaMediathekIE, + ARDBetaMediathekPlaylistIE, ARDIE, - ARDMediathekIE, +# ARDMediathekIE, ) from .arte import ( ArteTVPlus7IE, From 18ed43ee9763581d13676b14ba6b875b5fe39ac3 Mon Sep 17 00:00:00 2001 From: TinyToweringTree <54483833+TinyToweringTree@users.noreply.github.com> Date: Sat, 24 Aug 2019 21:37:25 +0200 Subject: [PATCH 07/10] [ard] Add series data to the old extractors --- youtube_dl/extractor/ard.py | 107 ++++++++++++++++------------- youtube_dl/extractor/extractors.py | 6 +- 2 files changed, 62 insertions(+), 51 deletions(-) diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py index a3504f4b3..3048a8d94 100644 --- a/youtube_dl/extractor/ard.py +++ b/youtube_dl/extractor/ard.py @@ -26,9 +26,48 @@ from ..compat import ( ) -class ARDMediathekIE(InfoExtractor): - IE_NAME = 'ARD:mediathek' - _VALID_URL = r'^https?://(?:(?:(?:www|classic)\.)?ardmediathek\.de|mediathek\.(?:daserste|rbb-online)\.de|one\.ard\.de)/(?:.*/)(?P[0-9]+|[^0-9][^/\?]+)[^/\?]*(?:\?.*)?' +def _extract_episode_info(title): + """Try to extract episode data from the title.""" + res = {} + if not title: + return res + + for pattern in [ + r'.*(?P \(S(?P\d+)/E(?P\d+)\)).*', + r'.*(?P \((?:Folge |Teil )?(?P\d+)(?:/\d+)?\)).*', + r'.*(?PFolge (?P\d+)(?:\:| -|) )\"(?P.+)\".*', + r'.*(?PFolge (?P\d+)(?:/\d+)?(?:\:| -|) ).*', + ]: + m = re.match(pattern, title) + if m: + groupdict = m.groupdict() + for int_entry in ['season_number', 'episode_number']: + res[int_entry] = int_or_none(groupdict.get(int_entry)) + + for str_entry in ['episode']: + res[str_entry] = str_or_none(groupdict.get(str_entry)) + + # Build the episode title by removing numeric episode + # information. + if groupdict.get('ep_info') and not res['episode']: + res['episode'] = str_or_none( + title.replace(groupdict.get('ep_info'), '')) + + if res['episode']: + res['episode'] = res['episode'].strip() + + break + + # As a fallback use the whole title as the episode name + if not res.get('episode'): + res['episode'] = title.strip() + + return res + + +class ARDMediathekClassicIE(InfoExtractor): + IE_NAME = 'ARD:mediathek classic' + _VALID_URL = r'^https?://(?:(?:(?:classic)\.)?ardmediathek\.de|mediathek\.(?:daserste|rbb-online)\.de|one\.ard\.de)/(?:.*/)(?P[0-9]+|[^0-9][^/\?]+)[^/\?]*(?:\?.*)?' _TESTS = [{ # available till 26.07.2022 @@ -65,7 +104,7 @@ class ARDMediathekIE(InfoExtractor): @classmethod def suitable(cls, url): - return False if ARDBetaMediathekIE.suitable(url) else super(ARDMediathekIE, cls).suitable(url) + return False if ARDMediathekIE.suitable(url) else super(ARDMediathekClassicIE, cls).suitable(url) def _extract_media_info(self, media_info_url, webpage, video_id): media_info = self._download_json( @@ -235,17 +274,20 @@ class ARDMediathekIE(InfoExtractor): 'http://www.ardmediathek.de/play/media/%s' % video_id, webpage, video_id) + title = self._live_title(title) if info.get('is_live') else title info.update({ 'id': video_id, - 'title': self._live_title(title) if info.get('is_live') else title, + 'title': title, 'description': description, 'thumbnail': thumbnail, }) + info.update(_extract_episode_info(title)) return info class ARDIE(InfoExtractor): + IE_NAME = 'Das Erste' _VALID_URL = r'(?Phttps?://(www\.)?daserste\.de/[^?#]+/videos/(?P[^/?#]+)-(?P[0-9]+))\.html' _TESTS = [{ # available till 14.02.2019 @@ -295,16 +337,23 @@ class ARDIE(InfoExtractor): formats.append(f) self._sort_formats(formats) - return { + res = { 'id': mobj.group('id'), 'formats': formats, 'display_id': display_id, 'title': video_node.find('./title').text, + 'description': video_node.find('./desc').text, + 'channel': 'Das Erste', + 'series': video_node.find('./broadcast').text, 'duration': parse_duration(video_node.find('./duration').text), 'upload_date': upload_date, 'thumbnail': thumbnail, } + res.update(_extract_episode_info(res.get('title'))) + + return res + class ARDMediathekBaseIE(InfoExtractor): @@ -371,7 +420,7 @@ class ARDMediathekBaseIE(InfoExtractor): return res -class ARDBetaMediathekIE(ARDMediathekBaseIE): +class ARDMediathekIE(ARDMediathekBaseIE): _VALID_URL = r'https://(?:beta|www)\.ardmediathek\.de/[^/]+/(?:player|live)/(?P[a-zA-Z0-9]+)(?:/(?P[^/?#]+))?' _TESTS = [{ 'url': 'https://beta.ardmediathek.de/ard/player/Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhdG9ydC9mYmM4NGM1NC0xNzU4LTRmZGYtYWFhZS0wYzcyZTIxNGEyMDE/die-robuste-roswita', @@ -596,44 +645,6 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE): return self._extract_format_from_url(format_url, suffix, width) - def _extract_episode_info(self, title): - res = {} - if not title: - return res - - # Try to read episode data from the title. - for pattern in [ - r'.*(?P \(S(?P\d+)/E(?P\d+)\)).*', - r'.*(?P \((?:Folge |Teil )?(?P\d+)(?:/\d+)?\)).*', - r'.*(?PFolge (?P\d+)(?:\:| -|) )\"(?P.+)\".*', - r'.*(?PFolge (?P\d+)(?:/\d+)?(?:\:| -|) ).*', - ]: - m = re.match(pattern, title) - if m: - groupdict = m.groupdict() - for int_entry in ['season_number', 'episode_number']: - res[int_entry] = int_or_none(groupdict.get(int_entry)) - - for str_entry in ['episode']: - res[str_entry] = str_or_none(groupdict.get(str_entry)) - - # Build the episode title by removing numeric episode - # information. - if groupdict.get('ep_info') and not res['episode']: - res['episode'] = str_or_none( - title.replace(groupdict.get('ep_info'), '')) - - if res['episode']: - res['episode'] = res['episode'].strip() - - break - - # Fallback - if not res.get('episode'): - res['episode'] = title.strip() - - return res - def _extract_age_limit(self, fsk_str): m = re.match(r'(?:FSK|fsk|Fsk)(\d+)', fsk_str) if m and m.group(1): @@ -687,7 +698,7 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE): filter_func = template.get('filter', str_or_none) res[template['key']] = filter_func(value) - res.update(self._extract_episode_info(res.get('title'))) + res.update(_extract_episode_info(res.get('title'))) return res @@ -791,7 +802,7 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE): return res -class ARDBetaMediathekPlaylistIE(ARDMediathekBaseIE): +class ARDMediathekPlaylistIE(ARDMediathekBaseIE): _VALID_URL = r'https://(?:beta|www)\.ardmediathek\.de/(?P[^/]+)/(?Pshows|more)/(?P[a-zA-Z0-9]+)(?:/(?P[^/?#]+))?' _TESTS = [{ 'url': 'https://www.ardmediathek.de/daserste/shows/Y3JpZDovL2Rhc2Vyc3RlLmRlL3N0dXJtIGRlciBsaWViZQ/sturm-der-liebe', @@ -964,7 +975,7 @@ class ARDBetaMediathekPlaylistIE(ARDMediathekBaseIE): self.report_warning(msg) entries = [ - self.url_result(item_url, ie=ARDBetaMediathekIE.ie_key()) + self.url_result(item_url, ie=ARDMediathekIE.ie_key()) for item_url in urls] return self.playlist_result(entries, video_id, title, description) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index a7030d73b..996bf5ed3 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -53,10 +53,10 @@ from .appletrailers import ( from .archiveorg import ArchiveOrgIE from .arkena import ArkenaIE from .ard import ( - ARDBetaMediathekIE, - ARDBetaMediathekPlaylistIE, + ARDMediathekIE, + ARDMediathekPlaylistIE, ARDIE, -# ARDMediathekIE, + ARDMediathekClassicIE, ) from .arte import ( ArteTVPlus7IE, From 767878189b9f985661d170dff6e188523b17a6a8 Mon Sep 17 00:00:00 2001 From: TinyToweringTree <54483833+TinyToweringTree@users.noreply.github.com> Date: Tue, 27 Aug 2019 20:09:05 +0200 Subject: [PATCH 08/10] [ard] Update tests --- youtube_dl/extractor/ard.py | 84 +++++++++++++++++++++++-------------- 1 file changed, 53 insertions(+), 31 deletions(-) diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py index 3048a8d94..60df2102e 100644 --- a/youtube_dl/extractor/ard.py +++ b/youtube_dl/extractor/ard.py @@ -71,7 +71,7 @@ class ARDMediathekClassicIE(InfoExtractor): _TESTS = [{ # available till 26.07.2022 - 'url': 'http://www.ardmediathek.de/tv/S%C3%9CDLICHT/Was-ist-die-Kunst-der-Zukunft-liebe-Ann/BR-Fernsehen/Video?bcastId=34633636&documentId=44726822', + 'url': 'http://classic.ardmediathek.de/tv/S%C3%9CDLICHT/Was-ist-die-Kunst-der-Zukunft-liebe-Ann/BR-Fernsehen/Video?bcastId=34633636&documentId=44726822', 'info_dict': { 'id': '44726822', 'ext': 'mp4', @@ -80,7 +80,6 @@ class ARDMediathekClassicIE(InfoExtractor): 'duration': 1740, }, 'params': { - # m3u8 download 'skip_download': True, } }, { @@ -88,7 +87,7 @@ class ARDMediathekClassicIE(InfoExtractor): 'only_matching': True, }, { # audio - 'url': 'http://www.ardmediathek.de/tv/WDR-H%C3%B6rspiel-Speicher/Tod-eines-Fu%C3%9Fballers/WDR-3/Audio-Podcast?documentId=28488308&bcastId=23074086', + 'url': 'http://classic.ardmediathek.de/tv/WDR-H%C3%B6rspiel-Speicher/Tod-eines-Fu%C3%9Fballers/WDR-3/Audio-Podcast?documentId=28488308&bcastId=23074086', 'only_matching': True, }, { 'url': 'http://mediathek.daserste.de/sendungen_a-z/328454_anne-will/22429276_vertrauen-ist-gut-spionieren-ist-besser-geht', @@ -290,17 +289,17 @@ class ARDIE(InfoExtractor): IE_NAME = 'Das Erste' _VALID_URL = r'(?Phttps?://(www\.)?daserste\.de/[^?#]+/videos/(?P[^/?#]+)-(?P[0-9]+))\.html' _TESTS = [{ - # available till 14.02.2019 - 'url': 'http://www.daserste.de/information/talk/maischberger/videos/das-groko-drama-zerlegen-sich-die-volksparteien-video-102.html', - 'md5': '8e4ec85f31be7c7fc08a26cdbc5a1f49', + # available till 25.08.2020 + 'url': 'https://www.daserste.de/information/talk/presseclub/videos/gewitterwolken-am-konjunkturhimmel-ist-unser-wohlstand-in-gefahr-102.html', 'info_dict': { - 'display_id': 'das-groko-drama-zerlegen-sich-die-volksparteien-video', + 'display_id': 'gewitterwolken-am-konjunkturhimmel-ist-unser-wohlstand-in-gefahr', 'id': '102', 'ext': 'mp4', - 'duration': 4435.0, - 'title': 'Das GroKo-Drama: Zerlegen sich die Volksparteien?', - 'upload_date': '20180214', + 'duration': 3501, + 'title': 'Gewitterwolken am Konjunkturhimmel: Ist unser Wohlstand in Gefahr?', + 'upload_date': '20190825', 'thumbnail': r're:^https?://.*\.jpg$', + 'description': r're:^Der immer aggressivere Handelskrieg zwischen China und den USA hinterlässt seine Spuren.*', }, }, { 'url': 'http://www.daserste.de/information/reportage-dokumentation/dokus/videos/die-story-im-ersten-mission-unter-falscher-flagge-100.html', @@ -423,20 +422,37 @@ class ARDMediathekBaseIE(InfoExtractor): class ARDMediathekIE(ARDMediathekBaseIE): _VALID_URL = r'https://(?:beta|www)\.ardmediathek\.de/[^/]+/(?:player|live)/(?P[a-zA-Z0-9]+)(?:/(?P[^/?#]+))?' _TESTS = [{ - 'url': 'https://beta.ardmediathek.de/ard/player/Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhdG9ydC9mYmM4NGM1NC0xNzU4LTRmZGYtYWFhZS0wYzcyZTIxNGEyMDE/die-robuste-roswita', - 'md5': '2d02d996156ea3c397cfc5036b5d7f8f', + # available till 26.07.2022 + 'url': 'https://www.ardmediathek.de/ard/player/Y3JpZDovL2JyLmRlL3ZpZGVvLzUwY2YzZTVhLTk0NjYtNGFiMS04NjAzLTFjM2VkNWFjYjM0YQ/', 'info_dict': { - 'display_id': 'die-robuste-roswita', - 'id': 'Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhdG9ydC9mYmM4NGM1NC0xNzU4LTRmZGYtYWFhZS0wYzcyZTIxNGEyMDE', - 'title': 'Tatort: Die robuste Roswita', - 'description': r're:^Der Mord.*trüber ist als die Ilm.', - 'duration': 5316, - 'thumbnail': 'https://img.ardmediathek.de/standard/00/55/43/59/34/-1774185891/16x9/960?mandant=ard', - 'upload_date': '20180826', + 'id': 'Y3JpZDovL2JyLmRlL3ZpZGVvLzUwY2YzZTVhLTk0NjYtNGFiMS04NjAzLTFjM2VkNWFjYjM0YQ', + 'ext': 'mp4', + 'title': 'Was ist die Kunst der Zukunft, liebe Anna McCarthy?', + 'description': 'md5:4ada28b3e3b5df01647310e41f3a62f5', + 'upload_date': '20170726', + 'timestamp': 1501101900, + 'duration': 1740, + }, + 'params': { + 'skip_download': True, + } + }, { + # available till 23.02.2020 + 'url': 'https://beta.ardmediathek.de/daserste/player/Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhdG9ydC85MjQwZmJkZC0xMGNkLTQxOWUtYTE3Zi00NzJkYTE2ZTI0MmM/freigang', + 'info_dict': { + 'display_id': 'freigang', + 'id': 'Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhdG9ydC85MjQwZmJkZC0xMGNkLTQxOWUtYTE3Zi00NzJkYTE2ZTI0MmM', + 'title': 'Freigang', + 'description': r're:^Das perfekte Alibi.*etwas faul.', + 'timestamp': 1566590400, + 'upload_date': '20190823', 'ext': 'mp4', }, + 'params': { + 'skip_download': True, + } }, { - 'url': 'https://www.ardmediathek.de/ard/player/Y3JpZDovL3N3ci5kZS9hZXgvbzEwNzE5MTU/', + 'url': 'https://beta.ardmediathek.de/daserste/player/Y3JpZDovL2Rhc2Vyc3RlLmRlL3BvbGl6ZWlydWYgMTEwL2M0NWMwZThlLTIwZDUtNDJiNC04MDY5LWQ5ODVhOWIyMzE2MQ/moerderische-dorfgemeinschaft', 'only_matching': True, }, { 'url': 'https://www.ardmediathek.de/swr/live/Y3JpZDovL3N3ci5kZS8xMzQ4MTA0Mg', @@ -807,26 +823,32 @@ class ARDMediathekPlaylistIE(ARDMediathekBaseIE): _TESTS = [{ 'url': 'https://www.ardmediathek.de/daserste/shows/Y3JpZDovL2Rhc2Vyc3RlLmRlL3N0dXJtIGRlciBsaWViZQ/sturm-der-liebe', 'info_dict': { - 'id': '4e55c4bGxyuGq2gig0Q4WU', - 'display_id': 'menschen-und-leben', - 'title': 'Menschen & Leben', - } + 'id': 'Y3JpZDovL2Rhc2Vyc3RlLmRlL3N0dXJtIGRlciBsaWViZQ', + 'title': 'Sturm der Liebe', + }, + 'playlist_mincount': 1, }, { 'url': 'https://www.ardmediathek.de/alpha/shows/Y3JpZDovL2JyLmRlL2Jyb2FkY2FzdFNlcmllcy82YmM4YzFhMS1mYWQxLTRiMmYtOGRjYi0wZjk5YTk4YzU3ZTA/bob-ross-the-joy-of-painting', 'info_dict': { 'id': 'Y3JpZDovL2JyLmRlL2Jyb2FkY2FzdFNlcmllcy82YmM4YzFhMS1mYWQxLTRiMmYtOGRjYi0wZjk5YTk4YzU3ZTA', - 'display_id': 'bob-ross-the-joy-of-painting', 'title': 'Bob Ross - The Joy of Painting', - } + }, + 'playlist_mincount': 1, }, { 'url': 'https://www.ardmediathek.de/ard/more/4e55c4bGxyuGq2gig0Q4WU/menschen-und-leben', 'info_dict': { 'id': '4e55c4bGxyuGq2gig0Q4WU', - 'display_id': 'menschen-und-leben', 'title': 'Menschen & Leben', - } - }, - ] + }, + 'playlist_mincount': 1, + }, { + 'url': 'https://www.ardmediathek.de/daserste/shows/Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhZ2Vzc2NoYXU/tagesschau', + 'info_dict': { + 'id': 'Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhZ2Vzc2NoYXU', + 'title': 'Tagesschau', + }, + 'only_matching': True, + }, ] _configurations = { 'shows': { @@ -972,7 +994,7 @@ class ARDMediathekPlaylistIE(ARDMediathekBaseIE): # The API reported the wrong number of videos and/or there # might have been duplicate entries msg = msg + ' of {} reported videos.'.format(total_elements) - self.report_warning(msg) + self.to_screen(msg) entries = [ self.url_result(item_url, ie=ARDMediathekIE.ie_key()) From 716514b92bcf7abfe6cd37804451997bc551ffd7 Mon Sep 17 00:00:00 2001 From: TinyToweringTree <54483833+TinyToweringTree@users.noreply.github.com> Date: Sun, 1 Sep 2019 17:22:15 +0200 Subject: [PATCH 09/10] [ard] Fix two flake8 errors --- youtube_dl/extractor/ard.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py index 60df2102e..2e26b00dc 100644 --- a/youtube_dl/extractor/ard.py +++ b/youtube_dl/extractor/ard.py @@ -744,7 +744,7 @@ class ARDMediathekIE(ARDMediathekBaseIE): # different servers. duplicate = next((x for x in formats if url_basename(x['url']) == url_basename( - format_url)), + format_url)), None) if duplicate: continue @@ -893,7 +893,7 @@ class ARDMediathekPlaylistIE(ARDMediathekBaseIE): # response. 'Content-type': 'application/json', 'Accept': '*/*', } - query_str = self._build_query_str(client, playlist_id, page_number) + query_str = self._build_query_str(client, playlist_id, page_number) try: note = 'Downloading video IDs (page {})'.format(page_number) From 799d33c715aa9b947aa33189787349a389995e8b Mon Sep 17 00:00:00 2001 From: TinyToweringTree <54483833+TinyToweringTree@users.noreply.github.com> Date: Tue, 3 Sep 2019 10:54:00 +0200 Subject: [PATCH 10/10] [ard] Add extraction of is_live field --- youtube_dl/extractor/ard.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py index 2e26b00dc..46b729464 100644 --- a/youtube_dl/extractor/ard.py +++ b/youtube_dl/extractor/ard.py @@ -420,7 +420,7 @@ class ARDMediathekBaseIE(InfoExtractor): class ARDMediathekIE(ARDMediathekBaseIE): - _VALID_URL = r'https://(?:beta|www)\.ardmediathek\.de/[^/]+/(?:player|live)/(?P[a-zA-Z0-9]+)(?:/(?P[^/?#]+))?' + _VALID_URL = r'https://(?:beta|www)\.ardmediathek\.de/[^/]+/(?Pplayer|live)/(?P[a-zA-Z0-9]+)(?:/(?P[^/?#]+))?' _TESTS = [{ # available till 26.07.2022 'url': 'https://www.ardmediathek.de/ard/player/Y3JpZDovL2JyLmRlL3ZpZGVvLzUwY2YzZTVhLTk0NjYtNGFiMS04NjAzLTFjM2VkNWFjYjM0YQ/', @@ -785,6 +785,7 @@ class ARDMediathekIE(ARDMediathekBaseIE): res = { 'id': video_id, 'display_id': display_id, + 'is_live': mobj.group('mode') == 'live', } res.update(self._extract_metadata(data))