[ard] Make video format extraction more robust

This commit is contained in:
TinyToweringTree 2019-08-20 23:00:18 +02:00
parent 614c62ec8f
commit a11d059889

View File

@ -411,6 +411,8 @@ class ARDBetaMediathekIE(InfoExtractor):
'webm': 512, 'webm': 512,
'websm': 480, 'websm': 480,
'webs': 256, 'webs': 256,
# tagesschau24 uses a width of 256 instead of 320 for its
# smallest videos
}, },
}, },
@ -428,10 +430,11 @@ class ARDBetaMediathekIE(InfoExtractor):
}, },
}, },
# TODO Find out format data for videos from WDR and ONE. # There is no format information in the URLs of videos from
# WDR and ONE.
] ]
def _get_format_from_url(self, format_url, quality): def _build_format_from_http_url(self, format_url, suffix, width_from_json_pos):
"""Extract as much format data from the format_url as possible. """Extract as much format data from the format_url as possible.
Use the templates listed in _format_url_templates to do so. Use the templates listed in _format_url_templates to do so.
@ -439,6 +442,7 @@ class ARDBetaMediathekIE(InfoExtractor):
result = { result = {
'url': format_url, 'url': format_url,
'width': width_from_json_pos,
'preference': 10, # Plain HTTP, that's nice 'preference': 10, # Plain HTTP, that's nice
} }
@ -448,7 +452,7 @@ class ARDBetaMediathekIE(InfoExtractor):
m = re.match(template['pattern'], format_url) m = re.match(template['pattern'], format_url)
if m: if m:
groupdict = m.groupdict() groupdict = m.groupdict()
result['width'] = int_or_none(groupdict.get('width')) result['width'] = int_or_none(groupdict.get('width', width_from_json_pos))
result['height'] = int_or_none(groupdict.get('height')) result['height'] = int_or_none(groupdict.get('height'))
result['fps'] = int_or_none(groupdict.get('fps')) result['fps'] = int_or_none(groupdict.get('fps'))
result['tbr'] = int_or_none(groupdict.get('tbr')) result['tbr'] = int_or_none(groupdict.get('tbr'))
@ -467,50 +471,24 @@ class ARDBetaMediathekIE(InfoExtractor):
if result.get('height') and not result.get('width'): if result.get('height') and not result.get('width'):
result['width'] = int((result['height'] / 9) * 16) result['width'] = int((result['height'] / 9) * 16)
result['format_id'] = (('http-' + quality) if quality else 'http') + ('-' + format_id_suffix if format_id_suffix else '') result['format_id'] = ((('http-' + suffix) if suffix else 'http') +
('-' + format_id_suffix if format_id_suffix else ''))
return result return result
def _get_player_page(self, data): def _get_player_page(self, data):
if not data: if not isinstance(data, dict):
return None return None
root = data.get('ROOT_QUERY') root = data.get('ROOT_QUERY')
if root: if isinstance(root, dict):
for val in root.values(): for val in root.values():
if val.get('typename') == 'PlayerPage': if isinstance(val, dict) and val.get('typename') == 'PlayerPage':
return data.get(val.get('id')) return data.get(val.get('id'))
return None return None
def _get_player_page_element(self, data, player_page, entry, key=None):
element = player_page.get(entry)
if element == None or key == None:
return element
element_id = element.get('id')
if not element_id:
return None
data_element = data.get(element_id)
if not data_element:
return None
return data_element.get(key)
def _is_flag_set(self, data, flag): def _is_flag_set(self, data, flag):
player_page = self._get_player_page(data) return self._get_elements_from_path(data, [flag])
if not player_page:
return False
return self._get_player_page_element(data, player_page, flag)
def _extract_age_limit(self, fsk_str):
m = re.match(r'(?:FSK|fsk|Fsk)(\d+)', fsk_str)
if m and m.group(1):
return int_or_none(m.group(1))
else:
return 0
def _extract_episode_info(self, title): def _extract_episode_info(self, title):
res = {} res = {}
@ -543,100 +521,144 @@ class ARDBetaMediathekIE(InfoExtractor):
break break
# Fallback
if not res.get('episode'):
res['episode'] = title.strip()
return res return res
def _extract_age_limit(self, fsk_str):
m = re.match(r'(?:FSK|fsk|Fsk)(\d+)', fsk_str)
if m and m.group(1):
return int_or_none(m.group(1))
else:
return 0
def _extract_metadata(self, data): def _extract_metadata(self, data):
res = {} res = {}
player_page = self._get_player_page(data) for template in [
{ 'dict_key': 'channel',
if player_page: 'path': ['publicationService', 'name'] },
for template in [
{ 'dict_key': 'channel',
'entry': 'publicationService',
'key': 'name' },
{ 'dict_key': 'series', { 'dict_key': 'series',
'entry': 'show', 'path': ['show', 'title'] },
'key': 'title' },
{ 'dict_key': 'title', { 'dict_key': 'title',
'entry': 'title' }, 'path': ['title'] },
{ 'dict_key': 'description', { 'dict_key': 'description',
'entry': 'synopsis' }, 'path': ['synopsis'] },
{ 'dict_key': 'thumbnail', { 'dict_key': 'thumbnail',
'entry': 'image', 'path': ['image', 'src'],
'key': 'src', 'filter': lambda image_url: image_url.replace('{width}', '1920') },
'filter': lambda image_url: image_url.replace('{width}', '1920') },
{ 'dict_key': 'timestamp', { 'dict_key': 'timestamp',
'entry': 'broadcastedOn', 'path': ['broadcastedOn'],
'filter': unified_timestamp }, 'filter': unified_timestamp },
{ 'dict_key': 'release_date', { 'dict_key': 'release_date',
'entry': 'broadcastedOn', 'path': ['broadcastedOn'],
'filter': unified_strdate }, 'filter': unified_strdate },
{ 'dict_key': 'age_limit', { 'dict_key': 'age_limit',
'entry': 'maturityContentRating', 'path': ['maturityContentRating'],
'filter': self._extract_age_limit }, 'filter': self._extract_age_limit },
{ 'dict_key': 'duration', { 'dict_key': 'duration',
'entry': 'mediaCollection', 'path': ['mediaCollection', '_duration'],
'key': '_duration', 'filter': int_or_none },
'filter': int_or_none },
{ 'dict_key': 'subtitles', { 'dict_key': 'subtitles',
'entry': 'mediaCollection', 'path': ['mediaCollection', '_subtitleUrl'],
'key': '_subtitleUrl', 'filter': lambda subtitle_url: { 'de': [ { 'ext': 'ttml', 'url': subtitle_url } ]} },
'filter': lambda subtitle_url: { 'de': [ { 'ext': 'ttml', 'url': subtitle_url } ]} }, ]:
]: value = self._get_elements_from_path(data, template.get('path'))
value = self._get_player_page_element(data, if value != None:
player_page, filter_func = template.get('filter', str_or_none)
template.get('entry'), res[template['dict_key']] = filter_func(value)
template.get('key'))
if value != None:
filter_func = template.get('filter', str_or_none)
res[template['dict_key']] = filter_func(value)
res.update(self._extract_episode_info(res.get('title'))) res.update(self._extract_episode_info(res.get('title')))
return res return res
def _real_extract(self, url): def _resolve_element(self, data, element):
mobj = re.match(self._VALID_URL, url) """Return the actual element if the given element links to another
video_id = mobj.group('video_id') element by id."""
display_id = mobj.group('display_id') or video_id if element == None:
return None
webpage = self._download_webpage(url, display_id) if isinstance(element, dict) and element.get('type') == 'id':
data_json = self._search_regex(r'window\.__APOLLO_STATE__\s*=\s*(\{.*);\n', webpage, 'json') # This element refers to another element.
data = self._parse_json(data_json, display_id) # Retrieve the actual element.
if not data:
return None
return data.get(element.get('id'))
return element
def _get_elements_from_path(self, data, path, parent=None):
if parent == None:
parent = self._get_player_page(data)
if (not isinstance(parent, dict) or
not isinstance(path, list) or
len(path) == 0):
return None
element = self._resolve_element(data, parent.get(path[0]))
res = element
if isinstance(element, list):
res = []
for entry in element:
entry = self._resolve_element(data, entry)
if len(path[1:]) > 0:
res.append(self._get_elements_from_path(data, path[1:], entry))
else:
res.append(entry)
elif len(path[1:]) > 0:
res = self._get_elements_from_path(data, path[1:], element)
return res
def _extract_video_formats(self, video_id, data):
formats = []
if not data: if not data:
raise ExtractorError( return formats
msg='Did not find any video data to extract', expected=True)
res = {
'id': video_id,
'display_id': display_id,
}
res.update(self._extract_metadata(data)) qualities = self._get_elements_from_path(data, ['mediaCollection',
'_mediaArray',
'_mediaStreamArray',
'_quality'])
streams = self._get_elements_from_path(data, ['mediaCollection',
'_mediaArray',
'_mediaStreamArray',
'_stream',
'json'])
if not streams:
return formats
# Extract video formats # The streams are ordered by their size in the JSON data.
formats = [] # Use this to set the format's width.
for widget in data.values(): # The first index is the _mediaStreamArray index, the second one is
if '_quality' in widget: # the _stream.json index.
# Read format URLs from a MediaStreamArray widths = [
stream_array = try_get(widget, [], # At index 0 there's an m3u8 playlist ('quality' = 'auto')
lambda x: x['_stream']['json']) [320],
if not stream_array: [512, 480, 480],
continue [640, 960],
[1280],
[1920],
]
for format_url in stream_array: for media_array_i, media_stream_arrays in enumerate(streams):
format_url = url_or_none(format_url) for media_stream_array_i, streams in enumerate(media_stream_arrays):
for stream_i, stream in enumerate(streams):
format_url = url_or_none(stream)
if not format_url: if not format_url:
continue continue
@ -658,8 +680,51 @@ class ARDBetaMediathekIE(InfoExtractor):
format_url, video_id, 'mp4', m3u8_id='hls', format_url, video_id, 'mp4', m3u8_id='hls',
fatal=False)) fatal=False))
else: else:
quality = str_or_none(widget.get('_quality')) # This is a video file for direct HTTP download
formats.append(self._get_format_from_url(format_url, quality))
if (qualities and
media_array_i < len(qualities) and
media_stream_array_i < len(qualities[media_array_i])):
quality = str_or_none(qualities[media_array_i][media_stream_array_i])
else:
quality = None
suffix = '-'.join(map(str, [media_array_i, media_stream_array_i, stream_i]))
if quality != None:
suffix = suffix + '-q' + quality
# Infer the video's size from it's position within
# the JSON arrays.
width = None
if media_stream_array_i < len(widths):
if stream_i < len(widths[media_stream_array_i]):
width = widths[media_stream_array_i][stream_i]
formats.append(self._build_format_from_http_url(format_url, suffix, width))
return formats
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('video_id')
display_id = mobj.group('display_id') or video_id
webpage = self._download_webpage(url, display_id)
data_json = self._search_regex(r'window\.__APOLLO_STATE__\s*=\s*(\{.*);\n', webpage, 'json')
data = self._parse_json(data_json, display_id)
if not data:
raise ExtractorError(
msg='Did not find any video data to extract', expected=True)
res = {
'id': video_id,
'display_id': display_id,
}
res.update(self._extract_metadata(data))
formats = self._extract_video_formats(video_id, data)
if not formats and self._is_flag_set(data, 'geoblocked'): if not formats and self._is_flag_set(data, 'geoblocked'):
self.raise_geo_restricted( self.raise_geo_restricted(
@ -667,14 +732,18 @@ class ARDBetaMediathekIE(InfoExtractor):
countries=['DE']) countries=['DE'])
if not formats and self._is_flag_set(data, 'blockedByFsk'): if not formats and self._is_flag_set(data, 'blockedByFsk'):
raise ExtractorError( age_limit = res.get('age_limit')
msg='This video is currently not available due to age restrictions (FSK %d). Try again from %02d:00 to 06:00.' % (res['age_limit'], 22 if res['age_limit'] < 18 else 23), if age_limit != None:
expected=True) raise ExtractorError(
msg='This video is currently not available due to age restrictions (FSK %d). Try again from %02d:00 to 06:00.' % (age_limit, 22 if age_limit < 18 else 23),
expected=True)
else:
raise ExtractorError(
msg='This video is currently not available due to age restrictions. Try again later.',
expected=True)
if formats: if formats:
self._sort_formats(formats) self._sort_formats(formats)
res.update({ res['formats'] = formats
'formats': formats,
})
return res return res