[ard] Improve resolution extraction for HTTP files

2019-08-05 12:48:39 +02:00 · 2019-08-05 12:48:39 +02:00 · 51bd522667
commit 51bd522667
parent f620d0d860
1 changed files with 180 additions and 22 deletions
--- a/youtube_dl/extractor/ard.py
+++ b/youtube_dl/extractor/ard.py
@ -7,6 +7,7 @@ from .common import InfoExtractor
 from .generic import GenericIE
 from ..utils import (
    determine_ext,
    dict_get,
    ExtractorError,
    int_or_none,
    parse_duration,
@ -17,6 +18,7 @@ from ..utils import (
    unified_timestamp,
    update_url_query,
    url_or_none,
    url_basename,
    xpath_text,
 )
 from ..compat import compat_etree_fromstring
@ -325,6 +327,151 @@ class ARDBetaMediathekIE(InfoExtractor):
        'only_matching': True,
    }]
    _format_url_templates = [
        # Das Erste
        {
            'pattern': r'^.+/(?P<width>\d+)-[^/]+_[^/]+\..{3,4}$',
            'format_id_suffix': 'width',
        },
        # SWR / SR / NDR
        {
            'pattern': r'^.+/[^/]+\.(?P<width_key>[a-z]+)\..{3,4}$',
            'format_id_suffix': 'width_key',
            'width_dict': {
                # SWR / SR
                'xxl': 1920,
                'xl': 1280,
                'l': 960,
                'ml': 640,
                'm': 512,
                'sm': 480,
                's': 320,
                # NDR
                'hd': 1280,
                'hq': 960,
                'ln': 640,
                'hi': 512,
                'mn': 480,
                'lo': 320,
            },
        },
        # BR / ARD-alpha / SR
        {
            'pattern': r'^.+/[^/]+_(?P<width_key>[A-Z0-9])\..{3,4}$',
            'format_id_suffix': 'width_key',
            'width_dict': {
                # BR, ARD-alpha
                'X': 1280,
                'C': 960,
                'E': 640,
                'B': 512,
                '2': 480,
                'A': 480,
                '0': 320,
                # SR
                'P': 1280,
                'L': 960,
                'N': 640,
                'M': 512,
                'K': 480,
                'S': 320,
            },
        },
        # HR
        {
            'pattern': r'^.+/[^/]+?(?P<width>[0-9]+)x(?P<height>[0-9]+)-(?P<fps>[0-9]+)[pi]-(?P<tbr>[0-9]+)kbit\..{3,4}$',
            'format_id_suffix': 'tbr',
        },
        # Radio Bremen
        {
            'pattern': r'^.+/[^/]+_(?P<height>\d+)p\..{3,4}$',
            'format_id_suffix': 'height',
        },
        # RBB
        {
            'pattern': r'^.+/[^/]+_(?P<vbr>\d+)k\..{3,4}$',
            'format_id_suffix': 'vbr',
        },
        # tagesschau24
        {
            'pattern': r'^.+/[^/]+\.(?P<width_key>[a-z]+)\.[^/]+\..{3,4}$',
            'format_id_suffix': 'width_key',
            'width_dict': {
                'webxl': 1280,
                'webl': 960,
                'webml': 640,
                'webm': 512,
                'websm': 480,
                'webs': 256,
            },
        },
        # MDR
        {
            'pattern': r'^.+/[^/]+-(?P<width_key>[a-z0-9]+)_[^/]+\..{3,4}$',
            'format_id_suffix': 'width_key',
            'width_dict': {
                'be7c2950aac6': 1280,
                '730aae549c28': 960,
                '41dd60577440': 640,
                '9a4bb04739be': 512,
                '39c393010ca9': 480,
                'd1ceaa57a495': 320,
            },
        },
        # TODO Find out format data for videos from WDR and ONE.
    ]
    def _get_format_from_url(self, format_url, quality):
        """Extract as much format data from the format_url as possible.
        Use the templates listed in _format_url_templates to do so.
        """
        result = {
            'url': format_url,
            'preference': 10,  # Plain HTTP, that's nice
        }
        format_id_suffix = None
        for template in self._format_url_templates:
            m = re.match(template['pattern'], format_url)
            if m:
                groupdict = m.groupdict()
                result['width'] = int_or_none(groupdict.get('width'))
                result['height'] = int_or_none(groupdict.get('height'))
                result['fps'] = int_or_none(groupdict.get('fps'))
                result['tbr'] = int_or_none(groupdict.get('tbr'))
                result['vbr'] = int_or_none(groupdict.get('vbr'))
                width_dict = template.get('width_dict')
                if width_dict:
                    result['width'] = width_dict.get(groupdict.get('width_key'))
                format_id_suffix = groupdict.get(template.get('format_id_suffix'))
                break
        if result.get('width') and not result.get('height'):
            result['height'] = int((result['width'] / 16) * 9)
        if result.get('height') and not result.get('width'):
            result['width'] = int((result['height'] / 9) * 16)
        result['format_id'] = (('http-' + quality) if quality else 'http') + ('-' + format_id_suffix if format_id_suffix else '')
        return result
    def _real_extract(self, url):
        mobj = re.match(self._VALID_URL, url)
        video_id = mobj.group('video_id')
@ -333,6 +480,8 @@ class ARDBetaMediathekIE(InfoExtractor):
        webpage = self._download_webpage(url, display_id)
        data_json = self._search_regex(r'window\.__APOLLO_STATE__\s*=\s*(\{.*);\n', webpage, 'json')
        data = self._parse_json(data_json, display_id)
        #import json
        #print(json.dumps(data, indent=2))
        res = {
            'id': video_id,
@ -361,36 +510,45 @@ class ARDBetaMediathekIE(InfoExtractor):
                    'url': subtitle_url,
                })
            if '_quality' in widget:
-                format_url = url_or_none(try_get(
+                # Read format URLs from a MediaStreamArray
-                    widget, lambda x: x['_stream']['json'][0]))
+                stream_array = try_get(widget,
-                if not format_url:
+                                       lambda x: x['_stream']['json'])
                if not stream_array:
                    continue
-                ext = determine_ext(format_url)
+
-                if ext == 'f4m':
+                for format_url in stream_array:
-                    formats.extend(self._extract_f4m_formats(
+                    format_url = url_or_none(format_url)
-                        format_url + '?hdcore=3.11.0',
+                    if not format_url:
                        video_id, f4m_id='hds', fatal=False))
                elif ext == 'm3u8':
                    formats.extend(self._extract_m3u8_formats(
                        format_url, video_id, 'mp4', m3u8_id='hls',
                        fatal=False))
                else:
                    # HTTP formats are not available when geoblocked is True,
                    # other formats are fine though
                    if geoblocked:
                        continue
-                    quality = str_or_none(widget.get('_quality'))
+
-                    formats.append({
+                    # Make sure this format isn't already in our list.
-                        'format_id': ('http-' + quality) if quality else 'http',
+                    # Occassionally, there are duplicate files from
-                        'url': format_url,
+                    # different servers.
-                        'preference': 10,  # Plain HTTP, that's nice
+                    duplicate = next((x for x in formats
-                    })
+                        if url_basename(x['url']) == url_basename(format_url)), None)
                    if duplicate:
                        continue
                    ext = determine_ext(format_url)
                    if ext == 'f4m':
                        formats.extend(self._extract_f4m_formats(
                            format_url + '?hdcore=3.11.0',
                            video_id, f4m_id='hds', fatal=False))
                    elif ext == 'm3u8':
                        formats.extend(self._extract_m3u8_formats(
                            format_url, video_id, 'mp4', m3u8_id='hls',
                            fatal=False))
                    else:
                        quality = str_or_none(widget.get('_quality'))
                        formats.append(self._get_format_from_url(format_url, quality))
        if not formats and geoblocked:
            self.raise_geo_restricted(
                msg='This video is not available due to geoblocking',
                countries=['DE'])
        # TODO Improve error handling when video is only unavailable at
        #      certain times due to age restrictions.
        self._sort_formats(formats)
        res.update({
            'subtitles': subtitles,