[extractor/common] improve m3u8 info extraction

2016-03-10 19:58:44 +01:00 · 2016-03-10 19:58:44 +01:00 · 8f0bf01277
commit 8f0bf01277
parent f6f6217a98
1 changed files with 137 additions and 79 deletions
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@ -1007,26 +1007,56 @@ class InfoExtractor(object):
        return formats
    def _extract_m3u8_info(self, m3u8_url, video_id, ext=None,
                              entry_protocol='m3u8', preference=None,
                              m3u8_id=None, note=None, errnote=None,
                              fatal=True):
        res = self._download_webpage_handle(
            m3u8_url, video_id,
            note=note or 'Downloading m3u8 information',
            errnote=errnote or 'Failed to download m3u8 information',
            fatal=fatal)
        if res is False:
            return []
        m3u8_doc, urlh = res
        m3u8_url = urlh.geturl()
        return self._parse_m3u8(m3u8_doc, m3u8_url, ext, entry_protocol, preference, m3u8_id)
    def _parse_m3u8(self, m3u8_doc, m3u8_url, ext=None, entry_protocol='m3u8', preference=None, m3u8_id=None):
        return {
            'formats': self._parse_m3u8_formats(m3u8_doc, m3u8_url, ext, entry_protocol, preference, m3u8_id),
            'subtitles': self._parse_m3u8_subtitles(m3u8_doc, m3u8_url),
        }
    def _parse_m3u8_attributes(self, attrib):
        info = {}
        for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
            if val.startswith('"'):
                val = val[1:-1]
            info[key] = val
        return info
    def _parse_m3u8_subtitles(self, m3u8_doc, m3u8_url):
        subtitle_url = lambda u: (
            u
            if re.match(r'^https?://', u)
            else compat_urlparse.urljoin(m3u8_url, u))
        subtitles = {}
        for (attrib) in re.findall(r'#EXT-X-MEDIA:(?P<attrib>.*TYPE=SUBTITLES.*)', m3u8_doc):
            subtitle_info = self._parse_m3u8_attributes(attrib)
            subtitles.setdefault(subtitle_info['LANGUAGE'], []).append({
                'url': subtitle_url(subtitle_info['URI']),
                'ext': 'm3u8'
            })
        return subtitles
    def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
                              entry_protocol='m3u8', preference=None,
                              m3u8_id=None, note=None, errnote=None,
                              fatal=True):
        formats = [{
            'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
            'url': m3u8_url,
            'ext': ext,
            'protocol': 'm3u8',
            'preference': preference - 1 if preference else -1,
            'resolution': 'multiple',
            'format_note': 'Quality selection URL',
        }]
        format_url = lambda u: (
            u
            if re.match(r'^https?://', u)
            else compat_urlparse.urljoin(m3u8_url, u))
        res = self._download_webpage_handle(
            m3u8_url, video_id,
            note=note or 'Downloading m3u8 information',
@ -1037,6 +1067,12 @@ class InfoExtractor(object):
        m3u8_doc, urlh = res
        m3u8_url = urlh.geturl()
        return self._parse_m3u8_formats(m3u8_doc, m3u8_url, ext, entry_protocol, preference, m3u8_id)
    def _parse_m3u8_formats(self, m3u8_doc, m3u8_url, ext=None,
                              entry_protocol='m3u8', preference=None,
                              m3u8_id=None):
        # We should try extracting formats only from master playlists [1], i.e.
        # playlists that describe available qualities. On the other hand media
        # playlists [2] should be returned as is since they contain just the media
@ -1058,73 +1094,95 @@ class InfoExtractor(object):
                'protocol': entry_protocol,
                'preference': preference,
            }]
-        last_info = None
+
-        last_media = None
+        formats = [{
-        kv_rex = re.compile(
+            'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
-            r'(?P<key>[a-zA-Z_-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)')
+            'url': m3u8_url,
-        for line in m3u8_doc.splitlines():
+            'ext': ext,
-            if line.startswith('#EXT-X-STREAM-INF:'):
+            'protocol': 'm3u8',
-                last_info = {}
+            'preference': preference - 1 if preference else -1,
-                for m in kv_rex.finditer(line):
+            'resolution': 'multiple',
-                    v = m.group('val')
+            'format_note': 'Quality selection URL',
-                    if v.startswith('"'):
+        }]
-                        v = v[1:-1]
+
-                    last_info[m.group('key')] = v
+        format_url = lambda u: (
-            elif line.startswith('#EXT-X-MEDIA:'):
+            u
-                last_media = {}
+            if re.match(r'^https?://', u)
-                for m in kv_rex.finditer(line):
+            else compat_urlparse.urljoin(m3u8_url, u))
-                    v = m.group('val')
+
-                    if v.startswith('"'):
+        def parse_media_info(media_info):
-                        v = v[1:-1]
+            return {
-                    last_media[m.group('key')] = v
+                'format_id': '%s-%s' % (media_info['GROUP-ID'], media_info['NAME']),
-            elif line.startswith('#') or not line.strip():
+                'url': format_url(url),
-                continue
+                'language': media_info.get('LANGUAGE'),
                'vcodec': 'none' if media_info['TYPE'] == 'AUDIO' else None
            }
        lang = None
        groups = {}
        for (attrib) in re.findall(r'#EXT-X-MEDIA:(?P<attrib>.*TYPE=AUDIO.*)', m3u8_doc):
            media_info = self._parse_m3u8_attributes(attrib)
            url = media_info.get('URI')
            if url:
                formats.append(parse_media_info(media_info))
            else:
-                if last_info is None:
+                lang = media_info.get('LANGUAGE')
-                    formats.append({'url': format_url(line)})
+        for (attrib, url) in re.findall(r'#EXT-X-STREAM-INF:(?P<attrib>.*)\r?\n(?P<url>.+)', m3u8_doc):
-                    continue
+            stream_info = self._parse_m3u8_attributes(attrib)
-                tbr = int_or_none(last_info.get('BANDWIDTH'), scale=1000)
+            tbr = int_or_none(stream_info.get('AVERAGE-BANDWIDTH') or stream_info.get('BANDWIDTH'), scale=1000)
-                format_id = []
+            format_id = []
-                if m3u8_id:
+            if m3u8_id:
-                    format_id.append(m3u8_id)
+                format_id.append(m3u8_id)
-                last_media_name = last_media.get('NAME') if last_media and last_media.get('TYPE') != 'SUBTITLES' else None
+            format_id.append('%d' % (tbr if tbr else len(formats)))
-                format_id.append(last_media_name if last_media_name else '%d' % (tbr if tbr else len(formats)))
+            f = {
-                f = {
+                'format_id': '-'.join(format_id),
-                    'format_id': '-'.join(format_id),
+                'url': format_url(url),
-                    'url': format_url(line.strip()),
+                'tbr': tbr,
-                    'tbr': tbr,
+                'fps': float_or_none(stream_info.get('FRAME-RATE')),
-                    'ext': ext,
+                'language': lang,
-                    'protocol': entry_protocol,
+                'ext': ext,
-                    'preference': preference,
+                'protocol': entry_protocol,
                'preference': preference,
            }
            resolution = stream_info.get('RESOLUTION')
            if resolution:
                width_str, height_str = resolution.split('x')
                r = {
                    'width': int(width_str),
                    'height': int(height_str),
                }
-                resolution = last_info.get('RESOLUTION')
+                f.update(r)
-                if resolution:
+                group = stream_info.get('VIDEO')
-                    width_str, height_str = resolution.split('x')
+                if group:
-                    f['width'] = int(width_str)
+                    groups[group] = r
-                    f['height'] = int(height_str)
+            codecs = stream_info.get('CODECS')
-                codecs = last_info.get('CODECS')
+            if codecs:
-                if codecs:
+                vcodec, acodec = [None] * 2
-                    vcodec, acodec = [None] * 2
+                va_codecs = codecs.split(',')
-                    va_codecs = codecs.split(',')
+                if len(va_codecs) == 1:
-                    if len(va_codecs) == 1:
+                    # Audio only entries usually come with single codec and
-                        # Audio only entries usually come with single codec and
+                    # no resolution. For more robustness we also check it to
-                        # no resolution. For more robustness we also check it to
+                    # be mp4 audio.
-                        # be mp4 audio.
+                    if not resolution and va_codecs[0].startswith('mp4a'):
-                        if not resolution and va_codecs[0].startswith('mp4a'):
+                        vcodec, acodec = 'none', va_codecs[0]
                            vcodec, acodec = 'none', va_codecs[0]
                        else:
                            vcodec = va_codecs[0]
                    else:
-                        vcodec, acodec = va_codecs[:2]
+                        vcodec = va_codecs[0]
-                    f.update({
+                else:
-                        'acodec': acodec,
+                    vcodec, acodec = va_codecs[:2]
-                        'vcodec': vcodec,
+                f.update({
-                    })
+                    'acodec': acodec,
-                if last_media is not None:
+                    'vcodec': vcodec,
-                    f['m3u8_media'] = last_media
+                })
-                    last_media = None
+            formats.append(f)
-                formats.append(f)
+        for (attrib) in re.findall(r'#EXT-X-MEDIA:(?P<attrib>.*TYPE=VIDEO.*)', m3u8_doc):
-                last_info = {}
+            media_info = self._parse_m3u8_attributes(attrib)
            url = media_info.get('URI')
            if url:
                v = parse_media_info(media_info)
                group = groups.get(media_info['GROUP-ID'])
                if group:
                    v.update(group)
                formats.append(v)
        self._sort_formats(formats)
        return formats