[extractor/common] improve m3u8 info extraction
This commit is contained in:
parent
f6f6217a98
commit
8f0bf01277
@ -1007,26 +1007,56 @@ class InfoExtractor(object):
|
||||
|
||||
return formats
|
||||
|
||||
def _extract_m3u8_info(self, m3u8_url, video_id, ext=None,
|
||||
entry_protocol='m3u8', preference=None,
|
||||
m3u8_id=None, note=None, errnote=None,
|
||||
fatal=True):
|
||||
res = self._download_webpage_handle(
|
||||
m3u8_url, video_id,
|
||||
note=note or 'Downloading m3u8 information',
|
||||
errnote=errnote or 'Failed to download m3u8 information',
|
||||
fatal=fatal)
|
||||
if res is False:
|
||||
return []
|
||||
m3u8_doc, urlh = res
|
||||
m3u8_url = urlh.geturl()
|
||||
|
||||
return self._parse_m3u8(m3u8_doc, m3u8_url, ext, entry_protocol, preference, m3u8_id)
|
||||
|
||||
def _parse_m3u8(self, m3u8_doc, m3u8_url, ext=None, entry_protocol='m3u8', preference=None, m3u8_id=None):
|
||||
return {
|
||||
'formats': self._parse_m3u8_formats(m3u8_doc, m3u8_url, ext, entry_protocol, preference, m3u8_id),
|
||||
'subtitles': self._parse_m3u8_subtitles(m3u8_doc, m3u8_url),
|
||||
}
|
||||
|
||||
def _parse_m3u8_attributes(self, attrib):
|
||||
info = {}
|
||||
for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
|
||||
if val.startswith('"'):
|
||||
val = val[1:-1]
|
||||
info[key] = val
|
||||
return info
|
||||
|
||||
def _parse_m3u8_subtitles(self, m3u8_doc, m3u8_url):
|
||||
subtitle_url = lambda u: (
|
||||
u
|
||||
if re.match(r'^https?://', u)
|
||||
else compat_urlparse.urljoin(m3u8_url, u))
|
||||
|
||||
subtitles = {}
|
||||
for (attrib) in re.findall(r'#EXT-X-MEDIA:(?P<attrib>.*TYPE=SUBTITLES.*)', m3u8_doc):
|
||||
subtitle_info = self._parse_m3u8_attributes(attrib)
|
||||
subtitles.setdefault(subtitle_info['LANGUAGE'], []).append({
|
||||
'url': subtitle_url(subtitle_info['URI']),
|
||||
'ext': 'm3u8'
|
||||
})
|
||||
return subtitles
|
||||
|
||||
def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
|
||||
entry_protocol='m3u8', preference=None,
|
||||
m3u8_id=None, note=None, errnote=None,
|
||||
fatal=True):
|
||||
|
||||
formats = [{
|
||||
'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
|
||||
'url': m3u8_url,
|
||||
'ext': ext,
|
||||
'protocol': 'm3u8',
|
||||
'preference': preference - 1 if preference else -1,
|
||||
'resolution': 'multiple',
|
||||
'format_note': 'Quality selection URL',
|
||||
}]
|
||||
|
||||
format_url = lambda u: (
|
||||
u
|
||||
if re.match(r'^https?://', u)
|
||||
else compat_urlparse.urljoin(m3u8_url, u))
|
||||
|
||||
res = self._download_webpage_handle(
|
||||
m3u8_url, video_id,
|
||||
note=note or 'Downloading m3u8 information',
|
||||
@ -1037,6 +1067,12 @@ class InfoExtractor(object):
|
||||
m3u8_doc, urlh = res
|
||||
m3u8_url = urlh.geturl()
|
||||
|
||||
return self._parse_m3u8_formats(m3u8_doc, m3u8_url, ext, entry_protocol, preference, m3u8_id)
|
||||
|
||||
def _parse_m3u8_formats(self, m3u8_doc, m3u8_url, ext=None,
|
||||
entry_protocol='m3u8', preference=None,
|
||||
m3u8_id=None):
|
||||
|
||||
# We should try extracting formats only from master playlists [1], i.e.
|
||||
# playlists that describe available qualities. On the other hand media
|
||||
# playlists [2] should be returned as is since they contain just the media
|
||||
@ -1058,51 +1094,68 @@ class InfoExtractor(object):
|
||||
'protocol': entry_protocol,
|
||||
'preference': preference,
|
||||
}]
|
||||
last_info = None
|
||||
last_media = None
|
||||
kv_rex = re.compile(
|
||||
r'(?P<key>[a-zA-Z_-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)')
|
||||
for line in m3u8_doc.splitlines():
|
||||
if line.startswith('#EXT-X-STREAM-INF:'):
|
||||
last_info = {}
|
||||
for m in kv_rex.finditer(line):
|
||||
v = m.group('val')
|
||||
if v.startswith('"'):
|
||||
v = v[1:-1]
|
||||
last_info[m.group('key')] = v
|
||||
elif line.startswith('#EXT-X-MEDIA:'):
|
||||
last_media = {}
|
||||
for m in kv_rex.finditer(line):
|
||||
v = m.group('val')
|
||||
if v.startswith('"'):
|
||||
v = v[1:-1]
|
||||
last_media[m.group('key')] = v
|
||||
elif line.startswith('#') or not line.strip():
|
||||
continue
|
||||
|
||||
formats = [{
|
||||
'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
|
||||
'url': m3u8_url,
|
||||
'ext': ext,
|
||||
'protocol': 'm3u8',
|
||||
'preference': preference - 1 if preference else -1,
|
||||
'resolution': 'multiple',
|
||||
'format_note': 'Quality selection URL',
|
||||
}]
|
||||
|
||||
format_url = lambda u: (
|
||||
u
|
||||
if re.match(r'^https?://', u)
|
||||
else compat_urlparse.urljoin(m3u8_url, u))
|
||||
|
||||
def parse_media_info(media_info):
|
||||
return {
|
||||
'format_id': '%s-%s' % (media_info['GROUP-ID'], media_info['NAME']),
|
||||
'url': format_url(url),
|
||||
'language': media_info.get('LANGUAGE'),
|
||||
'vcodec': 'none' if media_info['TYPE'] == 'AUDIO' else None
|
||||
}
|
||||
|
||||
lang = None
|
||||
groups = {}
|
||||
for (attrib) in re.findall(r'#EXT-X-MEDIA:(?P<attrib>.*TYPE=AUDIO.*)', m3u8_doc):
|
||||
media_info = self._parse_m3u8_attributes(attrib)
|
||||
url = media_info.get('URI')
|
||||
if url:
|
||||
formats.append(parse_media_info(media_info))
|
||||
else:
|
||||
if last_info is None:
|
||||
formats.append({'url': format_url(line)})
|
||||
continue
|
||||
tbr = int_or_none(last_info.get('BANDWIDTH'), scale=1000)
|
||||
lang = media_info.get('LANGUAGE')
|
||||
for (attrib, url) in re.findall(r'#EXT-X-STREAM-INF:(?P<attrib>.*)\r?\n(?P<url>.+)', m3u8_doc):
|
||||
stream_info = self._parse_m3u8_attributes(attrib)
|
||||
tbr = int_or_none(stream_info.get('AVERAGE-BANDWIDTH') or stream_info.get('BANDWIDTH'), scale=1000)
|
||||
format_id = []
|
||||
if m3u8_id:
|
||||
format_id.append(m3u8_id)
|
||||
last_media_name = last_media.get('NAME') if last_media and last_media.get('TYPE') != 'SUBTITLES' else None
|
||||
format_id.append(last_media_name if last_media_name else '%d' % (tbr if tbr else len(formats)))
|
||||
format_id.append('%d' % (tbr if tbr else len(formats)))
|
||||
f = {
|
||||
'format_id': '-'.join(format_id),
|
||||
'url': format_url(line.strip()),
|
||||
'url': format_url(url),
|
||||
'tbr': tbr,
|
||||
'fps': float_or_none(stream_info.get('FRAME-RATE')),
|
||||
'language': lang,
|
||||
'ext': ext,
|
||||
'protocol': entry_protocol,
|
||||
'preference': preference,
|
||||
}
|
||||
resolution = last_info.get('RESOLUTION')
|
||||
resolution = stream_info.get('RESOLUTION')
|
||||
if resolution:
|
||||
width_str, height_str = resolution.split('x')
|
||||
f['width'] = int(width_str)
|
||||
f['height'] = int(height_str)
|
||||
codecs = last_info.get('CODECS')
|
||||
r = {
|
||||
'width': int(width_str),
|
||||
'height': int(height_str),
|
||||
}
|
||||
f.update(r)
|
||||
group = stream_info.get('VIDEO')
|
||||
if group:
|
||||
groups[group] = r
|
||||
codecs = stream_info.get('CODECS')
|
||||
if codecs:
|
||||
vcodec, acodec = [None] * 2
|
||||
va_codecs = codecs.split(',')
|
||||
@ -1120,11 +1173,16 @@ class InfoExtractor(object):
|
||||
'acodec': acodec,
|
||||
'vcodec': vcodec,
|
||||
})
|
||||
if last_media is not None:
|
||||
f['m3u8_media'] = last_media
|
||||
last_media = None
|
||||
formats.append(f)
|
||||
last_info = {}
|
||||
for (attrib) in re.findall(r'#EXT-X-MEDIA:(?P<attrib>.*TYPE=VIDEO.*)', m3u8_doc):
|
||||
media_info = self._parse_m3u8_attributes(attrib)
|
||||
url = media_info.get('URI')
|
||||
if url:
|
||||
v = parse_media_info(media_info)
|
||||
group = groups.get(media_info['GROUP-ID'])
|
||||
if group:
|
||||
v.update(group)
|
||||
formats.append(v)
|
||||
self._sort_formats(formats)
|
||||
return formats
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user