[extractor/common] improve m3u8 info extraction
This commit is contained in:
parent
f6f6217a98
commit
8f0bf01277
@ -1007,26 +1007,56 @@ class InfoExtractor(object):
|
|||||||
|
|
||||||
return formats
|
return formats
|
||||||
|
|
||||||
|
def _extract_m3u8_info(self, m3u8_url, video_id, ext=None,
|
||||||
|
entry_protocol='m3u8', preference=None,
|
||||||
|
m3u8_id=None, note=None, errnote=None,
|
||||||
|
fatal=True):
|
||||||
|
res = self._download_webpage_handle(
|
||||||
|
m3u8_url, video_id,
|
||||||
|
note=note or 'Downloading m3u8 information',
|
||||||
|
errnote=errnote or 'Failed to download m3u8 information',
|
||||||
|
fatal=fatal)
|
||||||
|
if res is False:
|
||||||
|
return []
|
||||||
|
m3u8_doc, urlh = res
|
||||||
|
m3u8_url = urlh.geturl()
|
||||||
|
|
||||||
|
return self._parse_m3u8(m3u8_doc, m3u8_url, ext, entry_protocol, preference, m3u8_id)
|
||||||
|
|
||||||
|
def _parse_m3u8(self, m3u8_doc, m3u8_url, ext=None, entry_protocol='m3u8', preference=None, m3u8_id=None):
|
||||||
|
return {
|
||||||
|
'formats': self._parse_m3u8_formats(m3u8_doc, m3u8_url, ext, entry_protocol, preference, m3u8_id),
|
||||||
|
'subtitles': self._parse_m3u8_subtitles(m3u8_doc, m3u8_url),
|
||||||
|
}
|
||||||
|
|
||||||
|
def _parse_m3u8_attributes(self, attrib):
|
||||||
|
info = {}
|
||||||
|
for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
|
||||||
|
if val.startswith('"'):
|
||||||
|
val = val[1:-1]
|
||||||
|
info[key] = val
|
||||||
|
return info
|
||||||
|
|
||||||
|
def _parse_m3u8_subtitles(self, m3u8_doc, m3u8_url):
|
||||||
|
subtitle_url = lambda u: (
|
||||||
|
u
|
||||||
|
if re.match(r'^https?://', u)
|
||||||
|
else compat_urlparse.urljoin(m3u8_url, u))
|
||||||
|
|
||||||
|
subtitles = {}
|
||||||
|
for (attrib) in re.findall(r'#EXT-X-MEDIA:(?P<attrib>.*TYPE=SUBTITLES.*)', m3u8_doc):
|
||||||
|
subtitle_info = self._parse_m3u8_attributes(attrib)
|
||||||
|
subtitles.setdefault(subtitle_info['LANGUAGE'], []).append({
|
||||||
|
'url': subtitle_url(subtitle_info['URI']),
|
||||||
|
'ext': 'm3u8'
|
||||||
|
})
|
||||||
|
return subtitles
|
||||||
|
|
||||||
def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
|
def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
|
||||||
entry_protocol='m3u8', preference=None,
|
entry_protocol='m3u8', preference=None,
|
||||||
m3u8_id=None, note=None, errnote=None,
|
m3u8_id=None, note=None, errnote=None,
|
||||||
fatal=True):
|
fatal=True):
|
||||||
|
|
||||||
formats = [{
|
|
||||||
'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
|
|
||||||
'url': m3u8_url,
|
|
||||||
'ext': ext,
|
|
||||||
'protocol': 'm3u8',
|
|
||||||
'preference': preference - 1 if preference else -1,
|
|
||||||
'resolution': 'multiple',
|
|
||||||
'format_note': 'Quality selection URL',
|
|
||||||
}]
|
|
||||||
|
|
||||||
format_url = lambda u: (
|
|
||||||
u
|
|
||||||
if re.match(r'^https?://', u)
|
|
||||||
else compat_urlparse.urljoin(m3u8_url, u))
|
|
||||||
|
|
||||||
res = self._download_webpage_handle(
|
res = self._download_webpage_handle(
|
||||||
m3u8_url, video_id,
|
m3u8_url, video_id,
|
||||||
note=note or 'Downloading m3u8 information',
|
note=note or 'Downloading m3u8 information',
|
||||||
@ -1037,6 +1067,12 @@ class InfoExtractor(object):
|
|||||||
m3u8_doc, urlh = res
|
m3u8_doc, urlh = res
|
||||||
m3u8_url = urlh.geturl()
|
m3u8_url = urlh.geturl()
|
||||||
|
|
||||||
|
return self._parse_m3u8_formats(m3u8_doc, m3u8_url, ext, entry_protocol, preference, m3u8_id)
|
||||||
|
|
||||||
|
def _parse_m3u8_formats(self, m3u8_doc, m3u8_url, ext=None,
|
||||||
|
entry_protocol='m3u8', preference=None,
|
||||||
|
m3u8_id=None):
|
||||||
|
|
||||||
# We should try extracting formats only from master playlists [1], i.e.
|
# We should try extracting formats only from master playlists [1], i.e.
|
||||||
# playlists that describe available qualities. On the other hand media
|
# playlists that describe available qualities. On the other hand media
|
||||||
# playlists [2] should be returned as is since they contain just the media
|
# playlists [2] should be returned as is since they contain just the media
|
||||||
@ -1058,73 +1094,95 @@ class InfoExtractor(object):
|
|||||||
'protocol': entry_protocol,
|
'protocol': entry_protocol,
|
||||||
'preference': preference,
|
'preference': preference,
|
||||||
}]
|
}]
|
||||||
last_info = None
|
|
||||||
last_media = None
|
formats = [{
|
||||||
kv_rex = re.compile(
|
'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
|
||||||
r'(?P<key>[a-zA-Z_-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)')
|
'url': m3u8_url,
|
||||||
for line in m3u8_doc.splitlines():
|
'ext': ext,
|
||||||
if line.startswith('#EXT-X-STREAM-INF:'):
|
'protocol': 'm3u8',
|
||||||
last_info = {}
|
'preference': preference - 1 if preference else -1,
|
||||||
for m in kv_rex.finditer(line):
|
'resolution': 'multiple',
|
||||||
v = m.group('val')
|
'format_note': 'Quality selection URL',
|
||||||
if v.startswith('"'):
|
}]
|
||||||
v = v[1:-1]
|
|
||||||
last_info[m.group('key')] = v
|
format_url = lambda u: (
|
||||||
elif line.startswith('#EXT-X-MEDIA:'):
|
u
|
||||||
last_media = {}
|
if re.match(r'^https?://', u)
|
||||||
for m in kv_rex.finditer(line):
|
else compat_urlparse.urljoin(m3u8_url, u))
|
||||||
v = m.group('val')
|
|
||||||
if v.startswith('"'):
|
def parse_media_info(media_info):
|
||||||
v = v[1:-1]
|
return {
|
||||||
last_media[m.group('key')] = v
|
'format_id': '%s-%s' % (media_info['GROUP-ID'], media_info['NAME']),
|
||||||
elif line.startswith('#') or not line.strip():
|
'url': format_url(url),
|
||||||
continue
|
'language': media_info.get('LANGUAGE'),
|
||||||
|
'vcodec': 'none' if media_info['TYPE'] == 'AUDIO' else None
|
||||||
|
}
|
||||||
|
|
||||||
|
lang = None
|
||||||
|
groups = {}
|
||||||
|
for (attrib) in re.findall(r'#EXT-X-MEDIA:(?P<attrib>.*TYPE=AUDIO.*)', m3u8_doc):
|
||||||
|
media_info = self._parse_m3u8_attributes(attrib)
|
||||||
|
url = media_info.get('URI')
|
||||||
|
if url:
|
||||||
|
formats.append(parse_media_info(media_info))
|
||||||
else:
|
else:
|
||||||
if last_info is None:
|
lang = media_info.get('LANGUAGE')
|
||||||
formats.append({'url': format_url(line)})
|
for (attrib, url) in re.findall(r'#EXT-X-STREAM-INF:(?P<attrib>.*)\r?\n(?P<url>.+)', m3u8_doc):
|
||||||
continue
|
stream_info = self._parse_m3u8_attributes(attrib)
|
||||||
tbr = int_or_none(last_info.get('BANDWIDTH'), scale=1000)
|
tbr = int_or_none(stream_info.get('AVERAGE-BANDWIDTH') or stream_info.get('BANDWIDTH'), scale=1000)
|
||||||
format_id = []
|
format_id = []
|
||||||
if m3u8_id:
|
if m3u8_id:
|
||||||
format_id.append(m3u8_id)
|
format_id.append(m3u8_id)
|
||||||
last_media_name = last_media.get('NAME') if last_media and last_media.get('TYPE') != 'SUBTITLES' else None
|
format_id.append('%d' % (tbr if tbr else len(formats)))
|
||||||
format_id.append(last_media_name if last_media_name else '%d' % (tbr if tbr else len(formats)))
|
f = {
|
||||||
f = {
|
'format_id': '-'.join(format_id),
|
||||||
'format_id': '-'.join(format_id),
|
'url': format_url(url),
|
||||||
'url': format_url(line.strip()),
|
'tbr': tbr,
|
||||||
'tbr': tbr,
|
'fps': float_or_none(stream_info.get('FRAME-RATE')),
|
||||||
'ext': ext,
|
'language': lang,
|
||||||
'protocol': entry_protocol,
|
'ext': ext,
|
||||||
'preference': preference,
|
'protocol': entry_protocol,
|
||||||
|
'preference': preference,
|
||||||
|
}
|
||||||
|
resolution = stream_info.get('RESOLUTION')
|
||||||
|
if resolution:
|
||||||
|
width_str, height_str = resolution.split('x')
|
||||||
|
r = {
|
||||||
|
'width': int(width_str),
|
||||||
|
'height': int(height_str),
|
||||||
}
|
}
|
||||||
resolution = last_info.get('RESOLUTION')
|
f.update(r)
|
||||||
if resolution:
|
group = stream_info.get('VIDEO')
|
||||||
width_str, height_str = resolution.split('x')
|
if group:
|
||||||
f['width'] = int(width_str)
|
groups[group] = r
|
||||||
f['height'] = int(height_str)
|
codecs = stream_info.get('CODECS')
|
||||||
codecs = last_info.get('CODECS')
|
if codecs:
|
||||||
if codecs:
|
vcodec, acodec = [None] * 2
|
||||||
vcodec, acodec = [None] * 2
|
va_codecs = codecs.split(',')
|
||||||
va_codecs = codecs.split(',')
|
if len(va_codecs) == 1:
|
||||||
if len(va_codecs) == 1:
|
# Audio only entries usually come with single codec and
|
||||||
# Audio only entries usually come with single codec and
|
# no resolution. For more robustness we also check it to
|
||||||
# no resolution. For more robustness we also check it to
|
# be mp4 audio.
|
||||||
# be mp4 audio.
|
if not resolution and va_codecs[0].startswith('mp4a'):
|
||||||
if not resolution and va_codecs[0].startswith('mp4a'):
|
vcodec, acodec = 'none', va_codecs[0]
|
||||||
vcodec, acodec = 'none', va_codecs[0]
|
|
||||||
else:
|
|
||||||
vcodec = va_codecs[0]
|
|
||||||
else:
|
else:
|
||||||
vcodec, acodec = va_codecs[:2]
|
vcodec = va_codecs[0]
|
||||||
f.update({
|
else:
|
||||||
'acodec': acodec,
|
vcodec, acodec = va_codecs[:2]
|
||||||
'vcodec': vcodec,
|
f.update({
|
||||||
})
|
'acodec': acodec,
|
||||||
if last_media is not None:
|
'vcodec': vcodec,
|
||||||
f['m3u8_media'] = last_media
|
})
|
||||||
last_media = None
|
formats.append(f)
|
||||||
formats.append(f)
|
for (attrib) in re.findall(r'#EXT-X-MEDIA:(?P<attrib>.*TYPE=VIDEO.*)', m3u8_doc):
|
||||||
last_info = {}
|
media_info = self._parse_m3u8_attributes(attrib)
|
||||||
|
url = media_info.get('URI')
|
||||||
|
if url:
|
||||||
|
v = parse_media_info(media_info)
|
||||||
|
group = groups.get(media_info['GROUP-ID'])
|
||||||
|
if group:
|
||||||
|
v.update(group)
|
||||||
|
formats.append(v)
|
||||||
self._sort_formats(formats)
|
self._sort_formats(formats)
|
||||||
return formats
|
return formats
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user