[sproutvideo] Add new extractor (closes #7935)

This commit is contained in:
thezero 2019-07-28 23:08:39 +02:00
parent dcc8522fdb
commit 662087e491
7 changed files with 120 additions and 8 deletions

View File

@ -43,6 +43,9 @@ def get_suitable_downloader(info_dict, params={}):
if ed.can_download(info_dict): if ed.can_download(info_dict):
return ed return ed
if info_dict.get('force_hlsdl') is True:
return HlsFD
if protocol.startswith('m3u8') and info_dict.get('is_live'): if protocol.startswith('m3u8') and info_dict.get('is_live'):
return FFmpegFD return FFmpegFD

View File

@ -121,6 +121,8 @@ class FragmentFD(FileDownloader):
del ctx['fragment_filename_sanitized'] del ctx['fragment_filename_sanitized']
def _prepare_frag_download(self, ctx): def _prepare_frag_download(self, ctx):
if 'hls' not in ctx:
ctx['hls'] = False
if 'live' not in ctx: if 'live' not in ctx:
ctx['live'] = False ctx['live'] = False
if not ctx['live']: if not ctx['live']:
@ -143,6 +145,7 @@ class FragmentFD(FileDownloader):
'retries': self.params.get('retries', 0), 'retries': self.params.get('retries', 0),
'nopart': self.params.get('nopart', False), 'nopart': self.params.get('nopart', False),
'test': self.params.get('test', False), 'test': self.params.get('test', False),
'hls': ctx['hls'],
} }
) )
tmpfilename = self.temp_name(ctx['filename']) tmpfilename = self.temp_name(ctx['filename'])

View File

@ -105,6 +105,7 @@ class HlsFD(FragmentFD):
'filename': filename, 'filename': filename,
'total_frags': media_frags, 'total_frags': media_frags,
'ad_frags': ad_frags, 'ad_frags': ad_frags,
'hls': '#EXT-X-KEY:METHOD=AES-128' in s,
} }
self._prepare_and_start_frag_download(ctx) self._prepare_and_start_frag_download(ctx)
@ -113,10 +114,15 @@ class HlsFD(FragmentFD):
skip_unavailable_fragments = self.params.get('skip_unavailable_fragments', True) skip_unavailable_fragments = self.params.get('skip_unavailable_fragments', True)
test = self.params.get('test', False) test = self.params.get('test', False)
extra_query = None extra_segment_query = None
extra_key_query = None
extra_param_to_segment_url = info_dict.get('extra_param_to_segment_url') extra_param_to_segment_url = info_dict.get('extra_param_to_segment_url')
if extra_param_to_segment_url: if extra_param_to_segment_url:
extra_query = compat_urlparse.parse_qs(extra_param_to_segment_url) extra_segment_query = compat_urlparse.parse_qs(extra_param_to_segment_url)
extra_key_query = compat_urlparse.parse_qs(extra_param_to_segment_url)
extra_param_to_key_url = info_dict.get('extra_param_to_key_url')
if extra_param_to_key_url:
extra_key_query = compat_urlparse.parse_qs(extra_param_to_key_url)
i = 0 i = 0
media_sequence = 0 media_sequence = 0
decrypt_info = {'METHOD': 'NONE'} decrypt_info = {'METHOD': 'NONE'}
@ -136,8 +142,8 @@ class HlsFD(FragmentFD):
line line
if re.match(r'^https?://', line) if re.match(r'^https?://', line)
else compat_urlparse.urljoin(man_url, line)) else compat_urlparse.urljoin(man_url, line))
if extra_query: if extra_segment_query:
frag_url = update_url_query(frag_url, extra_query) frag_url = update_url_query(frag_url, extra_segment_query)
count = 0 count = 0
headers = info_dict.get('http_headers', {}) headers = info_dict.get('http_headers', {})
if byte_range: if byte_range:
@ -187,8 +193,8 @@ class HlsFD(FragmentFD):
if not re.match(r'^https?://', decrypt_info['URI']): if not re.match(r'^https?://', decrypt_info['URI']):
decrypt_info['URI'] = compat_urlparse.urljoin( decrypt_info['URI'] = compat_urlparse.urljoin(
man_url, decrypt_info['URI']) man_url, decrypt_info['URI'])
if extra_query: if extra_key_query:
decrypt_info['URI'] = update_url_query(decrypt_info['URI'], extra_query) decrypt_info['URI'] = update_url_query(decrypt_info['URI'], extra_key_query)
if decrypt_url != decrypt_info['URI']: if decrypt_url != decrypt_info['URI']:
decrypt_info['KEY'] = None decrypt_info['KEY'] = None
elif line.startswith('#EXT-X-MEDIA-SEQUENCE'): elif line.startswith('#EXT-X-MEDIA-SEQUENCE'):

View File

@ -45,7 +45,8 @@ class HttpFD(FileDownloader):
headers.update(add_headers) headers.update(add_headers)
is_test = self.params.get('test', False) is_test = self.params.get('test', False)
chunk_size = self._TEST_FILE_SIZE if is_test else ( is_hls = self.params.get('hls', False)
chunk_size = self._TEST_FILE_SIZE if is_test and not is_hls else (
info_dict.get('downloader_options', {}).get('http_chunk_size') info_dict.get('downloader_options', {}).get('http_chunk_size')
or self.params.get('http_chunk_size') or 0) or self.params.get('http_chunk_size') or 0)
@ -194,7 +195,8 @@ class HttpFD(FileDownloader):
# However, for a test we still would like to download just a piece of a file. # However, for a test we still would like to download just a piece of a file.
# To achieve this we limit data_len to _TEST_FILE_SIZE and manually control # To achieve this we limit data_len to _TEST_FILE_SIZE and manually control
# block size when downloading a file. # block size when downloading a file.
if is_test and (data_len is None or int(data_len) > self._TEST_FILE_SIZE): # If we are using HLS we cannot cut the fragment because it will break the decryption.
if is_test and not is_hls and (data_len is None or int(data_len) > self._TEST_FILE_SIZE):
data_len = self._TEST_FILE_SIZE data_len = self._TEST_FILE_SIZE
if data_len is not None: if data_len is not None:

View File

@ -1053,6 +1053,7 @@ from .sportbox import SportBoxIE
from .sportdeutschland import SportDeutschlandIE from .sportdeutschland import SportDeutschlandIE
from .springboardplatform import SpringboardPlatformIE from .springboardplatform import SpringboardPlatformIE
from .sprout import SproutIE from .sprout import SproutIE
from .sproutvideo import SproutVideoIE
from .srgssr import ( from .srgssr import (
SRGSSRIE, SRGSSRIE,
SRGSSRPlayIE, SRGSSRPlayIE,

View File

@ -119,6 +119,7 @@ from .expressen import ExpressenIE
from .zype import ZypeIE from .zype import ZypeIE
from .odnoklassniki import OdnoklassnikiIE from .odnoklassniki import OdnoklassnikiIE
from .kinja import KinjaEmbedIE from .kinja import KinjaEmbedIE
from .sproutvideo import SproutVideoIE
class GenericIE(InfoExtractor): class GenericIE(InfoExtractor):
@ -2142,6 +2143,18 @@ class GenericIE(InfoExtractor):
'skip_download': True, 'skip_download': True,
}, },
}, },
{
# SproutVideo iframe in page
'url': 'https://www.solidarum.org/vivre-ensemble/adrien-labaeye-berlin-des-communautes-aux-communs',
'info_dict': {
'id': '4c9dddb01910e3c9c4',
'ext': 'mp4',
'title': 'Adrien Labaeye : Berlin, des communautés aux communs',
},
'params': {
'skip_download': True,
},
},
# { # {
# # TODO: find another test # # TODO: find another test
# # http://schema.org/VideoObject # # http://schema.org/VideoObject
@ -3201,6 +3214,10 @@ class GenericIE(InfoExtractor):
return self.playlist_from_matches( return self.playlist_from_matches(
zype_urls, video_id, video_title, ie=ZypeIE.ie_key()) zype_urls, video_id, video_title, ie=ZypeIE.ie_key())
sproutvideo_url = SproutVideoIE._extract_url(webpage)
if sproutvideo_url:
return self.url_result(sproutvideo_url)
# Look for HTML5 media # Look for HTML5 media
entries = self._parse_html5_media_entries(url, webpage, video_id, m3u8_id='hls') entries = self._parse_html5_media_entries(url, webpage, video_id, m3u8_id='hls')
if entries: if entries:

View File

@ -0,0 +1,80 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..compat import (
compat_b64decode,
compat_urllib_parse_urlencode,
)
class SproutVideoIE(InfoExtractor):
_VALID_URL = r'(?:https?:|)//videos.sproutvideo.com/embed/(?P<id>[a-f0-9]+)/[a-f0-9]+\??.*'
_TEST = {
'url': 'https://videos.sproutvideo.com/embed/4c9dddb01910e3c9c4/0fc24387c4f24ee3',
'md5': '1343ce1a6cb39d67889bfa07c7b02b0e',
'info_dict': {
'id': '4c9dddb01910e3c9c4',
'ext': 'mp4',
'title': 'Adrien Labaeye : Berlin, des communautés aux communs',
}
}
@staticmethod
def _extract_url(webpage):
sproutvideo = re.search(
r'(?:<iframe\s+class=[\'\"]sproutvideo-player.*src|href)=[\'\"](?P<url>%s)[\'\"]' % SproutVideoIE._VALID_URL, webpage)
if sproutvideo:
return sproutvideo.group('url')
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
data = self._search_regex(r'<script[^>]+>var dat = \'([^\']+)\';</script>', webpage, 'data')
data_decoded = compat_b64decode(data).decode('utf-8')
parsed_data = self._parse_json(data_decoded, video_id)
# https://github.com/ytdl-org/youtube-dl/issues/16996#issuecomment-406901324
# signature->m for manifests
# signature->k for keys
# signature->t for segments
m_sig = self._policy_to_qs(parsed_data, 'm')
k_sig = self._policy_to_qs(parsed_data, 'k')
t_sig = self._policy_to_qs(parsed_data, 't')
url = "https://{0}.videos.sproutvideo.com/{1}/{2}/video/index.m3u8?{3}"
url = url.format(parsed_data['base'],
parsed_data['s3_user_hash'],
parsed_data['s3_video_hash'],
m_sig)
formats = self._extract_m3u8_formats(url, video_id, 'mp4', 'm3u8_native',
m3u8_id='hls', fatal=False)
self._sort_formats(formats)
for i in range(len(formats)):
formats[i]['url'] = "{}?{}".format(formats[i]['url'], m_sig)
return {
'id': video_id,
'title': parsed_data['title'],
'formats': formats,
'force_hlsdl': True, # currently FFmpeg is not supported
'extra_param_to_segment_url': t_sig,
'extra_param_to_key_url': k_sig
}
def _format_qsdata(self, qs_data):
parsed_dict = dict()
for key in qs_data:
parsed_dict[key.replace('CloudFront-', '')] = qs_data[key]
return parsed_dict
def _policy_to_qs(self, policy, key):
sig = self._format_qsdata(policy['signatures'][key])
sig['sessionID'] = policy['sessionID']
return compat_urllib_parse_urlencode(sig, doseq=True)