From 662087e491912be66dbf6866df3846608fce6904 Mon Sep 17 00:00:00 2001
From: thezero <io@thezero.org>
Date: Sun, 28 Jul 2019 23:08:39 +0200
Subject: [PATCH] [sproutvideo] Add new extractor (closes #7935)

---
 youtube_dl/downloader/__init__.py   |  3 ++
 youtube_dl/downloader/fragment.py   |  3 ++
 youtube_dl/downloader/hls.py        | 18 ++++---
 youtube_dl/downloader/http.py       |  6 ++-
 youtube_dl/extractor/extractors.py  |  1 +
 youtube_dl/extractor/generic.py     | 17 ++++++
 youtube_dl/extractor/sproutvideo.py | 80 +++++++++++++++++++++++++++++
 7 files changed, 120 insertions(+), 8 deletions(-)
 create mode 100644 youtube_dl/extractor/sproutvideo.py

diff --git a/youtube_dl/downloader/__init__.py b/youtube_dl/downloader/__init__.py
index 2e485df9d..28cc6a363 100644
--- a/youtube_dl/downloader/__init__.py
+++ b/youtube_dl/downloader/__init__.py
@@ -43,6 +43,9 @@ def get_suitable_downloader(info_dict, params={}):
         if ed.can_download(info_dict):
             return ed
 
+    if info_dict.get('force_hlsdl') is True:
+        return HlsFD
+
     if protocol.startswith('m3u8') and info_dict.get('is_live'):
         return FFmpegFD
 
diff --git a/youtube_dl/downloader/fragment.py b/youtube_dl/downloader/fragment.py
index 02f35459e..449795bc7 100644
--- a/youtube_dl/downloader/fragment.py
+++ b/youtube_dl/downloader/fragment.py
@@ -121,6 +121,8 @@ class FragmentFD(FileDownloader):
             del ctx['fragment_filename_sanitized']
 
     def _prepare_frag_download(self, ctx):
+        if 'hls' not in ctx:
+            ctx['hls'] = False
         if 'live' not in ctx:
             ctx['live'] = False
         if not ctx['live']:
@@ -143,6 +145,7 @@ class FragmentFD(FileDownloader):
                 'retries': self.params.get('retries', 0),
                 'nopart': self.params.get('nopart', False),
                 'test': self.params.get('test', False),
+                'hls': ctx['hls'],
             }
         )
         tmpfilename = self.temp_name(ctx['filename'])
diff --git a/youtube_dl/downloader/hls.py b/youtube_dl/downloader/hls.py
index 84bc34928..84bfc4a87 100644
--- a/youtube_dl/downloader/hls.py
+++ b/youtube_dl/downloader/hls.py
@@ -105,6 +105,7 @@ class HlsFD(FragmentFD):
             'filename': filename,
             'total_frags': media_frags,
             'ad_frags': ad_frags,
+            'hls': '#EXT-X-KEY:METHOD=AES-128' in s,
         }
 
         self._prepare_and_start_frag_download(ctx)
@@ -113,10 +114,15 @@ class HlsFD(FragmentFD):
         skip_unavailable_fragments = self.params.get('skip_unavailable_fragments', True)
         test = self.params.get('test', False)
 
-        extra_query = None
+        extra_segment_query = None
+        extra_key_query = None
         extra_param_to_segment_url = info_dict.get('extra_param_to_segment_url')
         if extra_param_to_segment_url:
-            extra_query = compat_urlparse.parse_qs(extra_param_to_segment_url)
+            extra_segment_query = compat_urlparse.parse_qs(extra_param_to_segment_url)
+            extra_key_query = compat_urlparse.parse_qs(extra_param_to_segment_url)
+        extra_param_to_key_url = info_dict.get('extra_param_to_key_url')
+        if extra_param_to_key_url:
+            extra_key_query = compat_urlparse.parse_qs(extra_param_to_key_url)
         i = 0
         media_sequence = 0
         decrypt_info = {'METHOD': 'NONE'}
@@ -136,8 +142,8 @@ class HlsFD(FragmentFD):
                         line
                         if re.match(r'^https?://', line)
                         else compat_urlparse.urljoin(man_url, line))
-                    if extra_query:
-                        frag_url = update_url_query(frag_url, extra_query)
+                    if extra_segment_query:
+                        frag_url = update_url_query(frag_url, extra_segment_query)
                     count = 0
                     headers = info_dict.get('http_headers', {})
                     if byte_range:
@@ -187,8 +193,8 @@ class HlsFD(FragmentFD):
                         if not re.match(r'^https?://', decrypt_info['URI']):
                             decrypt_info['URI'] = compat_urlparse.urljoin(
                                 man_url, decrypt_info['URI'])
-                        if extra_query:
-                            decrypt_info['URI'] = update_url_query(decrypt_info['URI'], extra_query)
+                        if extra_key_query:
+                            decrypt_info['URI'] = update_url_query(decrypt_info['URI'], extra_key_query)
                         if decrypt_url != decrypt_info['URI']:
                             decrypt_info['KEY'] = None
                 elif line.startswith('#EXT-X-MEDIA-SEQUENCE'):
diff --git a/youtube_dl/downloader/http.py b/youtube_dl/downloader/http.py
index 3c72ea18b..431483bc4 100644
--- a/youtube_dl/downloader/http.py
+++ b/youtube_dl/downloader/http.py
@@ -45,7 +45,8 @@ class HttpFD(FileDownloader):
             headers.update(add_headers)
 
         is_test = self.params.get('test', False)
-        chunk_size = self._TEST_FILE_SIZE if is_test else (
+        is_hls = self.params.get('hls', False)
+        chunk_size = self._TEST_FILE_SIZE if is_test and not is_hls else (
             info_dict.get('downloader_options', {}).get('http_chunk_size')
             or self.params.get('http_chunk_size') or 0)
 
@@ -194,7 +195,8 @@ class HttpFD(FileDownloader):
             # However, for a test we still would like to download just a piece of a file.
             # To achieve this we limit data_len to _TEST_FILE_SIZE and manually control
             # block size when downloading a file.
-            if is_test and (data_len is None or int(data_len) > self._TEST_FILE_SIZE):
+            # If we are using HLS we cannot cut the fragment because it will break the decryption.
+            if is_test and not is_hls and (data_len is None or int(data_len) > self._TEST_FILE_SIZE):
                 data_len = self._TEST_FILE_SIZE
 
             if data_len is not None:
diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py
index e407ab3d9..42ee3ff65 100644
--- a/youtube_dl/extractor/extractors.py
+++ b/youtube_dl/extractor/extractors.py
@@ -1053,6 +1053,7 @@ from .sportbox import SportBoxIE
 from .sportdeutschland import SportDeutschlandIE
 from .springboardplatform import SpringboardPlatformIE
 from .sprout import SproutIE
+from .sproutvideo import SproutVideoIE
 from .srgssr import (
     SRGSSRIE,
     SRGSSRPlayIE,
diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py
index ce8252f6a..13dc6f34b 100644
--- a/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@@ -119,6 +119,7 @@ from .expressen import ExpressenIE
 from .zype import ZypeIE
 from .odnoklassniki import OdnoklassnikiIE
 from .kinja import KinjaEmbedIE
+from .sproutvideo import SproutVideoIE
 
 
 class GenericIE(InfoExtractor):
@@ -2142,6 +2143,18 @@ class GenericIE(InfoExtractor):
                 'skip_download': True,
             },
         },
+        {
+            # SproutVideo iframe in page
+            'url': 'https://www.solidarum.org/vivre-ensemble/adrien-labaeye-berlin-des-communautes-aux-communs',
+            'info_dict': {
+                'id': '4c9dddb01910e3c9c4',
+                'ext': 'mp4',
+                'title': 'Adrien Labaeye : Berlin, des communautés aux communs',
+            },
+            'params': {
+                'skip_download': True,
+            },
+        },
         # {
         #     # TODO: find another test
         #     # http://schema.org/VideoObject
@@ -3201,6 +3214,10 @@ class GenericIE(InfoExtractor):
             return self.playlist_from_matches(
                 zype_urls, video_id, video_title, ie=ZypeIE.ie_key())
 
+        sproutvideo_url = SproutVideoIE._extract_url(webpage)
+        if sproutvideo_url:
+            return self.url_result(sproutvideo_url)
+
         # Look for HTML5 media
         entries = self._parse_html5_media_entries(url, webpage, video_id, m3u8_id='hls')
         if entries:
diff --git a/youtube_dl/extractor/sproutvideo.py b/youtube_dl/extractor/sproutvideo.py
new file mode 100644
index 000000000..99cf3f727
--- /dev/null
+++ b/youtube_dl/extractor/sproutvideo.py
@@ -0,0 +1,80 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+
+from ..compat import (
+    compat_b64decode,
+    compat_urllib_parse_urlencode,
+)
+
+
+class SproutVideoIE(InfoExtractor):
+    _VALID_URL = r'(?:https?:|)//videos.sproutvideo.com/embed/(?P<id>[a-f0-9]+)/[a-f0-9]+\??.*'
+    _TEST = {
+        'url': 'https://videos.sproutvideo.com/embed/4c9dddb01910e3c9c4/0fc24387c4f24ee3',
+        'md5': '1343ce1a6cb39d67889bfa07c7b02b0e',
+        'info_dict': {
+            'id': '4c9dddb01910e3c9c4',
+            'ext': 'mp4',
+            'title': 'Adrien Labaeye : Berlin, des communautés aux communs',
+        }
+    }
+
+    @staticmethod
+    def _extract_url(webpage):
+        sproutvideo = re.search(
+            r'(?:<iframe\s+class=[\'\"]sproutvideo-player.*src|href)=[\'\"](?P<url>%s)[\'\"]' % SproutVideoIE._VALID_URL, webpage)
+        if sproutvideo:
+            return sproutvideo.group('url')
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+
+        data = self._search_regex(r'<script[^>]+>var dat = \'([^\']+)\';</script>', webpage, 'data')
+        data_decoded = compat_b64decode(data).decode('utf-8')
+        parsed_data = self._parse_json(data_decoded, video_id)
+
+        # https://github.com/ytdl-org/youtube-dl/issues/16996#issuecomment-406901324
+        # signature->m for manifests
+        # signature->k for keys
+        # signature->t for segments
+        m_sig = self._policy_to_qs(parsed_data, 'm')
+        k_sig = self._policy_to_qs(parsed_data, 'k')
+        t_sig = self._policy_to_qs(parsed_data, 't')
+
+        url = "https://{0}.videos.sproutvideo.com/{1}/{2}/video/index.m3u8?{3}"
+        url = url.format(parsed_data['base'],
+                         parsed_data['s3_user_hash'],
+                         parsed_data['s3_video_hash'],
+                         m_sig)
+
+        formats = self._extract_m3u8_formats(url, video_id, 'mp4', 'm3u8_native',
+                                             m3u8_id='hls', fatal=False)
+        self._sort_formats(formats)
+
+        for i in range(len(formats)):
+            formats[i]['url'] = "{}?{}".format(formats[i]['url'], m_sig)
+
+        return {
+            'id': video_id,
+            'title': parsed_data['title'],
+            'formats': formats,
+            'force_hlsdl': True,  # currently FFmpeg is not supported
+            'extra_param_to_segment_url': t_sig,
+            'extra_param_to_key_url': k_sig
+        }
+
+    def _format_qsdata(self, qs_data):
+        parsed_dict = dict()
+        for key in qs_data:
+            parsed_dict[key.replace('CloudFront-', '')] = qs_data[key]
+        return parsed_dict
+
+    def _policy_to_qs(self, policy, key):
+        sig = self._format_qsdata(policy['signatures'][key])
+        sig['sessionID'] = policy['sessionID']
+        return compat_urllib_parse_urlencode(sig, doseq=True)