From 3fc56635b7d375b262ac3c15aaae549227b5227e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 14 Jan 2020 21:46:56 +0700 Subject: [PATCH 01/54] [ndr:base:embed] Improve thumbnails extraction (closes #23731) --- youtube_dl/extractor/ndr.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/ndr.py b/youtube_dl/extractor/ndr.py index aec2ea133..9c8bf05af 100644 --- a/youtube_dl/extractor/ndr.py +++ b/youtube_dl/extractor/ndr.py @@ -9,6 +9,8 @@ from ..utils import ( int_or_none, parse_iso8601, qualities, + try_get, + urljoin, ) @@ -220,11 +222,17 @@ class NDREmbedBaseIE(InfoExtractor): upload_date = ppjson.get('config', {}).get('publicationDate') duration = int_or_none(config.get('duration')) - thumbnails = [{ - 'id': thumbnail.get('quality') or thumbnail_id, - 'url': thumbnail['src'], - 'preference': quality_key(thumbnail.get('quality')), - } for thumbnail_id, thumbnail in config.get('poster', {}).items() if thumbnail.get('src')] + thumbnails = [] + poster = try_get(config, lambda x: x['poster'], dict) or {} + for thumbnail_id, thumbnail in poster.items(): + thumbnail_url = urljoin(url, thumbnail.get('src')) + if not thumbnail_url: + continue + thumbnails.append({ + 'id': thumbnail.get('quality') or thumbnail_id, + 'url': thumbnail_url, + 'preference': quality_key(thumbnail.get('quality')), + }) return { 'id': video_id, From 628e5bc0b715c239e5fe367bc538a1c1fa563787 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 14 Jan 2020 23:48:36 +0700 Subject: [PATCH 02/54] [canvas] Add support for new API endpoint and update tests (closes #17680, closes #18629) --- youtube_dl/extractor/canvas.py | 83 +++++++++++++++++++++++++++------- 1 file changed, 66 insertions(+), 17 deletions(-) diff --git a/youtube_dl/extractor/canvas.py b/youtube_dl/extractor/canvas.py index c506bc5dd..8667a0d04 100644 --- a/youtube_dl/extractor/canvas.py +++ b/youtube_dl/extractor/canvas.py @@ -13,6 +13,8 @@ from ..utils import ( int_or_none, merge_dicts, parse_iso8601, + str_or_none, + url_or_none, ) @@ -20,15 +22,15 @@ class CanvasIE(InfoExtractor): _VALID_URL = r'https?://mediazone\.vrt\.be/api/v1/(?Pcanvas|een|ketnet|vrt(?:video|nieuws)|sporza)/assets/(?P[^/?#&]+)' _TESTS = [{ 'url': 'https://mediazone.vrt.be/api/v1/ketnet/assets/md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475', - 'md5': '90139b746a0a9bd7bb631283f6e2a64e', + 'md5': '68993eda72ef62386a15ea2cf3c93107', 'info_dict': { 'id': 'md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475', 'display_id': 'md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Nachtwacht: De Greystook', - 'description': 'md5:1db3f5dc4c7109c821261e7512975be7', + 'description': 'Nachtwacht: De Greystook', 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 1468.03, + 'duration': 1468.04, }, 'expected_warnings': ['is not a supported codec', 'Unknown MIME type'], }, { @@ -39,23 +41,45 @@ class CanvasIE(InfoExtractor): 'HLS': 'm3u8_native', 'HLS_AES': 'm3u8', } + _REST_API_BASE = 'https://media-services-public.vrt.be/vualto-video-aggregator-web/rest/external/v1' def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) site_id, video_id = mobj.group('site_id'), mobj.group('id') + # Old API endpoint, serves more formats but may fail for some videos data = self._download_json( 'https://mediazone.vrt.be/api/v1/%s/assets/%s' - % (site_id, video_id), video_id) + % (site_id, video_id), video_id, 'Downloading asset JSON', + 'Unable to download asset JSON', fatal=False) + + # New API endpoint + if not data: + token = self._download_json( + '%s/tokens' % self._REST_API_BASE, video_id, + 'Downloading token', data=b'', + headers={'Content-Type': 'application/json'})['vrtPlayerToken'] + data = self._download_json( + '%s/videos/%s' % (self._REST_API_BASE, video_id), + video_id, 'Downloading video JSON', fatal=False, query={ + 'vrtPlayerToken': token, + 'client': '%s@PROD' % site_id, + }, expected_status=400) + message = data.get('message') + if message and not data.get('title'): + if data.get('code') == 'AUTHENTICATION_REQUIRED': + self.raise_login_required(message) + raise ExtractorError(message, expected=True) title = data['title'] description = data.get('description') formats = [] for target in data['targetUrls']: - format_url, format_type = target.get('url'), target.get('type') + format_url, format_type = url_or_none(target.get('url')), str_or_none(target.get('type')) if not format_url or not format_type: continue + format_type = format_type.upper() if format_type in self._HLS_ENTRY_PROTOCOLS_MAP: formats.extend(self._extract_m3u8_formats( format_url, video_id, 'mp4', self._HLS_ENTRY_PROTOCOLS_MAP[format_type], @@ -134,20 +158,20 @@ class CanvasEenIE(InfoExtractor): }, 'skip': 'Pagina niet gevonden', }, { - 'url': 'https://www.een.be/sorry-voor-alles/herbekijk-sorry-voor-alles', + 'url': 'https://www.een.be/thuis/emma-pakt-thilly-aan', 'info_dict': { - 'id': 'mz-ast-11a587f8-b921-4266-82e2-0bce3e80d07f', - 'display_id': 'herbekijk-sorry-voor-alles', + 'id': 'md-ast-3a24ced2-64d7-44fb-b4ed-ed1aafbf90b8', + 'display_id': 'emma-pakt-thilly-aan', 'ext': 'mp4', - 'title': 'Herbekijk Sorry voor alles', - 'description': 'md5:8bb2805df8164e5eb95d6a7a29dc0dd3', + 'title': 'Emma pakt Thilly aan', + 'description': 'md5:c5c9b572388a99b2690030afa3f3bad7', 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 3788.06, + 'duration': 118.24, }, 'params': { 'skip_download': True, }, - 'skip': 'Episode no longer available', + 'expected_warnings': ['is not a supported codec'], }, { 'url': 'https://www.canvas.be/check-point/najaar-2016/de-politie-uw-vriend', 'only_matching': True, @@ -183,19 +207,44 @@ class VrtNUIE(GigyaBaseIE): IE_DESC = 'VrtNU.be' _VALID_URL = r'https?://(?:www\.)?vrt\.be/(?Pvrtnu)/(?:[^/]+/)*(?P[^/?#&]+)' _TESTS = [{ + # Available via old API endpoint 'url': 'https://www.vrt.be/vrtnu/a-z/postbus-x/1/postbus-x-s1a1/', 'info_dict': { 'id': 'pbs-pub-2e2d8c27-df26-45c9-9dc6-90c78153044d$vid-90c932b1-e21d-4fb8-99b1-db7b49cf74de', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'De zwarte weduwe', - 'description': 'md5:d90c21dced7db869a85db89a623998d4', + 'description': 'md5:db1227b0f318c849ba5eab1fef895ee4', 'duration': 1457.04, 'thumbnail': r're:^https?://.*\.jpg$', - 'season': '1', + 'season': 'Season 1', 'season_number': 1, 'episode_number': 1, }, - 'skip': 'This video is only available for registered users' + 'skip': 'This video is only available for registered users', + 'params': { + 'username': '', + 'password': '', + }, + 'expected_warnings': ['is not a supported codec'], + }, { + # Only available via new API endpoint + 'url': 'https://www.vrt.be/vrtnu/a-z/kamp-waes/1/kamp-waes-s1a5/', + 'info_dict': { + 'id': 'pbs-pub-0763b56c-64fb-4d38-b95b-af60bf433c71$vid-ad36a73c-4735-4f1f-b2c0-a38e6e6aa7e1', + 'ext': 'mp4', + 'title': 'Aflevering 5', + 'description': 'Wie valt door de mand tijdens een missie?', + 'duration': 2967.06, + 'season': 'Season 1', + 'season_number': 1, + 'episode_number': 5, + }, + 'skip': 'This video is only available for registered users', + 'params': { + 'username': '', + 'password': '', + }, + 'expected_warnings': ['Unable to download asset JSON', 'is not a supported codec', 'Unknown MIME type'], }] _NETRC_MACHINE = 'vrtnu' _APIKEY = '3_0Z2HujMtiWq_pkAjgnS2Md2E11a1AwZjYiBETtwNE-EoEHDINgtnvcAOpNgmrVGy' From 14bb191634e2e8ab89a2e94e9e4d009b6406c8b2 Mon Sep 17 00:00:00 2001 From: jnozsc Date: Tue, 14 Jan 2020 10:09:08 -0800 Subject: [PATCH 03/54] [travis] Add flake8 job (#23720) --- .travis.yml | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 14d95fa84..51afd469a 100644 --- a/.travis.yml +++ b/.travis.yml @@ -13,7 +13,7 @@ dist: trusty env: - YTDL_TEST_SET=core - YTDL_TEST_SET=download -matrix: +jobs: include: - python: 3.7 dist: xenial @@ -35,6 +35,11 @@ matrix: env: YTDL_TEST_SET=download - env: JYTHON=true; YTDL_TEST_SET=core - env: JYTHON=true; YTDL_TEST_SET=download + - name: flake8 + python: 3.8 + dist: xenial + install: pip install flake8 + script: flake8 . fast_finish: true allow_failures: - env: YTDL_TEST_SET=download From bfdc8340c90e0ce495d2927e7d555daa5ac05670 Mon Sep 17 00:00:00 2001 From: Moritz Patelscheck Date: Fri, 20 Dec 2019 00:02:39 +0100 Subject: [PATCH 04/54] [yourporn] Fix extraction (closes #21645, closes #22255, closes #23459) --- youtube_dl/extractor/yourporn.py | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/yourporn.py b/youtube_dl/extractor/yourporn.py index 8a2d5f63b..98347491e 100644 --- a/youtube_dl/extractor/yourporn.py +++ b/youtube_dl/extractor/yourporn.py @@ -1,6 +1,7 @@ from __future__ import unicode_literals from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( parse_duration, urljoin, @@ -8,9 +9,9 @@ from ..utils import ( class YourPornIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?(?:yourporn\.sexy|sxyprn\.com)/post/(?P[^/?#&.]+)' + _VALID_URL = r'https?://(?:www\.)?sxyprn\.com/post/(?P[^/?#&.]+)' _TESTS = [{ - 'url': 'https://yourporn.sexy/post/57ffcb2e1179b.html', + 'url': 'https://sxyprn.com/post/57ffcb2e1179b.html', 'md5': '6f8682b6464033d87acaa7a8ff0c092e', 'info_dict': { 'id': '57ffcb2e1179b', @@ -33,11 +34,19 @@ class YourPornIE(InfoExtractor): webpage = self._download_webpage(url, video_id) - video_url = urljoin(url, self._parse_json( + parts = self._parse_json( self._search_regex( r'data-vnfo=(["\'])(?P{.+?})\1', webpage, 'data info', group='data'), - video_id)[video_id]).replace('/cdn/', '/cdn5/') + video_id)[video_id].split('/') + + num = 0 + for c in parts[6] + parts[7]: + if c.isnumeric(): + num += int(c) + parts[5] = compat_str(int(parts[5]) - num) + parts[1] += '8' + video_url = urljoin(url, '/'.join(parts)) title = (self._search_regex( r'<[^>]+\bclass=["\']PostEditTA[^>]+>([^<]+)', webpage, 'title', @@ -54,4 +63,5 @@ class YourPornIE(InfoExtractor): 'thumbnail': thumbnail, 'duration': duration, 'age_limit': 18, + 'ext': 'mp4', } From d7c55f226dd8fafb424eefb078f41b3fc410588b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 15 Jan 2020 01:34:01 +0700 Subject: [PATCH 05/54] [ChangeLog] Actualize [ci skip] --- ChangeLog | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/ChangeLog b/ChangeLog index c33169cd8..cdf4dbc96 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,35 @@ +version + +Extractors +* [yourporn] Fix extraction (#21645, #22255, #23459) ++ [canvas] Add support for new API endpoint (#17680, #18629) +* [ndr:base:embed] Improve thumbnails extraction (#23731) ++ [vodplatform] Add support for embed.kwikmotion.com domain ++ [twitter] Add support for promo_video_website cards (#23711) +* [orf:radio] Clean description and improve extraction +* [orf:fm4] Fix extraction (#23599) +* [safari] Fix kaltura session extraction (#23679, #23670) +* [lego] Fix extraction and extract subtitle (#23687) +* [cloudflarestream] Improve extraction + + Add support for bytehighway.net domain + + Add support for signed URLs + + Extract thumbnail +* [naver] Improve extraction + * Improve geo-restriction handling + + Extract automatic captions + + Extract uploader metadata + + Extract VLive HLS formats + * Improve metadata extraction +- [pandatv] Remove extractor (#23630) +* [dctp] Fix format extraction (#23656) ++ [scrippsnetworks] Add support for www.discovery.com videos +* [discovery] Fix anonymous token extraction (#23650) +* [nrktv:seriebase] Fix extraction (#23625, #23537) +* [wistia] Improve format extraction and extract subtitles (#22590) +* [vice] Improve extraction (#23631) +* [redtube] Detect private videos (#23518) + + version 2020.01.01 Extractors From e8cf0dbdd8aa4f2dcd521d0bf7e7798e87867b52 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 15 Jan 2020 01:37:29 +0700 Subject: [PATCH 06/54] release 2020.01.15 --- .github/ISSUE_TEMPLATE/1_broken_site.md | 6 +++--- .github/ISSUE_TEMPLATE/2_site_support_request.md | 4 ++-- .github/ISSUE_TEMPLATE/3_site_feature_request.md | 4 ++-- .github/ISSUE_TEMPLATE/4_bug_report.md | 6 +++--- .github/ISSUE_TEMPLATE/5_feature_request.md | 4 ++-- ChangeLog | 2 +- docs/supportedsites.md | 1 - youtube_dl/version.py | 2 +- 8 files changed, 14 insertions(+), 15 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.md b/.github/ISSUE_TEMPLATE/1_broken_site.md index 97b8afcf9..cf8e6e411 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.md +++ b/.github/ISSUE_TEMPLATE/1_broken_site.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support -- [ ] I've verified that I'm running youtube-dl version **2020.01.01** +- [ ] I've verified that I'm running youtube-dl version **2020.01.15** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar issues including closed ones @@ -41,7 +41,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2020.01.01 + [debug] youtube-dl version 2020.01.15 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.md b/.github/ISSUE_TEMPLATE/2_site_support_request.md index de6c44a65..babbda464 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.md +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.md @@ -19,7 +19,7 @@ labels: 'site-support-request' - [ ] I'm reporting a new site support request -- [ ] I've verified that I'm running youtube-dl version **2020.01.01** +- [ ] I've verified that I'm running youtube-dl version **2020.01.15** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that none of provided URLs violate any copyrights - [ ] I've searched the bugtracker for similar site support requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.md b/.github/ISSUE_TEMPLATE/3_site_feature_request.md index a9dd5ca52..5498983ff 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.md +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.md @@ -18,13 +18,13 @@ title: '' - [ ] I'm reporting a site feature request -- [ ] I've verified that I'm running youtube-dl version **2020.01.01** +- [ ] I've verified that I'm running youtube-dl version **2020.01.15** - [ ] I've searched the bugtracker for similar site feature requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.md b/.github/ISSUE_TEMPLATE/4_bug_report.md index 8347903ea..d46735951 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.md +++ b/.github/ISSUE_TEMPLATE/4_bug_report.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support issue -- [ ] I've verified that I'm running youtube-dl version **2020.01.01** +- [ ] I've verified that I'm running youtube-dl version **2020.01.15** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar bug reports including closed ones @@ -43,7 +43,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2020.01.01 + [debug] youtube-dl version 2020.01.15 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.md b/.github/ISSUE_TEMPLATE/5_feature_request.md index 92228513c..748b64756 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.md +++ b/.github/ISSUE_TEMPLATE/5_feature_request.md @@ -19,13 +19,13 @@ labels: 'request' - [ ] I'm reporting a feature request -- [ ] I've verified that I'm running youtube-dl version **2020.01.01** +- [ ] I've verified that I'm running youtube-dl version **2020.01.15** - [ ] I've searched the bugtracker for similar feature requests including closed ones diff --git a/ChangeLog b/ChangeLog index cdf4dbc96..cc7fc4323 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version +version 2020.01.15 Extractors * [yourporn] Fix extraction (#21645, #22255, #23459) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index e471aa79a..e9a8cc27a 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -628,7 +628,6 @@ - **OutsideTV** - **PacktPub** - **PacktPubCourse** - - **PandaTV**: 熊猫TV - **pandora.tv**: 판도라TV - **ParamountNetwork** - **parliamentlive.tv**: UK parliament videos diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 8ad2df674..932b138a9 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2020.01.01' +__version__ = '2020.01.15' From e4e5fa6e3c1c2ca2d48dfb5a8b1f734bd627b2dc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 15 Jan 2020 04:13:10 +0700 Subject: [PATCH 07/54] [soundcloud] Restore previews extraction (closes #23739) --- youtube_dl/extractor/soundcloud.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index 62e9d8643..a0b09f5b1 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -96,7 +96,7 @@ class SoundcloudIE(InfoExtractor): 'repost_count': int, } }, - # not streamable song + # not streamable song, preview { 'url': 'https://soundcloud.com/the-concept-band/goldrushed-mastered?in=the-concept-band/sets/the-royal-concept-ep', 'info_dict': { @@ -119,7 +119,6 @@ class SoundcloudIE(InfoExtractor): # rtmp 'skip_download': True, }, - 'skip': 'Preview', }, # private link { @@ -346,9 +345,9 @@ class SoundcloudIE(InfoExtractor): }) def invalid_url(url): - return not url or url in format_urls or re.search(r'/(?:preview|playlist)/0/30/', url) + return not url or url in format_urls - def add_format(f, protocol): + def add_format(f, protocol, is_preview=False): mobj = re.search(r'\.(?P\d+)\.(?P[0-9a-z]{3,4})(?=[/?])', stream_url) if mobj: for k, v in mobj.groupdict().items(): @@ -361,12 +360,16 @@ class SoundcloudIE(InfoExtractor): v = f.get(k) if v: format_id_list.append(v) + preview = is_preview or re.search(r'/(?:preview|playlist)/0/30/', f['url']) + if preview: + format_id_list.append('preview') abr = f.get('abr') if abr: f['abr'] = int(abr) f.update({ 'format_id': '_'.join(format_id_list), 'protocol': 'm3u8_native' if protocol == 'hls' else 'http', + 'preference': -10 if preview else None, }) formats.append(f) @@ -377,7 +380,7 @@ class SoundcloudIE(InfoExtractor): if not isinstance(t, dict): continue format_url = url_or_none(t.get('url')) - if not format_url or t.get('snipped') or '/preview/' in format_url: + if not format_url: continue stream = self._download_json( format_url, track_id, query=query, fatal=False) @@ -400,7 +403,8 @@ class SoundcloudIE(InfoExtractor): add_format({ 'url': stream_url, 'ext': ext, - }, 'http' if protocol == 'progressive' else protocol) + }, 'http' if protocol == 'progressive' else protocol, + t.get('snipped') or '/preview/' in format_url) if not formats: # Old API, does not work for some tracks (e.g. From 90ea83c64d904587992105fb4506e80f6abb28b5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 15 Jan 2020 04:32:05 +0700 Subject: [PATCH 08/54] [orf:tvthek] Improve geo restricted videos detection (closes #23741) --- youtube_dl/extractor/orf.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/orf.py b/youtube_dl/extractor/orf.py index 45fc745a3..d54b8ace6 100644 --- a/youtube_dl/extractor/orf.py +++ b/youtube_dl/extractor/orf.py @@ -90,8 +90,11 @@ class ORFTVthekIE(InfoExtractor): format_id = '-'.join(format_id_list) ext = determine_ext(src) if ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - src, video_id, 'mp4', m3u8_id=format_id, fatal=False)) + m3u8_formats = self._extract_m3u8_formats( + src, video_id, 'mp4', m3u8_id=format_id, fatal=False) + if any('/geoprotection' in f['url'] for f in m3u8_formats): + self.raise_geo_restricted() + formats.extend(m3u8_formats) elif ext == 'f4m': formats.extend(self._extract_f4m_formats( src, video_id, f4m_id=format_id, fatal=False)) From a9866c0366bd6399b0f757527425466a3be4d128 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Wed, 15 Jan 2020 14:02:57 +0100 Subject: [PATCH 09/54] [zype] improve extraction - extract subtitles(closes #21258) - support URLs with alternative keys/tokens(#21258) - extract more metadata --- youtube_dl/extractor/generic.py | 3 + youtube_dl/extractor/trunews.py | 49 ++------------ youtube_dl/extractor/zype.py | 111 +++++++++++++++++++++++++++----- 3 files changed, 101 insertions(+), 62 deletions(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index a4aef106f..3c002472f 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -2098,6 +2098,9 @@ class GenericIE(InfoExtractor): 'ext': 'mp4', 'title': 'Smoky Barbecue Favorites', 'thumbnail': r're:^https?://.*\.jpe?g', + 'description': 'md5:5ff01e76316bd8d46508af26dc86023b', + 'upload_date': '20170909', + 'timestamp': 1504915200, }, 'add_ie': [ZypeIE.ie_key()], 'params': { diff --git a/youtube_dl/extractor/trunews.py b/youtube_dl/extractor/trunews.py index b0c7caabf..cca5b5ceb 100644 --- a/youtube_dl/extractor/trunews.py +++ b/youtube_dl/extractor/trunews.py @@ -1,21 +1,12 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import ( - dict_get, - float_or_none, - int_or_none, - unified_timestamp, - update_url_query, - url_or_none, -) class TruNewsIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?trunews\.com/stream/(?P[^/?#&]+)' _TEST = { 'url': 'https://www.trunews.com/stream/will-democrats-stage-a-circus-during-president-trump-s-state-of-the-union-speech', - 'md5': 'a19c024c3906ff954fac9b96ce66bb08', 'info_dict': { 'id': '5c5a21e65d3c196e1c0020cc', 'display_id': 'will-democrats-stage-a-circus-during-president-trump-s-state-of-the-union-speech', @@ -28,48 +19,16 @@ class TruNewsIE(InfoExtractor): }, 'add_ie': ['Zype'], } + _ZYPE_TEMPL = 'https://player.zype.com/embed/%s.js?api_key=X5XnahkjCwJrT_l5zUqypnaLEObotyvtUKJWWlONxDoHVjP8vqxlArLV8llxMbyt' def _real_extract(self, url): display_id = self._match_id(url) - video = self._download_json( + zype_id = self._download_json( 'https://api.zype.com/videos', display_id, query={ 'app_key': 'PUVKp9WgGUb3-JUw6EqafLx8tFVP6VKZTWbUOR-HOm__g4fNDt1bCsm_LgYf_k9H', 'per_page': 1, 'active': 'true', 'friendly_title': display_id, - })['response'][0] - - zype_id = video['_id'] - - thumbnails = [] - thumbnails_list = video.get('thumbnails') - if isinstance(thumbnails_list, list): - for thumbnail in thumbnails_list: - if not isinstance(thumbnail, dict): - continue - thumbnail_url = url_or_none(thumbnail.get('url')) - if not thumbnail_url: - continue - thumbnails.append({ - 'url': thumbnail_url, - 'width': int_or_none(thumbnail.get('width')), - 'height': int_or_none(thumbnail.get('height')), - }) - - return { - '_type': 'url_transparent', - 'url': update_url_query( - 'https://player.zype.com/embed/%s.js' % zype_id, - {'api_key': 'X5XnahkjCwJrT_l5zUqypnaLEObotyvtUKJWWlONxDoHVjP8vqxlArLV8llxMbyt'}), - 'ie_key': 'Zype', - 'id': zype_id, - 'display_id': display_id, - 'title': video.get('title'), - 'description': dict_get(video, ('description', 'ott_description', 'short_description')), - 'duration': int_or_none(video.get('duration')), - 'timestamp': unified_timestamp(video.get('published_at')), - 'average_rating': float_or_none(video.get('rating')), - 'view_count': int_or_none(video.get('request_count')), - 'thumbnails': thumbnails, - } + })['response'][0]['_id'] + return self.url_result(self._ZYPE_TEMPL % zype_id, 'Zype', zype_id) diff --git a/youtube_dl/extractor/zype.py b/youtube_dl/extractor/zype.py index 3b16e703b..2e2e97a0c 100644 --- a/youtube_dl/extractor/zype.py +++ b/youtube_dl/extractor/zype.py @@ -4,10 +4,20 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import compat_HTTPError +from ..utils import ( + dict_get, + ExtractorError, + int_or_none, + js_to_json, + parse_iso8601, +) class ZypeIE(InfoExtractor): - _VALID_URL = r'https?://player\.zype\.com/embed/(?P[\da-fA-F]+)\.js\?.*?api_key=[^&]+' + _ID_RE = r'[\da-fA-F]+' + _COMMON_RE = r'//player\.zype\.com/embed/%s\.(?:js|json|html)\?.*?(?:access_token|(?:ap[ip]|player)_key)=' + _VALID_URL = r'https?:%s[^&]+' % (_COMMON_RE % ('(?P%s)' % _ID_RE)) _TEST = { 'url': 'https://player.zype.com/embed/5b400b834b32992a310622b9.js?api_key=jZ9GUhRmxcPvX7M3SlfejB6Hle9jyHTdk2jVxG7wOHPLODgncEKVdPYBhuz9iWXQ&autoplay=false&controls=true&da=false', 'md5': 'eaee31d474c76a955bdaba02a505c595', @@ -16,6 +26,9 @@ class ZypeIE(InfoExtractor): 'ext': 'mp4', 'title': 'Smoky Barbecue Favorites', 'thumbnail': r're:^https?://.*\.jpe?g', + 'description': 'md5:5ff01e76316bd8d46508af26dc86023b', + 'timestamp': 1504915200, + 'upload_date': '20170909', }, } @@ -24,34 +37,98 @@ class ZypeIE(InfoExtractor): return [ mobj.group('url') for mobj in re.finditer( - r']+\bsrc=(["\'])(?P(?:https?:)?//player\.zype\.com/embed/[\da-fA-F]+\.js\?.*?api_key=.+?)\1', + r']+\bsrc=(["\'])(?P(?:https?:)?%s.+?)\1' % (ZypeIE._COMMON_RE % ZypeIE._ID_RE), webpage)] def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + try: + response = self._download_json(re.sub( + r'\.(?:js|html)\?', '.json?', url), video_id)['response'] + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code in (400, 401, 403): + raise ExtractorError(self._parse_json( + e.cause.read().decode(), video_id)['message'], expected=True) + raise - title = self._search_regex( - r'video_title\s*[:=]\s*(["\'])(?P(?:(?!\1).)+)\1', webpage, - 'title', group='value') + body = response['body'] + video = response['video'] + title = video['title'] - m3u8_url = self._search_regex( - r'(["\'])(?P(?:(?!\1).)+\.m3u8(?:(?!\1).)*)\1', webpage, - 'm3u8 url', group='url') - - formats = self._extract_m3u8_formats( - m3u8_url, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls') + if isinstance(body, dict): + formats = [] + for output in body.get('outputs', []): + output_url = output.get('url') + if not output_url: + continue + name = output.get('name') + if name == 'm3u8': + formats = self._extract_m3u8_formats( + output_url, video_id, 'mp4', + 'm3u8_native', m3u8_id='hls', fatal=False) + else: + f = { + 'format_id': name, + 'tbr': int_or_none(output.get('bitrate')), + 'url': output_url, + } + if name in ('m4a', 'mp3'): + f['vcodec'] = 'none' + else: + f.update({ + 'height': int_or_none(output.get('height')), + 'width': int_or_none(output.get('width')), + }) + formats.append(f) + text_tracks = body.get('subtitles') or [] + else: + m3u8_url = self._search_regex( + r'(["\'])(?P(?:(?!\1).)+\.m3u8(?:(?!\1).)*)\1', + body, 'm3u8 url', group='url') + formats = self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls') + text_tracks = self._search_regex( + r'textTracks\s*:\s*(\[[^]]+\])', + body, 'text tracks', default=None) + if text_tracks: + text_tracks = self._parse_json( + text_tracks, video_id, js_to_json, False) self._sort_formats(formats) - thumbnail = self._search_regex( - r'poster\s*[:=]\s*(["\'])(?P(?:(?!\1).)+)\1', webpage, 'thumbnail', - default=False, group='url') + subtitles = {} + if text_tracks: + for text_track in text_tracks: + tt_url = dict_get(text_track, ('file', 'src')) + if not tt_url: + continue + subtitles.setdefault(text_track.get('label') or 'English', []).append({ + 'url': tt_url, + }) + + thumbnails = [] + for thumbnail in video.get('thumbnails', []): + thumbnail_url = thumbnail.get('url') + if not thumbnail_url: + continue + thumbnails.append({ + 'url': thumbnail_url, + 'width': int_or_none(thumbnail.get('width')), + 'height': int_or_none(thumbnail.get('height')), + }) return { 'id': video_id, + 'display_id': video.get('friendly_title'), 'title': title, - 'thumbnail': thumbnail, + 'thumbnails': thumbnails, + 'description': dict_get(video, ('description', 'ott_description', 'short_description')), + 'timestamp': parse_iso8601(video.get('published_at')), + 'duration': int_or_none(video.get('duration')), + 'view_count': int_or_none(video.get('request_count')), + 'average_rating': int_or_none(video.get('rating')), + 'season_number': int_or_none(video.get('season')), + 'episode_number': int_or_none(video.get('episode')), 'formats': formats, + 'subtitles': subtitles, } From 2c482bff7c91c364c55b74846a3ae416cf588df3 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Wed, 15 Jan 2020 14:16:58 +0100 Subject: [PATCH 10/54] [americastestkitchen] fix extraction --- youtube_dl/extractor/americastestkitchen.py | 42 ++++++++------------- 1 file changed, 16 insertions(+), 26 deletions(-) diff --git a/youtube_dl/extractor/americastestkitchen.py b/youtube_dl/extractor/americastestkitchen.py index 8b32aa886..9c9d77ae1 100644 --- a/youtube_dl/extractor/americastestkitchen.py +++ b/youtube_dl/extractor/americastestkitchen.py @@ -5,6 +5,7 @@ from .common import InfoExtractor from ..utils import ( clean_html, int_or_none, + js_to_json, try_get, unified_strdate, ) @@ -13,22 +14,21 @@ from ..utils import ( class AmericasTestKitchenIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?americastestkitchen\.com/(?:episode|videos)/(?P\d+)' _TESTS = [{ - 'url': 'https://www.americastestkitchen.com/episode/548-summer-dinner-party', + 'url': 'https://www.americastestkitchen.com/episode/582-weeknight-japanese-suppers', 'md5': 'b861c3e365ac38ad319cfd509c30577f', 'info_dict': { - 'id': '1_5g5zua6e', - 'title': 'Summer Dinner Party', + 'id': '5b400b9ee338f922cb06450c', + 'title': 'Weeknight Japanese Suppers', 'ext': 'mp4', - 'description': 'md5:858d986e73a4826979b6a5d9f8f6a1ec', - 'thumbnail': r're:^https?://.*\.jpg', - 'timestamp': 1497285541, - 'upload_date': '20170612', - 'uploader_id': 'roger.metcalf@americastestkitchen.com', - 'release_date': '20170617', + 'description': 'md5:3d0c1a44bb3b27607ce82652db25b4a8', + 'thumbnail': r're:^https?://', + 'timestamp': 1523664000, + 'upload_date': '20180414', + 'release_date': '20180414', 'series': "America's Test Kitchen", - 'season_number': 17, - 'episode': 'Summer Dinner Party', - 'episode_number': 24, + 'season_number': 18, + 'episode': 'Weeknight Japanese Suppers', + 'episode_number': 15, }, 'params': { 'skip_download': True, @@ -47,7 +47,7 @@ class AmericasTestKitchenIE(InfoExtractor): self._search_regex( r'window\.__INITIAL_STATE__\s*=\s*({.+?})\s*;\s*', webpage, 'initial context'), - video_id) + video_id, js_to_json) ep_data = try_get( video_data, @@ -55,17 +55,7 @@ class AmericasTestKitchenIE(InfoExtractor): lambda x: x['videoDetail']['content']['data']), dict) ep_meta = ep_data.get('full_video', {}) - zype_id = ep_meta.get('zype_id') - if zype_id: - embed_url = 'https://player.zype.com/embed/%s.js?api_key=jZ9GUhRmxcPvX7M3SlfejB6Hle9jyHTdk2jVxG7wOHPLODgncEKVdPYBhuz9iWXQ' % zype_id - ie_key = 'Zype' - else: - partner_id = self._search_regex( - r'src=["\'](?:https?:)?//(?:[^/]+\.)kaltura\.com/(?:[^/]+/)*(?:p|partner_id)/(\d+)', - webpage, 'kaltura partner id') - external_id = ep_data.get('external_id') or ep_meta['external_id'] - embed_url = 'kaltura:%s:%s' % (partner_id, external_id) - ie_key = 'Kaltura' + zype_id = ep_data.get('zype_id') or ep_meta['zype_id'] title = ep_data.get('title') or ep_meta.get('title') description = clean_html(ep_meta.get('episode_description') or ep_data.get( @@ -79,8 +69,8 @@ class AmericasTestKitchenIE(InfoExtractor): return { '_type': 'url_transparent', - 'url': embed_url, - 'ie_key': ie_key, + 'url': 'https://player.zype.com/embed/%s.js?api_key=jZ9GUhRmxcPvX7M3SlfejB6Hle9jyHTdk2jVxG7wOHPLODgncEKVdPYBhuz9iWXQ' % zype_id, + 'ie_key': 'Zype', 'title': title, 'description': description, 'thumbnail': thumbnail, From 48ff5590c160b89e4596b706f2b33c69557063a0 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Thu, 16 Jan 2020 15:37:16 +0100 Subject: [PATCH 11/54] [nbc] add support for nbc multi network URLs(closes #23049) --- youtube_dl/extractor/nbc.py | 37 ++++++++++++++++++++++++++++--------- 1 file changed, 28 insertions(+), 9 deletions(-) diff --git a/youtube_dl/extractor/nbc.py b/youtube_dl/extractor/nbc.py index 5bc39d002..6f3cb3003 100644 --- a/youtube_dl/extractor/nbc.py +++ b/youtube_dl/extractor/nbc.py @@ -87,11 +87,25 @@ class NBCIE(AdobePassIE): def _real_extract(self, url): permalink, video_id = re.match(self._VALID_URL, url).groups() permalink = 'http' + compat_urllib_parse_unquote(permalink) - response = self._download_json( + video_data = self._download_json( 'https://friendship.nbc.co/v2/graphql', video_id, query={ - 'query': '''{ - page(name: "%s", platform: web, type: VIDEO, userId: "0") { - data { + 'query': '''query bonanzaPage( + $app: NBCUBrands! = nbc + $name: String! + $oneApp: Boolean + $platform: SupportedPlatforms! = web + $type: EntityPageType! = VIDEO + $userId: String! +) { + bonanzaPage( + app: $app + name: $name + oneApp: $oneApp + platform: $platform + type: $type + userId: $userId + ) { + metadata { ... on VideoPageData { description episodeNumber @@ -100,15 +114,20 @@ class NBCIE(AdobePassIE): mpxAccountId mpxGuid rating + resourceId seasonNumber secondaryTitle seriesShortTitle } } } -}''' % permalink, - }) - video_data = response['data']['page']['data'] +}''', + 'variables': json.dumps({ + 'name': permalink, + 'oneApp': True, + 'userId': '0', + }), + })['data']['bonanzaPage']['metadata'] query = { 'mbr': 'true', 'manifest': 'm3u', @@ -117,8 +136,8 @@ class NBCIE(AdobePassIE): title = video_data['secondaryTitle'] if video_data.get('locked'): resource = self._get_mvpd_resource( - 'nbcentertainment', title, video_id, - video_data.get('rating')) + video_data.get('resourceId') or 'nbcentertainment', + title, video_id, video_data.get('rating')) query['auth'] = self._extract_mvpd_auth( url, video_id, 'nbcentertainment', resource) theplatform_url = smuggle_url(update_url_query( From c968f738df8e21d7a7f2f86f697207e0476b76ef Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 17 Jan 2020 14:23:24 +0100 Subject: [PATCH 12/54] [ard] improve extraction(closes #23761) - simplify extraction - extract age limit and series - bypass geo-restriction --- youtube_dl/extractor/ard.py | 329 ++++++++++++++-------------- youtube_dl/extractor/srmediathek.py | 4 +- 2 files changed, 169 insertions(+), 164 deletions(-) diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py index 8adae4644..09d3ab4f9 100644 --- a/youtube_dl/extractor/ard.py +++ b/youtube_dl/extractor/ard.py @@ -1,6 +1,7 @@ # coding: utf-8 from __future__ import unicode_literals +import json import re from .common import InfoExtractor @@ -22,7 +23,101 @@ from ..utils import ( from ..compat import compat_etree_fromstring -class ARDMediathekIE(InfoExtractor): +class ARDMediathekBaseIE(InfoExtractor): + _GEO_COUNTRIES = ['DE'] + + def _extract_media_info(self, media_info_url, webpage, video_id): + media_info = self._download_json( + media_info_url, video_id, 'Downloading media JSON') + return self._parse_media_info(media_info, video_id, '"fsk"' in webpage) + + def _parse_media_info(self, media_info, video_id, fsk): + formats = self._extract_formats(media_info, video_id) + + if not formats: + if fsk: + raise ExtractorError( + 'This video is only available after 20:00', expected=True) + elif media_info.get('_geoblocked'): + self.raise_geo_restricted( + 'This video is not available due to geoblocking', + countries=self._GEO_COUNTRIES) + + self._sort_formats(formats) + + subtitles = {} + subtitle_url = media_info.get('_subtitleUrl') + if subtitle_url: + subtitles['de'] = [{ + 'ext': 'ttml', + 'url': subtitle_url, + }] + + return { + 'id': video_id, + 'duration': int_or_none(media_info.get('_duration')), + 'thumbnail': media_info.get('_previewImage'), + 'is_live': media_info.get('_isLive') is True, + 'formats': formats, + 'subtitles': subtitles, + } + + def _extract_formats(self, media_info, video_id): + type_ = media_info.get('_type') + media_array = media_info.get('_mediaArray', []) + formats = [] + for num, media in enumerate(media_array): + for stream in media.get('_mediaStreamArray', []): + stream_urls = stream.get('_stream') + if not stream_urls: + continue + if not isinstance(stream_urls, list): + stream_urls = [stream_urls] + quality = stream.get('_quality') + server = stream.get('_server') + for stream_url in stream_urls: + if not url_or_none(stream_url): + continue + ext = determine_ext(stream_url) + if quality != 'auto' and ext in ('f4m', 'm3u8'): + continue + if ext == 'f4m': + formats.extend(self._extract_f4m_formats( + update_url_query(stream_url, { + 'hdcore': '3.1.1', + 'plugin': 'aasp-3.1.1.69.124' + }), video_id, f4m_id='hds', fatal=False)) + elif ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + stream_url, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) + else: + if server and server.startswith('rtmp'): + f = { + 'url': server, + 'play_path': stream_url, + 'format_id': 'a%s-rtmp-%s' % (num, quality), + } + else: + f = { + 'url': stream_url, + 'format_id': 'a%s-%s-%s' % (num, ext, quality) + } + m = re.search( + r'_(?P\d+)x(?P\d+)\.mp4$', + stream_url) + if m: + f.update({ + 'width': int(m.group('width')), + 'height': int(m.group('height')), + }) + if type_ == 'audio': + f['vcodec'] = 'none' + formats.append(f) + return formats + + +class ARDMediathekIE(ARDMediathekBaseIE): IE_NAME = 'ARD:mediathek' _VALID_URL = r'^https?://(?:(?:(?:www|classic)\.)?ardmediathek\.de|mediathek\.(?:daserste|rbb-online)\.de|one\.ard\.de)/(?:.*/)(?P[0-9]+|[^0-9][^/\?]+)[^/\?]*(?:\?.*)?' @@ -63,94 +158,6 @@ class ARDMediathekIE(InfoExtractor): def suitable(cls, url): return False if ARDBetaMediathekIE.suitable(url) else super(ARDMediathekIE, cls).suitable(url) - def _extract_media_info(self, media_info_url, webpage, video_id): - media_info = self._download_json( - media_info_url, video_id, 'Downloading media JSON') - - formats = self._extract_formats(media_info, video_id) - - if not formats: - if '"fsk"' in webpage: - raise ExtractorError( - 'This video is only available after 20:00', expected=True) - elif media_info.get('_geoblocked'): - raise ExtractorError('This video is not available due to geo restriction', expected=True) - - self._sort_formats(formats) - - duration = int_or_none(media_info.get('_duration')) - thumbnail = media_info.get('_previewImage') - is_live = media_info.get('_isLive') is True - - subtitles = {} - subtitle_url = media_info.get('_subtitleUrl') - if subtitle_url: - subtitles['de'] = [{ - 'ext': 'ttml', - 'url': subtitle_url, - }] - - return { - 'id': video_id, - 'duration': duration, - 'thumbnail': thumbnail, - 'is_live': is_live, - 'formats': formats, - 'subtitles': subtitles, - } - - def _extract_formats(self, media_info, video_id): - type_ = media_info.get('_type') - media_array = media_info.get('_mediaArray', []) - formats = [] - for num, media in enumerate(media_array): - for stream in media.get('_mediaStreamArray', []): - stream_urls = stream.get('_stream') - if not stream_urls: - continue - if not isinstance(stream_urls, list): - stream_urls = [stream_urls] - quality = stream.get('_quality') - server = stream.get('_server') - for stream_url in stream_urls: - if not url_or_none(stream_url): - continue - ext = determine_ext(stream_url) - if quality != 'auto' and ext in ('f4m', 'm3u8'): - continue - if ext == 'f4m': - formats.extend(self._extract_f4m_formats( - update_url_query(stream_url, { - 'hdcore': '3.1.1', - 'plugin': 'aasp-3.1.1.69.124' - }), - video_id, f4m_id='hds', fatal=False)) - elif ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - stream_url, video_id, 'mp4', m3u8_id='hls', fatal=False)) - else: - if server and server.startswith('rtmp'): - f = { - 'url': server, - 'play_path': stream_url, - 'format_id': 'a%s-rtmp-%s' % (num, quality), - } - else: - f = { - 'url': stream_url, - 'format_id': 'a%s-%s-%s' % (num, ext, quality) - } - m = re.search(r'_(?P\d+)x(?P\d+)\.mp4$', stream_url) - if m: - f.update({ - 'width': int(m.group('width')), - 'height': int(m.group('height')), - }) - if type_ == 'audio': - f['vcodec'] = 'none' - formats.append(f) - return formats - def _real_extract(self, url): # determine video id from url m = re.match(self._VALID_URL, url) @@ -302,19 +309,20 @@ class ARDIE(InfoExtractor): } -class ARDBetaMediathekIE(InfoExtractor): - _VALID_URL = r'https://(?:beta|www)\.ardmediathek\.de/[^/]+/(?:player|live)/(?P[a-zA-Z0-9]+)(?:/(?P[^/?#]+))?' +class ARDBetaMediathekIE(ARDMediathekBaseIE): + _VALID_URL = r'https://(?:beta|www)\.ardmediathek\.de/(?P[^/]+)/(?:player|live)/(?P[a-zA-Z0-9]+)(?:/(?P[^/?#]+))?' _TESTS = [{ 'url': 'https://beta.ardmediathek.de/ard/player/Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhdG9ydC9mYmM4NGM1NC0xNzU4LTRmZGYtYWFhZS0wYzcyZTIxNGEyMDE/die-robuste-roswita', - 'md5': '2d02d996156ea3c397cfc5036b5d7f8f', + 'md5': 'dfdc87d2e7e09d073d5a80770a9ce88f', 'info_dict': { 'display_id': 'die-robuste-roswita', - 'id': 'Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhdG9ydC9mYmM4NGM1NC0xNzU4LTRmZGYtYWFhZS0wYzcyZTIxNGEyMDE', - 'title': 'Tatort: Die robuste Roswita', + 'id': '70153354', + 'title': 'Die robuste Roswita', 'description': r're:^Der Mord.*trüber ist als die Ilm.', 'duration': 5316, - 'thumbnail': 'https://img.ardmediathek.de/standard/00/55/43/59/34/-1774185891/16x9/960?mandant=ard', - 'upload_date': '20180826', + 'thumbnail': 'https://img.ardmediathek.de/standard/00/70/15/33/90/-1852531467/16x9/960?mandant=ard', + 'timestamp': 1577047500, + 'upload_date': '20191222', 'ext': 'mp4', }, }, { @@ -330,71 +338,68 @@ class ARDBetaMediathekIE(InfoExtractor): video_id = mobj.group('video_id') display_id = mobj.group('display_id') or video_id - webpage = self._download_webpage(url, display_id) - data_json = self._search_regex(r'window\.__APOLLO_STATE__\s*=\s*(\{.*);\n', webpage, 'json') - data = self._parse_json(data_json, display_id) - - res = { - 'id': video_id, - 'display_id': display_id, + player_page = self._download_json( + 'https://api.ardmediathek.de/public-gateway', + display_id, data=json.dumps({ + 'query': '''{ + playerPage(client:"%s", clipId: "%s") { + blockedByFsk + broadcastedOn + maturityContentRating + mediaCollection { + _duration + _geoblocked + _isLive + _mediaArray { + _mediaStreamArray { + _quality + _server + _stream } - formats = [] - subtitles = {} - geoblocked = False - for widget in data.values(): - if widget.get('_geoblocked') is True: - geoblocked = True - if '_duration' in widget: - res['duration'] = int_or_none(widget['_duration']) - if 'clipTitle' in widget: - res['title'] = widget['clipTitle'] - if '_previewImage' in widget: - res['thumbnail'] = widget['_previewImage'] - if 'broadcastedOn' in widget: - res['timestamp'] = unified_timestamp(widget['broadcastedOn']) - if 'synopsis' in widget: - res['description'] = widget['synopsis'] - subtitle_url = url_or_none(widget.get('_subtitleUrl')) - if subtitle_url: - subtitles.setdefault('de', []).append({ - 'ext': 'ttml', - 'url': subtitle_url, - }) - if '_quality' in widget: - format_url = url_or_none(try_get( - widget, lambda x: x['_stream']['json'][0])) - if not format_url: - continue - ext = determine_ext(format_url) - if ext == 'f4m': - formats.extend(self._extract_f4m_formats( - format_url + '?hdcore=3.11.0', - video_id, f4m_id='hds', fatal=False)) - elif ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - format_url, video_id, 'mp4', m3u8_id='hls', - fatal=False)) - else: - # HTTP formats are not available when geoblocked is True, - # other formats are fine though - if geoblocked: - continue - quality = str_or_none(widget.get('_quality')) - formats.append({ - 'format_id': ('http-' + quality) if quality else 'http', - 'url': format_url, - 'preference': 10, # Plain HTTP, that's nice - }) - - if not formats and geoblocked: - self.raise_geo_restricted( - msg='This video is not available due to geoblocking', - countries=['DE']) - - self._sort_formats(formats) - res.update({ - 'subtitles': subtitles, - 'formats': formats, + } + _previewImage + _subtitleUrl + _type + } + show { + title + } + synopsis + title + tracking { + atiCustomVars { + contentId + } + } + } +}''' % (mobj.group('client'), video_id), + }).encode(), headers={ + 'Content-Type': 'application/json' + })['data']['playerPage'] + title = player_page['title'] + content_id = str_or_none(try_get( + player_page, lambda x: x['tracking']['atiCustomVars']['contentId'])) + media_collection = player_page.get('mediaCollection') or {} + if not media_collection and content_id: + media_collection = self._download_json( + 'https://www.ardmediathek.de/play/media/' + content_id, + content_id, fatal=False) or {} + info = self._parse_media_info( + media_collection, content_id or video_id, + player_page.get('blockedByFsk')) + age_limit = None + description = player_page.get('synopsis') + maturity_content_rating = player_page.get('maturityContentRating') + if maturity_content_rating: + age_limit = int_or_none(maturity_content_rating.lstrip('FSK')) + if not age_limit: + age_limit = int_or_none(self._search_regex(r'\(FSK\s*(\d+)\)\s*$', description, 'age limit', default=None)) + info.update({ + 'age_limit': age_limit, + 'display_id': display_id, + 'title': title, + 'description': description, + 'timestamp': unified_timestamp(player_page.get('broadcastedOn')), + 'series': try_get(player_page, lambda x: x['show']['title']), }) - - return res + return info diff --git a/youtube_dl/extractor/srmediathek.py b/youtube_dl/extractor/srmediathek.py index 28baf901c..359dadaa3 100644 --- a/youtube_dl/extractor/srmediathek.py +++ b/youtube_dl/extractor/srmediathek.py @@ -1,14 +1,14 @@ # coding: utf-8 from __future__ import unicode_literals -from .ard import ARDMediathekIE +from .ard import ARDMediathekBaseIE from ..utils import ( ExtractorError, get_element_by_attribute, ) -class SRMediathekIE(ARDMediathekIE): +class SRMediathekIE(ARDMediathekBaseIE): IE_NAME = 'sr:mediathek' IE_DESC = 'Saarländischer Rundfunk' _VALID_URL = r'https?://sr-mediathek(?:\.sr-online)?\.de/index\.php\?.*?&id=(?P[0-9]+)' From d9a2f8679136228ddead5f09bb17e006cccaeffe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 18 Jan 2020 14:46:38 +0700 Subject: [PATCH 13/54] [ivi:compilation] Fix entries extraction (closes #23770) --- youtube_dl/extractor/ivi.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/ivi.py b/youtube_dl/extractor/ivi.py index a502e8806..b5a740a01 100644 --- a/youtube_dl/extractor/ivi.py +++ b/youtube_dl/extractor/ivi.py @@ -239,7 +239,7 @@ class IviCompilationIE(InfoExtractor): self.url_result( 'http://www.ivi.ru/watch/%s/%s' % (compilation_id, serie), IviIE.ie_key()) for serie in re.findall( - r']+data-id="\1"' % compilation_id, html)] + r']+\bhref=["\']/watch/%s/(\d+)["\']' % compilation_id, html)] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) From a4b2769451a632837e9f0e86511ed50b5e9c90b2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 18 Jan 2020 15:05:45 +0700 Subject: [PATCH 14/54] [24video] Add support for 24video.vip (closes #23753) --- youtube_dl/extractor/twentyfourvideo.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/twentyfourvideo.py b/youtube_dl/extractor/twentyfourvideo.py index 1d66eeaff..2830c212e 100644 --- a/youtube_dl/extractor/twentyfourvideo.py +++ b/youtube_dl/extractor/twentyfourvideo.py @@ -18,7 +18,7 @@ class TwentyFourVideoIE(InfoExtractor): https?:// (?P (?:(?:www|porno)\.)?24video\. - (?:net|me|xxx|sexy?|tube|adult|site) + (?:net|me|xxx|sexy?|tube|adult|site|vip) )/ (?: video/(?:(?:view|xml)/)?| @@ -59,6 +59,9 @@ class TwentyFourVideoIE(InfoExtractor): }, { 'url': 'https://porno.24video.net/video/2640421-vsya-takaya-gibkaya-i-v-masle', 'only_matching': True, + }, { + 'url': 'https://www.24video.vip/video/view/1044982', + 'only_matching': True, }] def _real_extract(self, url): From fd032450f09c18a971c2a7d4b2e251c8444d69b4 Mon Sep 17 00:00:00 2001 From: PB <3854688+uno20001@users.noreply.github.com> Date: Sat, 18 Jan 2020 16:47:50 +0100 Subject: [PATCH 15/54] [businessinsider] Fix jwplatform id extraction (closes #22929) (#22954) --- youtube_dl/extractor/businessinsider.py | 28 +++++++++++++++---------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/businessinsider.py b/youtube_dl/extractor/businessinsider.py index dfcf9bc6b..73a57b1e4 100644 --- a/youtube_dl/extractor/businessinsider.py +++ b/youtube_dl/extractor/businessinsider.py @@ -9,21 +9,26 @@ class BusinessInsiderIE(InfoExtractor): _VALID_URL = r'https?://(?:[^/]+\.)?businessinsider\.(?:com|nl)/(?:[^/]+/)*(?P[^/?#&]+)' _TESTS = [{ 'url': 'http://uk.businessinsider.com/how-much-radiation-youre-exposed-to-in-everyday-life-2016-6', - 'md5': 'ca237a53a8eb20b6dc5bd60564d4ab3e', + 'md5': 'ffed3e1e12a6f950aa2f7d83851b497a', 'info_dict': { - 'id': 'hZRllCfw', + 'id': 'cjGDb0X9', 'ext': 'mp4', - 'title': "Here's how much radiation you're exposed to in everyday life", - 'description': 'md5:9a0d6e2c279948aadaa5e84d6d9b99bd', - 'upload_date': '20170709', - 'timestamp': 1499606400, - }, - 'params': { - 'skip_download': True, + 'title': "Bananas give you more radiation exposure than living next to a nuclear power plant", + 'description': 'md5:0175a3baf200dd8fa658f94cade841b3', + 'upload_date': '20160611', + 'timestamp': 1465675620, }, }, { 'url': 'https://www.businessinsider.nl/5-scientifically-proven-things-make-you-less-attractive-2017-7/', - 'only_matching': True, + 'md5': '43f438dbc6da0b89f5ac42f68529d84a', + 'info_dict': { + 'id': '5zJwd4FK', + 'ext': 'mp4', + 'title': 'Deze dingen zorgen ervoor dat je minder snel een date scoort', + 'description': 'md5:2af8975825d38a4fed24717bbe51db49', + 'upload_date': '20170705', + 'timestamp': 1499270528, + }, }, { 'url': 'http://www.businessinsider.com/excel-index-match-vlookup-video-how-to-2015-2?IR=T', 'only_matching': True, @@ -35,7 +40,8 @@ class BusinessInsiderIE(InfoExtractor): jwplatform_id = self._search_regex( (r'data-media-id=["\']([a-zA-Z0-9]{8})', r'id=["\']jwplayer_([a-zA-Z0-9]{8})', - r'id["\']?\s*:\s*["\']?([a-zA-Z0-9]{8})'), + r'id["\']?\s*:\s*["\']?([a-zA-Z0-9]{8})', + r'(?:jwplatform\.com/players/|jwplayer_)([a-zA-Z0-9]{8})'), webpage, 'jwplatform id') return self.url_result( 'jwplatform:%s' % jwplatform_id, ie=JWPlatformIE.ie_key(), From f4a18db748d710616d6886d2283a1661b1993783 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sun, 19 Jan 2020 18:24:00 +0100 Subject: [PATCH 16/54] [ard] add a missing condition --- youtube_dl/extractor/ard.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py index 09d3ab4f9..2f47e21c3 100644 --- a/youtube_dl/extractor/ard.py +++ b/youtube_dl/extractor/ard.py @@ -392,8 +392,9 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE): maturity_content_rating = player_page.get('maturityContentRating') if maturity_content_rating: age_limit = int_or_none(maturity_content_rating.lstrip('FSK')) - if not age_limit: - age_limit = int_or_none(self._search_regex(r'\(FSK\s*(\d+)\)\s*$', description, 'age limit', default=None)) + if not age_limit and description: + age_limit = int_or_none(self._search_regex( + r'\(FSK\s*(\d+)\)\s*$', description, 'age limit', default=None)) info.update({ 'age_limit': age_limit, 'display_id': display_id, From 9cf30dc017d2c04a8d7b3d455899a47a41de9bd7 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sun, 19 Jan 2020 19:30:48 +0100 Subject: [PATCH 17/54] [azmedien] fix extraction(closes #23783) --- youtube_dl/extractor/azmedien.py | 36 +++++++------------------------- 1 file changed, 8 insertions(+), 28 deletions(-) diff --git a/youtube_dl/extractor/azmedien.py b/youtube_dl/extractor/azmedien.py index fcbdc71b9..b1e20def5 100644 --- a/youtube_dl/extractor/azmedien.py +++ b/youtube_dl/extractor/azmedien.py @@ -47,39 +47,19 @@ class AZMedienIE(InfoExtractor): 'url': 'https://www.telebaern.tv/telebaern-news/montag-1-oktober-2018-ganze-sendung-133531189#video=0_7xjo9lf1', 'only_matching': True }] - + _API_TEMPL = 'https://www.%s/api/pub/gql/%s/NewsArticleTeaser/cb9f2f81ed22e9b47f4ca64ea3cc5a5d13e88d1d' _PARTNER_ID = '1719221' def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - host = mobj.group('host') - video_id = mobj.group('id') - entry_id = mobj.group('kaltura_id') + host, display_id, article_id, entry_id = re.match(self._VALID_URL, url).groups() if not entry_id: - api_url = 'https://www.%s/api/pub/gql/%s' % (host, host.split('.')[0]) - payload = { - 'query': '''query VideoContext($articleId: ID!) { - article: node(id: $articleId) { - ... on Article { - mainAssetRelation { - asset { - ... on VideoAsset { - kalturaId - } - } - } - } - } - }''', - 'variables': {'articleId': 'Article:%s' % mobj.group('article_id')}, - } - json_data = self._download_json( - api_url, video_id, headers={ - 'Content-Type': 'application/json', - }, - data=json.dumps(payload).encode()) - entry_id = json_data['data']['article']['mainAssetRelation']['asset']['kalturaId'] + entry_id = self._download_json( + self._API_TEMPL % (host, host.split('.')[0]), display_id, query={ + 'variables': json.dumps({ + 'contextId': 'NewsArticle:' + article_id, + }), + })['data']['context']['mainAsset']['video']['kaltura']['kalturaId'] return self.url_result( 'kaltura:%s:%s' % (self._PARTNER_ID, entry_id), From be96f9924f4b93ab4632b602a7e7b97518a1ddab Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sun, 19 Jan 2020 20:15:02 +0100 Subject: [PATCH 18/54] [voicerepublic] fix extraction --- youtube_dl/extractor/voicerepublic.py | 76 +++++++-------------------- 1 file changed, 19 insertions(+), 57 deletions(-) diff --git a/youtube_dl/extractor/voicerepublic.py b/youtube_dl/extractor/voicerepublic.py index 59e1359c4..a52e40afa 100644 --- a/youtube_dl/extractor/voicerepublic.py +++ b/youtube_dl/extractor/voicerepublic.py @@ -1,17 +1,12 @@ from __future__ import unicode_literals -import re - from .common import InfoExtractor -from ..compat import ( - compat_str, - compat_urlparse, -) +from ..compat import compat_str from ..utils import ( ExtractorError, determine_ext, int_or_none, - sanitized_Request, + urljoin, ) @@ -26,8 +21,7 @@ class VoiceRepublicIE(InfoExtractor): 'ext': 'm4a', 'title': 'Watching the Watchers: Building a Sousveillance State', 'description': 'Secret surveillance programs have metadata too. The people and companies that operate secret surveillance programs can be surveilled.', - 'thumbnail': r're:^https?://.*\.(?:png|jpg)$', - 'duration': 1800, + 'duration': 1556, 'view_count': int, } }, { @@ -38,63 +32,31 @@ class VoiceRepublicIE(InfoExtractor): def _real_extract(self, url): display_id = self._match_id(url) - req = sanitized_Request( - compat_urlparse.urljoin(url, '/talks/%s' % display_id)) - # Older versions of Firefox get redirected to an "upgrade browser" page - req.add_header('User-Agent', 'youtube-dl') - webpage = self._download_webpage(req, display_id) + webpage = self._download_webpage(url, display_id) if '>Queued for processing, please stand by...<' in webpage: raise ExtractorError( 'Audio is still queued for processing', expected=True) - config = self._search_regex( - r'(?s)return ({.+?});\s*\n', webpage, - 'data', default=None) - data = self._parse_json(config, display_id, fatal=False) if config else None - if data: - title = data['title'] - description = data.get('teaser') - talk_id = compat_str(data.get('talk_id') or display_id) - talk = data['talk'] - duration = int_or_none(talk.get('duration')) - formats = [{ - 'url': compat_urlparse.urljoin(url, talk_url), - 'format_id': format_id, - 'ext': determine_ext(talk_url) or format_id, - 'vcodec': 'none', - } for format_id, talk_url in talk['links'].items()] - else: - title = self._og_search_title(webpage) - description = self._html_search_regex( - r"(?s)
]*>(.+?)
", - webpage, 'description', fatal=False) - talk_id = self._search_regex( - [r"id='jc-(\d+)'", r"data-shareable-id='(\d+)'"], - webpage, 'talk id', default=None) or display_id - duration = None - player = self._search_regex( - r"class='vr-player jp-jplayer'([^>]+)>", webpage, 'player') - formats = [{ - 'url': compat_urlparse.urljoin(url, talk_url), - 'format_id': format_id, - 'ext': determine_ext(talk_url) or format_id, - 'vcodec': 'none', - } for format_id, talk_url in re.findall(r"data-([^=]+)='([^']+)'", player)] + talk = self._parse_json(self._search_regex( + r'initialSnapshot\s*=\s*({.+?});', + webpage, 'talk'), display_id)['talk'] + title = talk['title'] + formats = [{ + 'url': urljoin(url, talk_url), + 'format_id': format_id, + 'ext': determine_ext(talk_url) or format_id, + 'vcodec': 'none', + } for format_id, talk_url in talk['media_links'].items()] self._sort_formats(formats) - thumbnail = self._og_search_thumbnail(webpage) - view_count = int_or_none(self._search_regex( - r"class='play-count[^']*'>\s*(\d+) plays", - webpage, 'play count', fatal=False)) - return { - 'id': talk_id, + 'id': compat_str(talk.get('id') or display_id), 'display_id': display_id, 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'duration': duration, - 'view_count': view_count, + 'description': talk.get('teaser'), + 'thumbnail': talk.get('image_url'), + 'duration': int_or_none(talk.get('archived_duration')), + 'view_count': int_or_none(talk.get('play_count')), 'formats': formats, } From 22cb94902f5bbe32d636009c2599eae7aa66282c Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sun, 19 Jan 2020 21:20:56 +0100 Subject: [PATCH 19/54] [stretchinternet] fix extraction(closes #4319) --- youtube_dl/extractor/stretchinternet.py | 38 +++++++------------------ 1 file changed, 11 insertions(+), 27 deletions(-) diff --git a/youtube_dl/extractor/stretchinternet.py b/youtube_dl/extractor/stretchinternet.py index ae2ac1b42..4dbead2ba 100644 --- a/youtube_dl/extractor/stretchinternet.py +++ b/youtube_dl/extractor/stretchinternet.py @@ -5,44 +5,28 @@ from ..utils import int_or_none class StretchInternetIE(InfoExtractor): - _VALID_URL = r'https?://portal\.stretchinternet\.com/[^/]+/portal\.htm\?.*?\beventId=(?P\d+)' + _VALID_URL = r'https?://portal\.stretchinternet\.com/[^/]+/(?:portal|full)\.htm\?.*?\beventId=(?P\d+)' _TEST = { - 'url': 'https://portal.stretchinternet.com/umary/portal.htm?eventId=313900&streamType=video', + 'url': 'https://portal.stretchinternet.com/umary/portal.htm?eventId=573272&streamType=video', 'info_dict': { - 'id': '313900', + 'id': '573272', 'ext': 'mp4', - 'title': 'Augustana (S.D.) Baseball vs University of Mary', - 'description': 'md5:7578478614aae3bdd4a90f578f787438', - 'timestamp': 1490468400, - 'upload_date': '20170325', + 'title': 'University of Mary Wrestling vs. Upper Iowa', + 'timestamp': 1575668361, + 'upload_date': '20191206', } } def _real_extract(self, url): video_id = self._match_id(url) - stream = self._download_json( - 'https://neo-client.stretchinternet.com/streamservice/v1/media/stream/v%s' - % video_id, video_id) - - video_url = 'https://%s' % stream['source'] - event = self._download_json( - 'https://neo-client.stretchinternet.com/portal-ws/getEvent.json', - video_id, query={ - 'clientID': 99997, - 'eventID': video_id, - 'token': 'asdf', - })['event'] - - title = event.get('title') or event['mobileTitle'] - description = event.get('customText') - timestamp = int_or_none(event.get('longtime')) + 'https://api.stretchinternet.com/trinity/event/tcg/' + video_id, + video_id)[0] return { 'id': video_id, - 'title': title, - 'description': description, - 'timestamp': timestamp, - 'url': video_url, + 'title': event['title'], + 'timestamp': int_or_none(event.get('dateCreated'), 1000), + 'url': 'https://' + event['media'][0]['url'], } From c3cfea906869e8358652e382679a5996c2aec73e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 24 Jan 2020 04:09:10 +0700 Subject: [PATCH 20/54] [youtube] Fix sigfunc name extraction (closes #23819) --- youtube_dl/extractor/youtube.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index b913d07a6..eacaa5ecd 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1343,6 +1343,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): funcname = self._search_regex( (r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P[a-zA-Z0-9$]+)\(', r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P[a-zA-Z0-9$]+)\(', + r'\b(?P[a-zA-Z0-9$]{2})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)', r'(?P[a-zA-Z0-9$]+)\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)', # Obsolete patterns r'(["\'])signature\1\s*,\s*(?P[a-zA-Z0-9$]+)\(', From bffdedfabd7ef2d9491a4f2c6a42a8326e739589 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 24 Jan 2020 04:14:08 +0700 Subject: [PATCH 21/54] [ChangeLog] Actualize [ci skip] --- ChangeLog | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/ChangeLog b/ChangeLog index cc7fc4323..f72c87133 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,27 @@ +version + +Extractors +* [youtube] Fix sigfunc name extraction (#23819) +* [stretchinternet] Fix extraction (#4319) +* [voicerepublic] Fix extraction +* [azmedien] Fix extraction (#23783) +* [businessinsider] Fix jwplatform id extraction (#22929, #22954) ++ [24video] Add support for 24video.vip (#23753) +* [ivi:compilation] Fix entries extraction (#23770) +* [ard] Improve extraction (#23761) + * Simplify extraction + + Extract age limit and series + * Bypass geo-restriction ++ [nbc] Add support for nbc multi network URLs (#23049) +* [americastestkitchen] Fix extraction +* [zype] Improve extraction + + Extract subtitles (#21258) + + Support URLs with alternative keys/tokens (#21258) + + Extract more metadata +* [orf:tvthek] Improve geo restricted videos detection (#23741) +* [soundcloud] Restore previews extraction (#23739) + + version 2020.01.15 Extractors From 76dbe4df5f931de47a25c7962ef55b2a261cf1a8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 24 Jan 2020 04:16:05 +0700 Subject: [PATCH 22/54] release 2020.01.24 --- .github/ISSUE_TEMPLATE/1_broken_site.md | 6 +++--- .github/ISSUE_TEMPLATE/2_site_support_request.md | 4 ++-- .github/ISSUE_TEMPLATE/3_site_feature_request.md | 4 ++-- .github/ISSUE_TEMPLATE/4_bug_report.md | 6 +++--- .github/ISSUE_TEMPLATE/5_feature_request.md | 4 ++-- ChangeLog | 2 +- youtube_dl/version.py | 2 +- 7 files changed, 14 insertions(+), 14 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.md b/.github/ISSUE_TEMPLATE/1_broken_site.md index cf8e6e411..73f46ec04 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.md +++ b/.github/ISSUE_TEMPLATE/1_broken_site.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support -- [ ] I've verified that I'm running youtube-dl version **2020.01.15** +- [ ] I've verified that I'm running youtube-dl version **2020.01.24** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar issues including closed ones @@ -41,7 +41,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2020.01.15 + [debug] youtube-dl version 2020.01.24 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.md b/.github/ISSUE_TEMPLATE/2_site_support_request.md index babbda464..7e3c9f669 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.md +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.md @@ -19,7 +19,7 @@ labels: 'site-support-request' - [ ] I'm reporting a new site support request -- [ ] I've verified that I'm running youtube-dl version **2020.01.15** +- [ ] I've verified that I'm running youtube-dl version **2020.01.24** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that none of provided URLs violate any copyrights - [ ] I've searched the bugtracker for similar site support requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.md b/.github/ISSUE_TEMPLATE/3_site_feature_request.md index 5498983ff..b9bb3bd11 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.md +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.md @@ -18,13 +18,13 @@ title: '' - [ ] I'm reporting a site feature request -- [ ] I've verified that I'm running youtube-dl version **2020.01.15** +- [ ] I've verified that I'm running youtube-dl version **2020.01.24** - [ ] I've searched the bugtracker for similar site feature requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.md b/.github/ISSUE_TEMPLATE/4_bug_report.md index d46735951..265ea80c1 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.md +++ b/.github/ISSUE_TEMPLATE/4_bug_report.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support issue -- [ ] I've verified that I'm running youtube-dl version **2020.01.15** +- [ ] I've verified that I'm running youtube-dl version **2020.01.24** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar bug reports including closed ones @@ -43,7 +43,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2020.01.15 + [debug] youtube-dl version 2020.01.24 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.md b/.github/ISSUE_TEMPLATE/5_feature_request.md index 748b64756..e71778a3d 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.md +++ b/.github/ISSUE_TEMPLATE/5_feature_request.md @@ -19,13 +19,13 @@ labels: 'request' - [ ] I'm reporting a feature request -- [ ] I've verified that I'm running youtube-dl version **2020.01.15** +- [ ] I've verified that I'm running youtube-dl version **2020.01.24** - [ ] I've searched the bugtracker for similar feature requests including closed ones diff --git a/ChangeLog b/ChangeLog index f72c87133..94aa9f327 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version +version 2020.01.24 Extractors * [youtube] Fix sigfunc name extraction (#23819) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 932b138a9..fa6f7289a 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2020.01.15' +__version__ = '2020.01.24' From 2a5c26c9803ed8801bac2419128f1778bae558df Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Thu, 23 Jan 2020 23:20:48 +0100 Subject: [PATCH 23/54] [soundcloud] imporve private playlist/set tracks extraction https://github.com/ytdl-org/youtube-dl/issues/3707#issuecomment-577873539 --- youtube_dl/extractor/soundcloud.py | 32 +++++++++++++++++++----------- 1 file changed, 20 insertions(+), 12 deletions(-) diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index a0b09f5b1..a1372d389 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -238,7 +238,7 @@ class SoundcloudIE(InfoExtractor): 'ext': 'mp3', 'title': 'Mezzo Valzer', 'description': 'md5:4138d582f81866a530317bae316e8b61', - 'uploader': 'Giovanni Sarani', + 'uploader': 'Micronie', 'uploader_id': '3352531', 'timestamp': 1551394171, 'upload_date': '20190228', @@ -524,7 +524,17 @@ class SoundcloudIE(InfoExtractor): class SoundcloudPlaylistBaseIE(SoundcloudIE): - def _extract_track_entries(self, tracks, token=None): + def _extract_set(self, playlist, token=None): + playlist_id = compat_str(playlist['id']) + tracks = playlist.get('tracks') or [] + if not all([t.get('permalink_url') for t in tracks]) and token: + tracks = self._download_json( + self._API_V2_BASE + 'tracks', playlist_id, + 'Downloading tracks', query={ + 'ids': ','.join([compat_str(t['id']) for t in tracks]), + 'playlistId': playlist_id, + 'playlistSecretToken': token, + }) entries = [] for track in tracks: track_id = str_or_none(track.get('id')) @@ -537,7 +547,10 @@ class SoundcloudPlaylistBaseIE(SoundcloudIE): url += '?secret_token=' + token entries.append(self.url_result( url, SoundcloudIE.ie_key(), track_id)) - return entries + return self.playlist_result( + entries, playlist_id, + playlist.get('title'), + playlist.get('description')) class SoundcloudSetIE(SoundcloudPlaylistBaseIE): @@ -548,6 +561,7 @@ class SoundcloudSetIE(SoundcloudPlaylistBaseIE): 'info_dict': { 'id': '2284613', 'title': 'The Royal Concept EP', + 'description': 'md5:71d07087c7a449e8941a70a29e34671e', }, 'playlist_mincount': 5, }, { @@ -570,13 +584,10 @@ class SoundcloudSetIE(SoundcloudPlaylistBaseIE): msgs = (compat_str(err['error_message']) for err in info['errors']) raise ExtractorError('unable to download video webpage: %s' % ','.join(msgs)) - entries = self._extract_track_entries(info['tracks'], token) - - return self.playlist_result( - entries, str_or_none(info.get('id')), info.get('title')) + return self._extract_set(info, token) -class SoundcloudPagedPlaylistBaseIE(SoundcloudPlaylistBaseIE): +class SoundcloudPagedPlaylistBaseIE(SoundcloudIE): def _extract_playlist(self, base_url, playlist_id, playlist_title): COMMON_QUERY = { 'limit': 2000000000, @@ -774,10 +785,7 @@ class SoundcloudPlaylistIE(SoundcloudPlaylistBaseIE): self._API_V2_BASE + 'playlists/' + playlist_id, playlist_id, 'Downloading playlist', query=query) - entries = self._extract_track_entries(data['tracks'], token) - - return self.playlist_result( - entries, playlist_id, data.get('title'), data.get('description')) + return self._extract_set(data, token) class SoundcloudSearchIE(SearchInfoExtractor, SoundcloudIE): From 43e7994749472be22318c2e4bad81ac5cd17e37f Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sun, 26 Jan 2020 14:15:49 +0100 Subject: [PATCH 24/54] [svt] fix article extraction(closes #22897)(closes #22919) --- youtube_dl/extractor/svt.py | 52 ++++++++++++++++++++++--------------- 1 file changed, 31 insertions(+), 21 deletions(-) diff --git a/youtube_dl/extractor/svt.py b/youtube_dl/extractor/svt.py index 0901c3163..6f73c2d12 100644 --- a/youtube_dl/extractor/svt.py +++ b/youtube_dl/extractor/svt.py @@ -6,17 +6,17 @@ import re from .common import InfoExtractor from ..compat import ( compat_parse_qs, + compat_str, compat_urllib_parse_urlparse, ) from ..utils import ( determine_ext, dict_get, int_or_none, - orderedSet, + str_or_none, strip_or_none, try_get, urljoin, - compat_str, ) @@ -318,26 +318,26 @@ class SVTSeriesIE(SVTPlayBaseIE): class SVTPageIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?svt\.se/(?:[^/]+/)*(?P[^/?&#]+)' + _VALID_URL = r'https?://(?:www\.)?svt\.se/(?P(?:[^/]+/)*(?P[^/?&#]+))' _TESTS = [{ - 'url': 'https://www.svt.se/sport/oseedat/guide-sommartraningen-du-kan-gora-var-och-nar-du-vill', + 'url': 'https://www.svt.se/sport/ishockey/bakom-masken-lehners-kamp-mot-mental-ohalsa', 'info_dict': { - 'id': 'guide-sommartraningen-du-kan-gora-var-och-nar-du-vill', - 'title': 'GUIDE: Sommarträning du kan göra var och när du vill', + 'id': '25298267', + 'title': 'Bakom masken – Lehners kamp mot mental ohälsa', }, - 'playlist_count': 7, + 'playlist_count': 4, }, { - 'url': 'https://www.svt.se/nyheter/inrikes/ebba-busch-thor-kd-har-delvis-ratt-om-no-go-zoner', + 'url': 'https://www.svt.se/nyheter/utrikes/svenska-andrea-ar-en-mil-fran-branderna-i-kalifornien', 'info_dict': { - 'id': 'ebba-busch-thor-kd-har-delvis-ratt-om-no-go-zoner', - 'title': 'Ebba Busch Thor har bara delvis rätt om ”no-go-zoner”', + 'id': '24243746', + 'title': 'Svenska Andrea redo att fly sitt hem i Kalifornien', }, - 'playlist_count': 1, + 'playlist_count': 2, }, { # only programTitle 'url': 'http://www.svt.se/sport/ishockey/jagr-tacklar-giroux-under-intervjun', 'info_dict': { - 'id': '2900353', + 'id': '8439V2K', 'ext': 'mp4', 'title': 'Stjärnorna skojar till det - under SVT-intervjun', 'duration': 27, @@ -356,16 +356,26 @@ class SVTPageIE(InfoExtractor): return False if SVTIE.suitable(url) else super(SVTPageIE, cls).suitable(url) def _real_extract(self, url): - playlist_id = self._match_id(url) + path, display_id = re.match(self._VALID_URL, url).groups() - webpage = self._download_webpage(url, playlist_id) + article = self._download_json( + 'https://api.svt.se/nss-api/page/' + path, display_id, + query={'q': 'articles'})['articles']['content'][0] - entries = [ - self.url_result( - 'svt:%s' % video_id, ie=SVTPlayIE.ie_key(), video_id=video_id) - for video_id in orderedSet(re.findall( - r'data-video-id=["\'](\d+)', webpage))] + entries = [] - title = strip_or_none(self._og_search_title(webpage, default=None)) + def _process_content(content): + if content.get('_type') in ('VIDEOCLIP', 'VIDEOEPISODE'): + video_id = compat_str(content['image']['svtId']) + entries.append(self.url_result( + 'svt:' + video_id, SVTPlayIE.ie_key(), video_id)) - return self.playlist_result(entries, playlist_id, title) + for media in article.get('media', []): + _process_content(media) + + for obj in article.get('structuredBody', []): + _process_content(obj.get('content') or {}) + + return self.playlist_result( + entries, str_or_none(article.get('id')), + strip_or_none(article.get('title'))) From 8e4d3f83ce486aebc851f6f74fa970f331ca338e Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sun, 26 Jan 2020 16:17:51 +0100 Subject: [PATCH 25/54] [svt] fix series extraction(closes #22297) --- youtube_dl/extractor/svt.py | 99 ++++++++++++++++++------------------- 1 file changed, 49 insertions(+), 50 deletions(-) diff --git a/youtube_dl/extractor/svt.py b/youtube_dl/extractor/svt.py index 6f73c2d12..e12389cad 100644 --- a/youtube_dl/extractor/svt.py +++ b/youtube_dl/extractor/svt.py @@ -4,11 +4,7 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import ( - compat_parse_qs, - compat_str, - compat_urllib_parse_urlparse, -) +from ..compat import compat_str from ..utils import ( determine_ext, dict_get, @@ -16,7 +12,6 @@ from ..utils import ( str_or_none, strip_or_none, try_get, - urljoin, ) @@ -237,23 +232,23 @@ class SVTPlayIE(SVTPlayBaseIE): class SVTSeriesIE(SVTPlayBaseIE): - _VALID_URL = r'https?://(?:www\.)?svtplay\.se/(?P[^/?&#]+)' + _VALID_URL = r'https?://(?:www\.)?svtplay\.se/(?P[^/?&#]+)(?:.+?\btab=(?P[^&#]+))?' _TESTS = [{ 'url': 'https://www.svtplay.se/rederiet', 'info_dict': { - 'id': 'rederiet', + 'id': '14445680', 'title': 'Rederiet', - 'description': 'md5:505d491a58f4fcf6eb418ecab947e69e', + 'description': 'md5:d9fdfff17f5d8f73468176ecd2836039', }, 'playlist_mincount': 318, }, { - 'url': 'https://www.svtplay.se/rederiet?tab=sasong2', + 'url': 'https://www.svtplay.se/rederiet?tab=season-2-14445680', 'info_dict': { - 'id': 'rederiet-sasong2', + 'id': 'season-2-14445680', 'title': 'Rederiet - Säsong 2', - 'description': 'md5:505d491a58f4fcf6eb418ecab947e69e', + 'description': 'md5:d9fdfff17f5d8f73468176ecd2836039', }, - 'playlist_count': 12, + 'playlist_mincount': 12, }] @classmethod @@ -261,60 +256,64 @@ class SVTSeriesIE(SVTPlayBaseIE): return False if SVTIE.suitable(url) or SVTPlayIE.suitable(url) else super(SVTSeriesIE, cls).suitable(url) def _real_extract(self, url): - series_id = self._match_id(url) + series_slug, season_id = re.match(self._VALID_URL, url).groups() - qs = compat_parse_qs(compat_urllib_parse_urlparse(url).query) - season_slug = qs.get('tab', [None])[0] - - if season_slug: - series_id += '-%s' % season_slug - - webpage = self._download_webpage( - url, series_id, 'Downloading series page') - - root = self._parse_json( - self._search_regex( - self._SVTPLAY_RE, webpage, 'content', group='json'), - series_id) + series = self._download_json( + 'https://api.svt.se/contento/graphql', series_slug, + 'Downloading series page', query={ + 'query': '''{ + listablesBySlug(slugs: ["%s"]) { + associatedContent(include: [productionPeriod, season]) { + items { + item { + ... on Episode { + videoSvtId + } + } + } + id + name + } + id + longDescription + name + shortDescription + } +}''' % series_slug, + })['data']['listablesBySlug'][0] season_name = None entries = [] - for season in root['relatedVideoContent']['relatedVideosAccordion']: + for season in series['associatedContent']: if not isinstance(season, dict): continue - if season_slug: - if season.get('slug') != season_slug: + if season_id: + if season.get('id') != season_id: continue season_name = season.get('name') - videos = season.get('videos') - if not isinstance(videos, list): + items = season.get('items') + if not isinstance(items, list): continue - for video in videos: - content_url = video.get('contentUrl') - if not content_url or not isinstance(content_url, compat_str): + for item in items: + video = item.get('item') or {} + content_id = video.get('videoSvtId') + if not content_id or not isinstance(content_id, compat_str): continue - entries.append( - self.url_result( - urljoin(url, content_url), - ie=SVTPlayIE.ie_key(), - video_title=video.get('title') - )) + entries.append(self.url_result( + 'svt:' + content_id, SVTPlayIE.ie_key(), content_id)) - metadata = root.get('metaData') - if not isinstance(metadata, dict): - metadata = {} - - title = metadata.get('title') - season_name = season_name or season_slug + title = series.get('name') + season_name = season_name or season_id if title and season_name: title = '%s - %s' % (title, season_name) - elif season_slug: - title = season_slug + elif season_id: + title = season_id return self.playlist_result( - entries, series_id, title, metadata.get('description')) + entries, season_id or series.get('id'), title, + dict_get(series, ('longDescription', 'shortDescription'))) class SVTPageIE(InfoExtractor): From 4877ffc0e9c1f27262bf6b5a4972d11edd487bfe Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Mon, 27 Jan 2020 15:12:08 +0100 Subject: [PATCH 26/54] [viewlift] improve extraction - fix extraction(closes #23851) - add add support for authentication - add support for more domains --- youtube_dl/extractor/viewlift.py | 304 +++++++++++++------------------ 1 file changed, 126 insertions(+), 178 deletions(-) diff --git a/youtube_dl/extractor/viewlift.py b/youtube_dl/extractor/viewlift.py index 851ad936c..d6b92b1c8 100644 --- a/youtube_dl/extractor/viewlift.py +++ b/youtube_dl/extractor/viewlift.py @@ -1,28 +1,62 @@ from __future__ import unicode_literals -import base64 +import json import re from .common import InfoExtractor -from ..compat import compat_urllib_parse_unquote +from ..compat import compat_HTTPError from ..utils import ( ExtractorError, - clean_html, - determine_ext, int_or_none, - js_to_json, parse_age_limit, - parse_duration, - try_get, ) class ViewLiftBaseIE(InfoExtractor): - _DOMAINS_REGEX = r'(?:(?:main\.)?snagfilms|snagxtreme|funnyforfree|kiddovid|winnersview|(?:monumental|lax)sportsnetwork|vayafilm)\.com|hoichoi\.tv' + _API_BASE = 'https://prod-api.viewlift.com/' + _DOMAINS_REGEX = r'(?:(?:main\.)?snagfilms|snagxtreme|funnyforfree|kiddovid|winnersview|(?:monumental|lax)sportsnetwork|vayafilm|failarmy|ftfnext|lnppass\.legapallacanestro|moviespree|app\.myoutdoortv|neoufitness|pflmma|theidentitytb)\.com|(?:hoichoi|app\.horseandcountry|kronon|marquee|supercrosslive)\.tv' + _SITE_MAP = { + 'ftfnext': 'lax', + 'funnyforfree': 'snagfilms', + 'hoichoi': 'hoichoitv', + 'kiddovid': 'snagfilms', + 'laxsportsnetwork': 'lax', + 'legapallacanestro': 'lnp', + 'marquee': 'marquee-tv', + 'monumentalsportsnetwork': 'monumental-network', + 'moviespree': 'bingeflix', + 'pflmma': 'pfl', + 'snagxtreme': 'snagfilms', + 'theidentitytb': 'tampabay', + 'vayafilm': 'snagfilms', + } + _TOKENS = {} + + def _call_api(self, site, path, video_id, query): + token = self._TOKENS.get(site) + if not token: + token_query = {'site': site} + email, password = self._get_login_info(netrc_machine=site) + if email: + resp = self._download_json( + self._API_BASE + 'identity/signin', video_id, + 'Logging in', query=token_query, data=json.dumps({ + 'email': email, + 'password': password, + }).encode()) + else: + resp = self._download_json( + self._API_BASE + 'identity/anonymous-token', video_id, + 'Downloading authorization token', query=token_query) + self._TOKENS[site] = token = resp['authorizationToken'] + return self._download_json( + self._API_BASE + path, video_id, + headers={'Authorization': token}, query=query) class ViewLiftEmbedIE(ViewLiftBaseIE): - _VALID_URL = r'https?://(?:(?:www|embed)\.)?(?:%s)/embed/player\?.*\bfilmId=(?P[\da-f]{8}-(?:[\da-f]{4}-){3}[\da-f]{12})' % ViewLiftBaseIE._DOMAINS_REGEX + IE_NAME = 'viewlift:embed' + _VALID_URL = r'https?://(?:(?:www|embed)\.)?(?P%s)/embed/player\?.*\bfilmId=(?P[\da-f]{8}-(?:[\da-f]{4}-){3}[\da-f]{12})' % ViewLiftBaseIE._DOMAINS_REGEX _TESTS = [{ 'url': 'http://embed.snagfilms.com/embed/player?filmId=74849a00-85a9-11e1-9660-123139220831&w=500', 'md5': '2924e9215c6eff7a55ed35b72276bd93', @@ -30,6 +64,9 @@ class ViewLiftEmbedIE(ViewLiftBaseIE): 'id': '74849a00-85a9-11e1-9660-123139220831', 'ext': 'mp4', 'title': '#whilewewatch', + 'description': 'md5:b542bef32a6f657dadd0df06e26fb0c8', + 'timestamp': 1334350096, + 'upload_date': '20120413', } }, { # invalid labels, 360p is better that 480p @@ -39,7 +76,8 @@ class ViewLiftEmbedIE(ViewLiftBaseIE): 'id': '17ca0950-a74a-11e0-a92a-0026bb61d036', 'ext': 'mp4', 'title': 'Life in Limbo', - } + }, + 'skip': 'The video does not exist', }, { 'url': 'http://www.snagfilms.com/embed/player?filmId=0000014c-de2f-d5d6-abcf-ffef58af0017', 'only_matching': True, @@ -54,67 +92,68 @@ class ViewLiftEmbedIE(ViewLiftBaseIE): return mobj.group('url') def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - - if '>This film is not playable in your area.<' in webpage: - raise ExtractorError( - 'Film %s is not playable in your area.' % video_id, expected=True) + domain, film_id = re.match(self._VALID_URL, url).groups() + site = domain.split('.')[-2] + if site in self._SITE_MAP: + site = self._SITE_MAP[site] + try: + content_data = self._call_api( + site, 'entitlement/video/status', film_id, { + 'id': film_id + })['video'] + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: + error_message = self._parse_json(e.cause.read().decode(), film_id).get('errorMessage') + if error_message == 'User does not have a valid subscription or has not purchased this content.': + self.raise_login_required() + raise ExtractorError(error_message, expected=True) + raise + gist = content_data['gist'] + title = gist['title'] + video_assets = content_data['streamingInfo']['videoAssets'] formats = [] - has_bitrate = False - sources = self._parse_json(self._search_regex( - r'(?s)sources:\s*(\[.+?\]),', webpage, - 'sources', default='[]'), video_id, js_to_json) - for source in sources: - file_ = source.get('file') - if not file_: + mpeg_video_assets = video_assets.get('mpeg') or [] + for video_asset in mpeg_video_assets: + video_asset_url = video_asset.get('url') + if not video_asset: continue - type_ = source.get('type') - ext = determine_ext(file_) - format_id = source.get('label') or ext - if all(v in ('m3u8', 'hls') for v in (type_, ext)): - formats.extend(self._extract_m3u8_formats( - file_, video_id, 'mp4', 'm3u8_native', - m3u8_id='hls', fatal=False)) - else: - bitrate = int_or_none(self._search_regex( - [r'(\d+)kbps', r'_\d{1,2}x\d{1,2}_(\d{3,})\.%s' % ext], - file_, 'bitrate', default=None)) - if not has_bitrate and bitrate: - has_bitrate = True - height = int_or_none(self._search_regex( - r'^(\d+)[pP]$', format_id, 'height', default=None)) - formats.append({ - 'url': file_, - 'format_id': 'http-%s%s' % (format_id, ('-%dk' % bitrate if bitrate else '')), - 'tbr': bitrate, - 'height': height, - }) - if not formats: - hls_url = self._parse_json(self._search_regex( - r'filmInfo\.src\s*=\s*({.+?});', - webpage, 'src'), video_id, js_to_json)['src'] - formats = self._extract_m3u8_formats( - hls_url, video_id, 'mp4', 'm3u8_native', - m3u8_id='hls', fatal=False) - field_preference = None if has_bitrate else ('height', 'tbr', 'format_id') - self._sort_formats(formats, field_preference) + bitrate = int_or_none(video_asset.get('bitrate')) + height = int_or_none(self._search_regex( + r'^_?(\d+)[pP]$', video_asset.get('renditionValue'), + 'height', default=None)) + formats.append({ + 'url': video_asset_url, + 'format_id': 'http%s' % ('-%d' % bitrate if bitrate else ''), + 'tbr': bitrate, + 'height': height, + 'vcodec': video_asset.get('codec'), + }) - title = self._search_regex( - [r"title\s*:\s*'([^']+)'", r'([^<]+)'], - webpage, 'title') + hls_url = video_assets.get('hls') + if hls_url: + formats.extend(self._extract_m3u8_formats( + hls_url, film_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) + self._sort_formats(formats, ('height', 'tbr', 'format_id')) - return { - 'id': video_id, + info = { + 'id': film_id, 'title': title, + 'description': gist.get('description'), + 'thumbnail': gist.get('videoImageUrl'), + 'duration': int_or_none(gist.get('runtime')), + 'age_limit': parse_age_limit(content_data.get('parentalRating')), + 'timestamp': int_or_none(gist.get('publishDate'), 1000), 'formats': formats, } + for k in ('categories', 'tags'): + info[k] = [v['title'] for v in content_data.get(k, []) if v.get('title')] + return info class ViewLiftIE(ViewLiftBaseIE): - _VALID_URL = r'https?://(?:www\.)?(?P%s)(?:/(?:films/title|show|(?:news/)?videos?))?/(?P[^?#]+)' % ViewLiftBaseIE._DOMAINS_REGEX + IE_NAME = 'viewlift' + _VALID_URL = r'https?://(?:www\.)?(?P%s)(?P(?:/(?:films/title|show|(?:news/)?videos?|watch))?/(?P[^?#]+))' % ViewLiftBaseIE._DOMAINS_REGEX _TESTS = [{ 'url': 'http://www.snagfilms.com/films/title/lost_for_life', 'md5': '19844f897b35af219773fd63bdec2942', @@ -151,10 +190,13 @@ class ViewLiftIE(ViewLiftBaseIE): 'id': '00000148-7b53-de26-a9fb-fbf306f70020', 'display_id': 'augie_alone/s_2_ep_12_love', 'ext': 'mp4', - 'title': 'Augie, Alone:S. 2 Ep. 12 - Love', - 'description': 'md5:db2a5c72d994f16a780c1eb353a8f403', + 'title': 'S. 2 Ep. 12 - Love', + 'description': 'Augie finds love.', 'thumbnail': r're:^https?://.*\.jpg', 'duration': 107, + 'upload_date': '20141012', + 'timestamp': 1413129540, + 'age_limit': 17, }, 'params': { 'skip_download': True, @@ -177,6 +219,9 @@ class ViewLiftIE(ViewLiftBaseIE): # Was once Kaltura embed 'url': 'https://www.monumentalsportsnetwork.com/videos/john-carlson-postgame-2-25-15', 'only_matching': True, + }, { + 'url': 'https://www.marquee.tv/watch/sadlerswells-sacredmonsters', + 'only_matching': True, }] @classmethod @@ -184,119 +229,22 @@ class ViewLiftIE(ViewLiftBaseIE): return False if ViewLiftEmbedIE.suitable(url) else super(ViewLiftIE, cls).suitable(url) def _real_extract(self, url): - domain, display_id = re.match(self._VALID_URL, url).groups() - - webpage = self._download_webpage(url, display_id) - - if ">Sorry, the Film you're looking for is not available.<" in webpage: - raise ExtractorError( - 'Film %s is not available.' % display_id, expected=True) - - initial_store_state = self._search_regex( - r"window\.initialStoreState\s*=.*?JSON\.parse\(unescape\(atob\('([^']+)'\)\)\)", - webpage, 'Initial Store State', default=None) - if initial_store_state: - modules = self._parse_json(compat_urllib_parse_unquote(base64.b64decode( - initial_store_state).decode()), display_id)['page']['data']['modules'] - content_data = next(m['contentData'][0] for m in modules if m.get('moduleType') == 'VideoDetailModule') - gist = content_data['gist'] - film_id = gist['id'] - title = gist['title'] - video_assets = try_get( - content_data, lambda x: x['streamingInfo']['videoAssets'], dict) - if not video_assets: - token = self._download_json( - 'https://prod-api.viewlift.com/identity/anonymous-token', - film_id, 'Downloading authorization token', - query={'site': 'snagfilms'})['authorizationToken'] - video_assets = self._download_json( - 'https://prod-api.viewlift.com/entitlement/video/status', - film_id, headers={ - 'Authorization': token, - 'Referer': url, - }, query={ - 'id': film_id - })['video']['streamingInfo']['videoAssets'] - - formats = [] - mpeg_video_assets = video_assets.get('mpeg') or [] - for video_asset in mpeg_video_assets: - video_asset_url = video_asset.get('url') - if not video_asset: - continue - bitrate = int_or_none(video_asset.get('bitrate')) - height = int_or_none(self._search_regex( - r'^_?(\d+)[pP]$', video_asset.get('renditionValue'), - 'height', default=None)) - formats.append({ - 'url': video_asset_url, - 'format_id': 'http%s' % ('-%d' % bitrate if bitrate else ''), - 'tbr': bitrate, - 'height': height, - 'vcodec': video_asset.get('codec'), - }) - - hls_url = video_assets.get('hls') - if hls_url: - formats.extend(self._extract_m3u8_formats( - hls_url, film_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) - self._sort_formats(formats, ('height', 'tbr', 'format_id')) - - info = { - 'id': film_id, - 'display_id': display_id, - 'title': title, - 'description': gist.get('description'), - 'thumbnail': gist.get('videoImageUrl'), - 'duration': int_or_none(gist.get('runtime')), - 'age_limit': parse_age_limit(content_data.get('parentalRating')), - 'timestamp': int_or_none(gist.get('publishDate'), 1000), - 'formats': formats, - } - for k in ('categories', 'tags'): - info[k] = [v['title'] for v in content_data.get(k, []) if v.get('title')] - return info - else: - film_id = self._search_regex(r'filmId=([\da-f-]{36})"', webpage, 'film id') - - snag = self._parse_json( - self._search_regex( - r'Snag\.page\.data\s*=\s*(\[.+?\]);', webpage, 'snag', default='[]'), - display_id) - - for item in snag: - if item.get('data', {}).get('film', {}).get('id') == film_id: - data = item['data']['film'] - title = data['title'] - description = clean_html(data.get('synopsis')) - thumbnail = data.get('image') - duration = int_or_none(data.get('duration') or data.get('runtime')) - categories = [ - category['title'] for category in data.get('categories', []) - if category.get('title')] - break - else: - title = self._html_search_regex( - (r'itemprop="title">([^<]+)<', - r'(?s)itemprop="title">(.+?)(.+?)', - webpage, 'description', default=None) or self._og_search_description(webpage) - thumbnail = self._og_search_thumbnail(webpage) - duration = parse_duration(self._search_regex( - r'([^<]+)<', - webpage, 'duration', fatal=False)) - categories = re.findall(r'
([^<]+)', webpage) - - return { - '_type': 'url_transparent', - 'url': 'http://%s/embed/player?filmId=%s' % (domain, film_id), - 'id': film_id, - 'display_id': display_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'duration': duration, - 'categories': categories, - 'ie_key': 'ViewLiftEmbed', - } + domain, path, display_id = re.match(self._VALID_URL, url).groups() + site = domain.split('.')[-2] + if site in self._SITE_MAP: + site = self._SITE_MAP[site] + modules = self._call_api( + site, 'content/pages', display_id, { + 'includeContent': 'true', + 'moduleOffset': 1, + 'path': path, + 'site': site, + })['modules'] + film_id = next(m['contentData'][0]['gist']['id'] for m in modules if m.get('moduleType') == 'VideoDetailModule') + return { + '_type': 'url_transparent', + 'url': 'http://%s/embed/player?filmId=%s' % (domain, film_id), + 'id': film_id, + 'display_id': display_id, + 'ie_key': 'ViewLiftEmbed', + } From 51c7f40c83a12f9dc0fce0b9e5102a0c13467b6a Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Mon, 27 Jan 2020 23:37:29 +0100 Subject: [PATCH 27/54] [vimeo] fix album extraction(closes #23864) --- youtube_dl/extractor/vimeo.py | 68 +++++++++++++++++------------------ 1 file changed, 33 insertions(+), 35 deletions(-) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index baa46d5f3..f378aa283 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -841,33 +841,6 @@ class VimeoChannelIE(VimeoBaseInfoExtractor): return self._TITLE or self._html_search_regex( self._TITLE_RE, webpage, 'list title', fatal=False) - def _login_list_password(self, page_url, list_id, webpage): - login_form = self._search_regex( - r'(?s)]+?id="pw_form"(.*?)', - webpage, 'login form', default=None) - if not login_form: - return webpage - - password = self._downloader.params.get('videopassword') - if password is None: - raise ExtractorError('This album is protected by a password, use the --video-password option', expected=True) - fields = self._hidden_inputs(login_form) - token, vuid = self._extract_xsrft_and_vuid(webpage) - fields['token'] = token - fields['password'] = password - post = urlencode_postdata(fields) - password_path = self._search_regex( - r'action="([^"]+)"', login_form, 'password URL') - password_url = compat_urlparse.urljoin(page_url, password_path) - password_request = sanitized_Request(password_url, post) - password_request.add_header('Content-type', 'application/x-www-form-urlencoded') - self._set_vimeo_cookie('vuid', vuid) - self._set_vimeo_cookie('xsrft', token) - - return self._download_webpage( - password_request, list_id, - 'Verifying the password', 'Wrong password') - def _title_and_entries(self, list_id, base_url): for pagenum in itertools.count(1): page_url = self._page_url(base_url, pagenum) @@ -876,7 +849,6 @@ class VimeoChannelIE(VimeoBaseInfoExtractor): 'Downloading page %s' % pagenum) if pagenum == 1: - webpage = self._login_list_password(page_url, list_id, webpage) yield self._extract_list_title(webpage) # Try extracting href first since not all videos are available via @@ -923,7 +895,7 @@ class VimeoUserIE(VimeoChannelIE): _BASE_URL_TEMPL = 'https://vimeo.com/%s' -class VimeoAlbumIE(VimeoChannelIE): +class VimeoAlbumIE(VimeoBaseInfoExtractor): IE_NAME = 'vimeo:album' _VALID_URL = r'https://vimeo\.com/(?:album|showcase)/(?P\d+)(?:$|[?#]|/(?!video))' _TITLE_RE = r'