From f8a12427a9ccdb8506be64c2b56eee7f8872ac3f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 27 Dec 2019 00:18:37 +0700 Subject: [PATCH 01/21] [teachable] Improve locked lessons detection (#23528) --- youtube_dl/extractor/teachable.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/teachable.py b/youtube_dl/extractor/teachable.py index 7d2e34b3b..b82414c3d 100644 --- a/youtube_dl/extractor/teachable.py +++ b/youtube_dl/extractor/teachable.py @@ -165,7 +165,10 @@ class TeachableIE(TeachableBaseIE): if any(re.search(p, webpage) for p in ( r'class=["\']lecture-contents-locked', r'>\s*Lecture contents locked', - r'id=["\']lecture-locked')): + r'id=["\']lecture-locked', + # https://academy.tailoredtutors.co.uk/courses/108779/lectures/1955313 + r'class=["\'](?:inner-)?lesson-locked', + r'>LESSON LOCKED<')): self.raise_login_required('Lecture contents locked') title = self._og_search_title(webpage, default=None) From 941e359e9512c1c75d42cb5b4b248816e16edb82 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 27 Dec 2019 00:26:12 +0700 Subject: [PATCH 02/21] [teachable] Fail with error message if no video URL found --- youtube_dl/extractor/teachable.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/teachable.py b/youtube_dl/extractor/teachable.py index b82414c3d..6b7f13b43 100644 --- a/youtube_dl/extractor/teachable.py +++ b/youtube_dl/extractor/teachable.py @@ -170,6 +170,7 @@ class TeachableIE(TeachableBaseIE): r'class=["\'](?:inner-)?lesson-locked', r'>LESSON LOCKED<')): self.raise_login_required('Lecture contents locked') + raise ExtractorError('Unable to find video URL') title = self._og_search_title(webpage, default=None) From cb7e053e0a6542b2db145c16291361e1f2d5ba2c Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sun, 29 Dec 2019 19:25:21 +0100 Subject: [PATCH 03/21] [extractors] add missing import for ScrippsNetworksIE --- youtube_dl/extractor/extractors.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 376d07727..7b05f5410 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -963,7 +963,10 @@ from .savefrom import SaveFromIE from .sbs import SBSIE from .screencast import ScreencastIE from .screencastomatic import ScreencastOMaticIE -from .scrippsnetworks import ScrippsNetworksWatchIE +from .scrippsnetworks import ( + ScrippsNetworksWatchIE, + ScrippsNetworksIE, +) from .scte import ( SCTEIE, SCTECourseIE, From 75ef77c1b18e943933a635ba28a47ec4c9671504 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sun, 29 Dec 2019 19:30:50 +0100 Subject: [PATCH 04/21] [brightcove] cache brightcove player policy keys --- youtube_dl/extractor/brightcove.py | 36 ++++++++++++++++-------------- 1 file changed, 19 insertions(+), 17 deletions(-) diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index 8e2f7217a..9553f82d6 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -586,24 +586,26 @@ class BrightcoveNewIE(AdobePassIE): account_id, player_id, embed, content_type, video_id = re.match(self._VALID_URL, url).groups() - webpage = self._download_webpage( - 'http://players.brightcove.net/%s/%s_%s/index.min.js' - % (account_id, player_id, embed), video_id) - - policy_key = None - - catalog = self._search_regex( - r'catalog\(({.+?})\);', webpage, 'catalog', default=None) - if catalog: - catalog = self._parse_json( - js_to_json(catalog), video_id, fatal=False) - if catalog: - policy_key = catalog.get('policyKey') - + policy_key_id = '%s_%s' % (account_id, player_id) + policy_key = self._downloader.cache.load('brightcove', policy_key_id) if not policy_key: - policy_key = self._search_regex( - r'policyKey\s*:\s*(["\'])(?P.+?)\1', - webpage, 'policy key', group='pk') + webpage = self._download_webpage( + 'http://players.brightcove.net/%s/%s_%s/index.min.js' + % (account_id, player_id, embed), video_id) + + catalog = self._search_regex( + r'catalog\(({.+?})\);', webpage, 'catalog', default=None) + if catalog: + catalog = self._parse_json( + js_to_json(catalog), video_id, fatal=False) + if catalog: + policy_key = catalog.get('policyKey') + + if not policy_key: + policy_key = self._search_regex( + r'policyKey\s*:\s*(["\'])(?P.+?)\1', + webpage, 'policy key', group='pk') + self._downloader.cache.store('brightcove', policy_key_id, policy_key) api_url = 'https://edge.api.brightcove.com/playback/v1/accounts/%s/%ss/%s' % (account_id, content_type, video_id) headers = { From 0c15a56f1c7afa77347b5e3b1ae9811662291f25 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Mon, 30 Dec 2019 22:31:11 +0100 Subject: [PATCH 05/21] [prosiebensat1] improve geo restriction handling(closes #23571) --- youtube_dl/extractor/prosiebensat1.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/prosiebensat1.py b/youtube_dl/extractor/prosiebensat1.py index e19a470a5..1bc4f9b6b 100644 --- a/youtube_dl/extractor/prosiebensat1.py +++ b/youtube_dl/extractor/prosiebensat1.py @@ -16,7 +16,7 @@ from ..utils import ( class ProSiebenSat1BaseIE(InfoExtractor): - _GEO_COUNTRIES = ['DE'] + _GEO_BYPASS = False _ACCESS_ID = None _SUPPORTED_PROTOCOLS = 'dash:clear,hls:clear,progressive:clear' _V4_BASE_URL = 'https://vas-v4.p7s1video.net/4.0/get' @@ -39,14 +39,18 @@ class ProSiebenSat1BaseIE(InfoExtractor): formats = [] if self._ACCESS_ID: raw_ct = self._ENCRYPTION_KEY + clip_id + self._IV + self._ACCESS_ID - server_token = (self._download_json( + protocols = self._download_json( self._V4_BASE_URL + 'protocols', clip_id, 'Downloading protocols JSON', headers=self.geo_verification_headers(), query={ 'access_id': self._ACCESS_ID, 'client_token': sha1((raw_ct).encode()).hexdigest(), 'video_id': clip_id, - }, fatal=False) or {}).get('server_token') + }, fatal=False, expected_status=(403,)) or {} + error = protocols.get('error') or {} + if error.get('title') == 'Geo check failed': + self.raise_geo_restricted(countries=['AT', 'CH', 'DE']) + server_token = protocols.get('server_token') if server_token: urls = (self._download_json( self._V4_BASE_URL + 'urls', clip_id, 'Downloading urls JSON', query={ From 3bed621750b7fe25afc04a0131664bbbc610c563 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Tue, 31 Dec 2019 09:49:29 +0100 Subject: [PATCH 06/21] [soundcloud] automatically update client id on failing requests --- youtube_dl/extractor/soundcloud.py | 45 +++++++++++++++++++++++------- 1 file changed, 35 insertions(+), 10 deletions(-) diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index 2128e5957..b3ffef8df 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -9,6 +9,8 @@ from .common import ( SearchInfoExtractor ) from ..compat import ( + compat_HTTPError, + compat_kwargs, compat_str, compat_urlparse, ) @@ -255,7 +257,6 @@ class SoundcloudIE(InfoExtractor): _API_BASE = 'https://api.soundcloud.com/' _API_V2_BASE = 'https://api-v2.soundcloud.com/' _BASE_URL = 'https://soundcloud.com/' - _CLIENT_ID = 'YUKXoArFcqrlQn9tfNHvvyfnDISj04zk' _IMAGE_REPL_RE = r'-([0-9a-z]+)\.jpg' _ARTWORK_MAP = { @@ -271,9 +272,39 @@ class SoundcloudIE(InfoExtractor): 'original': 0, } + def _update_client_id(self): + webpage = self._download_webpage('https://soundcloud.com/', None) + for src in reversed(re.findall(r']+src="([^"]+)"', webpage)): + script = self._download_webpage(src, None, fatal=False) + if script: + client_id = self._search_regex( + r'client_id\s*:\s*"([0-9a-zA-Z]{32})"', + script, 'client id', default=None) + if client_id: + self._CLIENT_ID = client_id + self._downloader.cache.store('soundcloud', 'client_id', client_id) + return + raise ExtractorError('Unable to extract client id') + + def _download_json(self, *args, **kwargs): + query = kwargs.get('query', {}).copy() + for _ in range(2): + query['client_id'] = self._CLIENT_ID + kwargs['query'] = query + try: + return super(SoundcloudIE, self)._download_json(*args, **compat_kwargs(kwargs)) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: + self._update_client_id() + continue + raise + + def _real_initialize(self): + self._CLIENT_ID = self._downloader.cache.load('soundcloud', 'client_id') or 'YUKXoArFcqrlQn9tfNHvvyfnDISj04zk' + @classmethod def _resolv_url(cls, url): - return SoundcloudIE._API_V2_BASE + 'resolve?url=' + url + '&client_id=' + cls._CLIENT_ID + return SoundcloudIE._API_V2_BASE + 'resolve?url=' + url def _extract_info_dict(self, info, full_title=None, secret_token=None, version=2): track_id = compat_str(info['id']) @@ -451,9 +482,7 @@ class SoundcloudIE(InfoExtractor): track_id = mobj.group('track_id') - query = { - 'client_id': self._CLIENT_ID, - } + query = {} if track_id: info_json_url = self._API_V2_BASE + 'tracks/' + track_id full_title = track_id @@ -536,7 +565,6 @@ class SoundcloudPagedPlaylistBaseIE(SoundcloudPlaylistBaseIE): def _extract_playlist(self, base_url, playlist_id, playlist_title): COMMON_QUERY = { 'limit': 2000000000, - 'client_id': self._CLIENT_ID, 'linked_partitioning': '1', } @@ -722,9 +750,7 @@ class SoundcloudPlaylistIE(SoundcloudPlaylistBaseIE): mobj = re.match(self._VALID_URL, url) playlist_id = mobj.group('id') - query = { - 'client_id': self._CLIENT_ID, - } + query = {} token = mobj.group('token') if token: query['secret_token'] = token @@ -761,7 +787,6 @@ class SoundcloudSearchIE(SearchInfoExtractor, SoundcloudIE): self._MAX_RESULTS_PER_PAGE) query.update({ 'limit': limit, - 'client_id': self._CLIENT_ID, 'linked_partitioning': 1, 'offset': 0, }) From 2b845c408653683f8266665f03b145ecaad76f4d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 31 Dec 2019 22:16:39 +0700 Subject: [PATCH 07/21] [spankbang] Fix extraction (closes #23307, closes #23423, closes #23444) --- youtube_dl/extractor/spankbang.py | 36 +++++++++++++++++-------------- 1 file changed, 20 insertions(+), 16 deletions(-) diff --git a/youtube_dl/extractor/spankbang.py b/youtube_dl/extractor/spankbang.py index e040ada29..d02ce6b57 100644 --- a/youtube_dl/extractor/spankbang.py +++ b/youtube_dl/extractor/spankbang.py @@ -4,6 +4,7 @@ import re from .common import InfoExtractor from ..utils import ( + determine_ext, ExtractorError, merge_dicts, orderedSet, @@ -75,11 +76,20 @@ class SpankBangIE(InfoExtractor): if not f_url: return f = parse_resolution(format_id) - f.update({ - 'url': f_url, - 'format_id': format_id, - }) - formats.append(f) + ext = determine_ext(f_url) + if format_id.startswith('m3u8') or ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + f_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + elif format_id.startswith('mpd') or ext == 'mpd': + formats.extend(self._extract_mpd_formats( + f_url, video_id, mpd_id='dash', fatal=False)) + elif ext == 'mp4' or f.get('width') or f.get('height'): + f.update({ + 'url': f_url, + 'format_id': format_id, + }) + formats.append(f) STREAM_URL_PREFIX = 'stream_url_' @@ -93,28 +103,22 @@ class SpankBangIE(InfoExtractor): r'data-streamkey\s*=\s*(["\'])(?P(?:(?!\1).)+)\1', webpage, 'stream key', group='value') - sb_csrf_session = self._get_cookies( - 'https://spankbang.com')['sb_csrf_session'].value - stream = self._download_json( 'https://spankbang.com/api/videos/stream', video_id, 'Downloading stream JSON', data=urlencode_postdata({ 'id': stream_key, 'data': 0, - 'sb_csrf_session': sb_csrf_session, }), headers={ 'Referer': url, - 'X-CSRFToken': sb_csrf_session, + 'X-Requested-With': 'XMLHttpRequest', }) for format_id, format_url in stream.items(): - if format_id.startswith(STREAM_URL_PREFIX): - if format_url and isinstance(format_url, list): - format_url = format_url[0] - extract_format( - format_id[len(STREAM_URL_PREFIX):], format_url) + if format_url and isinstance(format_url, list): + format_url = format_url[0] + extract_format(format_id, format_url) - self._sort_formats(formats) + self._sort_formats(formats, field_preference=('preference', 'height', 'width', 'fps', 'tbr', 'format_id')) info = self._search_json_ld(webpage, video_id, default={}) From 0a02732b566c080434dc88e68f75a5e3c0239c6a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 31 Dec 2019 22:18:01 +0700 Subject: [PATCH 08/21] [spankbang] Improve removed video detection (#23423) --- youtube_dl/extractor/spankbang.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/spankbang.py b/youtube_dl/extractor/spankbang.py index d02ce6b57..61ca902ce 100644 --- a/youtube_dl/extractor/spankbang.py +++ b/youtube_dl/extractor/spankbang.py @@ -65,7 +65,7 @@ class SpankBangIE(InfoExtractor): url.replace('/%s/embed' % video_id, '/%s/video' % video_id), video_id, headers={'Cookie': 'country=US'}) - if re.search(r'<[^>]+\bid=["\']video_removed', webpage): + if re.search(r'<[^>]+\b(?:id|class)=["\']video_removed', webpage): raise ExtractorError( 'Video %s is not available' % video_id, expected=True) From 060680874654e77cfd03d150a834b58213379c8c Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Tue, 31 Dec 2019 16:42:56 +0100 Subject: [PATCH 09/21] [brightcove] update policy key on failing requests --- youtube_dl/extractor/brightcove.py | 42 ++++++++++++++++++++---------- 1 file changed, 28 insertions(+), 14 deletions(-) diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index 9553f82d6..5e0c4bc3e 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -588,11 +588,15 @@ class BrightcoveNewIE(AdobePassIE): policy_key_id = '%s_%s' % (account_id, player_id) policy_key = self._downloader.cache.load('brightcove', policy_key_id) - if not policy_key: + policy_key_extracted = False + + def extract_policy_key(): webpage = self._download_webpage( 'http://players.brightcove.net/%s/%s_%s/index.min.js' % (account_id, player_id, embed), video_id) + policy_key = None + catalog = self._search_regex( r'catalog\(({.+?})\);', webpage, 'catalog', default=None) if catalog: @@ -605,28 +609,38 @@ class BrightcoveNewIE(AdobePassIE): policy_key = self._search_regex( r'policyKey\s*:\s*(["\'])(?P.+?)\1', webpage, 'policy key', group='pk') + self._downloader.cache.store('brightcove', policy_key_id, policy_key) + return policy_key api_url = 'https://edge.api.brightcove.com/playback/v1/accounts/%s/%ss/%s' % (account_id, content_type, video_id) - headers = { - 'Accept': 'application/json;pk=%s' % policy_key, - } + headers = {} referrer = smuggled_data.get('referrer') if referrer: headers.update({ 'Referer': referrer, 'Origin': re.search(r'https?://[^/]+', referrer).group(0), }) - try: - json_data = self._download_json(api_url, video_id, headers=headers) - except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: - json_data = self._parse_json(e.cause.read().decode(), video_id)[0] - message = json_data.get('message') or json_data['error_code'] - if json_data.get('error_subcode') == 'CLIENT_GEO': - self.raise_geo_restricted(msg=message) - raise ExtractorError(message, expected=True) - raise + + for _ in range(2): + if not policy_key: + policy_key = extract_policy_key() + policy_key_extracted = True + headers['Accept'] = 'application/json;pk=%s' % policy_key + try: + json_data = self._download_json(api_url, video_id, headers=headers) + break + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code in (401, 403): + json_data = self._parse_json(e.cause.read().decode(), video_id)[0] + message = json_data.get('message') or json_data['error_code'] + if json_data.get('error_subcode') == 'CLIENT_GEO': + self.raise_geo_restricted(msg=message) + elif json_data.get('error_code') == 'INVALID_POLICY_KEY' and not policy_key_extracted: + policy_key = None + continue + raise ExtractorError(message, expected=True) + raise errors = json_data.get('errors') if errors and errors[0].get('error_subcode') == 'TVE_AUTH': From f41347260c2c2cf723bc2bb8a5c11f67a22175d5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 31 Dec 2019 23:29:06 +0700 Subject: [PATCH 10/21] [pornhub] Fix extraction and add support for m3u8 formats (closes #22749, closes #23082) --- youtube_dl/extractor/pornhub.py | 47 ++++++++++++++++++++++++++------- 1 file changed, 37 insertions(+), 10 deletions(-) diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index ba0ad7da2..75ed69cde 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -227,12 +227,13 @@ class PornHubIE(PornHubBaseIE): else: thumbnail, duration = [None] * 2 - if not video_urls: - tv_webpage = dl_webpage('tv') - + def extract_js_vars(webpage, pattern, fatal=True): assignments = self._search_regex( - r'(var.+?mediastring.+?)', tv_webpage, - 'encoded url').split(';') + pattern, webpage, 'encoded url', fatal=fatal) + if not assignments: + return {} + + assignments = assignments.split(';') js_vars = {} @@ -254,11 +255,31 @@ class PornHubIE(PornHubBaseIE): assn = re.sub(r'var\s+', '', assn) vname, value = assn.split('=', 1) js_vars[vname] = parse_js_value(value) + return js_vars - video_url = js_vars['mediastring'] - if video_url not in video_urls_set: - video_urls.append((video_url, None)) - video_urls_set.add(video_url) + def add_video_url(video_url): + v_url = url_or_none(video_url) + if not v_url: + return + if v_url in video_urls_set: + return + video_urls.append((v_url, None)) + video_urls_set.add(v_url) + + if not video_urls: + FORMAT_PREFIXES = ('media', 'quality') + js_vars = extract_js_vars( + webpage, r'(var\s+(?:%s)_.+)' % '|'.join(FORMAT_PREFIXES), + fatal=False) + if js_vars: + for key, format_url in js_vars.items(): + if any(key.startswith(p) for p in FORMAT_PREFIXES): + add_video_url(format_url) + + if not video_urls: + js_vars = extract_js_vars( + dl_webpage('tv'), r'(var.+?mediastring.+?)') + add_video_url(js_vars['mediastring']) for mobj in re.finditer( r']+\bclass=["\']downloadBtn\b[^>]+\bhref=(["\'])(?P(?:(?!\1).)+)\1', @@ -276,10 +297,16 @@ class PornHubIE(PornHubBaseIE): r'/(\d{6}/\d{2})/', video_url, 'upload data', default=None) if upload_date: upload_date = upload_date.replace('/', '') - if determine_ext(video_url) == 'mpd': + ext = determine_ext(video_url) + if ext == 'mpd': formats.extend(self._extract_mpd_formats( video_url, video_id, mpd_id='dash', fatal=False)) continue + elif ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + video_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + continue tbr = None mobj = re.search(r'(?P\d+)[pP]?_(?P\d+)[kK]', video_url) if mobj: From 0164cd5dacf76b0fd295e82412fda60e7c60df61 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 31 Dec 2019 23:43:43 +0700 Subject: [PATCH 11/21] [pornhub] Improve locked videos detection (closes #22449, closes #22780) --- youtube_dl/extractor/pornhub.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index 75ed69cde..b3251ccd9 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -17,6 +17,7 @@ from ..utils import ( determine_ext, ExtractorError, int_or_none, + NO_DEFAULT, orderedSet, remove_quotes, str_to_int, @@ -227,9 +228,9 @@ class PornHubIE(PornHubBaseIE): else: thumbnail, duration = [None] * 2 - def extract_js_vars(webpage, pattern, fatal=True): + def extract_js_vars(webpage, pattern, default=NO_DEFAULT): assignments = self._search_regex( - pattern, webpage, 'encoded url', fatal=fatal) + pattern, webpage, 'encoded url', default=default) if not assignments: return {} @@ -270,11 +271,15 @@ class PornHubIE(PornHubBaseIE): FORMAT_PREFIXES = ('media', 'quality') js_vars = extract_js_vars( webpage, r'(var\s+(?:%s)_.+)' % '|'.join(FORMAT_PREFIXES), - fatal=False) + default=None) if js_vars: for key, format_url in js_vars.items(): if any(key.startswith(p) for p in FORMAT_PREFIXES): add_video_url(format_url) + if not video_urls and re.search( + r'<[^>]+\bid=["\']lockedPlayer', webpage): + raise ExtractorError( + 'Video %s is locked' % video_id, expected=True) if not video_urls: js_vars = extract_js_vars( From 2d30b92e116d097f5d9d794ad97f71ef6aadf8a9 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Tue, 31 Dec 2019 19:48:40 +0100 Subject: [PATCH 12/21] [brightcove] invalidate policy key cache on failing requests --- youtube_dl/extractor/brightcove.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index 5e0c4bc3e..85001b3ad 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -589,6 +589,7 @@ class BrightcoveNewIE(AdobePassIE): policy_key_id = '%s_%s' % (account_id, player_id) policy_key = self._downloader.cache.load('brightcove', policy_key_id) policy_key_extracted = False + store_pk = lambda x: self._downloader.cache.store('brightcove', policy_key_id, x) def extract_policy_key(): webpage = self._download_webpage( @@ -610,7 +611,7 @@ class BrightcoveNewIE(AdobePassIE): r'policyKey\s*:\s*(["\'])(?P.+?)\1', webpage, 'policy key', group='pk') - self._downloader.cache.store('brightcove', policy_key_id, policy_key) + store_pk(policy_key) return policy_key api_url = 'https://edge.api.brightcove.com/playback/v1/accounts/%s/%ss/%s' % (account_id, content_type, video_id) @@ -638,6 +639,7 @@ class BrightcoveNewIE(AdobePassIE): self.raise_geo_restricted(msg=message) elif json_data.get('error_code') == 'INVALID_POLICY_KEY' and not policy_key_extracted: policy_key = None + store_pk(None) continue raise ExtractorError(message, expected=True) raise From de7aade2f872d6de2dbd0d82624e51c24968e057 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Tue, 31 Dec 2019 21:31:22 +0100 Subject: [PATCH 13/21] [soundcloud] fix client id extraction for non fatal requests --- youtube_dl/extractor/soundcloud.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index b3ffef8df..62e9d8643 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -15,6 +15,7 @@ from ..compat import ( compat_urlparse, ) from ..utils import ( + error_to_compat_str, ExtractorError, float_or_none, HEADRequest, @@ -272,6 +273,9 @@ class SoundcloudIE(InfoExtractor): 'original': 0, } + def _store_client_id(self, client_id): + self._downloader.cache.store('soundcloud', 'client_id', client_id) + def _update_client_id(self): webpage = self._download_webpage('https://soundcloud.com/', None) for src in reversed(re.findall(r']+src="([^"]+)"', webpage)): @@ -282,11 +286,14 @@ class SoundcloudIE(InfoExtractor): script, 'client id', default=None) if client_id: self._CLIENT_ID = client_id - self._downloader.cache.store('soundcloud', 'client_id', client_id) + self._store_client_id(client_id) return raise ExtractorError('Unable to extract client id') def _download_json(self, *args, **kwargs): + non_fatal = kwargs.get('fatal') is False + if non_fatal: + del kwargs['fatal'] query = kwargs.get('query', {}).copy() for _ in range(2): query['client_id'] = self._CLIENT_ID @@ -295,8 +302,12 @@ class SoundcloudIE(InfoExtractor): return super(SoundcloudIE, self)._download_json(*args, **compat_kwargs(kwargs)) except ExtractorError as e: if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: + self._store_client_id(None) self._update_client_id() continue + elif non_fatal: + self._downloader.report_warning(error_to_compat_str(e)) + return False raise def _real_initialize(self): From d6bf9cbd46c1eb65f7f79e5e1fde78ec665369e1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 1 Jan 2020 04:13:32 +0700 Subject: [PATCH 14/21] [ChangeLog] Actualize [ci skip] --- ChangeLog | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/ChangeLog b/ChangeLog index 18080575b..c0b536be7 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,24 @@ +version + +Extractors +* [brightcove] Invalidate policy key cache on failing requests +* [pornhub] Improve locked videos detection (#22449, #22780) ++ [pornhub] Add support for m3u8 formats +* [pornhub] Fix extraction (#22749, #23082) +* [brightcove] Update policy key on failing requests +* [spankbang] Improve removed video detection (#23423) +* [spankbang] Fix extraction (#23307, #23423, #23444) +* [soundcloud] Automatically update client id on failing requests +* [prosiebensat1] Improve geo restriction handling (#23571) +* [brightcove] Cache brightcove player policy keys +* [teachable] Fail with error message if no video URL found +* [teachable] Improve locked lessons detection (#23528) ++ [scrippsnetworks] Add support for Scripps Networks sites (#19857, #22981) +* [mitele] Fix extraction (#21354, #23456) +* [soundcloud] Update client id (#23516) +* [mailru] Relax URL regular expressions (#23509) + + version 2019.12.25 Core From 0d5c415e1f4be8364bf842ac7548f09b472d72d6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 1 Jan 2020 05:20:48 +0700 Subject: [PATCH 15/21] [devscripts/create-github-release] Switch to using PAT for authentication Basic authentication will be deprecated soon --- devscripts/create-github-release.py | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) diff --git a/devscripts/create-github-release.py b/devscripts/create-github-release.py index 428111b3f..6464ef322 100644 --- a/devscripts/create-github-release.py +++ b/devscripts/create-github-release.py @@ -15,7 +15,6 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from youtube_dl.compat import ( compat_basestring, - compat_input, compat_getpass, compat_print, compat_urllib_request, @@ -40,28 +39,20 @@ class GitHubReleaser(object): try: info = netrc.netrc().authenticators(self._NETRC_MACHINE) if info is not None: - self._username = info[0] - self._password = info[2] + self._token = info[2] compat_print('Using GitHub credentials found in .netrc...') return else: compat_print('No GitHub credentials found in .netrc') except (IOError, netrc.NetrcParseError): compat_print('Unable to parse .netrc') - self._username = compat_input( - 'Type your GitHub username or email address and press [Return]: ') - self._password = compat_getpass( - 'Type your GitHub password and press [Return]: ') + self._token = compat_getpass( + 'Type your GitHub PAT (personal access token) and press [Return]: ') def _call(self, req): if isinstance(req, compat_basestring): req = sanitized_Request(req) - # Authorizing manually since GitHub does not response with 401 with - # WWW-Authenticate header set (see - # https://developer.github.com/v3/#basic-authentication) - b64 = base64.b64encode( - ('%s:%s' % (self._username, self._password)).encode('utf-8')).decode('ascii') - req.add_header('Authorization', 'Basic %s' % b64) + req.add_header('Authorization', 'token %s' % self._token) response = self._opener.open(req).read().decode('utf-8') return json.loads(response) From ca069f68816c5da790c5745713b38c70df6abf65 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 1 Jan 2020 05:24:58 +0700 Subject: [PATCH 16/21] release 2020.01.01 --- .github/ISSUE_TEMPLATE/1_broken_site.md | 6 +++--- .github/ISSUE_TEMPLATE/2_site_support_request.md | 4 ++-- .github/ISSUE_TEMPLATE/3_site_feature_request.md | 4 ++-- .github/ISSUE_TEMPLATE/4_bug_report.md | 6 +++--- .github/ISSUE_TEMPLATE/5_feature_request.md | 4 ++-- ChangeLog | 2 +- docs/supportedsites.md | 1 + youtube_dl/version.py | 2 +- 8 files changed, 15 insertions(+), 14 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.md b/.github/ISSUE_TEMPLATE/1_broken_site.md index e6b82fda3..97b8afcf9 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.md +++ b/.github/ISSUE_TEMPLATE/1_broken_site.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support -- [ ] I've verified that I'm running youtube-dl version **2019.12.25** +- [ ] I've verified that I'm running youtube-dl version **2020.01.01** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar issues including closed ones @@ -41,7 +41,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2019.12.25 + [debug] youtube-dl version 2020.01.01 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.md b/.github/ISSUE_TEMPLATE/2_site_support_request.md index 9096af717..de6c44a65 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.md +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.md @@ -19,7 +19,7 @@ labels: 'site-support-request' - [ ] I'm reporting a new site support request -- [ ] I've verified that I'm running youtube-dl version **2019.12.25** +- [ ] I've verified that I'm running youtube-dl version **2020.01.01** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that none of provided URLs violate any copyrights - [ ] I've searched the bugtracker for similar site support requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.md b/.github/ISSUE_TEMPLATE/3_site_feature_request.md index 5c235df0a..a9dd5ca52 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.md +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.md @@ -18,13 +18,13 @@ title: '' - [ ] I'm reporting a site feature request -- [ ] I've verified that I'm running youtube-dl version **2019.12.25** +- [ ] I've verified that I'm running youtube-dl version **2020.01.01** - [ ] I've searched the bugtracker for similar site feature requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.md b/.github/ISSUE_TEMPLATE/4_bug_report.md index fe6ab9aa0..8347903ea 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.md +++ b/.github/ISSUE_TEMPLATE/4_bug_report.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support issue -- [ ] I've verified that I'm running youtube-dl version **2019.12.25** +- [ ] I've verified that I'm running youtube-dl version **2020.01.01** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar bug reports including closed ones @@ -43,7 +43,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2019.12.25 + [debug] youtube-dl version 2020.01.01 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.md b/.github/ISSUE_TEMPLATE/5_feature_request.md index 76b028de4..92228513c 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.md +++ b/.github/ISSUE_TEMPLATE/5_feature_request.md @@ -19,13 +19,13 @@ labels: 'request' - [ ] I'm reporting a feature request -- [ ] I've verified that I'm running youtube-dl version **2019.12.25** +- [ ] I've verified that I'm running youtube-dl version **2020.01.01** - [ ] I've searched the bugtracker for similar feature requests including closed ones diff --git a/ChangeLog b/ChangeLog index c0b536be7..c33169cd8 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version +version 2020.01.01 Extractors * [brightcove] Invalidate policy key cache on failing requests diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 04956c546..e471aa79a 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -761,6 +761,7 @@ - **screen.yahoo:search**: Yahoo screen search - **Screencast** - **ScreencastOMatic** + - **ScrippsNetworks** - **scrippsnetworks:watch** - **SCTE** - **SCTECourse** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 606dbe1fb..8ad2df674 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2019.12.25' +__version__ = '2020.01.01' From 484637a9ccede2967a709d2026d29d7b61560e43 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 2 Jan 2020 22:45:42 +0700 Subject: [PATCH 17/21] [redtube] Detect private videos (#23518) --- youtube_dl/extractor/redtube.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/redtube.py b/youtube_dl/extractor/redtube.py index 5c84028ef..b1bde1e81 100644 --- a/youtube_dl/extractor/redtube.py +++ b/youtube_dl/extractor/redtube.py @@ -43,8 +43,15 @@ class RedTubeIE(InfoExtractor): webpage = self._download_webpage( 'http://www.redtube.com/%s' % video_id, video_id) - if any(s in webpage for s in ['video-deleted-info', '>This video has been removed']): - raise ExtractorError('Video %s has been removed' % video_id, expected=True) + ERRORS = ( + (('video-deleted-info', '>This video has been removed'), 'has been removed'), + (('private_video_text', '>This video is private', '>Send a friend request to its owner to be able to view it'), 'is private'), + ) + + for patterns, message in ERRORS: + if any(p in webpage for p in patterns): + raise ExtractorError( + 'Video %s %s' % (video_id, message), expected=True) info = self._search_json_ld(webpage, video_id, default={}) From 44b434e4e3c4e64b25363bec1a3ededb7f667d72 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sun, 5 Jan 2020 16:32:43 +0100 Subject: [PATCH 18/21] [vice] improve extraction(closes #23631) --- youtube_dl/extractor/vice.py | 212 +++++++++++++++++------------------ 1 file changed, 106 insertions(+), 106 deletions(-) diff --git a/youtube_dl/extractor/vice.py b/youtube_dl/extractor/vice.py index 8fdfd743d..e37499512 100644 --- a/youtube_dl/extractor/vice.py +++ b/youtube_dl/extractor/vice.py @@ -1,35 +1,50 @@ # coding: utf-8 from __future__ import unicode_literals -import re -import time +import functools import hashlib import json import random +import re +import time from .adobepass import AdobePassIE -from .youtube import YoutubeIE from .common import InfoExtractor +from .youtube import YoutubeIE from ..compat import ( compat_HTTPError, compat_str, ) from ..utils import ( + clean_html, ExtractorError, int_or_none, + OnDemandPagedList, parse_age_limit, str_or_none, try_get, ) -class ViceIE(AdobePassIE): +class ViceBaseIE(InfoExtractor): + def _call_api(self, resource, resource_key, resource_id, locale, fields, args=''): + return self._download_json( + 'https://video.vice.com/api/v1/graphql', resource_id, query={ + 'query': '''{ + %s(locale: "%s", %s: "%s"%s) { + %s + } +}''' % (resource, locale, resource_key, resource_id, args, fields), + })['data'][resource] + + +class ViceIE(ViceBaseIE, AdobePassIE): IE_NAME = 'vice' - _VALID_URL = r'https?://(?:(?:video|vms)\.vice|(?:www\.)?viceland)\.com/(?P[^/]+)/(?:video/[^/]+|embed)/(?P[\da-f]+)' + _VALID_URL = r'https?://(?:(?:video|vms)\.vice|(?:www\.)?vice(?:land|tv))\.com/(?P[^/]+)/(?:video/[^/]+|embed)/(?P[\da-f]{24})' _TESTS = [{ 'url': 'https://video.vice.com/en_us/video/pet-cremator/58c69e38a55424f1227dc3f7', 'info_dict': { - 'id': '5e647f0125e145c9aef2069412c0cbde', + 'id': '58c69e38a55424f1227dc3f7', 'ext': 'mp4', 'title': '10 Questions You Always Wanted To Ask: Pet Cremator', 'description': 'md5:fe856caacf61fe0e74fab15ce2b07ca5', @@ -43,17 +58,16 @@ class ViceIE(AdobePassIE): # m3u8 download 'skip_download': True, }, - 'add_ie': ['UplynkPreplay'], }, { # geo restricted to US 'url': 'https://video.vice.com/en_us/video/the-signal-from-tolva/5816510690b70e6c5fd39a56', 'info_dict': { - 'id': '930c0ad1f47141cc955087eecaddb0e2', + 'id': '5816510690b70e6c5fd39a56', 'ext': 'mp4', - 'uploader': 'waypoint', + 'uploader': 'vice', 'title': 'The Signal From Tölva', 'description': 'md5:3927e3c79f9e8094606a2b3c5b5e55d5', - 'uploader_id': '57f7d621e05ca860fa9ccaf9', + 'uploader_id': '57a204088cb727dec794c67b', 'timestamp': 1477941983, 'upload_date': '20161031', }, @@ -61,15 +75,14 @@ class ViceIE(AdobePassIE): # m3u8 download 'skip_download': True, }, - 'add_ie': ['UplynkPreplay'], }, { 'url': 'https://video.vice.com/alps/video/ulfs-wien-beruchtigste-grafitti-crew-part-1/581b12b60a0e1f4c0fb6ea2f', 'info_dict': { 'id': '581b12b60a0e1f4c0fb6ea2f', 'ext': 'mp4', 'title': 'ULFs - Wien berüchtigste Grafitti Crew - Part 1', - 'description': '

Zwischen Hinterzimmer-Tattoos und U-Bahnschächten erzählen uns die Ulfs, wie es ist, "süchtig nach Sachbeschädigung" zu sein.

', - 'uploader': 'VICE', + 'description': 'Zwischen Hinterzimmer-Tattoos und U-Bahnschächten erzählen uns die Ulfs, wie es ist, "süchtig nach Sachbeschädigung" zu sein.', + 'uploader': 'vice', 'uploader_id': '57a204088cb727dec794c67b', 'timestamp': 1485368119, 'upload_date': '20170125', @@ -78,9 +91,7 @@ class ViceIE(AdobePassIE): 'params': { # AES-encrypted m3u8 'skip_download': True, - 'proxy': '127.0.0.1:8118', }, - 'add_ie': ['UplynkPreplay'], }, { 'url': 'https://video.vice.com/en_us/video/pizza-show-trailer/56d8c9a54d286ed92f7f30e4', 'only_matching': True, @@ -98,7 +109,7 @@ class ViceIE(AdobePassIE): @staticmethod def _extract_urls(webpage): return re.findall( - r']+\bsrc=["\']((?:https?:)?//video\.vice\.com/[^/]+/embed/[\da-f]+)', + r']+\bsrc=["\']((?:https?:)?//video\.vice\.com/[^/]+/embed/[\da-f]{24})', webpage) @staticmethod @@ -109,31 +120,16 @@ class ViceIE(AdobePassIE): def _real_extract(self, url): locale, video_id = re.match(self._VALID_URL, url).groups() - webpage = self._download_webpage( - 'https://video.vice.com/%s/embed/%s' % (locale, video_id), - video_id) - - video = self._parse_json( - self._search_regex( - r'PREFETCH_DATA\s*=\s*({.+?})\s*;\s*\n', webpage, - 'app state'), video_id)['video'] - video_id = video.get('vms_id') or video.get('id') or video_id - title = video['title'] - is_locked = video.get('locked') + video = self._call_api('videos', 'id', video_id, locale, '''body + locked + rating + thumbnail_url + title''')[0] + title = video['title'].strip() rating = video.get('rating') - thumbnail = video.get('thumbnail_url') - duration = int_or_none(video.get('duration')) - series = try_get( - video, lambda x: x['episode']['season']['show']['title'], - compat_str) - episode_number = try_get( - video, lambda x: x['episode']['episode_number']) - season_number = try_get( - video, lambda x: x['episode']['season']['season_number']) - uploader = None query = {} - if is_locked: + if video.get('locked'): resource = self._get_mvpd_resource( 'VICELAND', title, video_id, rating) query['tvetoken'] = self._extract_mvpd_auth( @@ -148,12 +144,9 @@ class ViceIE(AdobePassIE): query.update({ 'exp': exp, 'sign': hashlib.sha512(('%s:GET:%d' % (video_id, exp)).encode()).hexdigest(), - '_ad_blocked': None, - '_ad_unit': '', - '_debug': '', + 'skipadstitching': 1, 'platform': 'desktop', 'rn': random.randint(10000, 100000), - 'fbprebidtoken': '', }) try: @@ -169,85 +162,94 @@ class ViceIE(AdobePassIE): raise video_data = preplay['video'] - base = video_data['base'] - uplynk_preplay_url = preplay['preplayURL'] - episode = video_data.get('episode', {}) - channel = video_data.get('channel', {}) + formats = self._extract_m3u8_formats( + preplay['playURL'], video_id, 'mp4', 'm3u8_native') + self._sort_formats(formats) + episode = video_data.get('episode') or {} + channel = video_data.get('channel') or {} + season = video_data.get('season') or {} subtitles = {} - cc_url = preplay.get('ccURL') - if cc_url: - subtitles['en'] = [{ + for subtitle in preplay.get('subtitleURLs', []): + cc_url = subtitle.get('url') + if not cc_url: + continue + language_code = try_get(subtitle, lambda x: x['languages'][0]['language_code'], compat_str) or 'en' + subtitles.setdefault(language_code, []).append({ 'url': cc_url, - }] + }) return { - '_type': 'url_transparent', - 'url': uplynk_preplay_url, + 'formats': formats, 'id': video_id, 'title': title, - 'description': base.get('body') or base.get('display_body'), - 'thumbnail': thumbnail, - 'duration': int_or_none(video_data.get('video_duration')) or duration, + 'description': clean_html(video.get('body')), + 'thumbnail': video.get('thumbnail_url'), + 'duration': int_or_none(video_data.get('video_duration')), 'timestamp': int_or_none(video_data.get('created_at'), 1000), - 'age_limit': parse_age_limit(video_data.get('video_rating')), - 'series': video_data.get('show_title') or series, - 'episode_number': int_or_none(episode.get('episode_number') or episode_number), + 'age_limit': parse_age_limit(video_data.get('video_rating') or rating), + 'series': try_get(video_data, lambda x: x['show']['base']['display_title'], compat_str), + 'episode_number': int_or_none(episode.get('episode_number')), 'episode_id': str_or_none(episode.get('id') or video_data.get('episode_id')), - 'season_number': int_or_none(season_number), - 'season_id': str_or_none(episode.get('season_id')), - 'uploader': channel.get('base', {}).get('title') or channel.get('name') or uploader, + 'season_number': int_or_none(season.get('season_number')), + 'season_id': str_or_none(season.get('id') or video_data.get('season_id')), + 'uploader': channel.get('name'), 'uploader_id': str_or_none(channel.get('id')), 'subtitles': subtitles, - 'ie_key': 'UplynkPreplay', } -class ViceShowIE(InfoExtractor): +class ViceShowIE(ViceBaseIE): IE_NAME = 'vice:show' - _VALID_URL = r'https?://(?:.+?\.)?vice\.com/(?:[^/]+/)?show/(?P[^/?#&]+)' - - _TEST = { - 'url': 'https://munchies.vice.com/en/show/fuck-thats-delicious-2', + _VALID_URL = r'https?://(?:video\.vice|(?:www\.)?vice(?:land|tv))\.com/(?P[^/]+)/show/(?P[^/?#&]+)' + _PAGE_SIZE = 25 + _TESTS = [{ + 'url': 'https://video.vice.com/en_us/show/fck-thats-delicious', 'info_dict': { - 'id': 'fuck-thats-delicious-2', - 'title': "Fuck, That's Delicious", - 'description': 'Follow the culinary adventures of rapper Action Bronson during his ongoing world tour.', + 'id': '57a2040c8cb727dec794c901', + 'title': 'F*ck, That’s Delicious', + 'description': 'The life and eating habits of rap’s greatest bon vivant, Action Bronson.', }, - 'playlist_count': 17, - } + 'playlist_mincount': 64, + }, { + 'url': 'https://www.vicetv.com/en_us/show/fck-thats-delicious', + 'only_matching': True, + }] + + def _fetch_page(self, locale, show_id, page): + videos = self._call_api('videos', 'show_id', show_id, locale, '''body + id + url''', ', page: %d, per_page: %d' % (page + 1, self._PAGE_SIZE)) + for video in videos: + yield self.url_result( + video['url'], ViceIE.ie_key(), video.get('id')) def _real_extract(self, url): - show_id = self._match_id(url) - webpage = self._download_webpage(url, show_id) + locale, display_id = re.match(self._VALID_URL, url).groups() + show = self._call_api('shows', 'slug', display_id, locale, '''dek + id + title''')[0] + show_id = show['id'] - entries = [ - self.url_result(video_url, ViceIE.ie_key()) - for video_url, _ in re.findall( - r']+class="article-title"[^>]+data-id="\d+"[^>]*>\s*]+href="(%s.*?)"' - % ViceIE._VALID_URL, webpage)] + entries = OnDemandPagedList( + functools.partial(self._fetch_page, locale, show_id), + self._PAGE_SIZE) - title = self._search_regex( - r'(.+?)', webpage, 'title', default=None) - if title: - title = re.sub(r'(.+)\s*\|\s*.+$', r'\1', title).strip() - description = self._html_search_meta( - 'description', webpage, 'description') - - return self.playlist_result(entries, show_id, title, description) + return self.playlist_result( + entries, show_id, show.get('title'), show.get('dek')) -class ViceArticleIE(InfoExtractor): +class ViceArticleIE(ViceBaseIE): IE_NAME = 'vice:article' - _VALID_URL = r'https://www\.vice\.com/[^/]+/article/(?P[^?#]+)' + _VALID_URL = r'https://(?:www\.)?vice\.com/(?P[^/]+)/article/(?:[0-9a-z]{6}/)?(?P[^?#]+)' _TESTS = [{ 'url': 'https://www.vice.com/en_us/article/on-set-with-the-woman-making-mormon-porn-in-utah', 'info_dict': { - 'id': '41eae2a47b174a1398357cec55f1f6fc', + 'id': '58dc0a3dee202d2a0ccfcbd8', 'ext': 'mp4', - 'title': 'Mormon War on Porn ', - 'description': 'md5:6394a8398506581d0346b9ab89093fef', + 'title': 'Mormon War on Porn', + 'description': 'md5:1c5d91fe25fa8aa304f9def118b92dbf', 'uploader': 'vice', 'uploader_id': '57a204088cb727dec794c67b', 'timestamp': 1491883129, @@ -258,10 +260,10 @@ class ViceArticleIE(InfoExtractor): # AES-encrypted m3u8 'skip_download': True, }, - 'add_ie': ['UplynkPreplay'], + 'add_ie': [ViceIE.ie_key()], }, { 'url': 'https://www.vice.com/en_us/article/how-to-hack-a-car', - 'md5': '7fe8ebc4fa3323efafc127b82bd821d9', + 'md5': '13010ee0bc694ea87ec40724397c2349', 'info_dict': { 'id': '3jstaBeXgAs', 'ext': 'mp4', @@ -271,15 +273,15 @@ class ViceArticleIE(InfoExtractor): 'uploader_id': 'MotherboardTV', 'upload_date': '20140529', }, - 'add_ie': ['Youtube'], + 'add_ie': [YoutubeIE.ie_key()], }, { 'url': 'https://www.vice.com/en_us/article/znm9dx/karley-sciortino-slutever-reloaded', 'md5': 'a7ecf64ee4fa19b916c16f4b56184ae2', 'info_dict': { - 'id': 'e2ed435eb67e43efb66e6ef9a6930a88', + 'id': '57f41d3556a0a80f54726060', 'ext': 'mp4', 'title': "Making The World's First Male Sex Doll", - 'description': 'md5:916078ef0e032d76343116208b6cc2c4', + 'description': 'md5:19b00b215b99961cf869c40fbe9df755', 'uploader': 'vice', 'uploader_id': '57a204088cb727dec794c67b', 'timestamp': 1476919911, @@ -288,6 +290,7 @@ class ViceArticleIE(InfoExtractor): }, 'params': { 'skip_download': True, + 'format': 'bestvideo', }, 'add_ie': [ViceIE.ie_key()], }, { @@ -299,14 +302,11 @@ class ViceArticleIE(InfoExtractor): }] def _real_extract(self, url): - display_id = self._match_id(url) + locale, display_id = re.match(self._VALID_URL, url).groups() - webpage = self._download_webpage(url, display_id) - - prefetch_data = self._parse_json(self._search_regex( - r'__APP_STATE\s*=\s*({.+?})(?:\s*\|\|\s*{}\s*)?;\s*\n', - webpage, 'app state'), display_id)['pageData'] - body = prefetch_data['body'] + article = self._call_api('articles', 'slug', display_id, locale, '''body + embed_code''')[0] + body = article['body'] def _url_res(video_url, ie_key): return { @@ -316,7 +316,7 @@ class ViceArticleIE(InfoExtractor): 'ie_key': ie_key, } - vice_url = ViceIE._extract_url(webpage) + vice_url = ViceIE._extract_url(body) if vice_url: return _url_res(vice_url, ViceIE.ie_key()) @@ -332,6 +332,6 @@ class ViceArticleIE(InfoExtractor): video_url = self._html_search_regex( r'data-video-url="([^"]+)"', - prefetch_data['embed_code'], 'video URL') + article['embed_code'], 'video URL') return _url_res(video_url, ViceIE.ie_key()) From 259ad381730c1b3479c604820bf8333f06f65c64 Mon Sep 17 00:00:00 2001 From: nmeum Date: Sun, 5 Jan 2020 19:26:22 +0100 Subject: [PATCH 19/21] [devscripts/create-github-release] Remove unused import --- devscripts/create-github-release.py | 1 - 1 file changed, 1 deletion(-) diff --git a/devscripts/create-github-release.py b/devscripts/create-github-release.py index 6464ef322..2ddfa1096 100644 --- a/devscripts/create-github-release.py +++ b/devscripts/create-github-release.py @@ -1,7 +1,6 @@ #!/usr/bin/env python from __future__ import unicode_literals -import base64 import io import json import mimetypes From 233826f68f75ec8ee93c5762bf0cd6fceffab0bb Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sun, 5 Jan 2020 21:08:50 +0100 Subject: [PATCH 20/21] [wistia] improve format extraction and extract subtitles(closes #22590) --- youtube_dl/extractor/wistia.py | 68 +++++++++++++++++++++++++--------- 1 file changed, 51 insertions(+), 17 deletions(-) diff --git a/youtube_dl/extractor/wistia.py b/youtube_dl/extractor/wistia.py index 0fbc888ec..085514d47 100644 --- a/youtube_dl/extractor/wistia.py +++ b/youtube_dl/extractor/wistia.py @@ -13,8 +13,7 @@ from ..utils import ( class WistiaIE(InfoExtractor): _VALID_URL = r'(?:wistia:|https?://(?:fast\.)?wistia\.(?:net|com)/embed/(?:iframe|medias)/)(?P[a-z0-9]{10})' - _API_URL = 'http://fast.wistia.com/embed/medias/%s.json' - _IFRAME_URL = 'http://fast.wistia.net/embed/iframe/%s' + _EMBED_BASE_URL = 'http://fast.wistia.com/embed/' _TESTS = [{ 'url': 'http://fast.wistia.net/embed/iframe/sh7fpupwlt', @@ -67,10 +66,10 @@ class WistiaIE(InfoExtractor): video_id = self._match_id(url) data_json = self._download_json( - self._API_URL % video_id, video_id, + self._EMBED_BASE_URL + 'medias/%s.json' % video_id, video_id, # Some videos require this. headers={ - 'Referer': url if url.startswith('http') else self._IFRAME_URL % video_id, + 'Referer': url if url.startswith('http') else self._EMBED_BASE_URL + 'iframe/' + video_id, }) if data_json.get('error'): @@ -95,27 +94,61 @@ class WistiaIE(InfoExtractor): 'url': aurl, 'width': int_or_none(a.get('width')), 'height': int_or_none(a.get('height')), + 'filesize': int_or_none(a.get('size')), }) else: aext = a.get('ext') - is_m3u8 = a.get('container') == 'm3u8' or aext == 'm3u8' - formats.append({ - 'format_id': atype, + display_name = a.get('display_name') + format_id = atype + if atype and atype.endswith('_video') and display_name: + format_id = '%s-%s' % (atype[:-6], display_name) + f = { + 'format_id': format_id, 'url': aurl, - 'tbr': int_or_none(a.get('bitrate')), - 'vbr': int_or_none(a.get('opt_vbitrate')), - 'width': int_or_none(a.get('width')), - 'height': int_or_none(a.get('height')), - 'filesize': int_or_none(a.get('size')), - 'vcodec': a.get('codec'), - 'container': a.get('container'), - 'ext': 'mp4' if is_m3u8 else aext, - 'protocol': 'm3u8' if is_m3u8 else None, + 'tbr': int_or_none(a.get('bitrate')) or None, 'preference': 1 if atype == 'original' else None, - }) + } + if display_name == 'Audio': + f.update({ + 'vcodec': 'none', + }) + else: + f.update({ + 'width': int_or_none(a.get('width')), + 'height': int_or_none(a.get('height')), + 'vcodec': a.get('codec'), + }) + if a.get('container') == 'm3u8' or aext == 'm3u8': + ts_f = f.copy() + ts_f.update({ + 'ext': 'ts', + 'format_id': f['format_id'].replace('hls-', 'ts-'), + 'url': f['url'].replace('.bin', '.ts'), + }) + formats.append(ts_f) + f.update({ + 'ext': 'mp4', + 'protocol': 'm3u8_native', + }) + else: + f.update({ + 'container': a.get('container'), + 'ext': aext, + 'filesize': int_or_none(a.get('size')), + }) + formats.append(f) self._sort_formats(formats) + subtitles = {} + for caption in data.get('captions', []): + language = caption.get('language') + if not language: + continue + subtitles[language] = [{ + 'url': self._EMBED_BASE_URL + 'captions/' + video_id + '.vtt?language=' + language, + }] + return { 'id': video_id, 'title': title, @@ -124,4 +157,5 @@ class WistiaIE(InfoExtractor): 'thumbnails': thumbnails, 'duration': float_or_none(data.get('duration')), 'timestamp': int_or_none(data.get('createdAt')), + 'subtitles': subtitles, } From 0d2306d02beb4c1e50c3f279c109ab25f94ae421 Mon Sep 17 00:00:00 2001 From: Roxedus Date: Mon, 6 Jan 2020 00:34:36 +0100 Subject: [PATCH 21/21] [nrktv:seriebase] Fix extraction (closes #23625) (#23537) --- youtube_dl/extractor/nrk.py | 48 +++++++++++++++++++++++++++++++++---- 1 file changed, 44 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py index 60933f069..94115534b 100644 --- a/youtube_dl/extractor/nrk.py +++ b/youtube_dl/extractor/nrk.py @@ -12,6 +12,7 @@ from ..utils import ( ExtractorError, int_or_none, JSON_LD_RE, + js_to_json, NO_DEFAULT, parse_age_limit, parse_duration, @@ -105,6 +106,7 @@ class NRKBaseIE(InfoExtractor): MESSAGES = { 'ProgramRightsAreNotReady': 'Du kan dessverre ikke se eller høre programmet', 'ProgramRightsHasExpired': 'Programmet har gått ut', + 'NoProgramRights': 'Ikke tilgjengelig', 'ProgramIsGeoBlocked': 'NRK har ikke rettigheter til å vise dette programmet utenfor Norge', } message_type = data.get('messageType', '') @@ -255,6 +257,17 @@ class NRKTVIE(NRKBaseIE): ''' % _EPISODE_RE _API_HOSTS = ('psapi-ne.nrk.no', 'psapi-we.nrk.no') _TESTS = [{ + 'url': 'https://tv.nrk.no/program/MDDP12000117', + 'md5': '8270824df46ec629b66aeaa5796b36fb', + 'info_dict': { + 'id': 'MDDP12000117AA', + 'ext': 'mp4', + 'title': 'Alarm Trolltunga', + 'description': 'md5:46923a6e6510eefcce23d5ef2a58f2ce', + 'duration': 2223, + 'age_limit': 6, + }, + }, { 'url': 'https://tv.nrk.no/serie/20-spoersmaal-tv/MUHH48000314/23-05-2014', 'md5': '9a167e54d04671eb6317a37b7bc8a280', 'info_dict': { @@ -266,6 +279,7 @@ class NRKTVIE(NRKBaseIE): 'series': '20 spørsmål', 'episode': '23.05.2014', }, + 'skip': 'NoProgramRights', }, { 'url': 'https://tv.nrk.no/program/mdfp15000514', 'info_dict': { @@ -370,7 +384,24 @@ class NRKTVIE(NRKBaseIE): class NRKTVEpisodeIE(InfoExtractor): _VALID_URL = r'https?://tv\.nrk\.no/serie/(?P[^/]+/sesong/\d+/episode/\d+)' - _TEST = { + _TESTS = [{ + 'url': 'https://tv.nrk.no/serie/hellums-kro/sesong/1/episode/2', + 'info_dict': { + 'id': 'MUHH36005220BA', + 'ext': 'mp4', + 'title': 'Kro, krig og kjærlighet 2:6', + 'description': 'md5:b32a7dc0b1ed27c8064f58b97bda4350', + 'duration': 1563, + 'series': 'Hellums kro', + 'season_number': 1, + 'episode_number': 2, + 'episode': '2:6', + 'age_limit': 6, + }, + 'params': { + 'skip_download': True, + }, + }, { 'url': 'https://tv.nrk.no/serie/backstage/sesong/1/episode/8', 'info_dict': { 'id': 'MSUI14000816AA', @@ -386,7 +417,8 @@ class NRKTVEpisodeIE(InfoExtractor): 'params': { 'skip_download': True, }, - } + 'skip': 'ProgramRightsHasExpired', + }] def _real_extract(self, url): display_id = self._match_id(url) @@ -409,7 +441,7 @@ class NRKTVSerieBaseIE(InfoExtractor): (r'INITIAL_DATA(?:_V\d)?_*\s*=\s*({.+?})\s*;', r'({.+?})\s*,\s*"[^"]+"\s*\)\s*'), webpage, 'config', default='{}' if not fatal else NO_DEFAULT), - display_id, fatal=False) + display_id, fatal=False, transform_source=js_to_json) if not config: return return try_get( @@ -479,6 +511,14 @@ class NRKTVSeriesIE(NRKTVSerieBaseIE): _VALID_URL = r'https?://(?:tv|radio)\.nrk(?:super)?\.no/serie/(?P[^/]+)' _ITEM_RE = r'(?:data-season=["\']|id=["\']season-)(?P\d+)' _TESTS = [{ + 'url': 'https://tv.nrk.no/serie/blank', + 'info_dict': { + 'id': 'blank', + 'title': 'Blank', + 'description': 'md5:7664b4e7e77dc6810cd3bca367c25b6e', + }, + 'playlist_mincount': 30, + }, { # new layout, seasons 'url': 'https://tv.nrk.no/serie/backstage', 'info_dict': { @@ -648,7 +688,7 @@ class NRKSkoleIE(InfoExtractor): _TESTS = [{ 'url': 'https://www.nrk.no/skole/?page=search&q=&mediaId=14099', - 'md5': '6bc936b01f9dd8ed45bc58b252b2d9b6', + 'md5': '18c12c3d071953c3bf8d54ef6b2587b7', 'info_dict': { 'id': '6021', 'ext': 'mp4',