From 7bee705d8f110f09d8e72b1c863ff197ccc1d4f1 Mon Sep 17 00:00:00 2001 From: yonaikerlol Date: Thu, 14 Feb 2019 11:28:16 -0400 Subject: [PATCH 01/34] [openload] Add support for oload.live --- youtube_dl/extractor/openload.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/openload.py b/youtube_dl/extractor/openload.py index a2ae25272..c1dcbb7eb 100644 --- a/youtube_dl/extractor/openload.py +++ b/youtube_dl/extractor/openload.py @@ -249,7 +249,7 @@ class OpenloadIE(InfoExtractor): (?:www\.)? (?: openload\.(?:co|io|link|pw)| - oload\.(?:tv|stream|site|xyz|win|download|cloud|cc|icu|fun|club|info|pw) + oload\.(?:tv|stream|site|xyz|win|download|cloud|cc|icu|fun|club|info|pw|live) ) )/ (?:f|embed)/ @@ -346,6 +346,9 @@ class OpenloadIE(InfoExtractor): }, { 'url': 'https://oload.pw/f/WyKgK8s94N0', 'only_matching': True, + }, { + 'url': 'https://oload.live/f/-Z58UZ-GR4M', + 'only_matching': True, }] _USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36' From 794c1b6e02591b04da931fa59745bc47bfae7492 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 14 Feb 2019 23:40:46 +0700 Subject: [PATCH 02/34] [vshare] Pass Referer to download request (closes #19205, closes #19221) --- youtube_dl/extractor/vshare.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/vshare.py b/youtube_dl/extractor/vshare.py index e4ec77889..c631ac1fa 100644 --- a/youtube_dl/extractor/vshare.py +++ b/youtube_dl/extractor/vshare.py @@ -48,7 +48,7 @@ class VShareIE(InfoExtractor): webpage = self._download_webpage( 'https://vshare.io/v/%s/width-650/height-430/1' % video_id, - video_id) + video_id, headers={'Referer': url}) title = self._html_search_regex( r'([^<]+)', webpage, 'title') From 2b2da3ba10cc325d00b665aae87f0fa8508bccdf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 15 Feb 2019 23:56:29 +0700 Subject: [PATCH 03/34] [rai] Relax _VALID_URL (closes #19232) --- youtube_dl/extractor/rai.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/rai.py b/youtube_dl/extractor/rai.py index 548a6553b..149153b8f 100644 --- a/youtube_dl/extractor/rai.py +++ b/youtube_dl/extractor/rai.py @@ -288,7 +288,7 @@ class RaiPlayPlaylistIE(InfoExtractor): class RaiIE(RaiBaseIE): - _VALID_URL = r'https?://[^/]+\.(?:rai\.(?:it|tv)|rainews\.it)/dl/.+?-(?P%s)(?:-.+?)?\.html' % RaiBaseIE._UUID_RE + _VALID_URL = r'https?://[^/]+\.(?:rai\.(?:it|tv)|rainews\.it)/.+?-(?P%s)(?:-.+?)?\.html' % RaiBaseIE._UUID_RE _TESTS = [{ # var uniquename = "ContentItem-..." # data-id="ContentItem-..." @@ -375,6 +375,9 @@ class RaiIE(RaiBaseIE): # Direct MMS URL 'url': 'http://www.rai.it/dl/RaiTV/programmi/media/ContentItem-b63a4089-ac28-48cf-bca5-9f5b5bc46df5.html', 'only_matching': True, + }, { + 'url': 'https://www.rainews.it/tgr/marche/notiziari/video/2019/02/ContentItem-6ba945a2-889c-4a80-bdeb-8489c70a8db9.html', + 'only_matching': True, }] def _extract_from_content_id(self, content_id, url): From ba2e3730d125eab952eded3bb7749d479a2262d3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 16 Feb 2019 22:45:53 +0700 Subject: [PATCH 04/34] [noovo] Fix extraction (closes #19230) --- youtube_dl/extractor/noovo.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/noovo.py b/youtube_dl/extractor/noovo.py index 974de3c3e..b40770d07 100644 --- a/youtube_dl/extractor/noovo.py +++ b/youtube_dl/extractor/noovo.py @@ -57,7 +57,8 @@ class NoovoIE(InfoExtractor): webpage = self._download_webpage(url, video_id) - bc_url = BrightcoveNewIE._extract_url(self, webpage) + brightcove_id = self._search_regex( + r'data-video-id=["\'](\d+)', webpage, 'brightcove id') data = self._parse_json( self._search_regex( @@ -89,7 +90,10 @@ class NoovoIE(InfoExtractor): return { '_type': 'url_transparent', 'ie_key': BrightcoveNewIE.ie_key(), - 'url': smuggle_url(bc_url, {'geo_countries': ['CA']}), + 'url': smuggle_url( + self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, + {'geo_countries': ['CA']}), + 'id': brightcove_id, 'title': title, 'description': description, 'series': series, From ae65c93a26f2b3cf806477a3ee891aa461b5c6b0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 17 Feb 2019 00:58:13 +0700 Subject: [PATCH 05/34] [udemy] Update User-Agent and detect captcha (closes #14713, closes #15839, closes #18126) --- youtube_dl/extractor/udemy.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/udemy.py b/youtube_dl/extractor/udemy.py index 105826e9b..89a7f6ade 100644 --- a/youtube_dl/extractor/udemy.py +++ b/youtube_dl/extractor/udemy.py @@ -123,10 +123,22 @@ class UdemyIE(InfoExtractor): def _download_webpage_handle(self, *args, **kwargs): headers = kwargs.get('headers', {}).copy() - headers['User-Agent'] = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/603.2.4 (KHTML, like Gecko) Version/10.1.1 Safari/603.2.4' + headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36' kwargs['headers'] = headers - return super(UdemyIE, self)._download_webpage_handle( + ret = super(UdemyIE, self)._download_webpage_handle( *args, **compat_kwargs(kwargs)) + if not ret: + return ret + webpage, _ = ret + if any(p in webpage for p in ( + '>Please verify you are a human', + 'Access to this page has been denied because we believe you are using automation tools to browse the website', + '"_pxCaptcha"')): + raise ExtractorError( + 'Udemy asks you to solve a CAPTCHA. Login with browser, ' + 'solve CAPTCHA, then export cookies and pass cookie file to ' + 'youtube-dl with --cookies.', expected=True) + return ret def _download_json(self, url_or_request, *args, **kwargs): headers = { From d7d513891b7e63337218c5cb0bf743c8f7044381 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 17 Feb 2019 01:05:01 +0700 Subject: [PATCH 06/34] [udemy] Extend _VALID_URLs (closes #14330, closes #15883) --- youtube_dl/extractor/udemy.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/udemy.py b/youtube_dl/extractor/udemy.py index 89a7f6ade..ae8de9897 100644 --- a/youtube_dl/extractor/udemy.py +++ b/youtube_dl/extractor/udemy.py @@ -29,7 +29,7 @@ class UdemyIE(InfoExtractor): IE_NAME = 'udemy' _VALID_URL = r'''(?x) https?:// - www\.udemy\.com/ + (?:[^/]+\.)?udemy\.com/ (?: [^#]+\#/lecture/| lecture/view/?\?lectureId=| @@ -64,6 +64,9 @@ class UdemyIE(InfoExtractor): # only outputs rendition 'url': 'https://www.udemy.com/how-you-can-help-your-local-community-5-amazing-examples/learn/v4/t/lecture/3225750?start=0', 'only_matching': True, + }, { + 'url': 'https://wipro.udemy.com/java-tutorial/#/lecture/172757', + 'only_matching': True, }] def _extract_course_info(self, webpage, video_id): @@ -415,8 +418,14 @@ class UdemyIE(InfoExtractor): class UdemyCourseIE(UdemyIE): IE_NAME = 'udemy:course' - _VALID_URL = r'https?://(?:www\.)?udemy\.com/(?P[^/?#&]+)' - _TESTS = [] + _VALID_URL = r'https?://(?:[^/]+\.)?udemy\.com/(?P[^/?#&]+)' + _TESTS = [{ + 'url': 'https://www.udemy.com/java-tutorial/', + 'only_matching': True, + }, { + 'url': 'https://wipro.udemy.com/java-tutorial/', + 'only_matching': True, + }] @classmethod def suitable(cls, url): From c9a0ea6e51eff28b9a383a47215870fd5875fc3e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 17 Feb 2019 05:00:16 +0700 Subject: [PATCH 07/34] [bilibili] Update keys (closes #19233) --- youtube_dl/extractor/bilibili.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/bilibili.py b/youtube_dl/extractor/bilibili.py index 4d6b051fe..3746671d3 100644 --- a/youtube_dl/extractor/bilibili.py +++ b/youtube_dl/extractor/bilibili.py @@ -93,8 +93,8 @@ class BiliBiliIE(InfoExtractor): }] }] - _APP_KEY = '84956560bc028eb7' - _BILIBILI_KEY = '94aba54af9065f71de72f5508f1cd42e' + _APP_KEY = 'iVGUTjsxvpLeuDCf' + _BILIBILI_KEY = 'aHRmhWMLkdeMuILqORnYZocwMBpMEOdt' def _report_error(self, result): if 'message' in result: From 659e93fcf5c0480ac461cda412335cecf6a5595f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 17 Feb 2019 07:12:10 +0700 Subject: [PATCH 08/34] [linuxacademy] Add extractor (closes #12207) --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/linuxacademy.py | 174 +++++++++++++++++++++++++++ 2 files changed, 175 insertions(+) create mode 100644 youtube_dl/extractor/linuxacademy.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 3e1b63b4b..c70452dcd 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -593,6 +593,7 @@ from .linkedin import ( LinkedInLearningIE, LinkedInLearningCourseIE, ) +from .linuxacademy import LinuxAcademyIE from .litv import LiTVIE from .liveleak import ( LiveLeakIE, diff --git a/youtube_dl/extractor/linuxacademy.py b/youtube_dl/extractor/linuxacademy.py new file mode 100644 index 000000000..a78c6556e --- /dev/null +++ b/youtube_dl/extractor/linuxacademy.py @@ -0,0 +1,174 @@ +from __future__ import unicode_literals + +import json +import random +import re + +from .common import InfoExtractor +from ..compat import ( + compat_b64decode, + compat_HTTPError, + compat_str, +) +from ..utils import ( + ExtractorError, + orderedSet, + unescapeHTML, + urlencode_postdata, + urljoin, +) + + +class LinuxAcademyIE(InfoExtractor): + _VALID_URL = r'''(?x) + https?:// + (?:www\.)?linuxacademy\.com/cp/ + (?: + courses/lesson/course/(?P\d+)/lesson/(?P\d+)| + modules/view/id/(?P\d+) + ) + ''' + _TESTS = [{ + 'url': 'https://linuxacademy.com/cp/courses/lesson/course/1498/lesson/2/module/154', + 'info_dict': { + 'id': '1498-2', + 'ext': 'mp4', + 'title': "Introduction to the Practitioner's Brief", + }, + 'params': { + 'skip_download': True, + }, + 'skip': 'Requires Linux Academy account credentials', + }, { + 'url': 'https://linuxacademy.com/cp/courses/lesson/course/1498/lesson/2', + 'only_matching': True, + }, { + 'url': 'https://linuxacademy.com/cp/modules/view/id/154', + 'info_dict': { + 'id': '154', + 'title': 'AWS Certified Cloud Practitioner', + 'description': 'md5:039db7e60e4aac9cf43630e0a75fa834', + }, + 'playlist_count': 41, + 'skip': 'Requires Linux Academy account credentials', + }] + + _AUTHORIZE_URL = 'https://login.linuxacademy.com/authorize' + _ORIGIN_URL = 'https://linuxacademy.com' + _CLIENT_ID = 'KaWxNn1C2Gc7n83W9OFeXltd8Utb5vvx' + _NETRC_MACHINE = 'linuxacademy' + + def _real_initialize(self): + self._login() + + def _login(self): + username, password = self._get_login_info() + if username is None: + return + + def random_string(): + return ''.join([ + random.choice('0123456789ABCDEFGHIJKLMNOPQRSTUVXYZabcdefghijklmnopqrstuvwxyz-._~') + for _ in range(32)]) + + webpage, urlh = self._download_webpage_handle( + self._AUTHORIZE_URL, None, 'Downloading authorize page', query={ + 'client_id': self._CLIENT_ID, + 'response_type': 'token id_token', + 'redirect_uri': self._ORIGIN_URL, + 'scope': 'openid email user_impersonation profile', + 'audience': self._ORIGIN_URL, + 'state': random_string(), + 'nonce': random_string(), + }) + + login_data = self._parse_json( + self._search_regex( + r'atob\(\s*(["\'])(?P(?:(?!\1).)+)\1', webpage, + 'login info', group='value'), None, + transform_source=lambda x: compat_b64decode(x).decode('utf-8') + )['extraParams'] + + login_data.update({ + 'client_id': self._CLIENT_ID, + 'redirect_uri': self._ORIGIN_URL, + 'tenant': 'lacausers', + 'connection': 'Username-Password-Authentication', + 'username': username, + 'password': password, + 'sso': 'true', + }) + + login_state_url = compat_str(urlh.geturl()) + + try: + login_page = self._download_webpage( + 'https://login.linuxacademy.com/usernamepassword/login', None, + 'Downloading login page', data=json.dumps(login_data).encode(), + headers={ + 'Content-Type': 'application/json', + 'Origin': 'https://login.linuxacademy.com', + 'Referer': login_state_url, + }) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: + error = self._parse_json(e.cause.read(), None) + message = error.get('description') or error['code'] + raise ExtractorError( + '%s said: %s' % (self.IE_NAME, message), expected=True) + raise + + callback_page, urlh = self._download_webpage_handle( + 'https://login.linuxacademy.com/login/callback', None, + 'Downloading callback page', + data=urlencode_postdata(self._hidden_inputs(login_page)), + headers={ + 'Content-Type': 'application/x-www-form-urlencoded', + 'Origin': 'https://login.linuxacademy.com', + 'Referer': login_state_url, + }) + + access_token = self._search_regex( + r'access_token=([^=&]+)', compat_str(urlh.geturl()), + 'access token') + + self._download_webpage( + 'https://linuxacademy.com/cp/login/tokenValidateLogin/token/%s' + % access_token, None, 'Downloading token validation page') + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + chapter_id, lecture_id, course_id = mobj.group('chapter_id', 'lesson_id', 'course_id') + item_id = course_id if course_id else '%s-%s' % (chapter_id, lecture_id) + + webpage = self._download_webpage(url, item_id) + + # course path + if course_id: + entries = [ + self.url_result( + urljoin(url, lesson_url), ie=LinuxAcademyIE.ie_key()) + for lesson_url in orderedSet(re.findall( + r']+\bhref=["\'](/cp/courses/lesson/course/\d+/lesson/\d+/module/\d+)', + webpage))] + title = unescapeHTML(self._html_search_regex( + (r'class=["\']course-title["\'][^>]*>(?P[^<]+)', + r'var\s+title\s*=\s*(["\'])(?P(?:(?!\1).)+)\1'), + webpage, 'title', default=None, group='value')) + description = unescapeHTML(self._html_search_regex( + r'var\s+description\s*=\s*(["\'])(?P(?:(?!\1).)+)\1', + webpage, 'description', default=None, group='value')) + return self.playlist_result(entries, course_id, title, description) + + # single video path + info = self._extract_jwplayer_data( + webpage, item_id, require_title=False, m3u8_id='hls',) + title = self._search_regex( + (r'>Lecture\s*:\s*(?P[^<]+)', + r'lessonName\s*=\s*(["\'])(?P(?:(?!\1).)+)\1'), webpage, + 'title', group='value') + info.update({ + 'id': item_id, + 'title': title, + }) + return info From 3c9647372e78134777d201e157a5ef42345c9da2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 17 Feb 2019 13:38:21 +0700 Subject: [PATCH 09/34] [tvp] Fix description extraction, make thumbnail optional and fix tests --- youtube_dl/extractor/tvp.py | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/tvp.py b/youtube_dl/extractor/tvp.py index 3954f0b93..f9bf600b0 100644 --- a/youtube_dl/extractor/tvp.py +++ b/youtube_dl/extractor/tvp.py @@ -19,12 +19,12 @@ class TVPIE(InfoExtractor): _TESTS = [{ 'url': 'https://vod.tvp.pl/video/czas-honoru,i-seria-odc-13,194536', - 'md5': '8aa518c15e5cc32dfe8db400dc921fbb', + 'md5': 'a21eb0aa862f25414430f15fdfb9e76c', 'info_dict': { 'id': '194536', 'ext': 'mp4', - 'title': 'Czas honoru, I seria – odc. 13', - 'description': 'md5:381afa5bca72655fe94b05cfe82bf53d', + 'title': 'Czas honoru, odc. 13 – Władek', + 'description': 'md5:437f48b93558370b031740546b696e24', }, }, { 'url': 'http://www.tvp.pl/there-can-be-anything-so-i-shortened-it/17916176', @@ -45,6 +45,7 @@ class TVPIE(InfoExtractor): 'title': 'Wiadomości, 28.09.2017, 19:30', 'description': 'Wydanie główne codziennego serwisu informacyjnego.' }, + 'skip': 'HTTP Error 404: Not Found', }, { 'url': 'http://vod.tvp.pl/seriale/obyczajowe/na-sygnale/sezon-2-27-/odc-39/17834272', 'only_matching': True, @@ -75,8 +76,10 @@ class TVPIE(InfoExtractor): return { '_type': 'url_transparent', 'url': 'tvp:' + video_id, - 'description': self._og_search_description(webpage, default=None), - 'thumbnail': self._og_search_thumbnail(webpage), + 'description': self._og_search_description( + webpage, default=None) or self._html_search_meta( + 'description', webpage, default=None), + 'thumbnail': self._og_search_thumbnail(webpage, default=None), 'ie_key': 'TVPEmbed', } @@ -87,6 +90,14 @@ class TVPEmbedIE(InfoExtractor): _VALID_URL = r'(?:tvp:|https?://[^/]+\.tvp\.(?:pl|info)/sess/tvplayer\.php\?.*?object_id=)(?P\d+)' _TESTS = [{ + 'url': 'tvp:194536', + 'md5': 'a21eb0aa862f25414430f15fdfb9e76c', + 'info_dict': { + 'id': '194536', + 'ext': 'mp4', + 'title': 'Czas honoru, odc. 13 – Władek', + }, + }, { 'url': 'http://www.tvp.pl/sess/tvplayer.php?object_id=22670268', 'md5': '8c9cd59d16edabf39331f93bf8a766c7', 'info_dict': { From 34568dc2967d227630ed9d7150deaa62a689b937 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 17 Feb 2019 13:39:00 +0700 Subject: [PATCH 10/34] [tvp] Detect unavailable videos --- youtube_dl/extractor/tvp.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/tvp.py b/youtube_dl/extractor/tvp.py index f9bf600b0..536b580fc 100644 --- a/youtube_dl/extractor/tvp.py +++ b/youtube_dl/extractor/tvp.py @@ -98,6 +98,7 @@ class TVPEmbedIE(InfoExtractor): 'title': 'Czas honoru, odc. 13 – Władek', }, }, { + # not available 'url': 'http://www.tvp.pl/sess/tvplayer.php?object_id=22670268', 'md5': '8c9cd59d16edabf39331f93bf8a766c7', 'info_dict': { @@ -105,6 +106,7 @@ class TVPEmbedIE(InfoExtractor): 'ext': 'mp4', 'title': 'Panorama, 07.12.2015, 15:40', }, + 'skip': 'Transmisja została zakończona lub materiał niedostępny', }, { 'url': 'tvp:22670268', 'only_matching': True, @@ -116,10 +118,13 @@ class TVPEmbedIE(InfoExtractor): webpage = self._download_webpage( 'http://www.tvp.pl/sess/tvplayer.php?object_id=%s' % video_id, video_id) - error_massage = get_element_by_attribute('class', 'msg error', webpage) - if error_massage: + error = self._html_search_regex( + r'(?s)]+\bclass=["\']notAvailable__text["\'][^>]*>(.+?)

', + webpage, 'error', default=None) or clean_html( + get_element_by_attribute('class', 'msg error', webpage)) + if error: raise ExtractorError('%s said: %s' % ( - self.IE_NAME, clean_html(error_massage)), expected=True) + self.IE_NAME, clean_html(error)), expected=True) title = self._search_regex( r'name\s*:\s*([\'"])Title\1\s*,\s*value\s*:\s*\1(?P.+?)\1', From d93083789bf9c318b18d52ac132e9495345b9ebc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 17 Feb 2019 14:09:30 +0700 Subject: [PATCH 11/34] [tvp:series] Fix extraction --- youtube_dl/extractor/tvp.py | 67 ++++++++++++++++--------------------- 1 file changed, 29 insertions(+), 38 deletions(-) diff --git a/youtube_dl/extractor/tvp.py b/youtube_dl/extractor/tvp.py index 536b580fc..05669a366 100644 --- a/youtube_dl/extractor/tvp.py +++ b/youtube_dl/extractor/tvp.py @@ -1,14 +1,16 @@ # coding: utf-8 from __future__ import unicode_literals +import itertools import re from .common import InfoExtractor from ..utils import ( - determine_ext, clean_html, - get_element_by_attribute, + determine_ext, ExtractorError, + get_element_by_attribute, + orderedSet, ) @@ -198,46 +200,35 @@ class TVPEmbedIE(InfoExtractor): class TVPSeriesIE(InfoExtractor): IE_NAME = 'tvp:series' - _VALID_URL = r'https?://vod\.tvp\.pl/(?:[^/]+/){2}(?P<id>[^/]+)/?$' + _VALID_URL = r'https?://vod\.tvp\.pl/website/(?P<display_id>[^,]+),(?P<id>\d+)/video' _TESTS = [{ - 'url': 'http://vod.tvp.pl/filmy-fabularne/filmy-za-darmo/ogniem-i-mieczem', + 'url': 'https://vod.tvp.pl/website/lzy-cennet,38678312/video', 'info_dict': { - 'title': 'Ogniem i mieczem', - 'id': '4278026', + 'id': '38678312', }, - 'playlist_count': 4, - }, { - 'url': 'http://vod.tvp.pl/audycje/podroze/boso-przez-swiat', - 'info_dict': { - 'title': 'Boso przez świat', - 'id': '9329207', - }, - 'playlist_count': 86, + 'playlist_count': 115, }] + def _entries(self, url, display_id): + for page_num in itertools.count(1): + page = self._download_webpage( + url, display_id, 'Downloading page %d' % page_num, + query={'page': page_num}) + + video_ids = orderedSet(re.findall( + r'<a[^>]+\bhref=["\']/video/%s,[^,]+,(\d+)' % display_id, + page)) + + if not video_ids: + break + + for video_id in video_ids: + yield self.url_result( + 'tvp:%s' % video_id, ie=TVPEmbedIE.ie_key(), + video_id=video_id) + def _real_extract(self, url): - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id, tries=5) - - title = self._html_search_regex( - r'(?s) id=[\'"]path[\'"]>(?:.*? / ){2}(.*?)</span>', webpage, 'series') - playlist_id = self._search_regex(r'nodeId:\s*(\d+)', webpage, 'playlist id') - playlist = self._download_webpage( - 'http://vod.tvp.pl/vod/seriesAjax?type=series&nodeId=%s&recommend' - 'edId=0&sort=&page=0&pageSize=10000' % playlist_id, display_id, tries=5, - note='Downloading playlist') - - videos_paths = re.findall( - '(?s)class="shortTitle">.*?href="(/[^"]+)', playlist) - entries = [ - self.url_result('http://vod.tvp.pl%s' % v_path, ie=TVPIE.ie_key()) - for v_path in videos_paths] - - return { - '_type': 'playlist', - 'id': playlist_id, - 'display_id': display_id, - 'title': title, - 'entries': entries, - } + mobj = re.match(self._VALID_URL, url) + display_id, playlist_id = mobj.group('display_id', 'id') + return self.playlist_result(self._entries(url, display_id), playlist_id) From 388cfbd3d8915ebb99714ac8e7ce4151edf96d8d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 17 Feb 2019 14:27:00 +0700 Subject: [PATCH 12/34] [tvp:website] Improve support --- youtube_dl/extractor/extractors.py | 2 +- youtube_dl/extractor/tvp.py | 26 ++++++++++++++++++++++---- 2 files changed, 23 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index c70452dcd..923dfe7f4 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1218,7 +1218,7 @@ from .tvnow import ( from .tvp import ( TVPEmbedIE, TVPIE, - TVPSeriesIE, + TVPWebsiteIE, ) from .tvplay import ( TVPlayIE, diff --git a/youtube_dl/extractor/tvp.py b/youtube_dl/extractor/tvp.py index 05669a366..accff75b5 100644 --- a/youtube_dl/extractor/tvp.py +++ b/youtube_dl/extractor/tvp.py @@ -198,19 +198,36 @@ class TVPEmbedIE(InfoExtractor): } -class TVPSeriesIE(InfoExtractor): +class TVPWebsiteIE(InfoExtractor): IE_NAME = 'tvp:series' - _VALID_URL = r'https?://vod\.tvp\.pl/website/(?P<display_id>[^,]+),(?P<id>\d+)/video' + _VALID_URL = r'https?://vod\.tvp\.pl/website/(?P<display_id>[^,]+),(?P<id>\d+)' _TESTS = [{ + # series 'url': 'https://vod.tvp.pl/website/lzy-cennet,38678312/video', 'info_dict': { 'id': '38678312', }, 'playlist_count': 115, + }, { + # film + 'url': 'https://vod.tvp.pl/website/gloria,35139666', + 'info_dict': { + 'id': '36637049', + 'ext': 'mp4', + 'title': 'Gloria, Gloria', + }, + 'params': { + 'skip_download': True, + }, + 'add_ie': ['TVPEmbed'], + }, { + 'url': 'https://vod.tvp.pl/website/lzy-cennet,38678312', + 'only_matching': True, }] - def _entries(self, url, display_id): + def _entries(self, display_id, playlist_id): + url = 'https://vod.tvp.pl/website/%s,%s/video' % (display_id, playlist_id) for page_num in itertools.count(1): page = self._download_webpage( url, display_id, 'Downloading page %d' % page_num, @@ -231,4 +248,5 @@ class TVPSeriesIE(InfoExtractor): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) display_id, playlist_id = mobj.group('display_id', 'id') - return self.playlist_result(self._entries(url, display_id), playlist_id) + return self.playlist_result( + self._entries(display_id, playlist_id), playlist_id) From c76fc5b22a70f9ac24fe7e34c37aa8ef82e85c49 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 18 Feb 2019 02:10:06 +0700 Subject: [PATCH 13/34] [ChangeLog] Actualize [ci skip] --- ChangeLog | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/ChangeLog b/ChangeLog index 398528f76..adbdf166d 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,28 @@ +version <unreleased> + +Extractors +* [tvp:website] Fix and improve extraction ++ [tvp] Detect unavailable videos +* [tvp] Fix description extraction and make thumbnail optional ++ [linuxacademy] Add support for linuxacademy.com (#12207) +* [bilibili] Update keys (#19233) +* [udemy] Extend URL regular expressions (#14330, #15883) +* [udemy] Update User-Agent and detect captcha (#14713, #15839, #18126) +* [noovo] Fix extraction (#19230) +* [rai] Relax URL regular expression (#19232) ++ [vshare] Pass Referer to download request (#19205, #19221) ++ [openload] Add support for oload.live (#19222) +* [imgur] Use video id as title fallback (#18590) ++ [twitch] Add new source format detection approach (#19193) +* [tvplayhome] Fix video id extraction (#19190) +* [tvplayhome] Fix episode metadata extraction (#19190) +* [rutube:embed] Fix extraction (#19163) ++ [rutube:embed] Add support private videos (#19163) ++ [soundcloud] Extract more metadata ++ [trunews] Add support for trunews.com (#19153) ++ [linkedin:learning] Extract chapter_number and chapter_id (#19162) + + version 2019.02.08 Core From 77a842c8926625fe791ed36613f183bb195394cb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 18 Feb 2019 02:11:11 +0700 Subject: [PATCH 14/34] release 2019.02.18 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- docs/supportedsites.md | 2 ++ youtube_dl/version.py | 2 +- 4 files changed, 7 insertions(+), 5 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index 7128d998f..ff626883d 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2019.02.08*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2019.02.08** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2019.02.18*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2019.02.18** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -36,7 +36,7 @@ Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2019.02.08 +[debug] youtube-dl version 2019.02.18 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index adbdf166d..f9dd7928f 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version <unreleased> +version 2019.02.18 Extractors * [tvp:website] Fix and improve extraction diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 32fe6b647..d8a8d7ede 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -458,6 +458,7 @@ - **LineTV** - **linkedin:learning** - **linkedin:learning:course** + - **LinuxAcademy** - **LiTV** - **LiveLeak** - **LiveLeakEmbed** @@ -915,6 +916,7 @@ - **ToypicsUser**: Toypics user profile - **TrailerAddict** (Currently broken) - **Trilulilu** + - **TruNews** - **TruTV** - **Tube8** - **TubiTv** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 4dc5a611e..ea1d5a4a5 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2019.02.08' +__version__ = '2019.02.18' From caf48f557a8f4f904c88346bcfc462069b8745bc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 21 Feb 2019 05:59:07 +0700 Subject: [PATCH 15/34] [metacafe] Fix family filter bypass (closes #19287) --- youtube_dl/extractor/metacafe.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/metacafe.py b/youtube_dl/extractor/metacafe.py index 28f59f63c..9e92416d1 100644 --- a/youtube_dl/extractor/metacafe.py +++ b/youtube_dl/extractor/metacafe.py @@ -1,12 +1,13 @@ from __future__ import unicode_literals +import json import re from .common import InfoExtractor from ..compat import ( compat_parse_qs, + compat_urllib_parse, compat_urllib_parse_unquote, - compat_urllib_parse_urlencode, ) from ..utils import ( determine_ext, @@ -144,7 +145,7 @@ class MetacafeIE(InfoExtractor): headers = { # Disable family filter - 'Cookie': 'user=%s; ' % compat_urllib_parse_urlencode({'ffilter': False}) + 'Cookie': 'user=%s; ' % compat_urllib_parse.quote(json.dumps({'ffilter': False})) } # AnyClip videos require the flashversion cookie so that we get the link From 37b239b3b66ea9e2a71bae41e9da6dba8ee5554c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 23 Feb 2019 00:43:29 +0700 Subject: [PATCH 16/34] [downloader/external] Fix infinite retries for curl (closes #19303) --- youtube_dl/downloader/external.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/youtube_dl/downloader/external.py b/youtube_dl/downloader/external.py index 958d00aac..0b88bfd94 100644 --- a/youtube_dl/downloader/external.py +++ b/youtube_dl/downloader/external.py @@ -121,7 +121,11 @@ class CurlFD(ExternalFD): cmd += self._valueless_option('--silent', 'noprogress') cmd += self._valueless_option('--verbose', 'verbose') cmd += self._option('--limit-rate', 'ratelimit') - cmd += self._option('--retry', 'retries') + retry = self._option('--retry', 'retries') + if len(retry) == 2: + if retry[1] in ('inf', 'infinite'): + retry[1] = '2147483647' + cmd += retry cmd += self._option('--max-filesize', 'max_filesize') cmd += self._option('--interface', 'source_address') cmd += self._option('--proxy', 'proxy') From 8c80603f1adea843d96c0598b902106c7a3efb7f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 23 Feb 2019 00:58:56 +0700 Subject: [PATCH 17/34] [downloader/external] Add support for rate limit and retries for wget --- youtube_dl/downloader/external.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/youtube_dl/downloader/external.py b/youtube_dl/downloader/external.py index 0b88bfd94..22e6093b3 100644 --- a/youtube_dl/downloader/external.py +++ b/youtube_dl/downloader/external.py @@ -164,6 +164,12 @@ class WgetFD(ExternalFD): cmd = [self.exe, '-O', tmpfilename, '-nv', '--no-cookies'] for key, val in info_dict['http_headers'].items(): cmd += ['--header', '%s: %s' % (key, val)] + cmd += self._option('--limit-rate', 'ratelimit') + retry = self._option('--tries', 'retries') + if len(retry) == 2: + if retry[1] in ('inf', 'infinite'): + retry[1] = '0' + cmd += retry cmd += self._option('--bind-address', 'source_address') cmd += self._option('--proxy', 'proxy') cmd += self._valueless_option('--no-check-certificate', 'nocheckcertificate') From f0228f56fb2441510aa966ba9298e388b209cde1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 24 Feb 2019 21:01:25 +0700 Subject: [PATCH 18/34] [bbccouk] Make subtitles non fatal (#19651) --- youtube_dl/extractor/bbc.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index eac9a5a46..13340ec64 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -1,8 +1,9 @@ # coding: utf-8 from __future__ import unicode_literals -import re import itertools +import re +import xml from .common import InfoExtractor from ..utils import ( @@ -17,6 +18,7 @@ from ..utils import ( parse_iso8601, try_get, unescapeHTML, + url_or_none, urlencode_postdata, urljoin, ) @@ -310,7 +312,13 @@ class BBCCoUkIE(InfoExtractor): def _get_subtitles(self, media, programme_id): subtitles = {} for connection in self._extract_connections(media): - captions = self._download_xml(connection.get('href'), programme_id, 'Downloading captions') + cc_url = url_or_none(connection.get('href')) + if not cc_url: + continue + captions = self._download_xml( + cc_url, programme_id, 'Downloading captions', fatal=False) + if not isinstance(captions, xml.etree.ElementTree.Element): + continue lang = captions.get('{http://www.w3.org/XML/1998/namespace}lang', 'en') subtitles[lang] = [ { From 55b8588f0e4dd9597b6da5c46d05b9dd1e9f5960 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 24 Feb 2019 23:19:15 +0700 Subject: [PATCH 19/34] [servus] Fix extraction (closes #19297) --- youtube_dl/extractor/servus.py | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/servus.py b/youtube_dl/extractor/servus.py index 264e1dd8b..e579d42cf 100644 --- a/youtube_dl/extractor/servus.py +++ b/youtube_dl/extractor/servus.py @@ -1,31 +1,44 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor class ServusIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?servus\.com/(?:at|de)/p/[^/]+/(?P<id>AA-\w+|\d+-\d+)' + _VALID_URL = r'https?://(?:www\.)?servus\.com/(?:(?:at|de)/p/[^/]+|tv/videos)/(?P<id>[aA]{2}-\w+|\d+-\d+)' _TESTS = [{ 'url': 'https://www.servus.com/de/p/Die-Gr%C3%BCnen-aus-Sicht-des-Volkes/AA-1T6VBU5PW1W12/', - 'md5': '046dee641cda1c4cabe13baef3be2c1c', + 'md5': '3e1dd16775aa8d5cbef23628cfffc1f4', 'info_dict': { 'id': 'AA-1T6VBU5PW1W12', 'ext': 'mp4', - 'title': 'Die Grünen aus Volkssicht', - 'description': 'md5:052b5da1cb2cd7d562ef1f19be5a5cba', - 'thumbnail': r're:^https?://.*\.jpg$', + 'title': 'Die Grünen aus Sicht des Volkes', + 'description': 'md5:1247204d85783afe3682644398ff2ec4', + 'thumbnail': r're:^https?://.*\.jpg', } }, { 'url': 'https://www.servus.com/at/p/Wie-das-Leben-beginnt/1309984137314-381415152/', 'only_matching': True, + }, { + 'url': 'https://www.servus.com/tv/videos/aa-1t6vbu5pw1w12/', + 'only_matching': True, + }, { + 'url': 'https://www.servus.com/tv/videos/1380889096408-1235196658/', + 'only_matching': True, }] def _real_extract(self, url): - video_id = self._match_id(url) + video_id = self._match_id(url).upper() webpage = self._download_webpage(url, video_id) - title = self._og_search_title(webpage) + title = self._search_regex( + (r'videoLabel\s*=\s*(["\'])(?P<title>(?:(?!\1).)+)\1', + r'<h\d+[^>]+\bclass=["\']heading--(?:one|two)["\'][^>]*>(?P<title>[^<]+)'), + webpage, 'title', default=None, + group='title') or self._og_search_title(webpage) + title = re.sub(r'\s*-\s*Servus TV\s*$', '', title) description = self._og_search_description(webpage) thumbnail = self._og_search_thumbnail(webpage) From db1c3a9d3f202cc6f3fd83a2a918869e7c0d147f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 27 Feb 2019 03:41:15 +0700 Subject: [PATCH 20/34] [periscope] Extract width and height (closes #20015) --- youtube_dl/extractor/periscope.py | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/periscope.py b/youtube_dl/extractor/periscope.py index 8afe541ec..b337a56c0 100644 --- a/youtube_dl/extractor/periscope.py +++ b/youtube_dl/extractor/periscope.py @@ -5,6 +5,7 @@ import re from .common import InfoExtractor from ..utils import ( + int_or_none, parse_iso8601, unescapeHTML, ) @@ -75,6 +76,14 @@ class PeriscopeIE(PeriscopeBaseIE): 'url': broadcast[image], } for image in ('image_url', 'image_url_small') if broadcast.get(image)] + width = int_or_none(broadcast.get('width')) + height = int_or_none(broadcast.get('height')) + + def add_width_and_height(f): + for key, val in (('width', width), ('height', height)): + if not f.get(key): + f[key] = val + video_urls = set() formats = [] for format_id in ('replay', 'rtmp', 'hls', 'https_hls', 'lhls', 'lhlsweb'): @@ -83,16 +92,21 @@ class PeriscopeIE(PeriscopeBaseIE): continue video_urls.add(video_url) if format_id != 'rtmp': - formats.extend(self._extract_m3u8_formats( + m3u8_formats = self._extract_m3u8_formats( video_url, token, 'mp4', entry_protocol='m3u8_native' if state in ('ended', 'timed_out') else 'm3u8', - m3u8_id=format_id, fatal=False)) + m3u8_id=format_id, fatal=False) + if len(m3u8_formats) == 1: + add_width_and_height(m3u8_formats[0]) + formats.extend(m3u8_formats) continue - formats.append({ + rtmp_format = { 'url': video_url, 'ext': 'flv' if format_id == 'rtmp' else 'mp4', - }) + } + add_width_and_height(rtmp_format) + formats.append(rtmp_format) self._sort_formats(formats) return { From 9d9a8676dc02101069cf5fa9862500d39352538c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 28 Feb 2019 23:26:52 +0700 Subject: [PATCH 21/34] [francetv:site] Extend video id regex (closes #20029, closes #20071) --- youtube_dl/extractor/francetv.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py index 2ffe83a78..3c4ef08a8 100644 --- a/youtube_dl/extractor/francetv.py +++ b/youtube_dl/extractor/francetv.py @@ -271,7 +271,7 @@ class FranceTVSiteIE(FranceTVBaseInfoExtractor): catalogue = None video_id = self._search_regex( - r'data-main-video=(["\'])(?P<id>(?:(?!\1).)+)\1', + r'(?:data-main-video\s*=|videoId\s*:)\s*(["\'])(?P<id>(?:(?!\1).)+)\1', webpage, 'video id', default=None, group='id') if not video_id: From ff60ec8f029d12c119855ec82d7ce9ecda388651 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 1 Mar 2019 00:47:18 +0700 Subject: [PATCH 22/34] [npo] Fix extraction (#20084) --- youtube_dl/extractor/npo.py | 120 +++++++++++++++++++++++++++++++++++- 1 file changed, 117 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index 5a427c396..857845d35 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -12,11 +12,16 @@ from ..utils import ( ExtractorError, fix_xml_ampersands, int_or_none, + merge_dicts, orderedSet, parse_duration, qualities, + str_or_none, strip_jsonp, unified_strdate, + unified_timestamp, + url_or_none, + urlencode_postdata, ) @@ -176,9 +181,118 @@ class NPOIE(NPOBaseIE): def _real_extract(self, url): video_id = self._match_id(url) - return self._get_info(video_id) + try: + return self._get_info(url, video_id) + except ExtractorError: + return self._get_old_info(video_id) - def _get_info(self, video_id): + def _get_info(self, url, video_id): + token = self._download_json( + 'https://www.npostart.nl/api/token', video_id, + 'Downloading token', headers={ + 'Referer': url, + 'X-Requested-With': 'XMLHttpRequest', + })['token'] + + player = self._download_json( + 'https://www.npostart.nl/player/%s' % video_id, video_id, + 'Downloading player JSON', data=urlencode_postdata({ + 'autoplay': 0, + 'share': 1, + 'pageUrl': url, + 'hasAdConsent': 0, + '_token': token, + })) + + player_token = player['token'] + + format_urls = set() + formats = [] + for profile in ('hls', 'dash-widevine', 'dash-playready', 'smooth'): + streams = self._download_json( + 'https://start-player.npo.nl/video/%s/streams' % video_id, + video_id, 'Downloading %s profile JSON' % profile, fatal=False, + query={ + 'profile': profile, + 'quality': 'npo', + 'tokenId': player_token, + 'streamType': 'broadcast', + }) + if not streams: + continue + stream = streams.get('stream') + if not isinstance(stream, dict): + continue + stream_url = url_or_none(stream.get('src')) + if not stream_url or stream_url in format_urls: + continue + format_urls.add(stream_url) + if stream.get('protection') is not None: + continue + stream_type = stream.get('type') + stream_ext = determine_ext(stream_url) + if stream_type == 'application/dash+xml' or stream_ext == 'mpd': + formats.extend(self._extract_mpd_formats( + stream_url, video_id, mpd_id='dash', fatal=False)) + elif stream_type == 'application/vnd.apple.mpegurl' or stream_ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + stream_url, video_id, ext='mp4', + entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)) + elif '.ism/Manifest' in stream_url: + formats.extend(self._extract_ism_formats( + stream_url, video_id, ism_id='mss', fatal=False)) + else: + formats.append({ + 'url': stream_url, + }) + + self._sort_formats(formats) + + info = { + 'id': video_id, + 'title': video_id, + 'formats': formats, + } + + embed_url = url_or_none(player.get('embedUrl')) + if embed_url: + webpage = self._download_webpage( + embed_url, video_id, 'Downloading embed page', fatal=False) + if webpage: + video = self._parse_json( + self._search_regex( + r'\bvideo\s*=\s*({.+?})\s*;', webpage, 'video', + default='{}'), video_id) + if video: + title = video.get('episodeTitle') + subtitles = {} + subtitles_list = video.get('subtitles') + if isinstance(subtitles_list, list): + for cc in subtitles_list: + cc_url = url_or_none(cc.get('src')) + if not cc_url: + continue + lang = str_or_none(cc.get('language')) or 'nl' + subtitles.setdefault(lang, []).append({ + 'url': cc_url, + }) + return merge_dicts({ + 'title': title, + 'description': video.get('description'), + 'thumbnail': url_or_none( + video.get('still_image_url') or video.get('orig_image_url')), + 'duration': int_or_none(video.get('duration')), + 'timestamp': unified_timestamp(video.get('broadcastDate')), + 'creator': video.get('channel'), + 'series': video.get('title'), + 'episode': title, + 'episode_number': int_or_none(video.get('episodeNumber')), + 'subtitles': subtitles, + }, info) + + return info + + def _get_old_info(self, video_id): metadata = self._download_json( 'http://e.omroep.nl/metadata/%s' % video_id, video_id, @@ -280,7 +394,7 @@ class NPOIE(NPOBaseIE): # JSON else: video_url = stream_info.get('url') - if not video_url or video_url in urls: + if not video_url or 'vodnotavailable.' in video_url or video_url in urls: continue urls.add(video_url) if determine_ext(video_url) == 'm3u8': From 333f617b1207cb53efaa5e2f7af174cfa87deee1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 1 Mar 2019 01:02:36 +0700 Subject: [PATCH 23/34] [ChangeLog] Actualize [ci skip] --- ChangeLog | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/ChangeLog b/ChangeLog index f9dd7928f..f717f99a8 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,18 @@ +version <unreleased> + +Core ++ [downloader/external] Add support for rate limit and retries for wget +* [downloader/external] Fix infinite retries for curl (#19303) + +Extractors +* [npo] Fix extraction (#20084) +* [francetv:site] Extend video id regex (#20029, #20071) ++ [periscope] Extract width and height (#20015) +* [servus] Fix extraction (#19297) +* [bbccouk] Make subtitles non fatal (#19651) +* [metacafe] Fix family filter bypass (#19287) + + version 2019.02.18 Extractors From 04c33bdfb3cd73e71bf0788f02998cab30cf1da2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 1 Mar 2019 01:03:51 +0700 Subject: [PATCH 24/34] release 2019.03.01 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- youtube_dl/version.py | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index ff626883d..71a500f04 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2019.02.18*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2019.02.18** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2019.03.01*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2019.03.01** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -36,7 +36,7 @@ Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2019.02.18 +[debug] youtube-dl version 2019.03.01 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index f717f99a8..018a30641 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version <unreleased> +version 2019.03.01 Core + [downloader/external] Add support for rate limit and retries for wget diff --git a/youtube_dl/version.py b/youtube_dl/version.py index ea1d5a4a5..42ba37f15 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2019.02.18' +__version__ = '2019.03.01' From 06242d44fe261999e2424d9ecb00f20ff30ccb9b Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 1 Mar 2019 08:14:34 +0100 Subject: [PATCH 25/34] [vimeo] add support for Vimeo Pro portfolio protected videos(closes #20070) --- youtube_dl/extractor/vimeo.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 6215b3258..6f32ea6f1 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -502,7 +502,11 @@ class VimeoIE(VimeoBaseInfoExtractor): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') orig_url = url - if mobj.group('pro') or mobj.group('player'): + if mobj.group('pro'): + # some videos require portfolio_id to be present in player url + # https://github.com/rg3/youtube-dl/issues/20070 + url = self._extract_url(url, self._download_webpage(url, video_id)) + elif mobj.group('player'): url = 'https://player.vimeo.com/video/' + video_id elif any(p in url for p in ('play_redirect_hls', 'moogaloop.swf')): url = 'https://vimeo.com/' + video_id From c5b02efe20cff1612104fd731c7f02cbbce4f5f3 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 1 Mar 2019 15:08:11 +0100 Subject: [PATCH 26/34] [sixplay] handle videos with empty assets(closes #20016) --- youtube_dl/extractor/sixplay.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/sixplay.py b/youtube_dl/extractor/sixplay.py index 0c4f865ef..35bc9fa50 100644 --- a/youtube_dl/extractor/sixplay.py +++ b/youtube_dl/extractor/sixplay.py @@ -61,7 +61,8 @@ class SixPlayIE(InfoExtractor): quality_key = qualities(['lq', 'sd', 'hq', 'hd']) formats = [] subtitles = {} - for asset in clip_data['assets']: + assets = clip_data.get('assets') or [] + for asset in assets: asset_url = asset.get('full_physical_path') protocol = asset.get('protocol') if not asset_url or protocol == 'primetime' or asset.get('type') == 'usp_hlsfp_h264' or asset_url in urls: From 398e1e21d6cbf6eb1e8e7e84de4fad30b7d59613 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 1 Mar 2019 15:34:05 +0100 Subject: [PATCH 27/34] [espn] extend _VALID_URL regex(closes #20013) --- youtube_dl/extractor/espn.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/espn.py b/youtube_dl/extractor/espn.py index 127c69b2e..8cc9bd165 100644 --- a/youtube_dl/extractor/espn.py +++ b/youtube_dl/extractor/espn.py @@ -29,7 +29,8 @@ class ESPNIE(OnceIE): (?: .*?\?.*?\bid=| /_/id/ - ) + )| + [^/]+/video/ ) )| (?:www\.)espnfc\.(?:com|us)/(?:video/)?[^/]+/\d+/video/ @@ -94,6 +95,9 @@ class ESPNIE(OnceIE): }, { 'url': 'http://www.espnfc.com/english-premier-league/23/video/3324163/premier-league-in-90-seconds-golden-tweets', 'only_matching': True, + }, { + 'url': 'http://www.espn.com/espnw/video/26066627/arkansas-gibson-completes-hr-cycle-four-innings', + 'only_matching': True, }] def _real_extract(self, url): From dca0e0040ae97b2fc0cd54d5e819a5a278937350 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Sat, 2 Mar 2019 08:01:42 +0100 Subject: [PATCH 28/34] Revert "use older login method(closes #11572)" This reverts commit cc6a960e134614f8af2a42dcd8bf146d63638a3c. --- youtube_dl/extractor/crunchyroll.py | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index 5e2cbe41d..ce2e2d3ba 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -56,17 +56,6 @@ class CrunchyrollBaseIE(InfoExtractor): if username is None: return - self._download_webpage( - 'https://www.crunchyroll.com/?a=formhandler', - None, 'Logging in', 'Wrong login info', - data=urlencode_postdata({ - 'formname': 'RpcApiUser_Login', - 'next_url': 'https://www.crunchyroll.com/acct/membership', - 'name': username, - 'password': password, - })) - - ''' login_page = self._download_webpage( self._LOGIN_URL, None, 'Downloading login page') @@ -110,7 +99,6 @@ class CrunchyrollBaseIE(InfoExtractor): raise ExtractorError('Unable to login: %s' % error, expected=True) raise ExtractorError('Unable to log in') - ''' def _real_initialize(self): self._login() From a8f83f0c56e81b871a46c18fa9ebc6643370fa48 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Sat, 2 Mar 2019 08:25:47 +0100 Subject: [PATCH 29/34] [crunchyroll] fix is_logged check --- youtube_dl/extractor/crunchyroll.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index ce2e2d3ba..fd1e7afad 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -60,7 +60,7 @@ class CrunchyrollBaseIE(InfoExtractor): self._LOGIN_URL, None, 'Downloading login page') def is_logged(webpage): - return '<title>Redirecting' in webpage + return 'href="/logout"' in webpage # Already logged in if is_logged(login_page): From 7465e0aee2301c3e86fe38d6e0ef5ad01c16ec79 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 3 Mar 2019 06:25:45 +0700 Subject: [PATCH 30/34] [spankbang] Fix extraction (closes #20023) --- youtube_dl/extractor/spankbang.py | 45 +++++++++++++++++++++++++++---- 1 file changed, 40 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/spankbang.py b/youtube_dl/extractor/spankbang.py index fbe6ef31a..f11d728ca 100644 --- a/youtube_dl/extractor/spankbang.py +++ b/youtube_dl/extractor/spankbang.py @@ -9,6 +9,8 @@ from ..utils import ( parse_duration, parse_resolution, str_to_int, + url_or_none, + urlencode_postdata, ) @@ -64,16 +66,49 @@ class SpankBangIE(InfoExtractor): 'Video %s is not available' % video_id, expected=True) formats = [] - for mobj in re.finditer( - r'stream_url_(?P<id>[^\s=]+)\s*=\s*(["\'])(?P<url>(?:(?!\2).)+)\2', - webpage): - format_id, format_url = mobj.group('id', 'url') + + def extract_format(format_id, format_url): + f_url = url_or_none(format_url) + if not f_url: + return f = parse_resolution(format_id) f.update({ - 'url': format_url, + 'url': f_url, 'format_id': format_id, }) formats.append(f) + + STREAM_URL_PREFIX = 'stream_url_' + + for mobj in re.finditer( + r'%s(?P<id>[^\s=]+)\s*=\s*(["\'])(?P<url>(?:(?!\2).)+)\2' + % STREAM_URL_PREFIX, webpage): + extract_format(mobj.group('id', 'url')) + + if not formats: + stream_key = self._search_regex( + r'data-streamkey\s*=\s*(["\'])(?P<value>(?:(?!\1).)+)\1', + webpage, 'stream key', group='value') + + sb_csrf_session = self._get_cookies( + 'https://spankbang.com')['sb_csrf_session'].value + + stream = self._download_json( + 'https://spankbang.com/api/videos/stream', video_id, + 'Downloading stream JSON', data=urlencode_postdata({ + 'id': stream_key, + 'data': 0, + 'sb_csrf_session': sb_csrf_session, + }), headers={ + 'Referer': url, + 'X-CSRFToken': sb_csrf_session, + }) + + for format_id, format_url in stream.items(): + if format_id.startswith(STREAM_URL_PREFIX): + extract_format( + format_id[len(STREAM_URL_PREFIX):], format_url) + self._sort_formats(formats) title = self._html_search_regex( From 7aeb788e564d397face83b580362189753edd9dd Mon Sep 17 00:00:00 2001 From: cclauss <cclauss@me.com> Date: Sun, 3 Mar 2019 02:16:48 +0100 Subject: [PATCH 31/34] [travis] Remove sudo: false Travis now recommends removing `sudo: false` from configuration: https://blog.travis-ci.com/2018-11-19-required-linux-infrastructure-migration. --- .travis.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 79287ccf6..82e81d078 100644 --- a/.travis.yml +++ b/.travis.yml @@ -9,7 +9,6 @@ python: - "3.6" - "pypy" - "pypy3" -sudo: false env: - YTDL_TEST_SET=core - YTDL_TEST_SET=download From 8ae113ca9df0abd790e3391cd529bac42fce304f Mon Sep 17 00:00:00 2001 From: dimqua <dimqua@users.noreply.github.com> Date: Sun, 3 Mar 2019 04:19:36 +0300 Subject: [PATCH 32/34] [youtube] Add more invidious instances See [Invidious-Instances](https://github.com/omarroth/invidious/wiki/Invidious-Instances) for the reference. --- youtube_dl/extractor/youtube.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index c8bf98b58..457e2acea 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -352,6 +352,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): (?:www\.)?yourepeat\.com/| tube\.majestyc\.net/| (?:www\.)?invidio\.us/| + (?:www\.)?invidious\.snopyta\.org/| + (?:www\.)?invidious\.kabi\.tk/| + (?:www\.)?vid\.wxzm\.sx/| youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains (?:.*?\#/)? # handle anchor (#/) redirect urls (?: # the various things that can precede the ID: From 0a5baf9c210df9f492ae48dd8fdae90561c971bd Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Sun, 3 Mar 2019 06:18:15 +0100 Subject: [PATCH 33/34] [libsyn] improve extraction(closes #20229) --- youtube_dl/extractor/libsyn.py | 64 +++++++++++++++++++--------------- 1 file changed, 35 insertions(+), 29 deletions(-) diff --git a/youtube_dl/extractor/libsyn.py b/youtube_dl/extractor/libsyn.py index f7311f483..2cf444258 100644 --- a/youtube_dl/extractor/libsyn.py +++ b/youtube_dl/extractor/libsyn.py @@ -1,12 +1,14 @@ # coding: utf-8 from __future__ import unicode_literals -import json import re from .common import InfoExtractor from ..utils import ( + clean_html, + get_element_by_class, parse_duration, + strip_or_none, unified_strdate, ) @@ -21,7 +23,9 @@ class LibsynIE(InfoExtractor): 'id': '6385796', 'ext': 'mp3', 'title': "Champion Minded - Developing a Growth Mindset", - 'description': 'In this episode, Allistair talks about the importance of developing a growth mindset, not only in sports, but in life too.', + # description fetched using another request: + # http://html5-player.libsyn.com/embed/getitemdetails?item_id=6385796 + # 'description': 'In this episode, Allistair talks about the importance of developing a growth mindset, not only in sports, but in life too.', 'upload_date': '20180320', 'thumbnail': 're:^https?://.*', }, @@ -38,22 +42,36 @@ class LibsynIE(InfoExtractor): }] def _real_extract(self, url): - m = re.match(self._VALID_URL, url) - video_id = m.group('id') - url = m.group('mainurl') + url, video_id = re.match(self._VALID_URL, url).groups() webpage = self._download_webpage(url, video_id) - podcast_title = self._search_regex( - r'<h3>([^<]+)</h3>', webpage, 'podcast title', default=None) - if podcast_title: - podcast_title = podcast_title.strip() - episode_title = self._search_regex( - r'(?:<div class="episode-title">|<h4>)([^<]+)</', webpage, 'episode title') - if episode_title: - episode_title = episode_title.strip() + data = self._parse_json(self._search_regex( + r'var\s+playlistItem\s*=\s*({.+?});', + webpage, 'JSON data block'), video_id) + + episode_title = data.get('item_title') or get_element_by_class('episode-title', webpage) + if not episode_title: + self._search_regex( + [r'data-title="([^"]+)"', r'<title>(.+?)'], + webpage, 'episode title') + episode_title = episode_title.strip() + + podcast_title = strip_or_none(clean_html(self._search_regex( + r'

([^<]+)

', webpage, 'podcast title', + default=None) or get_element_by_class('podcast-title', webpage))) title = '%s - %s' % (podcast_title, episode_title) if podcast_title else episode_title + formats = [] + for k, format_id in (('media_url_libsyn', 'libsyn'), ('media_url', 'main'), ('download_link', 'download')): + f_url = data.get(k) + if not f_url: + continue + formats.append({ + 'url': f_url, + 'format_id': format_id, + }) + description = self._html_search_regex( r'(.+?)

', webpage, 'description', default=None) @@ -61,27 +79,15 @@ class LibsynIE(InfoExtractor): # Strip non-breaking and normal spaces description = description.replace('\u00A0', ' ').strip() release_date = unified_strdate(self._search_regex( - r'
Released: ([^<]+)<', webpage, 'release date', fatal=False)) - - data_json = self._search_regex(r'var\s+playlistItem\s*=\s*(\{.*?\});\n', webpage, 'JSON data block') - data = json.loads(data_json) - - formats = [{ - 'url': data['media_url'], - 'format_id': 'main', - }, { - 'url': data['media_url_libsyn'], - 'format_id': 'libsyn', - }] - thumbnail = data.get('thumbnail_url') - duration = parse_duration(data.get('duration')) + r'
Released: ([^<]+)<', + webpage, 'release date', default=None) or data.get('release_date')) return { 'id': video_id, 'title': title, 'description': description, - 'thumbnail': thumbnail, + 'thumbnail': data.get('thumbnail_url'), 'upload_date': release_date, - 'duration': duration, + 'duration': parse_duration(data.get('duration')), 'formats': formats, } From e7e62441cdde6dca6211c073be73677f195a0dff Mon Sep 17 00:00:00 2001 From: remitamine Date: Sun, 3 Mar 2019 13:23:59 +0100 Subject: [PATCH 34/34] [utils] strip #HttpOnly_ prefix from cookies files (#20219) --- test/test_YoutubeDLCookieJar.py | 10 ++++++++++ test/testdata/cookies/httponly_cookies.txt | 6 ++++++ youtube_dl/utils.py | 18 +++++++++++++++++- 3 files changed, 33 insertions(+), 1 deletion(-) create mode 100644 test/testdata/cookies/httponly_cookies.txt diff --git a/test/test_YoutubeDLCookieJar.py b/test/test_YoutubeDLCookieJar.py index 6a8243590..f959798de 100644 --- a/test/test_YoutubeDLCookieJar.py +++ b/test/test_YoutubeDLCookieJar.py @@ -29,6 +29,16 @@ class TestYoutubeDLCookieJar(unittest.TestCase): tf.close() os.remove(tf.name) + def test_strip_httponly_prefix(self): + cookiejar = YoutubeDLCookieJar('./test/testdata/cookies/httponly_cookies.txt') + cookiejar.load(ignore_discard=True, ignore_expires=True) + + def assert_cookie_has_value(key): + self.assertEqual(cookiejar._cookies['www.foobar.foobar']['/'][key].value, key + '_VALUE') + + assert_cookie_has_value('HTTPONLY_COOKIE') + assert_cookie_has_value('JS_ACCESSIBLE_COOKIE') + if __name__ == '__main__': unittest.main() diff --git a/test/testdata/cookies/httponly_cookies.txt b/test/testdata/cookies/httponly_cookies.txt new file mode 100644 index 000000000..c46541d6b --- /dev/null +++ b/test/testdata/cookies/httponly_cookies.txt @@ -0,0 +1,6 @@ +# Netscape HTTP Cookie File +# http://curl.haxx.se/rfc/cookie_spec.html +# This is a generated file! Do not edit. + +#HttpOnly_www.foobar.foobar FALSE / TRUE 2147483647 HTTPONLY_COOKIE HTTPONLY_COOKIE_VALUE +www.foobar.foobar FALSE / TRUE 2147483647 JS_ACCESSIBLE_COOKIE JS_ACCESSIBLE_COOKIE_VALUE diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index f5a0bb4b0..a71eda85d 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1141,6 +1141,8 @@ class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler): class YoutubeDLCookieJar(compat_cookiejar.MozillaCookieJar): + _HTTPONLY_PREFIX = '#HttpOnly_' + def save(self, filename=None, ignore_discard=False, ignore_expires=False): # Store session cookies with `expires` set to 0 instead of an empty # string @@ -1150,7 +1152,21 @@ class YoutubeDLCookieJar(compat_cookiejar.MozillaCookieJar): compat_cookiejar.MozillaCookieJar.save(self, filename, ignore_discard, ignore_expires) def load(self, filename=None, ignore_discard=False, ignore_expires=False): - compat_cookiejar.MozillaCookieJar.load(self, filename, ignore_discard, ignore_expires) + """Load cookies from a file.""" + if filename is None: + if self.filename is not None: + filename = self.filename + else: + raise ValueError(compat_cookiejar.MISSING_FILENAME_TEXT) + + cf = io.StringIO() + with open(filename) as f: + for line in f: + if line.startswith(self._HTTPONLY_PREFIX): + line = line[len(self._HTTPONLY_PREFIX):] + cf.write(compat_str(line)) + cf.seek(0) + self._really_load(cf, filename, ignore_discard, ignore_expires) # Session cookies are denoted by either `expires` field set to # an empty string or 0. MozillaCookieJar only recognizes the former # (see [1]). So we need force the latter to be recognized as session