From 6b03e1e25d05ddb3dbd577b045e44fa9eb05bdc0 Mon Sep 17 00:00:00 2001 From: Aleksandar Topuzovic Date: Fri, 13 May 2016 15:52:52 +0100 Subject: [PATCH 01/24] [HRTi] Implement extractor for Croatian Radiotelevision --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/hrti.py | 149 +++++++++++++++++++++++++++++ youtube_dl/utils.py | 2 + 3 files changed, 152 insertions(+) create mode 100644 youtube_dl/extractor/hrti.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 16fa4d35c..ee1718f6a 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -326,6 +326,7 @@ from .hotnewhiphop import HotNewHipHopIE from .hotstar import HotStarIE from .howcast import HowcastIE from .howstuffworks import HowStuffWorksIE +from .hrti import HRTiIE from .huffpost import HuffPostIE from .hypem import HypemIE from .iconosquare import IconosquareIE diff --git a/youtube_dl/extractor/hrti.py b/youtube_dl/extractor/hrti.py new file mode 100644 index 000000000..55ff981e2 --- /dev/null +++ b/youtube_dl/extractor/hrti.py @@ -0,0 +1,149 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json + +from .common import InfoExtractor +from ..compat import ( + compat_HTTPError +) +from ..utils import ( + sanitized_Request, + ExtractorError +) + + +class HRTiIE(InfoExtractor): + ''' + Information Extractor for Croatian Radiotelevision video on demand site + https://hrti.hrt.hr + Reverse engineered from the JavaScript app in app.min.js + ''' + _NETRC_MACHINE = 'hrti' + + APP_LANGUAGE = 'hr' + APP_VERSION = '1.1' + APP_PUBLICATION_ID = 'all_in_one' + + _VALID_URL = r'https?://hrti.hrt.hr/#/video/show/(?P[0-9]+)/(?P(\w|-)+)?' + _TEST = { + 'url': 'https://hrti.hrt.hr/#/video/show/2181385/republika-dokumentarna-serija-16-hd', + 'info_dict': { + 'id': '2181385', + 'ext': 'mp4', + 'name': 'REPUBLIKA, dokumentarna serija (4_6)-2251938', + }, + 'skip': 'Requires login' + } + + def _initialize_api(self): + '''Initializes the API and obtains the required urls''' + api_url = 'http://clientapi.hrt.hr/client_api.php/config/identify/format/json' + app_data = json.dumps({ + 'application_publication_id': HRTiIE.APP_PUBLICATION_ID + }) + self.uuid = self._download_json(api_url, None, note='Getting UUID', + errnote='Unable to obtain an UUID', + data=app_data)['uuid'] + + app_data = json.dumps({ + 'uuid': self.uuid, + 'application_publication_id': HRTiIE.APP_PUBLICATION_ID, + 'screen_height': 1080, + 'screen_width': 1920, + 'os': 'Windows', + 'os_version': 'NT 4.0', + 'device_model_string_id': 'chrome 42.0.2311.135', + 'application_version': HRTiIE.APP_VERSION + }) + + req = sanitized_Request(api_url, data=app_data) + req.get_method = lambda: 'PUT' + + resources = self._download_json( + req, None, note='Getting API endpoint and session information', + errnote='Unable to get endpoint and session information', + headers={'Content-type': 'application/json'}) + + self.session_id = resources['session_id'] + modules = resources['modules'] + + self.search_url = modules['vod_catalog']['resources']['search']['uri'] + self.search_url = self.search_url.format( + language=HRTiIE.APP_LANGUAGE, + application_id=HRTiIE.APP_PUBLICATION_ID) + + self.login_url = modules['user']['resources']['login']['uri'] + self.login_url = self.login_url.format(session_id=self.session_id) + self.login_url += '/format/json' + + self.logout_url = modules['user']['resources']['logout']['uri'] + + def _login(self): + '''Performs a login to the webservice''' + (username, password) = self._get_login_info() + + if username is None or password is None: + self.raise_login_required() + + auth_data = json.dumps({ + 'username': username, + 'password': password, + }) + try: + auth_info = self._download_json( + self.login_url, None, note='Authenticating', + errnote='Unable to log in', data=auth_data) + except ExtractorError as ee: + if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 406: + raise ExtractorError('Unable to login, ' + + 'incorrect username and/or password') + raise + + self.token = auth_info['secure_streaming_token'] + self.access_token = auth_info['session_token'] + + self.logout_url = self.logout_url.format(session_id=self.session_id, + access_token=self.access_token) + self.logout_url += '/format/json' + + def _real_initialize(self): + '''Performs necessary operations so that the information extractor is + ready for operation''' + self._initialize_api() + self._login() + + def _logout(self): + '''Performs logout from the webservice''' + self._download_json(self.logout_url, None, note='Logout', + errnote='Unable to log out', fatal=False) + + def _real_extract(self, url): + '''Extract the data necessary to download the video''' + video_id = self._match_id(url) + + metadata_url = self.search_url + \ + '/video_id/{video_id}/format/json'.format(video_id=video_id) + + metadata = self._download_json(metadata_url, video_id, + note='Getting video metadata') + video = metadata['video'][0] + title_info = video.get('title', {}) + title = title_info.get('title_long') + description = title_info.get('summary_long') + + movie = video['video_assets']['movie'][0] + url = movie['url'].format(TOKEN=self.token) + + formats = self._extract_m3u8_formats(url, video_id, 'mp4') + + self._sort_formats(formats) + + self._logout() + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'formats': formats, + } diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index a2cfb48a6..2b61e6966 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1926,6 +1926,8 @@ def update_Request(req, url=None, data=None, headers={}, query={}): origin_req_host=req.origin_req_host, unverifiable=req.unverifiable) if hasattr(req, 'timeout'): new_req.timeout = req.timeout + if req.get_method() == 'PUT': + new_req.get_method = lambda : 'PUT' return new_req From 95cf60e826e38ed1d46743c02549bf0bd187a0bb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 3 Jul 2016 02:21:32 +0700 Subject: [PATCH 02/24] [utils] Add PUTRequest --- youtube_dl/utils.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 2b61e6966..495878a0e 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1625,6 +1625,11 @@ class HEADRequest(compat_urllib_request.Request): return 'HEAD' +class PUTRequest(compat_urllib_request.Request): + def get_method(self): + return 'PUT' + + def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1): if get_attr: if v is not None: @@ -1920,14 +1925,18 @@ def update_Request(req, url=None, data=None, headers={}, query={}): req_headers.update(headers) req_data = data or req.data req_url = update_url_query(url or req.get_full_url(), query) - req_type = HEADRequest if req.get_method() == 'HEAD' else compat_urllib_request.Request + req_get_method = req.get_method() + if req_get_method == 'HEAD': + req_type = HEADRequest + elif req_get_method == 'PUT': + req_type = PUTRequest + else: + req_type = compat_urllib_request.Request new_req = req_type( req_url, data=req_data, headers=req_headers, origin_req_host=req.origin_req_host, unverifiable=req.unverifiable) if hasattr(req, 'timeout'): new_req.timeout = req.timeout - if req.get_method() == 'PUT': - new_req.get_method = lambda : 'PUT' return new_req From e3755a624b68d24b224ec48eb36833dad1585e2b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 3 Jul 2016 02:22:14 +0700 Subject: [PATCH 03/24] [hrti] Improve and add support for playlists (Closes #9482) --- youtube_dl/extractor/hrti.py | 237 +++++++++++++++++++++-------------- 1 file changed, 145 insertions(+), 92 deletions(-) diff --git a/youtube_dl/extractor/hrti.py b/youtube_dl/extractor/hrti.py index 55ff981e2..4b346f712 100644 --- a/youtube_dl/extractor/hrti.py +++ b/youtube_dl/extractor/hrti.py @@ -2,148 +2,201 @@ from __future__ import unicode_literals import json +import re from .common import InfoExtractor -from ..compat import ( - compat_HTTPError -) +from ..compat import compat_HTTPError from ..utils import ( + clean_html, + ExtractorError, + int_or_none, + parse_age_limit, sanitized_Request, - ExtractorError + try_get, ) -class HRTiIE(InfoExtractor): - ''' - Information Extractor for Croatian Radiotelevision video on demand site - https://hrti.hrt.hr - Reverse engineered from the JavaScript app in app.min.js - ''' +class HRTiBaseIE(InfoExtractor): + """ + Base Information Extractor for Croatian Radiotelevision + video on demand site https://hrti.hrt.hr + Reverse engineered from the JavaScript app in app.min.js + """ _NETRC_MACHINE = 'hrti' - APP_LANGUAGE = 'hr' - APP_VERSION = '1.1' - APP_PUBLICATION_ID = 'all_in_one' - - _VALID_URL = r'https?://hrti.hrt.hr/#/video/show/(?P[0-9]+)/(?P(\w|-)+)?' - _TEST = { - 'url': 'https://hrti.hrt.hr/#/video/show/2181385/republika-dokumentarna-serija-16-hd', - 'info_dict': { - 'id': '2181385', - 'ext': 'mp4', - 'name': 'REPUBLIKA, dokumentarna serija (4_6)-2251938', - }, - 'skip': 'Requires login' - } + _APP_LANGUAGE = 'hr' + _APP_VERSION = '1.1' + _APP_PUBLICATION_ID = 'all_in_one' + _API_URL = 'http://clientapi.hrt.hr/client_api.php/config/identify/format/json' def _initialize_api(self): - '''Initializes the API and obtains the required urls''' - api_url = 'http://clientapi.hrt.hr/client_api.php/config/identify/format/json' - app_data = json.dumps({ - 'application_publication_id': HRTiIE.APP_PUBLICATION_ID - }) - self.uuid = self._download_json(api_url, None, note='Getting UUID', - errnote='Unable to obtain an UUID', - data=app_data)['uuid'] + init_data = { + 'application_publication_id': self._APP_PUBLICATION_ID + } - app_data = json.dumps({ - 'uuid': self.uuid, - 'application_publication_id': HRTiIE.APP_PUBLICATION_ID, - 'screen_height': 1080, - 'screen_width': 1920, - 'os': 'Windows', - 'os_version': 'NT 4.0', - 'device_model_string_id': 'chrome 42.0.2311.135', - 'application_version': HRTiIE.APP_VERSION - }) + uuid = self._download_json( + self._API_URL, None, note='Downloading uuid', + errnote='Unable to download uuid', + data=json.dumps(init_data).encode('utf-8'))['uuid'] - req = sanitized_Request(api_url, data=app_data) + app_data = { + 'uuid': uuid, + 'application_publication_id': self._APP_PUBLICATION_ID, + 'application_version': self._APP_VERSION + } + + req = sanitized_Request(self._API_URL, data=json.dumps(app_data).encode('utf-8')) req.get_method = lambda: 'PUT' resources = self._download_json( - req, None, note='Getting API endpoint and session information', - errnote='Unable to get endpoint and session information', - headers={'Content-type': 'application/json'}) + req, None, note='Downloading session information', + errnote='Unable to download session information') + + self._session_id = resources['session_id'] - self.session_id = resources['session_id'] modules = resources['modules'] - self.search_url = modules['vod_catalog']['resources']['search']['uri'] - self.search_url = self.search_url.format( - language=HRTiIE.APP_LANGUAGE, - application_id=HRTiIE.APP_PUBLICATION_ID) + self._search_url = modules['vod_catalog']['resources']['search']['uri'].format( + language=self._APP_LANGUAGE, + application_id=self._APP_PUBLICATION_ID) - self.login_url = modules['user']['resources']['login']['uri'] - self.login_url = self.login_url.format(session_id=self.session_id) - self.login_url += '/format/json' + self._login_url = (modules['user']['resources']['login']['uri'] + + '/format/json').format(session_id=self._session_id) - self.logout_url = modules['user']['resources']['logout']['uri'] + self._logout_url = modules['user']['resources']['logout']['uri'] def _login(self): - '''Performs a login to the webservice''' (username, password) = self._get_login_info() - + # TODO: figure out authentication with cookies if username is None or password is None: self.raise_login_required() - auth_data = json.dumps({ + auth_data = { 'username': username, 'password': password, - }) + } + try: auth_info = self._download_json( - self.login_url, None, note='Authenticating', - errnote='Unable to log in', data=auth_data) - except ExtractorError as ee: - if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 406: - raise ExtractorError('Unable to login, ' + - 'incorrect username and/or password') - raise + self._login_url, None, note='Logging in', errnote='Unable to log in', + data=json.dumps(auth_data).encode('utf-8')) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 406: + auth_info = self._parse_json(e.cause.read().encode('utf-8'), None) + else: + raise - self.token = auth_info['secure_streaming_token'] - self.access_token = auth_info['session_token'] + error_message = auth_info.get('error', {}).get('message') + if error_message: + raise ExtractorError( + '%s said: %s' % (self.IE_NAME, error_message), + expected=True) - self.logout_url = self.logout_url.format(session_id=self.session_id, - access_token=self.access_token) - self.logout_url += '/format/json' + self._token = auth_info['secure_streaming_token'] def _real_initialize(self): - '''Performs necessary operations so that the information extractor is - ready for operation''' self._initialize_api() self._login() - def _logout(self): - '''Performs logout from the webservice''' - self._download_json(self.logout_url, None, note='Logout', - errnote='Unable to log out', fatal=False) + +class HRTiIE(HRTiBaseIE): + _VALID_URL = r'''(?x) + (?: + hrti:(?P[0-9]+)| + https?:// + hrti\.hrt\.hr/\#/video/show/(?P[0-9]+)/(?P[^/]+)? + ) + ''' + _TESTS = [{ + 'url': 'https://hrti.hrt.hr/#/video/show/2181385/republika-dokumentarna-serija-16-hd', + 'info_dict': { + 'id': '2181385', + 'display_id': 'republika-dokumentarna-serija-16-hd', + 'ext': 'mp4', + 'title': 'REPUBLIKA, dokumentarna serija (1/6) (HD)', + 'description': 'md5:48af85f620e8e0e1df4096270568544f', + 'duration': 2922, + 'view_count': int, + 'average_rating': int, + 'episode_number': int, + 'season_number': int, + 'age_limit': 12, + }, + 'skip': 'Requires account credentials', + }, { + 'url': 'https://hrti.hrt.hr/#/video/show/2181385/', + 'only_matching': True, + }, { + 'url': 'hrti:2181385', + 'only_matching': True, + }] def _real_extract(self, url): - '''Extract the data necessary to download the video''' - video_id = self._match_id(url) + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('short_id') or mobj.group('id') + display_id = mobj.group('display_id') or video_id - metadata_url = self.search_url + \ - '/video_id/{video_id}/format/json'.format(video_id=video_id) + video = self._download_json( + '%s/video_id/%s/format/json' % (self._search_url, video_id), + display_id, 'Downloading video metadata JSON')['video'][0] - metadata = self._download_json(metadata_url, video_id, - note='Getting video metadata') - video = metadata['video'][0] - title_info = video.get('title', {}) - title = title_info.get('title_long') - description = title_info.get('summary_long') + title_info = video['title'] + title = title_info['title_long'] movie = video['video_assets']['movie'][0] - url = movie['url'].format(TOKEN=self.token) - - formats = self._extract_m3u8_formats(url, video_id, 'mp4') - + m3u8_url = movie['url'].format(TOKEN=self._token) + formats = self._extract_m3u8_formats( + m3u8_url, display_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls') self._sort_formats(formats) - self._logout() + description = clean_html(title_info.get('summary_long')) + age_limit = parse_age_limit(video.get('parental_control', {}).get('rating')) + view_count = int_or_none(video.get('views')) + average_rating = int_or_none(video.get('user_rating')) + duration = int_or_none(movie.get('duration')) return { 'id': video_id, + 'display_id': display_id, 'title': title, 'description': description, + 'duration': duration, + 'view_count': view_count, + 'average_rating': average_rating, + 'age_limit': age_limit, 'formats': formats, } + + +class HRTiPlaylistIE(HRTiBaseIE): + _VALID_URL = r'https?://hrti.hrt.hr/#/video/list/category/(?P[0-9]+)/(?P[^/]+)?' + _TESTS = [{ + 'url': 'https://hrti.hrt.hr/#/video/list/category/212/ekumena', + 'info_dict': { + 'id': '212', + 'title': 'ekumena', + }, + 'playlist_mincount': 8, + 'skip': 'Requires account credentials', + }, { + 'url': 'https://hrti.hrt.hr/#/video/list/category/212/', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + category_id = mobj.group('id') + display_id = mobj.group('display_id') or category_id + + response = self._download_json( + '%s/category_id/%s/format/json' % (self._search_url, category_id), + display_id, 'Downloading video metadata JSON') + + video_ids = try_get( + response, lambda x: x['video_listings'][0]['alternatives'][0]['list'], + list) or [video['id'] for video in response.get('videos', []) if video.get('id')] + + entries = [self.url_result('hrti:%s' % category_id) for category_id in video_ids] + + return self.playlist_result(entries, category_id, display_id) From c3a5dd3b5d636935dea53ee23cc8a7da9a175f0e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 3 Jul 2016 02:22:59 +0700 Subject: [PATCH 04/24] Credit @atopuzov for hrti (#9482) --- AUTHORS | 1 + 1 file changed, 1 insertion(+) diff --git a/AUTHORS b/AUTHORS index bdd29687d..f74b30d07 100644 --- a/AUTHORS +++ b/AUTHORS @@ -176,3 +176,4 @@ Déstin Reed Roman Tsiupa Artur Krysiak Jakub Adam Wieczorek +Aleksandar Topuzović From 9b724d7277716fc6f0a0cdc5cdc857ac0ea2642b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 3 Jul 2016 02:25:39 +0700 Subject: [PATCH 05/24] [extractors] Add hrti:playlist import --- youtube_dl/extractor/extractors.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index ee1718f6a..5dab055db 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -326,7 +326,10 @@ from .hotnewhiphop import HotNewHipHopIE from .hotstar import HotStarIE from .howcast import HowcastIE from .howstuffworks import HowStuffWorksIE -from .hrti import HRTiIE +from .hrti import ( + HRTiIE, + HRTiPlaylistIE, +) from .huffpost import HuffPostIE from .hypem import HypemIE from .iconosquare import IconosquareIE From 1b734adb2d3bb17af67b5fef933a6fafe71e4cf4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 3 Jul 2016 03:12:15 +0700 Subject: [PATCH 06/24] [xtube] Fix extraction (Closes #9953, closes #9961) --- youtube_dl/extractor/xtube.py | 45 ++++++++++++++++++++++++----------- 1 file changed, 31 insertions(+), 14 deletions(-) diff --git a/youtube_dl/extractor/xtube.py b/youtube_dl/extractor/xtube.py index 4075b8a4f..83bc1fef2 100644 --- a/youtube_dl/extractor/xtube.py +++ b/youtube_dl/extractor/xtube.py @@ -4,17 +4,23 @@ import itertools import re from .common import InfoExtractor -from ..compat import compat_urllib_parse_unquote from ..utils import ( int_or_none, orderedSet, + parse_duration, sanitized_Request, str_to_int, ) class XTubeIE(InfoExtractor): - _VALID_URL = r'(?:xtube:|https?://(?:www\.)?xtube\.com/(?:watch\.php\?.*\bv=|video-watch/(?P[^/]+)-))(?P[^/?&#]+)' + _VALID_URL = r'''(?x) + (?: + xtube:| + https?://(?:www\.)?xtube\.com/(?:watch\.php\?.*\bv=|video-watch/(?P[^/]+)-) + ) + (?P[^/?&#]+) + ''' _TESTS = [{ # old URL schema @@ -27,6 +33,8 @@ class XTubeIE(InfoExtractor): 'description': 'contains:an ET kind of thing', 'uploader': 'greenshowers', 'duration': 450, + 'view_count': int, + 'comment_count': int, 'age_limit': 18, } }, { @@ -51,21 +59,30 @@ class XTubeIE(InfoExtractor): req.add_header('Cookie', 'age_verified=1; cookiesAccepted=1') webpage = self._download_webpage(req, display_id) - flashvars = self._parse_json( - self._search_regex( - r'xt\.playerOps\s*=\s*({.+?});', webpage, 'player ops'), - video_id)['flashvars'] + sources = self._parse_json(self._search_regex( + r'sources\s*:\s*({.+?}),', webpage, 'sources'), video_id) - title = flashvars.get('title') or self._search_regex( - r'

([^<]+)

', webpage, 'title') - video_url = compat_urllib_parse_unquote(flashvars['video_url']) - duration = int_or_none(flashvars.get('video_duration')) + formats = [] + for format_id, format_url in sources.items(): + formats.append({ + 'url': format_url, + 'format_id': format_id, + 'height': int_or_none(format_id), + }) + self._sort_formats(formats) - uploader = self._search_regex( - r']+name="contentOwnerId"[^>]+value="([^"]+)"', - webpage, 'uploader', fatal=False) + title = self._search_regex( + (r'

(?P[^<]+)</h1>', r'videoTitle\s*:\s*(["\'])(?P<title>.+?)\1'), + webpage, 'title', group='title') description = self._search_regex( r'</h1>\s*<p>([^<]+)', webpage, 'description', fatal=False) + uploader = self._search_regex( + (r'<input[^>]+name="contentOwnerId"[^>]+value="([^"]+)"', + r'<span[^>]+class="nickname"[^>]*>([^<]+)'), + webpage, 'uploader', fatal=False) + duration = parse_duration(self._search_regex( + r'<dt>Runtime:</dt>\s*<dd>([^<]+)</dd>', + webpage, 'duration', fatal=False)) view_count = str_to_int(self._search_regex( r'<dt>Views:</dt>\s*<dd>([\d,\.]+)</dd>', webpage, 'view count', fatal=False)) @@ -76,7 +93,6 @@ class XTubeIE(InfoExtractor): return { 'id': video_id, 'display_id': display_id, - 'url': video_url, 'title': title, 'description': description, 'uploader': uploader, @@ -84,6 +100,7 @@ class XTubeIE(InfoExtractor): 'view_count': view_count, 'comment_count': comment_count, 'age_limit': 18, + 'formats': formats, } From a0cfd82dda86b4e3e882819216f177b003a1b473 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 3 Jul 2016 03:19:22 +0700 Subject: [PATCH 07/24] release 2016.07.03 --- .github/ISSUE_TEMPLATE.md | 6 +++--- docs/supportedsites.md | 4 +++- youtube_dl/version.py | 2 +- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index 637103b6b..8dbd3cbfa 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2016.07.02*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2016.07.02** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2016.07.03*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2016.07.03** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through [README](https://github.com/rg3/youtube-dl/blob/master/README.md) and **most notably** [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -35,7 +35,7 @@ $ youtube-dl -v <your command line> [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2016.07.02 +[debug] youtube-dl version 2016.07.03 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 8fd1ab5af..93237022f 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -283,6 +283,8 @@ - **HotStar** - **Howcast** - **HowStuffWorks** + - **HRTi** + - **HRTiPlaylist** - **HuffPost**: Huffington Post - **Hypem** - **Iconosquare** @@ -329,7 +331,7 @@ - **kuwo:mv**: 酷我音乐 - MV - **kuwo:singer**: 酷我音乐 - 歌手 - **kuwo:song**: 酷我音乐 - - **la7.tv** + - **la7.it** - **Laola1Tv** - **Le**: 乐视网 - **Learnr** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index d0483f83b..1dda4c6d1 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2016.07.02' +__version__ = '2016.07.03' From bf83024826e325c474aed5128c012b78a5d15210 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Sat, 2 Jul 2016 21:19:07 +0100 Subject: [PATCH 08/24] [theplatform] add basic support for Adobe Pass --- youtube_dl/extractor/cbsinteractive.py | 5 +- youtube_dl/extractor/theplatform.py | 108 ++++++++++++++++++++++++- 2 files changed, 105 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/cbsinteractive.py b/youtube_dl/extractor/cbsinteractive.py index 0011c3029..821db20b2 100644 --- a/youtube_dl/extractor/cbsinteractive.py +++ b/youtube_dl/extractor/cbsinteractive.py @@ -80,9 +80,6 @@ class CBSInteractiveIE(ThePlatformIE): media_guid_path = 'media/guid/%d/%s' % (self.MPX_ACCOUNTS[site], vdata['mpxRefId']) formats, subtitles = [], {} - if site == 'cnet': - formats, subtitles = self._extract_theplatform_smil( - self.TP_RELEASE_URL_TEMPLATE % media_guid_path, video_id) for (fkey, vid) in vdata['files'].items(): if fkey == 'hls_phone' and 'hls_tablet' in vdata['files']: continue @@ -94,7 +91,7 @@ class CBSInteractiveIE(ThePlatformIE): subtitles = self._merge_subtitles(subtitles, tp_subtitles) self._sort_formats(formats) - info = self.get_metadata('kYEXFC/%s' % media_guid_path, video_id) + info = self._extract_theplatform_metadata('kYEXFC/%s' % media_guid_path, video_id) info.update({ 'id': video_id, 'display_id': display_id, diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py index 07d222ae3..bb3efc4ea 100644 --- a/youtube_dl/extractor/theplatform.py +++ b/youtube_dl/extractor/theplatform.py @@ -6,6 +6,7 @@ import time import hmac import binascii import hashlib +import netrc from .once import OnceIE @@ -24,6 +25,9 @@ from ..utils import ( xpath_with_ns, mimetype2ext, find_xpath_attr, + unescapeHTML, + urlencode_postdata, + unified_timestamp, ) default_ns = 'http://www.w3.org/2005/SMIL21/Language' @@ -62,10 +66,11 @@ class ThePlatformBaseIE(OnceIE): return formats, subtitles - def get_metadata(self, path, video_id): + def _download_theplatform_metadata(self, path, video_id): info_url = 'http://link.theplatform.com/s/%s?format=preview' % path - info = self._download_json(info_url, video_id) + return self._download_json(info_url, video_id) + def _parse_theplatform_metadata(self, info): subtitles = {} captions = info.get('captions') if isinstance(captions, list): @@ -86,6 +91,10 @@ class ThePlatformBaseIE(OnceIE): 'uploader': info.get('billingCode'), } + def _extract_theplatform_metadata(self, path, video_id): + info = self._download_theplatform_metadata(path, video_id) + return self._parse_theplatform_metadata(info) + class ThePlatformIE(ThePlatformBaseIE): _VALID_URL = r'''(?x) @@ -158,6 +167,7 @@ class ThePlatformIE(ThePlatformBaseIE): 'url': 'http://player.theplatform.com/p/NnzsPC/onsite_universal/select/media/guid/2410887629/2928790?fwsitesection=nbc_the_blacklist_video_library&autoPlay=true&carouselID=137781', 'only_matching': True, }] + _SERVICE_PROVIDER_TEMPLATE = 'https://sp.auth.adobe.com/adobe-services/%s' @classmethod def _extract_urls(cls, webpage): @@ -192,6 +202,96 @@ class ThePlatformIE(ThePlatformBaseIE): sig = flags + expiration_date + checksum + str_to_hex(sig_secret) return '%s&sig=%s' % (url, sig) + def _extract_mvpd_auth(self, url, video_id, requestor_id, resource): + def xml_text(xml_str, tag): + return self._search_regex( + '<%s>(.+?)</%s>' % (tag, tag), xml_str, tag) + + mvpd_headers = { + 'ap_42': 'anonymous', + 'ap_11': 'Linux i686', + 'ap_z': 'Mozilla/5.0 (X11; Linux i686; rv:47.0) Gecko/20100101 Firefox/47.0', + 'User-Agent': 'Mozilla/5.0 (X11; Linux i686; rv:47.0) Gecko/20100101 Firefox/47.0', + } + + guid = xml_text(resource, 'guid') + requestor_info = self._downloader.cache.load('mvpd', requestor_id) or {} + authn_token = requestor_info.get('authn_token') + if authn_token: + token_expires = unified_timestamp(xml_text(authn_token, 'simpleTokenExpires').replace('_GMT', '')) + if token_expires and token_expires >= time.time(): + authn_token = None + if not authn_token: + # TODO add support for other TV Providers + mso_id = 'DTV' + login_info = netrc.netrc().authenticators(mso_id) + if not login_info: + return None + + def post_form(form_page, note, data={}): + post_url = self._html_search_regex(r'<form[^>]+action=(["\'])(?P<url>.+?)\1', form_page, 'post url', group='url') + return self._download_webpage( + post_url, video_id, note, data=urlencode_postdata(data or self._hidden_inputs(form_page)), headers={ + 'Content-Type': 'application/x-www-form-urlencoded', + }) + + provider_redirect_page = self._download_webpage( + self._SERVICE_PROVIDER_TEMPLATE % 'authenticate/saml', video_id, + 'Downloading Provider Redirect Page', query={ + 'noflash': 'true', + 'mso_id': mso_id, + 'requestor_id': requestor_id, + 'no_iframe': 'false', + 'domain_name': 'adobe.com', + 'redirect_url': url, + }) + provider_login_page = post_form( + provider_redirect_page, 'Downloading Provider Login Page') + mvpd_confirm_page = post_form(provider_login_page, 'Logging in', { + 'username': login_info[0], + 'password': login_info[2], + }) + post_form(mvpd_confirm_page, 'Confirming Login') + + session = self._download_webpage( + self._SERVICE_PROVIDER_TEMPLATE % 'session', video_id, + 'Retrieving Session', data=urlencode_postdata({ + '_method': 'GET', + 'requestor_id': requestor_id, + }), headers=mvpd_headers) + authn_token = unescapeHTML(xml_text(session, 'authnToken')) + requestor_info['authn_token'] = authn_token + self._downloader.cache.store('mvpd', requestor_id, requestor_info) + + authz_token = requestor_info.get(guid) + if not authz_token: + authorize = self._download_webpage( + self._SERVICE_PROVIDER_TEMPLATE % 'authorize', video_id, + 'Retrieving Authorization Token', data=urlencode_postdata({ + 'resource_id': resource, + 'requestor_id': requestor_id, + 'authentication_token': authn_token, + 'mso_id': xml_text(authn_token, 'simpleTokenMsoID'), + 'userMeta': '1', + }), headers=mvpd_headers) + authz_token = unescapeHTML(xml_text(authorize, 'authzToken')) + requestor_info[guid] = authz_token + self._downloader.cache.store('mvpd', requestor_id, requestor_info) + + mvpd_headers.update({ + 'ap_19': xml_text(authn_token, 'simpleSamlNameID'), + 'ap_23': xml_text(authn_token, 'simpleSamlSessionIndex'), + }) + + return self._download_webpage( + self._SERVICE_PROVIDER_TEMPLATE % 'shortAuthorize', + video_id, 'Retrieving Media Token', data=urlencode_postdata({ + 'authz_token': authz_token, + 'requestor_id': requestor_id, + 'session_guid': xml_text(authn_token, 'simpleTokenAuthenticationGuid'), + 'hashed_guid': 'false', + }), headers=mvpd_headers) + def _real_extract(self, url): url, smuggled_data = unsmuggle_url(url, {}) @@ -265,7 +365,7 @@ class ThePlatformIE(ThePlatformBaseIE): formats, subtitles = self._extract_theplatform_smil(smil_url, video_id) self._sort_formats(formats) - ret = self.get_metadata(path, video_id) + ret = self._extract_theplatform_metadata(path, video_id) combined_subtitles = self._merge_subtitles(ret.get('subtitles', {}), subtitles) ret.update({ 'id': video_id, @@ -339,7 +439,7 @@ class ThePlatformFeedIE(ThePlatformBaseIE): timestamp = int_or_none(entry.get('media$availableDate'), scale=1000) categories = [item['media$name'] for item in entry.get('media$categories', [])] - ret = self.get_metadata('%s/%s' % (provider_id, first_video_id), video_id) + ret = self._extract_theplatform_metadata('%s/%s' % (provider_id, first_video_id), video_id) subtitles = self._merge_subtitles(subtitles, ret['subtitles']) ret.update({ 'id': video_id, From 05c7feec77f42145815695943d24cb6a9d7c7baa Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Sat, 2 Jul 2016 21:20:59 +0100 Subject: [PATCH 09/24] [aenetworks] add support Adobe Pass auth --- youtube_dl/extractor/aenetworks.py | 77 ++++++++++++++++++++---------- 1 file changed, 51 insertions(+), 26 deletions(-) diff --git a/youtube_dl/extractor/aenetworks.py b/youtube_dl/extractor/aenetworks.py index 8b60e2ab6..bc58d7b4d 100644 --- a/youtube_dl/extractor/aenetworks.py +++ b/youtube_dl/extractor/aenetworks.py @@ -2,7 +2,7 @@ from __future__ import unicode_literals import re -from .common import InfoExtractor +from .theplatform import ThePlatformIE from ..utils import ( smuggle_url, update_url_query, @@ -15,28 +15,15 @@ from ..compat import ( ) -class AENetworksBaseIE(InfoExtractor): - def theplatform_url_result(self, theplatform_url, video_id, query): - return { - '_type': 'url_transparent', - 'id': video_id, - 'url': smuggle_url( - update_url_query(theplatform_url, query), - { - 'sig': { - 'key': 'crazyjava', - 'secret': 's3cr3t' - }, - 'force_smil_url': True - }), - 'ie_key': 'ThePlatform', - } +class AENetworksBaseIE(ThePlatformIE): + _THEPLATFORM_KEY = 'crazyjava' + _THEPLATFORM_SECRET = 's3cr3t' class AENetworksIE(AENetworksBaseIE): IE_NAME = 'aenetworks' IE_DESC = 'A+E Networks: A&E, Lifetime, History.com, FYI Network' - _VALID_URL = r'https?://(?:www\.)?(?:(?:history|aetv|mylifetime)\.com|fyi\.tv)/(?:shows/(?P<show_path>[^/]+(?:/[^/]+){0,2})|movies/(?P<movie_display_id>[^/]+)/full-movie)' + _VALID_URL = r'https?://(?:www\.)?(?P<domain>(?:history|aetv|mylifetime)\.com|fyi\.tv)/(?:shows/(?P<show_path>[^/]+(?:/[^/]+){0,2})|movies/(?P<movie_display_id>[^/]+)/full-movie)' _TESTS = [{ 'url': 'http://www.history.com/shows/mountain-men/season-1/episode-1', 'md5': '8ff93eb073449f151d6b90c0ae1ef0c7', @@ -76,9 +63,15 @@ class AENetworksIE(AENetworksBaseIE): 'url': 'http://www.mylifetime.com/movies/center-stage-on-pointe/full-movie', 'only_matching': True }] + _DOMAIN_TO_REQUESTOR_ID = { + 'history.com': 'HISTORY', + 'aetv.com': 'AETV', + 'mylifetime.com': 'LIFETIME', + 'fyi.tv': 'FYI', + } def _real_extract(self, url): - show_path, movie_display_id = re.match(self._VALID_URL, url).groups() + domain, show_path, movie_display_id = re.match(self._VALID_URL, url).groups() display_id = show_path or movie_display_id webpage = self._download_webpage(url, display_id) if show_path: @@ -103,16 +96,32 @@ class AENetworksIE(AENetworksBaseIE): episode_attributes['data-videoid'])) return self.playlist_result( entries, self._html_search_meta('aetn:SeasonId', webpage)) + + query = { + 'mbr': 'true', + 'assetTypes': 'medium_video_s3' + } video_id = self._html_search_meta('aetn:VideoID', webpage) media_url = self._search_regex( r"media_url\s*=\s*'([^']+)'", webpage, 'video url') - - info = self._search_json_ld(webpage, video_id, fatal=False) - info.update(self.theplatform_url_result( - media_url, video_id, { - 'mbr': 'true', - 'assetTypes': 'medium_video_s3' - })) + theplatform_metadata = self._download_theplatform_metadata(self._search_regex( + r'https?://link.theplatform.com/s/([^?]+)', media_url, 'theplatform_path'), video_id) + info = self._parse_theplatform_metadata(theplatform_metadata) + if theplatform_metadata.get('AETN$isBehindWall'): + requestor_id = self._DOMAIN_TO_REQUESTOR_ID[domain] + resource = '<rss version="2.0" xmlns:media="http://search.yahoo.com/mrss/"><channel><title>%s%s%s%s' % (requestor_id, theplatform_metadata['title'], theplatform_metadata['AETN$PPL_pplProgramId'], theplatform_metadata['ratings'][0]['rating']) + query['auth'] = self._extract_mvpd_auth( + url, video_id, requestor_id, resource) + info.update(self._search_json_ld(webpage, video_id, fatal=False)) + media_url = update_url_query(media_url, query) + media_url = self._sign_url(media_url, self._THEPLATFORM_KEY, self._THEPLATFORM_SECRET) + formats, subtitles = self._extract_theplatform_smil(media_url, video_id) + self._sort_formats(formats) + info.update({ + 'id': video_id, + 'formats': formats, + 'subtitles': subtitles, + }) return info @@ -149,6 +158,22 @@ class HistoryTopicIE(AENetworksBaseIE): 'only_matching': True, }] + def theplatform_url_result(self, theplatform_url, video_id, query): + return { + '_type': 'url_transparent', + 'id': video_id, + 'url': smuggle_url( + update_url_query(theplatform_url, query), + { + 'sig': { + 'key': self._THEPLATFORM_KEY, + 'secret': self._THEPLATFORM_SECRET, + }, + 'force_smil_url': True + }), + 'ie_key': 'ThePlatform', + } + def _real_extract(self, url): topic_id, video_display_id = re.match(self._VALID_URL, url).groups() if video_display_id: From a1f6f5c768a506674928530990b7f95c605eac2c Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sat, 2 Jul 2016 21:22:08 +0100 Subject: [PATCH 10/24] [nationalgeographic] add support Adobe Pass auth --- youtube_dl/extractor/nationalgeographic.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/nationalgeographic.py b/youtube_dl/extractor/nationalgeographic.py index 722518663..ed76798aa 100644 --- a/youtube_dl/extractor/nationalgeographic.py +++ b/youtube_dl/extractor/nationalgeographic.py @@ -1,6 +1,7 @@ from __future__ import unicode_literals from .common import InfoExtractor +from .theplatform import ThePlatformIE from ..utils import ( smuggle_url, url_basename, @@ -61,7 +62,7 @@ class NationalGeographicIE(InfoExtractor): } -class NationalGeographicChannelIE(InfoExtractor): +class NationalGeographicChannelIE(ThePlatformIE): IE_NAME = 'natgeo:channel' _VALID_URL = r'https?://channel\.nationalgeographic\.com/(?:wild/)?[^/]+/videos/(?P[^/?]+)' @@ -102,12 +103,22 @@ class NationalGeographicChannelIE(InfoExtractor): release_url = self._search_regex( r'video_auth_playlist_url\s*=\s*"([^"]+)"', webpage, 'release url') + query = { + 'mbr': 'true', + 'manifest': 'm3u', + } + is_auth = self._search_regex(r'video_is_auth\s*=\s*"([^"]+)"', webpage, 'is auth', fatal=False) + if is_auth == 'auth': + auth_resource_id = self._search_regex( + r"video_auth_resourceId\s*=\s*'([^']+)'", + webpage, 'auth resource id') + query['auth'] = self._extract_mvpd_auth(url, display_id, 'natgeo', auth_resource_id) or '' return { '_type': 'url_transparent', 'ie_key': 'ThePlatform', 'url': smuggle_url( - update_url_query(release_url, {'mbr': 'true', 'switch': 'http'}), + update_url_query(release_url, query), {'force_smil_url': True}), 'display_id': display_id, } From 4cb13d0d6af20906f973f0b281f9efba4bc65d41 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sat, 2 Jul 2016 23:02:14 +0200 Subject: [PATCH 11/24] [hrti] Don't redefine variable in list comprehension --- youtube_dl/extractor/hrti.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/hrti.py b/youtube_dl/extractor/hrti.py index 4b346f712..656ce6d05 100644 --- a/youtube_dl/extractor/hrti.py +++ b/youtube_dl/extractor/hrti.py @@ -197,6 +197,6 @@ class HRTiPlaylistIE(HRTiBaseIE): response, lambda x: x['video_listings'][0]['alternatives'][0]['list'], list) or [video['id'] for video in response.get('videos', []) if video.get('id')] - entries = [self.url_result('hrti:%s' % category_id) for category_id in video_ids] + entries = [self.url_result('hrti:%s' % video_id) for video_id in video_ids] return self.playlist_result(entries, category_id, display_id) From 04006fae8df634ae98abd5bc8e538d0a5addf3fc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 3 Jul 2016 11:31:07 +0700 Subject: [PATCH 12/24] [README.md] Start writing youtube-dl coding conventions --- README.md | 152 +++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 146 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index c6feef116..1f014b48f 100644 --- a/README.md +++ b/README.md @@ -890,9 +890,17 @@ If you want to add support for a new site, first of all **make sure** this site After you have ensured this site is distributing it's content legally, you can follow this quick list (assuming your service is called `yourextractor`): 1. [Fork this repository](https://github.com/rg3/youtube-dl/fork) -2. Check out the source code with `git clone git@github.com:YOUR_GITHUB_USERNAME/youtube-dl.git` -3. Start a new git branch with `cd youtube-dl; git checkout -b yourextractor` +2. Check out the source code with: + + git clone git@github.com:YOUR_GITHUB_USERNAME/youtube-dl.git + +3. Start a new git branch with + + cd youtube-dl + git checkout -b yourextractor + 4. Start with this simple template and save it to `youtube_dl/extractor/yourextractor.py`: + ```python # coding: utf-8 from __future__ import unicode_literals @@ -936,19 +944,151 @@ After you have ensured this site is distributing it's content legally, you can f 5. Add an import in [`youtube_dl/extractor/extractors.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/extractors.py). 6. Run `python test/test_download.py TestDownload.test_YourExtractor`. This *should fail* at first, but you can continually re-run it until you're done. If you decide to add more than one test, then rename ``_TEST`` to ``_TESTS`` and make it into a list of dictionaries. The tests will then be named `TestDownload.test_YourExtractor`, `TestDownload.test_YourExtractor_1`, `TestDownload.test_YourExtractor_2`, etc. 7. Have a look at [`youtube_dl/extractor/common.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should and may return](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L74-L252). Add tests and code for as many as you want. -8. Keep in mind that the only mandatory fields in info dict for successful extraction process are `id`, `title` and either `url` or `formats`, i.e. these are the critical data the extraction does not make any sense without. This means that [any field](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L148-L252) apart from aforementioned mandatory ones should be treated **as optional** and extraction should be **tolerate** to situations when sources for these fields can potentially be unavailable (even if they always available at the moment) and **future-proof** in order not to break the extraction of general purpose mandatory fields. For example, if you have some intermediate dict `meta` that is a source of metadata and it has a key `summary` that you want to extract and put into resulting info dict as `description`, you should be ready that this key may be missing from the `meta` dict, i.e. you should extract it as `meta.get('summary')` and not `meta['summary']`. Similarly, you should pass `fatal=False` when extracting data from a webpage with `_search_regex/_html_search_regex`. -9. Check the code with [flake8](https://pypi.python.org/pypi/flake8). Also make sure your code works under all [Python](http://www.python.org/) versions claimed supported by youtube-dl, namely 2.6, 2.7, and 3.2+. -10. When the tests pass, [add](http://git-scm.com/docs/git-add) the new files and [commit](http://git-scm.com/docs/git-commit) them and [push](http://git-scm.com/docs/git-push) the result, like this: +8. Make sure your code follows [youtube-dl coding conventions](#youtube-dl-coding-conventions) and check the code with [flake8](https://pypi.python.org/pypi/flake8). Also make sure your code works under all [Python](http://www.python.org/) versions claimed supported by youtube-dl, namely 2.6, 2.7, and 3.2+. +9. When the tests pass, [add](http://git-scm.com/docs/git-add) the new files and [commit](http://git-scm.com/docs/git-commit) them and [push](http://git-scm.com/docs/git-push) the result, like this: $ git add youtube_dl/extractor/extractors.py $ git add youtube_dl/extractor/yourextractor.py $ git commit -m '[yourextractor] Add new extractor' $ git push origin yourextractor -11. Finally, [create a pull request](https://help.github.com/articles/creating-a-pull-request). We'll then review and merge it. +10. Finally, [create a pull request](https://help.github.com/articles/creating-a-pull-request). We'll then review and merge it. In any case, thank you very much for your contributions! +## youtube-dl coding conventions + +This section introduces a guide lines for writing idiomatic, robust and future-proof extractor code. + +Extractors are very fragile by nature since they depend on the layout of the source data provided by 3rd party media hoster out of your control and this layout tend to change. As extractor implementer your task is not only to write code that will extract media links and metadata correctly but also to minimize code dependency on source's layout changes and even to make the code foresee potential future changes and be ready for that. This is important because it will allow extractor not to break on minor layout changes thus keeping old youtube-dl version working. Even though this breakage issue is easily fixed by emitting a new version of youtube-dl with fix incorporated all the previous version become broken in all repositories and distros' packages that may not be so prompt in fetching the update from us. Needless to say some may never receive an update at all that is possible for non rolling release distros. + +### Mandatory and optional metafields + +For extraction to work youtube-dl relies on metadata your extractor extracts and provides to youtube-dl expressed by [information dictionary](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L148-L252) or simply *info dict*. Only the following meta fields in *info dict* are considered mandatory for successful extraction process by youtube-dl: + + - `id` (media identifier) + - `title` (media title) + - `url` (media download URL) or `formats` + +In fact only the last option is technically mandatory (i.e. if you can't figure out the download location of the media the extraction does not make any sense). But by convention youtube-dl also treats `id` and `title` to be mandatory. Thus aforementioned metafields are the critical data the extraction does not make any sense without and if any of them fail to be extracted then extractor is considered completely broken. + +[Any field](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L148-L252) apart from the aforementioned ones are considered **optional**. That means that extraction should be **tolerate** to situations when sources for these fields can potentially be unavailable (even if they always available at the moment) and **future-proof** in order not to break the extraction of general purpose mandatory fields. + +#### Example + +Say you have some source dictionary `meta` that you've fetched as JSON with HTTP request and it has a key `summary`: + +```python +meta = self._download_json(url, video_id) +``` + +Assume at this point `meta`'s layout is: + +```python +{ + ... + "summary": "some fancy summary text", + ... +} +``` + +Assume you want to extract `summary` and put into resulting info dict as `description`. Since `description` is optional metafield you should be ready that this key may be missing from the `meta` dict, so that you should extract it like: + +```python +description = meta.get('summary') # correct +``` + +and not like: + +```python +description = meta['summary'] # incorrect +``` + +The latter will break extraction process with `KeyError` if `summary` disappears from `meta` at some time later but with former approach extraction will just go ahead with `description` set to `None` that is perfectly fine (remember `None` is equivalent for absence of data). + +Similarly, you should pass `fatal=False` when extracting optional data from a webpage with `_search_regex`, `_html_search_regex` or similar methods, for instance: + +```python +description = self._search_regex( + r']+id="title"[^>]*>([^<]+)<', + webpage, 'description', fatal=False) +``` + +With `fatal` set to `False` if `_search_regex` fails to extract `description` it will emit a warning and continue extraction. + +You can also pass `default=`, for example: + +```python +description = self._search_regex( + r']+id="title"[^>]*>([^<]+)<', + webpage, 'description', default=None) +``` + +On failure this code will silently continue the extraction with `description` set to `None`. That is useful for metafields that are known to may or may not be present. + +### Provide fallbacks + +When extracting metadata try to provide several scenarios for that. For example if `title` is present in several places/sources try extracting from at least some of them. This would make it more future-proof in case some of the sources became unavailable. + +#### Example + +Say `meta` from previous example has a `title` and you are about to extract it. Since `title` is mandatory meta field you should end up with something like: + +```python +title = meta['title'] +``` + +If `title` disappeares from `meta` in future due to some changes on hoster's side the extraction would fail since `title` is mandatory. That's expected. + +Assume that you have some another source you can extract `title` from, for example `og:title` HTML meta of a `webpage`. In this case you can provide a fallback scenario: + +```python +title = meta.get('title') or self._og_search_title(webpage) +``` + +This code will try to extract from `meta` first and if it fails it will try extracting `og:title` from a `webpage`. + +### Make regular expressions flexible + +When using regular expressions try to write them fuzzy and flexible. + +#### Example + +Say you need to extract `title` from the following HTML code: + +```html +some fancy title +``` + +The code for that task should look similar to: + +```python +title = self._search_regex( + r']+class="title"[^>]*>([^<]+)', webpage, 'title') +``` + +Or even better: + +```python +title = self._search_regex( + r']+class=(["\'])title\1[^>]*>(?P[^<]+)', + webpage, 'title', group='title') +``` + +Note how you tolerate potential changes in `style` attribute's value or switch from using double quotes to single for `class` attribute: + +The code definitely should not look like: + +```python +title = self._search_regex( + r'<span style="position: absolute; left: 910px; width: 90px; float: right; z-index: 9999;" class="title">(.*?)</span>', + webpage, 'title', group='title') +``` + +### Use safe conversion functions + +Wrap all extracted numeric data into safe functions from `utils`: `int_or_none`, `float_or_none`. Use them for string to number conversions as well. + # EMBEDDING YOUTUBE-DL youtube-dl makes the best effort to be a good command-line program, and thus should be callable from any programming language. If you encounter any problems parsing its output, feel free to [create a report](https://github.com/rg3/youtube-dl/issues/new). From 1f552340574d2702fd0a80880aa6def1764f124a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 3 Jul 2016 11:31:49 +0700 Subject: [PATCH 13/24] Add PULL_REQUEST_TEMPLATE.md --- .github/PULL_REQUEST_TEMPLATE.md | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) create mode 100644 .github/PULL_REQUEST_TEMPLATE.md diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md new file mode 100644 index 000000000..f24bb4b09 --- /dev/null +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -0,0 +1,22 @@ +## Please follow the guide below + +- You will be asked some questions, please read them **carefully** and answer honestly +- Put an `x` into all the boxes [ ] relevant to your *pull request* (like that [x]) +- Use *Preview* tab to see how your *pull request* will actually look like + +--- + +### Before submitting a *pull request* make sure you have: +- [ ] At least skimmed through [adding new extractor tutorial](https://github.com/rg3/youtube-dl#adding-support-for-a-new-site) and [youtube-dl coding conventions](https://github.com/rg3/youtube-dl#youtube-dl-coding-conventions) sections +- [ ] [Searched](https://github.com/rg3/youtube-dl/search?q=is%3Apr&type=Issues) the bugtracker for similar pull requests + +### What is the purpose of your *pull request*? +- [ ] Bug fix +- [ ] New extractor +- [ ] New feature + +--- + +### Description of your *pull request* and other information + +Explanation of your *pull request* in arbitrary form goes here. Please make sure the description explains the purpose and effect of your *pull request* and is worded well enough to be understood. Provide as much context and examples as possible. From c723d1cd8dad872791ea2abdfb1bdfcbeb477f84 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 3 Jul 2016 11:35:13 +0700 Subject: [PATCH 14/24] [README.md] Update some codebase links --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 1f014b48f..6218060d6 100644 --- a/README.md +++ b/README.md @@ -964,7 +964,7 @@ Extractors are very fragile by nature since they depend on the layout of the sou ### Mandatory and optional metafields -For extraction to work youtube-dl relies on metadata your extractor extracts and provides to youtube-dl expressed by [information dictionary](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L148-L252) or simply *info dict*. Only the following meta fields in *info dict* are considered mandatory for successful extraction process by youtube-dl: +For extraction to work youtube-dl relies on metadata your extractor extracts and provides to youtube-dl expressed by [information dictionary](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L75-L257) or simply *info dict*. Only the following meta fields in *info dict* are considered mandatory for successful extraction process by youtube-dl: - `id` (media identifier) - `title` (media title) @@ -972,7 +972,7 @@ For extraction to work youtube-dl relies on metadata your extractor extracts and In fact only the last option is technically mandatory (i.e. if you can't figure out the download location of the media the extraction does not make any sense). But by convention youtube-dl also treats `id` and `title` to be mandatory. Thus aforementioned metafields are the critical data the extraction does not make any sense without and if any of them fail to be extracted then extractor is considered completely broken. -[Any field](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L148-L252) apart from the aforementioned ones are considered **optional**. That means that extraction should be **tolerate** to situations when sources for these fields can potentially be unavailable (even if they always available at the moment) and **future-proof** in order not to break the extraction of general purpose mandatory fields. +[Any field](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L149-L257) apart from the aforementioned ones are considered **optional**. That means that extraction should be **tolerate** to situations when sources for these fields can potentially be unavailable (even if they always available at the moment) and **future-proof** in order not to break the extraction of general purpose mandatory fields. #### Example From 2cb31d288e945a98a1e23023e17c01c4dc5c1cb0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 3 Jul 2016 13:01:04 +0700 Subject: [PATCH 15/24] [history:topic] Relax _VALID_URL --- youtube_dl/extractor/aenetworks.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/aenetworks.py b/youtube_dl/extractor/aenetworks.py index bc58d7b4d..8f53050c9 100644 --- a/youtube_dl/extractor/aenetworks.py +++ b/youtube_dl/extractor/aenetworks.py @@ -128,7 +128,7 @@ class AENetworksIE(AENetworksBaseIE): class HistoryTopicIE(AENetworksBaseIE): IE_NAME = 'history:topic' IE_DESC = 'History.com Topic' - _VALID_URL = r'https?://(?:www\.)?history\.com/topics/(?:[^/]+/)?(?P<topic_id>[^/]+)/videos(?:/(?P<video_display_id>[^/?#]+))?' + _VALID_URL = r'https?://(?:www\.)?history\.com/topics/(?:[^/]+/)?(?P<topic_id>[^/]+)(?:/[^/]+(?:/(?P<video_display_id>[^/?#]+))?)?' _TESTS = [{ 'url': 'http://www.history.com/topics/valentines-day/history-of-valentines-day/videos/bet-you-didnt-know-valentines-day?m=528e394da93ae&s=undefined&f=1&free=false', 'info_dict': { @@ -156,6 +156,12 @@ class HistoryTopicIE(AENetworksBaseIE): }, { 'url': 'http://www.history.com/topics/world-war-i-history/videos', 'only_matching': True, + }, { + 'url': 'http://www.history.com/topics/world-war-i/world-war-i-history', + 'only_matching': True, + }, { + 'url': 'http://www.history.com/topics/world-war-i/world-war-i-history/speeches', + 'only_matching': True, }] def theplatform_url_result(self, theplatform_url, video_id, query): From 369bb0620620d6499dc1d8db0f9b2624a76d941d Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Sun, 3 Jul 2016 14:11:29 +0800 Subject: [PATCH 16/24] [facebook] Improve embed detection (#5701) --- youtube_dl/extractor/facebook.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index bbdb14366..f5d4f966a 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -139,7 +139,7 @@ class FacebookIE(InfoExtractor): # Facebook API embed # see https://developers.facebook.com/docs/plugins/embedded-video-player mobj = re.search(r'''(?x)<div[^>]+ - class=(?P<q1>[\'"])[^\'"]*\bfb-video\b[^\'"]*(?P=q1)[^>]+ + class=(?P<q1>[\'"])[^\'"]*\bfb-(?:video|post)\b[^\'"]*(?P=q1)[^>]+ data-href=(?P<q2>[\'"])(?P<url>(?:https?:)?//(?:www\.)?facebook.com/.+?)(?P=q2)''', webpage) if mobj is not None: return mobj.group('url') From e793338c880f32f32eaec34e9bd338a097ab8fb0 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Sun, 3 Jul 2016 14:12:02 +0800 Subject: [PATCH 17/24] [buzzfeed] Detect Facebook embed and update _TESTS Closes #5701 --- youtube_dl/extractor/buzzfeed.py | 32 +++++++++++++++++++++++++++----- 1 file changed, 27 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/buzzfeed.py b/youtube_dl/extractor/buzzfeed.py index df503ecc0..75fa92d7c 100644 --- a/youtube_dl/extractor/buzzfeed.py +++ b/youtube_dl/extractor/buzzfeed.py @@ -5,6 +5,7 @@ import json import re from .common import InfoExtractor +from .facebook import FacebookIE class BuzzFeedIE(InfoExtractor): @@ -20,11 +21,11 @@ class BuzzFeedIE(InfoExtractor): 'info_dict': { 'id': 'aVCR29aE_OQ', 'ext': 'mp4', + 'title': 'Angry Ram destroys a punching bag..', + 'description': 'md5:c59533190ef23fd4458a5e8c8c872345', 'upload_date': '20141024', 'uploader_id': 'Buddhanz1', - 'description': 'He likes to stay in shape with his heavy bag, he wont stop until its on the ground\n\nFollow Angry Ram on Facebook for regular updates -\nhttps://www.facebook.com/pages/Angry-Ram/1436897249899558?ref=hl', - 'uploader': 'Buddhanz', - 'title': 'Angry Ram destroys a punching bag', + 'uploader': 'Angry Ram', } }] }, { @@ -41,13 +42,30 @@ class BuzzFeedIE(InfoExtractor): 'info_dict': { 'id': 'mVmBL8B-In0', 'ext': 'mp4', + 'title': 're:Munchkin the Teddy Bear gets her exercise', + 'description': 'md5:28faab95cda6e361bcff06ec12fc21d8', 'upload_date': '20141124', 'uploader_id': 'CindysMunchkin', - 'description': 're:© 2014 Munchkin the', 'uploader': 're:^Munchkin the', - 'title': 're:Munchkin the Teddy Bear gets her exercise', }, }] + }, { + 'url': 'http://www.buzzfeed.com/craigsilverman/the-most-adorable-crash-landing-ever#.eq7pX0BAmK', + 'info_dict': { + 'id': 'the-most-adorable-crash-landing-ever', + 'title': 'Watch This Baby Goose Make The Most Adorable Crash Landing', + 'description': 'This gosling knows how to stick a landing.', + }, + 'playlist': [{ + 'md5': '763ca415512f91ca62e4621086900a23', + 'info_dict': { + 'id': '971793786185728', + 'ext': 'mp4', + 'title': 'We set up crash pads so that the goslings on our roof would have a safe landi...', + 'uploader': 'Calgary Outdoor Centre-University of Calgary', + }, + }], + 'add_ie': ['Facebook'], }] def _real_extract(self, url): @@ -66,6 +84,10 @@ class BuzzFeedIE(InfoExtractor): continue entries.append(self.url_result(video['url'])) + facebook_url = FacebookIE._extract_url(webpage) + if facebook_url: + entries.append(self.url_result(facebook_url)) + return { '_type': 'playlist', 'id': playlist_id, From f1388739002a7fd1e8e9c41b642734786fc6c391 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Sun, 3 Jul 2016 15:39:24 +0800 Subject: [PATCH 18/24] [rai] Fix extraction and update _TESTS Closes #8617 Closes #9157 Closes #9232 --- youtube_dl/extractor/rai.py | 109 ++++++++++++++++++------------------ 1 file changed, 54 insertions(+), 55 deletions(-) diff --git a/youtube_dl/extractor/rai.py b/youtube_dl/extractor/rai.py index e36ce1aa1..946741f43 100644 --- a/youtube_dl/extractor/rai.py +++ b/youtube_dl/extractor/rai.py @@ -1,18 +1,16 @@ from __future__ import unicode_literals -import re - from .common import InfoExtractor -from ..compat import ( - compat_urllib_parse, - compat_urlparse, -) +from ..compat import compat_urlparse from ..utils import ( - ExtractorError, determine_ext, + ExtractorError, + find_xpath_attr, + fix_xml_ampersands, + int_or_none, parse_duration, unified_strdate, - int_or_none, + update_url_query, xpath_text, ) @@ -22,26 +20,30 @@ class RaiTVIE(InfoExtractor): _TESTS = [ { 'url': 'http://www.rai.tv/dl/RaiTV/programmi/media/ContentItem-cb27157f-9dd0-4aee-b788-b1f67643a391.html', - 'md5': '96382709b61dd64a6b88e0f791e6df4c', + 'md5': '8970abf8caf8aef4696e7b1f2adfc696', 'info_dict': { 'id': 'cb27157f-9dd0-4aee-b788-b1f67643a391', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Report del 07/04/2014', 'description': 'md5:f27c544694cacb46a078db84ec35d2d9', 'upload_date': '20140407', 'duration': 6160, + 'thumbnail': 're:^https?://.*\.jpg$', } }, { + # no m3u8 stream 'url': 'http://www.raisport.rai.it/dl/raiSport/media/rassegna-stampa-04a9f4bd-b563-40cf-82a6-aad3529cb4a9.html', - 'md5': 'd9751b78eac9710d62c2447b224dea39', + # HDS download, MD5 is unstable 'info_dict': { 'id': '04a9f4bd-b563-40cf-82a6-aad3529cb4a9', 'ext': 'flv', 'title': 'TG PRIMO TEMPO', 'upload_date': '20140612', 'duration': 1758, + 'thumbnail': 're:^https?://.*\.jpg$', }, + 'skip': 'Geo-restricted to Italy', }, { 'url': 'http://www.rainews.it/dl/rainews/media/state-of-the-net-Antonella-La-Carpia-regole-virali-7aafdea9-0e5d-49d5-88a6-7e65da67ae13.html', @@ -67,20 +69,22 @@ class RaiTVIE(InfoExtractor): }, { 'url': 'http://www.ilcandidato.rai.it/dl/ray/media/Il-Candidato---Primo-episodio-Le-Primarie-28e5525a-b495-45e8-a7c3-bc48ba45d2b6.html', - 'md5': '496ab63e420574447f70d02578333437', + 'md5': 'e57493e1cb8bc7c564663f363b171847', 'info_dict': { 'id': '28e5525a-b495-45e8-a7c3-bc48ba45d2b6', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Il Candidato - Primo episodio: "Le Primarie"', 'description': 'md5:364b604f7db50594678f483353164fb8', 'upload_date': '20140923', 'duration': 386, + 'thumbnail': 're:^https?://.*\.jpg$', } }, ] def _real_extract(self, url): video_id = self._match_id(url) + media = self._download_json( 'http://www.rai.tv/dl/RaiTV/programmi/media/ContentItem-%s.html?json' % video_id, video_id, 'Downloading video JSON') @@ -90,10 +94,9 @@ class RaiTVIE(InfoExtractor): thumbnail_url = media.get(image_type) if thumbnail_url: thumbnails.append({ - 'url': thumbnail_url, + 'url': compat_urlparse.urljoin(url, thumbnail_url), }) - subtitles = [] formats = [] media_type = media['type'] if 'Audio' in media_type: @@ -103,49 +106,60 @@ class RaiTVIE(InfoExtractor): 'ext': media.get('formatoAudio'), }) elif 'Video' in media_type: - def fix_xml(xml): - return xml.replace(' tag elementi', '').replace('>/', '</') + for platform in ('mon', 'flash', 'native'): + headers = {} + # TODO: rename --cn-verification-proxy + cn_verification_proxy = self._downloader.params.get('cn_verification_proxy') + if cn_verification_proxy: + headers['Ytdl-request-proxy'] = cn_verification_proxy - relinker = self._download_xml( - media['mediaUri'] + '&output=43', - video_id, transform_source=fix_xml) + relinker = self._download_xml( + media['mediaUri'], video_id, + note='Downloading XML metadata for platform %s' % platform, + transform_source=fix_xml_ampersands, + query={'output': 45, 'pl': platform}, headers=headers) - has_subtitle = False + media_url = find_xpath_attr(relinker, './url', 'type', 'content').text + if media_url == 'http://download.rai.it/video_no_available.mp4': + self.raise_geo_restricted() - for element in relinker.findall('element'): - media_url = xpath_text(element, 'url') ext = determine_ext(media_url) - content_type = xpath_text(element, 'content-type') + if (platform == 'mon' and ext != 'm3u8') or (platform == 'flash' and ext != 'f4m'): + continue + if ext == 'm3u8': formats.extend(self._extract_m3u8_formats( media_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) elif ext == 'f4m': + manifest_url = update_url_query( + media_url, {'hdcore': '3.7.0', 'plugin': 'aasp-3.7.0.39.44'}) formats.extend(self._extract_f4m_formats( - media_url + '?hdcore=3.7.0&plugin=aasp-3.7.0.39.44', - video_id, f4m_id='hds', fatal=False)) - elif ext == 'stl': - has_subtitle = True - elif content_type.startswith('video/'): - bitrate = int_or_none(xpath_text(element, 'bitrate')) + manifest_url, video_id, f4m_id='hds', fatal=False)) + else: + bitrate = int_or_none(xpath_text(relinker, 'bitrate')) formats.append({ 'url': media_url, 'tbr': bitrate if bitrate > 0 else None, 'format_id': 'http-%d' % bitrate if bitrate > 0 else 'http', }) - elif content_type.startswith('image/'): - thumbnails.append({ - 'url': media_url, - }) self._sort_formats(formats) - - if has_subtitle: - webpage = self._download_webpage(url, video_id) - subtitles = self._get_subtitles(video_id, webpage) else: raise ExtractorError('not a media file') + subtitles = {} + captions = media.get('subtitlesUrl') + if captions: + STL_EXT = '.stl' + SRT_EXT = '.srt' + if captions.endswith(STL_EXT): + captions = captions[:-len(STL_EXT)] + SRT_EXT + subtitles['it'] = [{ + 'ext': 'srt', + 'url': captions, + }] + return { 'id': video_id, 'title': media['name'], @@ -158,31 +172,16 @@ class RaiTVIE(InfoExtractor): 'subtitles': subtitles, } - def _get_subtitles(self, video_id, webpage): - subtitles = {} - m = re.search(r'<meta name="closedcaption" content="(?P<captions>[^"]+)"', webpage) - if m: - captions = m.group('captions') - STL_EXT = '.stl' - SRT_EXT = '.srt' - if captions.endswith(STL_EXT): - captions = captions[:-len(STL_EXT)] + SRT_EXT - subtitles['it'] = [{ - 'ext': 'srt', - 'url': 'http://www.rai.tv%s' % compat_urllib_parse.quote(captions), - }] - return subtitles - class RaiIE(InfoExtractor): _VALID_URL = r'https?://(?:.+?\.)?(?:rai\.it|rai\.tv|rainews\.it)/dl/.+?-(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})(?:-.+?)?\.html' _TESTS = [ { 'url': 'http://www.report.rai.it/dl/Report/puntata/ContentItem-0c7a664b-d0f4-4b2c-8835-3f82e46f433e.html', - 'md5': 'e0e7a8a131e249d1aa0ebf270d1d8db7', + 'md5': '2dd727e61114e1ee9c47f0da6914e178', 'info_dict': { 'id': '59d69d28-6bb6-409d-a4b5-ed44096560af', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Il pacco', 'description': 'md5:4b1afae1364115ce5d78ed83cd2e5b3a', 'upload_date': '20141221', From 64436cb1a453d6f92a72ac4e28cde1fa03b00fc7 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Sun, 3 Jul 2016 10:43:36 +0100 Subject: [PATCH 19/24] [nationalgeographic] skip download for national geographic channel tests(closes #9991) --- youtube_dl/extractor/nationalgeographic.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/youtube_dl/extractor/nationalgeographic.py b/youtube_dl/extractor/nationalgeographic.py index ed76798aa..6eabd2278 100644 --- a/youtube_dl/extractor/nationalgeographic.py +++ b/youtube_dl/extractor/nationalgeographic.py @@ -79,6 +79,10 @@ class NationalGeographicChannelIE(ThePlatformIE): 'upload_date': '20160322', 'uploader': 'NEWA-FNG-NGTV', }, + 'params': { + # m3u8 download + 'skip_download': True, + }, 'add_ie': ['ThePlatform'], }, { @@ -93,6 +97,10 @@ class NationalGeographicChannelIE(ThePlatformIE): 'upload_date': '20160330', 'uploader': 'NEWA-FNG-NGTV', }, + 'params': { + # m3u8 download + 'skip_download': True, + }, 'add_ie': ['ThePlatform'], }, ] From 034a884957c4b3d9cb9585f3c0d634d99a3e3389 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Sun, 3 Jul 2016 19:22:48 +0800 Subject: [PATCH 20/24] [rai] Support direct relinker URLs (closes #8552) --- youtube_dl/extractor/rai.py | 126 +++++++++++++++++++++++------------- 1 file changed, 81 insertions(+), 45 deletions(-) diff --git a/youtube_dl/extractor/rai.py b/youtube_dl/extractor/rai.py index 946741f43..b1d3df1ba 100644 --- a/youtube_dl/extractor/rai.py +++ b/youtube_dl/extractor/rai.py @@ -15,8 +15,54 @@ from ..utils import ( ) -class RaiTVIE(InfoExtractor): - _VALID_URL = r'https?://(?:.+?\.)?(?:rai\.it|rai\.tv|rainews\.it)/dl/(?:[^/]+/)+media/.+?-(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})(?:-.+?)?\.html' +class RaiBaseIE(InfoExtractor): + def _extract_relinker_formats(self, relinker_url, video_id): + formats = [] + + for platform in ('mon', 'flash', 'native'): + headers = {} + # TODO: rename --cn-verification-proxy + cn_verification_proxy = self._downloader.params.get('cn_verification_proxy') + if cn_verification_proxy: + headers['Ytdl-request-proxy'] = cn_verification_proxy + + relinker = self._download_xml( + relinker_url, video_id, + note='Downloading XML metadata for platform %s' % platform, + transform_source=fix_xml_ampersands, + query={'output': 45, 'pl': platform}, headers=headers) + + media_url = find_xpath_attr(relinker, './url', 'type', 'content').text + if media_url == 'http://download.rai.it/video_no_available.mp4': + self.raise_geo_restricted() + + ext = determine_ext(media_url) + if (ext == 'm3u8' and platform != 'mon') or (ext == 'f4m' and platform != 'flash'): + continue + + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + media_url, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) + elif ext == 'f4m': + manifest_url = update_url_query( + media_url.replace('manifest#live_hds.f4m', 'manifest.f4m'), + {'hdcore': '3.7.0', 'plugin': 'aasp-3.7.0.39.44'}) + formats.extend(self._extract_f4m_formats( + manifest_url, video_id, f4m_id='hds', fatal=False)) + else: + bitrate = int_or_none(xpath_text(relinker, 'bitrate')) + formats.append({ + 'url': media_url, + 'tbr': bitrate if bitrate > 0 else None, + 'format_id': 'http-%d' % bitrate if bitrate > 0 else 'http', + }) + + return formats + + +class RaiTVIE(RaiBaseIE): + _VALID_URL = r'https?://(?:.+?\.)?(?:rai\.it|rai\.tv|rainews\.it)/dl/(?:[^/]+/)+(?:media|ondemand)/.+?-(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})(?:-.+?)?\.html' _TESTS = [ { 'url': 'http://www.rai.tv/dl/RaiTV/programmi/media/ContentItem-cb27157f-9dd0-4aee-b788-b1f67643a391.html', @@ -106,44 +152,7 @@ class RaiTVIE(InfoExtractor): 'ext': media.get('formatoAudio'), }) elif 'Video' in media_type: - for platform in ('mon', 'flash', 'native'): - headers = {} - # TODO: rename --cn-verification-proxy - cn_verification_proxy = self._downloader.params.get('cn_verification_proxy') - if cn_verification_proxy: - headers['Ytdl-request-proxy'] = cn_verification_proxy - - relinker = self._download_xml( - media['mediaUri'], video_id, - note='Downloading XML metadata for platform %s' % platform, - transform_source=fix_xml_ampersands, - query={'output': 45, 'pl': platform}, headers=headers) - - media_url = find_xpath_attr(relinker, './url', 'type', 'content').text - if media_url == 'http://download.rai.it/video_no_available.mp4': - self.raise_geo_restricted() - - ext = determine_ext(media_url) - if (platform == 'mon' and ext != 'm3u8') or (platform == 'flash' and ext != 'f4m'): - continue - - if ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - media_url, video_id, 'mp4', 'm3u8_native', - m3u8_id='hls', fatal=False)) - elif ext == 'f4m': - manifest_url = update_url_query( - media_url, {'hdcore': '3.7.0', 'plugin': 'aasp-3.7.0.39.44'}) - formats.extend(self._extract_f4m_formats( - manifest_url, video_id, f4m_id='hds', fatal=False)) - else: - bitrate = int_or_none(xpath_text(relinker, 'bitrate')) - formats.append({ - 'url': media_url, - 'tbr': bitrate if bitrate > 0 else None, - 'format_id': 'http-%d' % bitrate if bitrate > 0 else 'http', - }) - + formats.extend(self._extract_relinker_formats(media['mediaUri'], video_id)) self._sort_formats(formats) else: raise ExtractorError('not a media file') @@ -173,7 +182,7 @@ class RaiTVIE(InfoExtractor): } -class RaiIE(InfoExtractor): +class RaiIE(RaiBaseIE): _VALID_URL = r'https?://(?:.+?\.)?(?:rai\.it|rai\.tv|rainews\.it)/dl/.+?-(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})(?:-.+?)?\.html' _TESTS = [ { @@ -186,6 +195,16 @@ class RaiIE(InfoExtractor): 'description': 'md5:4b1afae1364115ce5d78ed83cd2e5b3a', 'upload_date': '20141221', }, + }, + { + 'url': 'http://www.rai.tv/dl/RaiTV/dirette/PublishingBlock-1912dbbf-3f96-44c3-b4cf-523681fbacbc.html?channel=EuroNews', + # HDS live stream, MD5 is unstable + 'info_dict': { + 'id': '1912dbbf-3f96-44c3-b4cf-523681fbacbc', + 'ext': 'flv', + 'title': 'EuroNews', + }, + 'skip': 'Geo-restricted to Italy', } ] @@ -200,7 +219,24 @@ class RaiIE(InfoExtractor): iframe_url = self._search_regex( [r'<iframe[^>]+src="([^"]*/dl/[^"]+\?iframe\b[^"]*)"', r'drawMediaRaiTV\(["\'](.+?)["\']'], - webpage, 'iframe') - if not iframe_url.startswith('http'): - iframe_url = compat_urlparse.urljoin(url, iframe_url) - return self.url_result(iframe_url) + webpage, 'iframe', default=None) + if iframe_url: + if not iframe_url.startswith('http'): + iframe_url = compat_urlparse.urljoin(url, iframe_url) + return self.url_result(iframe_url) + + relinker_url = compat_urlparse.urljoin(url, self._search_regex( + r'var\s+videoURL\s*=\s*(?P<q1>[\'"])(?P<url>(https?:)?//mediapolis\.rai\.it/relinker/relinkerServlet\.htm\?cont=\d+)(?P=q1)', + webpage, 'relinker URL', group='url')) + formats = self._extract_relinker_formats(relinker_url, video_id) + self._sort_formats(formats) + + title = self._search_regex( + r'var\s+videoTitolo\s*=\s*([\'"])(?P<title>[^\'"]+)\1', + webpage, 'title', group='title', default=None) or self._og_search_title(webpage) + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + } From 477b7a847479ffea131e2d9a4993fa3f53540e85 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Sun, 3 Jul 2016 19:25:40 +0800 Subject: [PATCH 21/24] [downloader/f4m] Fix for Rai live streams --- youtube_dl/downloader/f4m.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/youtube_dl/downloader/f4m.py b/youtube_dl/downloader/f4m.py index 8f88b0241..80c21d40b 100644 --- a/youtube_dl/downloader/f4m.py +++ b/youtube_dl/downloader/f4m.py @@ -196,6 +196,11 @@ def build_fragments_list(boot_info): first_frag_number = fragment_run_entry_table[0]['first'] fragments_counter = itertools.count(first_frag_number) for segment, fragments_count in segment_run_table['segment_run']: + # In some live HDS streams (for example Rai), `fragments_count` is + # abnormal and causing out-of-memory errors. It's OK to change the + # number of fragments for live streams as they are updated periodically + if fragments_count == 4294967295 and boot_info['live']: + fragments_count = 2 for _ in range(fragments_count): res.append((segment, next(fragments_counter))) @@ -329,7 +334,11 @@ class F4mFD(FragmentFD): base_url = compat_urlparse.urljoin(man_url, media.attrib['url']) bootstrap_node = doc.find(_add_ns('bootstrapInfo')) - boot_info, bootstrap_url = self._parse_bootstrap_node(bootstrap_node, base_url) + # From Adobe F4M 3.0 spec: + # The <baseURL> element SHALL be the base URL for all relative + # (HTTP-based) URLs in the manifest. If <baseURL> is not present, said + # URLs should be relative to the location of the containing document. + boot_info, bootstrap_url = self._parse_bootstrap_node(bootstrap_node, man_url) live = boot_info['live'] metadata_node = media.find(_add_ns('metadata')) if metadata_node is not None: From 7507fc98cb7ab0f78317cd6b485f88830953645d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 3 Jul 2016 18:35:28 +0700 Subject: [PATCH 22/24] [README.md] Fix somes typo in coding conventions section --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 6218060d6..cd9e0b43f 100644 --- a/README.md +++ b/README.md @@ -960,7 +960,7 @@ In any case, thank you very much for your contributions! This section introduces a guide lines for writing idiomatic, robust and future-proof extractor code. -Extractors are very fragile by nature since they depend on the layout of the source data provided by 3rd party media hoster out of your control and this layout tend to change. As extractor implementer your task is not only to write code that will extract media links and metadata correctly but also to minimize code dependency on source's layout changes and even to make the code foresee potential future changes and be ready for that. This is important because it will allow extractor not to break on minor layout changes thus keeping old youtube-dl version working. Even though this breakage issue is easily fixed by emitting a new version of youtube-dl with fix incorporated all the previous version become broken in all repositories and distros' packages that may not be so prompt in fetching the update from us. Needless to say some may never receive an update at all that is possible for non rolling release distros. +Extractors are very fragile by nature since they depend on the layout of the source data provided by 3rd party media hoster out of your control and this layout tend to change. As an extractor implementer your task is not only to write code that will extract media links and metadata correctly but also to minimize code dependency on source's layout changes and even to make the code foresee potential future changes and be ready for that. This is important because it will allow extractor not to break on minor layout changes thus keeping old youtube-dl versions working. Even though this breakage issue is easily fixed by emitting a new version of youtube-dl with fix incorporated all the previous version become broken in all repositories and distros' packages that may not be so prompt in fetching the update from us. Needless to say some may never receive an update at all that is possible for non rolling release distros. ### Mandatory and optional metafields @@ -972,7 +972,7 @@ For extraction to work youtube-dl relies on metadata your extractor extracts and In fact only the last option is technically mandatory (i.e. if you can't figure out the download location of the media the extraction does not make any sense). But by convention youtube-dl also treats `id` and `title` to be mandatory. Thus aforementioned metafields are the critical data the extraction does not make any sense without and if any of them fail to be extracted then extractor is considered completely broken. -[Any field](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L149-L257) apart from the aforementioned ones are considered **optional**. That means that extraction should be **tolerate** to situations when sources for these fields can potentially be unavailable (even if they always available at the moment) and **future-proof** in order not to break the extraction of general purpose mandatory fields. +[Any field](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L149-L257) apart from the aforementioned ones are considered **optional**. That means that extraction should be **tolerate** to situations when sources for these fields can potentially be unavailable (even if they are always available at the moment) and **future-proof** in order not to break the extraction of general purpose mandatory fields. #### Example From 2b28b892d8f103951992b50b4ed76a4729a748fa Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Sun, 3 Jul 2016 19:44:58 +0800 Subject: [PATCH 23/24] [rai] Support videos with embedded content item ID (#8551) --- youtube_dl/extractor/rai.py | 121 +++++++++++++++++++++--------------- 1 file changed, 71 insertions(+), 50 deletions(-) diff --git a/youtube_dl/extractor/rai.py b/youtube_dl/extractor/rai.py index b1d3df1ba..eb32bbdb6 100644 --- a/youtube_dl/extractor/rai.py +++ b/youtube_dl/extractor/rai.py @@ -60,6 +60,57 @@ class RaiBaseIE(InfoExtractor): return formats + def _extract_from_content_id(self, content_id, base_url): + media = self._download_json( + 'http://www.rai.tv/dl/RaiTV/programmi/media/ContentItem-%s.html?json' % content_id, + content_id, 'Downloading video JSON') + + thumbnails = [] + for image_type in ('image', 'image_medium', 'image_300'): + thumbnail_url = media.get(image_type) + if thumbnail_url: + thumbnails.append({ + 'url': compat_urlparse.urljoin(base_url, thumbnail_url), + }) + + formats = [] + media_type = media['type'] + if 'Audio' in media_type: + formats.append({ + 'format_id': media.get('formatoAudio'), + 'url': media['audioUrl'], + 'ext': media.get('formatoAudio'), + }) + elif 'Video' in media_type: + formats.extend(self._extract_relinker_formats(media['mediaUri'], content_id)) + self._sort_formats(formats) + else: + raise ExtractorError('not a media file') + + subtitles = {} + captions = media.get('subtitlesUrl') + if captions: + STL_EXT = '.stl' + SRT_EXT = '.srt' + if captions.endswith(STL_EXT): + captions = captions[:-len(STL_EXT)] + SRT_EXT + subtitles['it'] = [{ + 'ext': 'srt', + 'url': captions, + }] + + return { + 'id': content_id, + 'title': media['name'], + 'description': media.get('desc'), + 'thumbnails': thumbnails, + 'uploader': media.get('author'), + 'upload_date': unified_strdate(media.get('date')), + 'duration': parse_duration(media.get('length')), + 'formats': formats, + 'subtitles': subtitles, + } + class RaiTVIE(RaiBaseIE): _VALID_URL = r'https?://(?:.+?\.)?(?:rai\.it|rai\.tv|rainews\.it)/dl/(?:[^/]+/)+(?:media|ondemand)/.+?-(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})(?:-.+?)?\.html' @@ -131,55 +182,7 @@ class RaiTVIE(RaiBaseIE): def _real_extract(self, url): video_id = self._match_id(url) - media = self._download_json( - 'http://www.rai.tv/dl/RaiTV/programmi/media/ContentItem-%s.html?json' % video_id, - video_id, 'Downloading video JSON') - - thumbnails = [] - for image_type in ('image', 'image_medium', 'image_300'): - thumbnail_url = media.get(image_type) - if thumbnail_url: - thumbnails.append({ - 'url': compat_urlparse.urljoin(url, thumbnail_url), - }) - - formats = [] - media_type = media['type'] - if 'Audio' in media_type: - formats.append({ - 'format_id': media.get('formatoAudio'), - 'url': media['audioUrl'], - 'ext': media.get('formatoAudio'), - }) - elif 'Video' in media_type: - formats.extend(self._extract_relinker_formats(media['mediaUri'], video_id)) - self._sort_formats(formats) - else: - raise ExtractorError('not a media file') - - subtitles = {} - captions = media.get('subtitlesUrl') - if captions: - STL_EXT = '.stl' - SRT_EXT = '.srt' - if captions.endswith(STL_EXT): - captions = captions[:-len(STL_EXT)] + SRT_EXT - subtitles['it'] = [{ - 'ext': 'srt', - 'url': captions, - }] - - return { - 'id': video_id, - 'title': media['name'], - 'description': media.get('desc'), - 'thumbnails': thumbnails, - 'uploader': media.get('author'), - 'upload_date': unified_strdate(media.get('date')), - 'duration': parse_duration(media.get('length')), - 'formats': formats, - 'subtitles': subtitles, - } + return self._extract_from_content_id(video_id, url) class RaiIE(RaiBaseIE): @@ -197,6 +200,7 @@ class RaiIE(RaiBaseIE): }, }, { + # Direct relinker URL 'url': 'http://www.rai.tv/dl/RaiTV/dirette/PublishingBlock-1912dbbf-3f96-44c3-b4cf-523681fbacbc.html?channel=EuroNews', # HDS live stream, MD5 is unstable 'info_dict': { @@ -205,7 +209,18 @@ class RaiIE(RaiBaseIE): 'title': 'EuroNews', }, 'skip': 'Geo-restricted to Italy', - } + }, + { + # Embedded content item ID + 'url': 'http://www.tg1.rai.it/dl/tg1/2010/edizioni/ContentSet-9b6e0cba-4bef-4aef-8cf0-9f7f665b7dfb-tg1.html?item=undefined', + 'md5': '84c1135ce960e8822ae63cec34441d63', + 'info_dict': { + 'id': '0960e765-62c8-474a-ac4b-7eb3e2be39c8', + 'ext': 'mp4', + 'title': 'TG1 ore 20:00 del 02/07/2016', + 'upload_date': '20160702', + }, + }, ] @classmethod @@ -225,6 +240,12 @@ class RaiIE(RaiBaseIE): iframe_url = compat_urlparse.urljoin(url, iframe_url) return self.url_result(iframe_url) + content_item_id = self._search_regex( + r'initEdizione\((?P<q1>[\'"])ContentItem-(?P<content_id>[^\'"]+)(?P=q1)', + webpage, 'content item ID', group='content_id', default=None) + if content_item_id: + return self._extract_from_content_id(content_item_id, url) + relinker_url = compat_urlparse.urljoin(url, self._search_regex( r'var\s+videoURL\s*=\s*(?P<q1>[\'"])(?P<url>(https?:)?//mediapolis\.rai\.it/relinker/relinkerServlet\.htm\?cont=\d+)(?P=q1)', webpage, 'relinker URL', group='url')) From 15e4b6b758daadab14ad520c55713a8b35a3db7e Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Sun, 3 Jul 2016 19:48:50 +0800 Subject: [PATCH 24/24] [rai] Support an alternative form of embedded relinker URL Closes #8551 --- youtube_dl/extractor/rai.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/rai.py b/youtube_dl/extractor/rai.py index eb32bbdb6..1f0fcd609 100644 --- a/youtube_dl/extractor/rai.py +++ b/youtube_dl/extractor/rai.py @@ -221,6 +221,15 @@ class RaiIE(RaiBaseIE): 'upload_date': '20160702', }, }, + { + 'url': 'http://www.rainews.it/dl/rainews/live/ContentItem-3156f2f2-dc70-4953-8e2f-70d7489d4ce9.html', + # HDS live stream, MD5 is unstable + 'info_dict': { + 'id': '3156f2f2-dc70-4953-8e2f-70d7489d4ce9', + 'ext': 'flv', + 'title': 'La diretta di Rainews24', + }, + }, ] @classmethod @@ -247,7 +256,7 @@ class RaiIE(RaiBaseIE): return self._extract_from_content_id(content_item_id, url) relinker_url = compat_urlparse.urljoin(url, self._search_regex( - r'var\s+videoURL\s*=\s*(?P<q1>[\'"])(?P<url>(https?:)?//mediapolis\.rai\.it/relinker/relinkerServlet\.htm\?cont=\d+)(?P=q1)', + r'(?:var\s+videoURL|mediaInfo\.mediaUri)\s*=\s*(?P<q1>[\'"])(?P<url>(https?:)?//mediapolis\.rai\.it/relinker/relinkerServlet\.htm\?cont=\d+)(?P=q1)', webpage, 'relinker URL', group='url')) formats = self._extract_relinker_formats(relinker_url, video_id) self._sort_formats(formats)