From 6ff4469528d642bd678df9b1fa83545a0942e333 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 29 Apr 2016 19:39:27 +0600 Subject: [PATCH 01/50] [crunchyroll] Relax fmt regex --- youtube_dl/extractor/crunchyroll.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index 184ba6896..4a7664296 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -307,7 +307,7 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text 'video_uploader', fatal=False) available_fmts = [] - for a, fmt in re.findall(r'(]+token="showmedia\.([0-9]{3,4})p"[^>]+>.*?)', webpage): + for a, fmt in re.findall(r'(]+token=["\']showmedia\.([0-9]{3,4})p["\'][^>]+>)', webpage): attrs = extract_attributes(a) href = attrs.get('href') if href and '/freetrial' in href: From 8312b1a3d1dc07d80d33e31f9b2b6facf13fa744 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 29 Apr 2016 19:43:53 +0600 Subject: [PATCH 02/50] [crunchyroll] Add even more relaxed fmt fallback --- youtube_dl/extractor/crunchyroll.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index 4a7664296..58960b2f8 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -314,7 +314,10 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text continue available_fmts.append(fmt) if not available_fmts: - available_fmts = re.findall(r'token="showmedia\.([0-9]{3,4})p"', webpage) + for p in (r'token=["\']showmedia\.([0-9]{3,4})p"', r'showmedia\.([0-9]{3,4})p'): + available_fmts = re.findall(p, webpage) + if available_fmts: + break video_encode_ids = [] formats = [] for fmt in available_fmts: From 00a17a9e1234ecc868a15b5759472a0f9215f797 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 29 Apr 2016 19:44:10 +0600 Subject: [PATCH 03/50] [crunchyroll] Sort formats --- youtube_dl/extractor/crunchyroll.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index 58960b2f8..90a64303d 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -367,6 +367,7 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text 'ext': 'flv', }) formats.append(format_info) + self._sort_formats(formats) metadata = self._download_xml( 'http://www.crunchyroll.com/xml', video_id, From e9c6cdf4a103d1ebdb6927bdab429c370cbe66b2 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Fri, 29 Apr 2016 22:49:04 +0800 Subject: [PATCH 04/50] [common] Fix format_id construction for HLS --- youtube_dl/extractor/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 2763d2ffe..61a5d124c 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1142,7 +1142,7 @@ class InfoExtractor(object): # Bandwidth of live streams may differ over time thus making # format_id unpredictable. So it's better to keep provided # format_id intact. - if last_media_name and not live: + if not live: format_id.append(last_media_name if last_media_name else '%d' % (tbr if tbr else len(formats))) f = { 'format_id': '-'.join(format_id), From cef3f3011f9d3a67de3ff064a5185a1a4bcf40e7 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sat, 30 Apr 2016 00:17:09 +0800 Subject: [PATCH 05/50] [funimation] Detect blocking and support CloudFlare cookies --- youtube_dl/extractor/funimation.py | 48 ++++++++++++++++++++++++++++-- 1 file changed, 45 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/funimation.py b/youtube_dl/extractor/funimation.py index 1eb528f31..0ad0d9b6a 100644 --- a/youtube_dl/extractor/funimation.py +++ b/youtube_dl/extractor/funimation.py @@ -2,6 +2,10 @@ from __future__ import unicode_literals from .common import InfoExtractor +from ..compat import ( + compat_HTTPError, + compat_urllib_parse_unquote_plus, +) from ..utils import ( clean_html, determine_ext, @@ -27,6 +31,7 @@ class FunimationIE(InfoExtractor): 'description': 'md5:1769f43cd5fc130ace8fd87232207892', 'thumbnail': 're:https?://.*\.jpg', }, + 'skip': 'Access without user interaction is forbidden by CloudFlare, and video removed', }, { 'url': 'http://www.funimation.com/shows/hacksign/videos/official/role-play', 'info_dict': { @@ -37,6 +42,7 @@ class FunimationIE(InfoExtractor): 'description': 'md5:b602bdc15eef4c9bbb201bb6e6a4a2dd', 'thumbnail': 're:https?://.*\.jpg', }, + 'skip': 'Access without user interaction is forbidden by CloudFlare', }, { 'url': 'http://www.funimation.com/shows/attack-on-titan-junior-high/videos/promotional/broadcast-dub-preview', 'info_dict': { @@ -47,8 +53,36 @@ class FunimationIE(InfoExtractor): 'description': 'md5:f8ec49c0aff702a7832cd81b8a44f803', 'thumbnail': 're:https?://.*\.(?:jpg|png)', }, + 'skip': 'Access without user interaction is forbidden by CloudFlare', }] + _LOGIN_URL = 'http://www.funimation.com/login' + + def _download_webpage(self, *args, **kwargs): + try: + return super(FunimationIE, self)._download_webpage(*args, **kwargs) + except ExtractorError as ee: + if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 403: + response = ee.cause.read() + if b'>Please complete the security check to access<' in response: + raise ExtractorError( + 'Access to funimation.com is blocked by CloudFlare. ' + 'Please browse to http://www.funimation.com/, solve ' + 'the reCAPTCHA, export browser cookies to a text file,' + ' and then try again with --cookies YOUR_COOKIE_FILE.', + expected=True) + raise + + def _extract_cloudflare_session_ua(self, url): + ci_session_cookie = self._get_cookies(url).get('ci_session') + if ci_session_cookie: + ci_session = compat_urllib_parse_unquote_plus(ci_session_cookie.value) + # ci_session is a string serialized by PHP function serialize() + # This case is simple enough to use regular expressions only + return self._search_regex( + r'"user_agent";s:\d+:"([^"]+)"', ci_session, 'user agent', + default=None) + def _login(self): (username, password) = self._get_login_info() if username is None: @@ -57,8 +91,11 @@ class FunimationIE(InfoExtractor): 'email_field': username, 'password_field': password, }) - login_request = sanitized_Request('http://www.funimation.com/login', data, headers={ - 'User-Agent': 'Mozilla/5.0 (Windows NT 5.2; WOW64; rv:42.0) Gecko/20100101 Firefox/42.0', + user_agent = self._extract_cloudflare_session_ua(self._LOGIN_URL) + if not user_agent: + user_agent = 'Mozilla/5.0 (Windows NT 5.2; WOW64; rv:42.0) Gecko/20100101 Firefox/42.0' + login_request = sanitized_Request(self._LOGIN_URL, data, headers={ + 'User-Agent': user_agent, 'Content-Type': 'application/x-www-form-urlencoded' }) login_page = self._download_webpage( @@ -103,11 +140,16 @@ class FunimationIE(InfoExtractor): ('mobile', 'Mozilla/5.0 (Linux; Android 4.4.2; Nexus 4 Build/KOT49H) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.114 Mobile Safari/537.36'), ) + user_agent = self._extract_cloudflare_session_ua(url) + if user_agent: + USER_AGENTS = ((None, user_agent),) + for kind, user_agent in USER_AGENTS: request = sanitized_Request(url) request.add_header('User-Agent', user_agent) webpage = self._download_webpage( - request, display_id, 'Downloading %s webpage' % kind) + request, display_id, + 'Downloading %s webpage' % kind if kind else 'Downloading webpage') playlist = self._parse_json( self._search_regex( From 65a3bfb379c9d5e53cac874af097d2071ee4ac4d Mon Sep 17 00:00:00 2001 From: remitamine Date: Fri, 29 Apr 2016 19:21:17 +0100 Subject: [PATCH 06/50] [dfb] extract m3u8 formats --- youtube_dl/extractor/dfb.py | 31 +++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/youtube_dl/extractor/dfb.py b/youtube_dl/extractor/dfb.py index cdfeccacb..a4d0448c2 100644 --- a/youtube_dl/extractor/dfb.py +++ b/youtube_dl/extractor/dfb.py @@ -12,39 +12,46 @@ class DFBIE(InfoExtractor): _TEST = { 'url': 'http://tv.dfb.de/video/u-19-em-stimmen-zum-spiel-gegen-russland/11633/', - # The md5 is different each time + 'md5': 'ac0f98a52a330f700b4b3034ad240649', 'info_dict': { 'id': '11633', 'display_id': 'u-19-em-stimmen-zum-spiel-gegen-russland', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'U 19-EM: Stimmen zum Spiel gegen Russland', 'upload_date': '20150714', }, } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - display_id = mobj.group('display_id') + display_id, video_id = re.match(self._VALID_URL, url).groups() - webpage = self._download_webpage(url, display_id) player_info = self._download_xml( 'http://tv.dfb.de/server/hd_video.php?play=%s' % video_id, display_id) video_info = player_info.find('video') + stream_access_url = self._proto_relative_url(video_info.find('url').text.strip()) - f4m_info = self._download_xml( - self._proto_relative_url(video_info.find('url').text.strip()), display_id) - token_el = f4m_info.find('token') - manifest_url = token_el.attrib['url'] + '?' + 'hdnea=' + token_el.attrib['auth'] + '&hdcore=3.2.0' - formats = self._extract_f4m_formats(manifest_url, display_id) + formats = [] + # see http://tv.dfb.de/player/js/ajax.js for the method to extract m3u8 formats + for sa_url in (stream_access_url, stream_access_url + '&area=&format=iphone'): + stream_access_info = self._download_xml(sa_url, display_id) + token_el = stream_access_info.find('token') + manifest_url = token_el.attrib['url'] + '?' + 'hdnea=' + token_el.attrib['auth'] + if '.f4m' in manifest_url: + formats.extend(self._extract_f4m_formats( + manifest_url + '&hdcore=3.2.0', + display_id, f4m_id='hds', fatal=False)) + else: + formats.extend(self._extract_m3u8_formats( + manifest_url, display_id, 'mp4', + 'm3u8_native', m3u8_id='hls', fatal=False)) self._sort_formats(formats) return { 'id': video_id, 'display_id': display_id, 'title': video_info.find('title').text, - 'thumbnail': self._og_search_thumbnail(webpage), + 'thumbnail': 'http://tv.dfb.de/images/%s_640x360.jpg' % video_id, 'upload_date': unified_strdate(video_info.find('time_date').text), 'formats': formats, } From 5556047465e0601d2bdee0e5a436cee64b745851 Mon Sep 17 00:00:00 2001 From: Reino17 Date: Wed, 27 Apr 2016 13:11:38 +0200 Subject: [PATCH 07/50] [rtlnl] Update 720p PG_URL_TEMPLATE - Fixed the format_id for the 720p progressive videostream and added the video's resolution. - The adaptive videostreams have the m3u8-extension, so I removed the confusing mp4-extension in order to make a better distinction between the these and the progressive videostreams. --- youtube_dl/extractor/rtlnl.py | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/rtlnl.py b/youtube_dl/extractor/rtlnl.py index 543d94417..e8b55ea25 100644 --- a/youtube_dl/extractor/rtlnl.py +++ b/youtube_dl/extractor/rtlnl.py @@ -94,19 +94,30 @@ class RtlNlIE(InfoExtractor): videopath = material['videopath'] m3u8_url = meta.get('videohost', 'http://manifest.us.rtl.nl') + videopath - formats = self._extract_m3u8_formats(m3u8_url, uuid, ext='mp4') + formats = self._extract_m3u8_formats(m3u8_url, uuid) video_urlpart = videopath.split('/adaptive/')[1][:-5] PG_URL_TEMPLATE = 'http://pg.us.rtl.nl/rtlxl/network/%s/progressive/%s.mp4' formats.extend([ { - 'url': PG_URL_TEMPLATE % ('a2m', video_urlpart), - 'format_id': 'pg-sd', + 'url': PG_URL_TEMPLATE % ('a2t', video_urlpart), + 'format_id': 'a2t', + 'width': 512, + 'height': 288, }, { - 'url': PG_URL_TEMPLATE % ('a3m', video_urlpart), - 'format_id': 'pg-hd', + 'url': PG_URL_TEMPLATE % ('a3t', video_urlpart), + 'format_id': 'a3t', + 'width': 704, + 'height': 400, + 'quality': 0, + }, + { + 'url': PG_URL_TEMPLATE % ('nettv', video_urlpart), + 'format_id': 'nettv', + 'width': 1280, + 'height': 720, 'quality': 0, } ]) From 0571ffda7dd12fc1067c0344f3ce4ce47b39edb0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 30 Apr 2016 01:43:39 +0600 Subject: [PATCH 08/50] [rtlnl] Improve extraction (Closes #9329) * Make hls extraction non fatal and revert ext * Extract progressive formats' metadata from corresponding hls formats --- youtube_dl/extractor/rtlnl.py | 55 +++++++++++++++++++++-------------- 1 file changed, 33 insertions(+), 22 deletions(-) diff --git a/youtube_dl/extractor/rtlnl.py b/youtube_dl/extractor/rtlnl.py index e8b55ea25..c95bcf035 100644 --- a/youtube_dl/extractor/rtlnl.py +++ b/youtube_dl/extractor/rtlnl.py @@ -94,33 +94,44 @@ class RtlNlIE(InfoExtractor): videopath = material['videopath'] m3u8_url = meta.get('videohost', 'http://manifest.us.rtl.nl') + videopath - formats = self._extract_m3u8_formats(m3u8_url, uuid) + formats = self._extract_m3u8_formats( + m3u8_url, uuid, 'mp4', m3u8_id='hls', fatal=False) video_urlpart = videopath.split('/adaptive/')[1][:-5] PG_URL_TEMPLATE = 'http://pg.us.rtl.nl/rtlxl/network/%s/progressive/%s.mp4' - formats.extend([ - { - 'url': PG_URL_TEMPLATE % ('a2t', video_urlpart), - 'format_id': 'a2t', - 'width': 512, - 'height': 288, - }, - { - 'url': PG_URL_TEMPLATE % ('a3t', video_urlpart), - 'format_id': 'a3t', - 'width': 704, - 'height': 400, - 'quality': 0, - }, - { - 'url': PG_URL_TEMPLATE % ('nettv', video_urlpart), - 'format_id': 'nettv', - 'width': 1280, - 'height': 720, - 'quality': 0, + PG_FORMATS = ( + ('a2t', 512, 288), + ('a3t', 704, 400), + ('nettv', 1280, 720), + ) + + def pg_format(format_id, width, height): + return { + 'url': PG_URL_TEMPLATE % (format_id, video_urlpart), + 'format_id': 'pg-%s' % format_id, + 'protocol': 'http', + 'width': width, + 'height': height, } - ]) + + if not formats: + formats = [pg_format(*pg_tuple) for pg_tuple in PG_FORMATS] + else: + pg_formats = [] + for format_id, width, height in PG_FORMATS: + try: + # Find hls format with the same width and height corresponding + # to progressive format and copy metadata from it. + f = next(f for f in formats + if f.get('width') == width and f.get('height') == height).copy() + f.update(pg_format(format_id, width, height)) + pg_formats.append(f) + except StopIteration: + # Missing hls format does mean that no progressive format with + # such width and height exists either. + pass + formats.extend(pg_formats) self._sort_formats(formats) thumbnails = [] From cd63d091cecd8a85a2080035051205b00f3454d3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 30 Apr 2016 01:48:14 +0600 Subject: [PATCH 09/50] [rtlnl] Fix tests --- youtube_dl/extractor/rtlnl.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/rtlnl.py b/youtube_dl/extractor/rtlnl.py index c95bcf035..e4411054a 100644 --- a/youtube_dl/extractor/rtlnl.py +++ b/youtube_dl/extractor/rtlnl.py @@ -39,7 +39,7 @@ class RtlNlIE(InfoExtractor): 'ext': 'mp4', 'timestamp': 1424039400, 'title': 'RTL Nieuws - Nieuwe beelden Kopenhagen: chaos direct na aanslag', - 'thumbnail': 're:^https?://screenshots\.rtl\.nl/system/thumb/sz=[0-9]+x[0-9]+/uuid=84ae5571-ac25-4225-ae0c-ef8d9efb2aed$', + 'thumbnail': 're:^https?://screenshots\.rtl\.nl/(?:[^/]+/)*sz=[0-9]+x[0-9]+/uuid=84ae5571-ac25-4225-ae0c-ef8d9efb2aed$', 'upload_date': '20150215', 'description': 'Er zijn nieuwe beelden vrijgegeven die vlak na de aanslag in Kopenhagen zijn gemaakt. Op de video is goed te zien hoe omstanders zich bekommeren om één van de slachtoffers, terwijl de eerste agenten ter plaatse komen.', } @@ -50,7 +50,7 @@ class RtlNlIE(InfoExtractor): 'id': 'f536aac0-1dc3-4314-920e-3bd1c5b3811a', 'ext': 'mp4', 'title': 'RTL Nieuws - Meer beelden van overval juwelier', - 'thumbnail': 're:^https?://screenshots\.rtl\.nl/system/thumb/sz=[0-9]+x[0-9]+/uuid=f536aac0-1dc3-4314-920e-3bd1c5b3811a$', + 'thumbnail': 're:^https?://screenshots\.rtl\.nl/(?:[^/]+/)*sz=[0-9]+x[0-9]+/uuid=f536aac0-1dc3-4314-920e-3bd1c5b3811a$', 'timestamp': 1437233400, 'upload_date': '20150718', 'duration': 30.474, From 373e1230e4a3b934ddc59c212773d36a7e998dec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 30 Apr 2016 01:50:26 +0600 Subject: [PATCH 10/50] [rtlnl] Clarify tests --- youtube_dl/extractor/rtlnl.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/youtube_dl/extractor/rtlnl.py b/youtube_dl/extractor/rtlnl.py index e4411054a..5e916c4ab 100644 --- a/youtube_dl/extractor/rtlnl.py +++ b/youtube_dl/extractor/rtlnl.py @@ -32,6 +32,7 @@ class RtlNlIE(InfoExtractor): 'duration': 576.880, }, }, { + # best format avaialble a3t 'url': 'http://www.rtl.nl/system/videoplayer/derden/rtlnieuws/video_embed.html#uuid=84ae5571-ac25-4225-ae0c-ef8d9efb2aed/autoplay=false', 'md5': 'dea7474214af1271d91ef332fb8be7ea', 'info_dict': { @@ -45,6 +46,7 @@ class RtlNlIE(InfoExtractor): } }, { # empty synopsis and missing episodes (see https://github.com/rg3/youtube-dl/issues/6275) + # best format available nettv 'url': 'http://www.rtl.nl/system/videoplayer/derden/rtlnieuws/video_embed.html#uuid=f536aac0-1dc3-4314-920e-3bd1c5b3811a/autoplay=false', 'info_dict': { 'id': 'f536aac0-1dc3-4314-920e-3bd1c5b3811a', From ca278a182b9331201e058f9f4d46b3b6114a1518 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 30 Apr 2016 02:07:29 +0600 Subject: [PATCH 11/50] [rtlnl] Replace test --- youtube_dl/extractor/rtlnl.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/rtlnl.py b/youtube_dl/extractor/rtlnl.py index 5e916c4ab..8598b5840 100644 --- a/youtube_dl/extractor/rtlnl.py +++ b/youtube_dl/extractor/rtlnl.py @@ -20,16 +20,16 @@ class RtlNlIE(InfoExtractor): (?P[0-9a-f-]+)''' _TESTS = [{ - 'url': 'http://www.rtlxl.nl/#!/rtl-nieuws-132237/6e4203a6-0a5e-3596-8424-c599a59e0677', - 'md5': 'cc16baa36a6c169391f0764fa6b16654', + 'url': 'http://www.rtlxl.nl/#!/rtl-nieuws-132237/82b1aad1-4a14-3d7b-b554-b0aed1b2c416', + 'md5': '473d1946c1fdd050b2c0161a4b13c373', 'info_dict': { - 'id': '6e4203a6-0a5e-3596-8424-c599a59e0677', + 'id': '82b1aad1-4a14-3d7b-b554-b0aed1b2c416', 'ext': 'mp4', - 'title': 'RTL Nieuws - Laat', - 'description': 'md5:6b61f66510c8889923b11f2778c72dc5', - 'timestamp': 1408051800, - 'upload_date': '20140814', - 'duration': 576.880, + 'title': 'RTL Nieuws', + 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', + 'timestamp': 1461951000, + 'upload_date': '20160429', + 'duration': 1167.96, }, }, { # best format avaialble a3t From 69c4cde4ba6a4c7dfb8a46d1713cbb46d6f1d623 Mon Sep 17 00:00:00 2001 From: remitamine Date: Fri, 29 Apr 2016 21:35:09 +0100 Subject: [PATCH 12/50] [wsj] improve extraction --- youtube_dl/extractor/wsj.py | 95 +++++++++++++++++++------------------ 1 file changed, 48 insertions(+), 47 deletions(-) diff --git a/youtube_dl/extractor/wsj.py b/youtube_dl/extractor/wsj.py index 5a897371d..a83e68b17 100644 --- a/youtube_dl/extractor/wsj.py +++ b/youtube_dl/extractor/wsj.py @@ -4,16 +4,22 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( int_or_none, + float_or_none, unified_strdate, ) class WSJIE(InfoExtractor): - _VALID_URL = r'https?://video-api\.wsj\.com/api-video/player/iframe\.html\?guid=(?P[a-zA-Z0-9-]+)' + _VALID_URL = r'''(?x)https?:// + (?: + video-api\.wsj\.com/api-video/player/iframe\.html\?guid=| + (?:www\.)?wsj\.com/video/[^/]+/ + ) + (?P[a-zA-Z0-9-]+)''' IE_DESC = 'Wall Street Journal' - _TEST = { + _TESTS = [{ 'url': 'http://video-api.wsj.com/api-video/player/iframe.html?guid=1BD01A4C-BFE8-40A5-A42F-8A8AF9898B1A', - 'md5': '9747d7a6ebc2f4df64b981e1dde9efa9', + 'md5': 'e230a5bb249075e40793b655a54a02e4', 'info_dict': { 'id': '1BD01A4C-BFE8-40A5-A42F-8A8AF9898B1A', 'ext': 'mp4', @@ -24,65 +30,60 @@ class WSJIE(InfoExtractor): 'duration': 90, 'title': 'Bills Coach Rex Ryan Updates His Old Jets Tattoo', }, - } + }, { + 'url': 'http://www.wsj.com/video/can-alphabet-build-a-smarter-city/359DDAA8-9AC1-489C-82E6-0429C1E430E0.html', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) - bitrates = [128, 174, 264, 320, 464, 664, 1264] api_url = ( 'http://video-api.wsj.com/api-video/find_all_videos.asp?' - 'type=guid&count=1&query=%s&' - 'fields=hls,adZone,thumbnailList,guid,state,secondsUntilStartTime,' - 'author,description,name,linkURL,videoStillURL,duration,videoURL,' - 'adCategory,catastrophic,linkShortURL,doctypeID,youtubeID,' - 'titletag,rssURL,wsj-section,wsj-subsection,allthingsd-section,' - 'allthingsd-subsection,sm-section,sm-subsection,provider,' - 'formattedCreationDate,keywords,keywordsOmniture,column,editor,' - 'emailURL,emailPartnerID,showName,omnitureProgramName,' - 'omnitureVideoFormat,linkRelativeURL,touchCastID,' - 'omniturePublishDate,%s') % ( - video_id, ','.join('video%dkMP4Url' % br for br in bitrates)) + 'type=guid&count=1&query=%s&fields=type,hls,videoMP4List,' + 'thumbnailList,author,description,name,duration,videoURL,' + 'titletag,formattedCreationDate,keywords,editor' % video_id) info = self._download_json(api_url, video_id)['items'][0] - - # Thumbnails are conveniently in the correct format already - thumbnails = info.get('thumbnailList') - creator = info.get('author') - uploader_id = info.get('editor') - categories = info.get('keywords') - duration = int_or_none(info.get('duration')) - upload_date = unified_strdate( - info.get('formattedCreationDate'), day_first=False) title = info.get('name', info.get('titletag')) - formats = [{ - 'format_id': 'f4m', - 'format_note': 'f4m (meta URL)', - 'url': info['videoURL'], - }] - if info.get('hls'): + formats = [] + + f4m_url = info.get('videoURL') + if f4m_url: + formats.extend(self._extract_f4m_formats( + f4m_url, video_id, f4m_id='hds', fatal=False)) + + m3u8_url = info.get('hls') + if m3u8_url: formats.extend(self._extract_m3u8_formats( info['hls'], video_id, ext='mp4', - preference=0, entry_protocol='m3u8_native')) - for br in bitrates: - field = 'video%dkMP4Url' % br - if info.get(field): - formats.append({ - 'format_id': 'mp4-%d' % br, - 'container': 'mp4', - 'tbr': br, - 'url': info[field], - }) + entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)) + + for v in info.get('videoMP4List', []): + mp4_url = v.get('url') + if not mp4_url: + continue + tbr = int_or_none(v.get('bitrate')) + formats.append({ + 'url': mp4_url, + 'format_id': 'http' + ('-%d' % tbr if tbr else ''), + 'tbr': tbr, + 'width': int_or_none(v.get('width')), + 'height': int_or_none(v.get('height')), + 'fps': float_or_none(v.get('fps')), + }) self._sort_formats(formats) return { 'id': video_id, 'formats': formats, - 'thumbnails': thumbnails, - 'creator': creator, - 'uploader_id': uploader_id, - 'duration': duration, - 'upload_date': upload_date, + # Thumbnails are conveniently in the correct format already + 'thumbnails': info.get('thumbnailList'), + 'creator': info.get('author'), + 'uploader_id': info.get('editor'), + 'duration': int_or_none(info.get('duration')), + 'upload_date': unified_strdate(info.get( + 'formattedCreationDate'), day_first=False), 'title': title, - 'categories': categories, + 'categories': info.get('keywords'), } From cbc032c8b70a038a69259378c92b4ba97b42d491 Mon Sep 17 00:00:00 2001 From: remitamine Date: Sat, 30 Apr 2016 01:24:36 +0100 Subject: [PATCH 13/50] [pbs] extract all http formats --- youtube_dl/extractor/pbs.py | 48 ++++++++++++++++++------------------- 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py index f43e3a146..38cdb9975 100644 --- a/youtube_dl/extractor/pbs.py +++ b/youtube_dl/extractor/pbs.py @@ -196,7 +196,7 @@ class PBSIE(InfoExtractor): _TESTS = [ { 'url': 'http://www.pbs.org/tpt/constitution-usa-peter-sagal/watch/a-more-perfect-union/', - 'md5': 'ce1888486f0908d555a8093cac9a7362', + 'md5': '173dc391afd361fa72eab5d3d918968d', 'info_dict': { 'id': '2365006249', 'ext': 'mp4', @@ -204,13 +204,10 @@ class PBSIE(InfoExtractor): 'description': 'md5:36f341ae62e251b8f5bd2b754b95a071', 'duration': 3190, }, - 'params': { - 'skip_download': True, # requires ffmpeg - }, }, { 'url': 'http://www.pbs.org/wgbh/pages/frontline/losing-iraq/', - 'md5': '143c98aa54a346738a3d78f54c925321', + 'md5': '6f722cb3c3982186d34b0f13374499c7', 'info_dict': { 'id': '2365297690', 'ext': 'mp4', @@ -218,9 +215,6 @@ class PBSIE(InfoExtractor): 'description': 'md5:4d3eaa01f94e61b3e73704735f1196d9', 'duration': 5050, }, - 'params': { - 'skip_download': True, # requires ffmpeg - } }, { 'url': 'http://www.pbs.org/newshour/bb/education-jan-june12-cyberschools_02-23/', @@ -244,9 +238,6 @@ class PBSIE(InfoExtractor): 'duration': 6559, 'thumbnail': 're:^https?://.*\.jpg$', }, - 'params': { - 'skip_download': True, # requires ffmpeg - }, }, { 'url': 'http://www.pbs.org/wgbh/nova/earth/killer-typhoon.html', @@ -262,9 +253,6 @@ class PBSIE(InfoExtractor): 'upload_date': '20140122', 'age_limit': 10, }, - 'params': { - 'skip_download': True, # requires ffmpeg - }, }, { 'url': 'http://www.pbs.org/wgbh/pages/frontline/united-states-of-secrets/', @@ -290,6 +278,7 @@ class PBSIE(InfoExtractor): }, { 'url': 'http://www.pbs.org/video/2365245528/', + 'md5': '115223d41bd55cda8ae5cd5ed4e11497', 'info_dict': { 'id': '2365245528', 'display_id': '2365245528', @@ -299,15 +288,13 @@ class PBSIE(InfoExtractor): 'duration': 6851, 'thumbnail': 're:^https?://.*\.jpg$', }, - 'params': { - 'skip_download': True, # requires ffmpeg - }, }, { # Video embedded in iframe containing angle brackets as attribute's value (e.g. # "