From 42b7a5afe09e485503cbe9794c7ad18c46dc838d Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Wed, 3 Aug 2016 13:11:48 +0100 Subject: [PATCH 01/87] [limelight] extract http formats --- youtube_dl/extractor/limelight.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/limelight.py b/youtube_dl/extractor/limelight.py index efe1437e0..a425bafe3 100644 --- a/youtube_dl/extractor/limelight.py +++ b/youtube_dl/extractor/limelight.py @@ -53,11 +53,17 @@ class LimelightBaseIE(InfoExtractor): 'height': int_or_none(stream.get('videoHeightInPixels')), 'ext': ext, } - rtmp = re.search(r'^(?Prtmpe?://[^/]+/(?P.+))/(?Pmp4:.+)$', stream_url) + rtmp = re.search(r'^(?Prtmpe?://(?P[^/]+)/(?P.+))/(?Pmp4:.+)$', stream_url) if rtmp: format_id = 'rtmp' if stream.get('videoBitRate'): format_id += '-%d' % int_or_none(stream['videoBitRate']) + http_fmt = fmt.copy() + http_fmt.update({ + 'url': 'http://%s/%s' % (rtmp.group('host').replace('csl.', 'cpl.'), rtmp.group('playpath')[4:]), + 'format_id': format_id.replace('rtmp', 'http'), + }) + formats.append(http_fmt) fmt.update({ 'url': rtmp.group('url'), 'play_path': rtmp.group('playpath'), @@ -166,9 +172,10 @@ class LimelightMediaIE(LimelightBaseIE): }, { # video with subtitles 'url': 'limelight:media:a3e00274d4564ec4a9b29b9466432335', + 'md5': '2fa3bad9ac321e23860ca23bc2c69e3d', 'info_dict': { 'id': 'a3e00274d4564ec4a9b29b9466432335', - 'ext': 'flv', + 'ext': 'mp4', 'title': '3Play Media Overview Video', 'thumbnail': 're:^https?://.*\.jpeg$', 'duration': 78.101, @@ -176,10 +183,6 @@ class LimelightMediaIE(LimelightBaseIE): 'upload_date': '20120605', 'subtitles': 'mincount:9', }, - 'params': { - # rtmp download - 'skip_download': True, - }, }, { 'url': 'https://assets.delvenetworks.com/player/loader.swf?mediaId=8018a574f08d416e95ceaccae4ba0452', 'only_matching': True, From 8d3b226b8371c7fd58e052852acc01d6e97872b4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 3 Aug 2016 22:06:10 +0700 Subject: [PATCH 02/87] [gamekings] Remove extractor Now covered by generic jwplayer --- youtube_dl/extractor/extractors.py | 1 - 1 file changed, 1 deletion(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 86c48ff54..8b866ed57 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -290,7 +290,6 @@ from .funimation import FunimationIE from .funnyordie import FunnyOrDieIE from .fusion import FusionIE from .gameinformer import GameInformerIE -from .gamekings import GamekingsIE from .gameone import ( GameOneIE, GameOnePlaylistIE, From 6bb0fbf9fb60047087b44c6e147ed071fe0dc41f Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Thu, 4 Aug 2016 09:54:28 +0800 Subject: [PATCH 03/87] Revert "[README.md] Use full paths for all configuration files (#8863)" This reverts commit 899d2bea63f34e85967a93da3c9c2835c3e9bfd4. --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 640625212..a9f3001a6 100644 --- a/README.md +++ b/README.md @@ -424,7 +424,7 @@ which means you can modify it, redistribute it or use it however you like. # CONFIGURATION -You can configure youtube-dl by placing any supported command line option to a configuration file. On Linux and OS X, the system wide configuration file is located at `/etc/youtube-dl.conf` and the user wide configuration file at `~/.config/youtube-dl/config/youtube-dl.conf`. On Windows, the user wide configuration file locations are `%APPDATA%\youtube-dl\config.txt` or `C:\Users\\youtube-dl.conf`. Note that by default configuration file may not exist so you may need to create it yourself. +You can configure youtube-dl by placing any supported command line option to a configuration file. On Linux and OS X, the system wide configuration file is located at `/etc/youtube-dl.conf` and the user wide configuration file at `~/.config/youtube-dl/config`. On Windows, the user wide configuration file locations are `%APPDATA%\youtube-dl\config.txt` or `C:\Users\\youtube-dl.conf`. Note that by default configuration file may not exist so you may need to create it yourself. For example, with the following configuration file youtube-dl will always extract the audio, not copy the mtime, use a proxy and save all videos under `Movies` directory in your home directory: ``` From 8b40854529eea6b32f41df448c475b87a42038d1 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Thu, 4 Aug 2016 09:24:20 +0100 Subject: [PATCH 04/87] [common] lower proto_preference of rtsp formats Most of the time the RtspFD fail to download videos but it report success of the download with this output: [mpv] 0 bytes [download] 100% of 0.00B --- youtube_dl/extractor/common.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 3b6a5491d..2d337d614 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -916,7 +916,8 @@ class InfoExtractor(object): if f.get('ext') in ['f4f', 'f4m']: # Not yet supported preference -= 0.5 - proto_preference = 0 if determine_protocol(f) in ['http', 'https'] else -0.1 + protocol = f.get('protocol') or determine_protocol(f) + proto_preference = 0 if protocol in ['http', 'https'] else (-0.5 if protocol == 'rtsp' else -0.1) if f.get('vcodec') == 'none': # audio only preference -= 50 From 217d5ae0137943829db23d13eee425e5fd7c08ae Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Thu, 4 Aug 2016 09:37:27 +0100 Subject: [PATCH 05/87] [vodplatform] Add new extractor --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/generic.py | 8 ++++ youtube_dl/extractor/vodplatform.py | 58 +++++++++++++++++++++++++++++ 3 files changed, 67 insertions(+) create mode 100644 youtube_dl/extractor/vodplatform.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 8b866ed57..909f7d25c 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1004,6 +1004,7 @@ from .vk import ( ) from .vlive import VLiveIE from .vodlocker import VodlockerIE +from .vodplatform import VODPlatformIE from .voicerepublic import VoiceRepublicIE from .voxmedia import VoxMediaIE from .vporn import VpornIE diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 5364f0b19..e89a03760 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -2207,6 +2207,14 @@ class GenericIE(InfoExtractor): return self.url_result( self._proto_relative_url(unescapeHTML(mobj.group(1))), 'Vine') + # Look for VODPlatform embeds + mobj = re.search( + r']+src=[\'"]((?:https?:)?//(?:www\.)?vod-platform\.net/embed/[^/?#]+)', + webpage) + if mobj is not None: + return self.url_result( + self._proto_relative_url(unescapeHTML(mobj.group(1))), 'VODPlatform') + # Look for Instagram embeds instagram_embed_url = InstagramIE._extract_embed_url(webpage) if instagram_embed_url is not None: diff --git a/youtube_dl/extractor/vodplatform.py b/youtube_dl/extractor/vodplatform.py new file mode 100644 index 000000000..b49542b16 --- /dev/null +++ b/youtube_dl/extractor/vodplatform.py @@ -0,0 +1,58 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import unescapeHTML + + +class VODPlatformIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?vod-platform\.net/embed/(?P[^/?#]+)' + _TEST = { + # from http://www.lbcgroup.tv/watch/chapter/29143/52844/%D8%A7%D9%84%D9%86%D8%B5%D8%B1%D8%A9-%D9%81%D9%8A-%D8%B6%D9%8A%D8%A7%D9%81%D8%A9-%D8%A7%D9%84%D9%80-cnn/ar + 'url': 'http://vod-platform.net/embed/RufMcytHDolTH1MuKHY9Fw', + 'md5': '1db2b7249ce383d6be96499006e951fc', + 'info_dict': { + 'id': 'RufMcytHDolTH1MuKHY9Fw', + 'ext': 'mp4', + 'title': 'LBCi News_ النصرة في ضيافة الـ "سي.أن.أن"', + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + title = unescapeHTML(self._og_search_title(webpage)) + hidden_inputs = self._hidden_inputs(webpage) + + base_url = self._search_regex( + '(.*/)(?:playlist.m3u8|manifest.mpd)', + hidden_inputs.get('HiddenmyhHlsLink') or hidden_inputs['HiddenmyDashLink'], + 'base url') + formats = self._extract_m3u8_formats( + base_url + 'playlist.m3u8', video_id, 'mp4', + 'm3u8_native', m3u8_id='hls', fatal=False) + formats.extend(self._extract_mpd_formats( + base_url + 'manifest.mpd', video_id, + mpd_id='dash', fatal=False)) + rtmp_formats = self._extract_smil_formats( + base_url + 'jwplayer.smil', video_id, fatal=False) + for rtmp_format in rtmp_formats: + rtsp_format = rtmp_format.copy() + rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path']) + del rtsp_format['play_path'] + del rtsp_format['ext'] + rtsp_format.update({ + 'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'), + 'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'), + 'protocol': 'rtsp', + }) + formats.extend([rtmp_format, rtsp_format]) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'thumbnail': hidden_inputs.get('HiddenThumbnail') or self._og_search_thumbnail(webpage), + 'formats': formats, + } From 1094074c045140e9a91b521b0a933f394a7bba91 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Thu, 4 Aug 2016 09:38:37 +0100 Subject: [PATCH 06/87] [kaltura] extract subtitles and reduce requests --- youtube_dl/extractor/kaltura.py | 43 +++++++++++++++++++++++++++------ 1 file changed, 36 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/kaltura.py b/youtube_dl/extractor/kaltura.py index 1729f5bfb..9930ea710 100644 --- a/youtube_dl/extractor/kaltura.py +++ b/youtube_dl/extractor/kaltura.py @@ -62,6 +62,11 @@ class KalturaIE(InfoExtractor): { 'url': 'https://cdnapisec.kaltura.com/html5/html5lib/v2.30.2/mwEmbedFrame.php/p/1337/uiconf_id/20540612/entry_id/1_sf5ovm7u?wid=_243342', 'only_matching': True, + }, + { + # video with subtitles + 'url': 'kaltura:111032:1_cw786r8q', + 'only_matching': True, } ] @@ -130,7 +135,6 @@ class KalturaIE(InfoExtractor): video_id, actions, service_url, note='Downloading Kaltura signature')['ks'] def _get_video_info(self, video_id, partner_id, service_url=None): - signature = self._get_kaltura_signature(video_id, partner_id, service_url) actions = [ { 'action': 'null', @@ -138,18 +142,30 @@ class KalturaIE(InfoExtractor): 'clientTag': 'kdp:v3.8.5', 'format': 1, # JSON, 2 = XML, 3 = PHP 'service': 'multirequest', - 'ks': signature, + }, + { + 'expiry': 86400, + 'service': 'session', + 'action': 'startWidgetSession', + 'widgetId': '_%s' % partner_id, }, { 'action': 'get', 'entryId': video_id, 'service': 'baseentry', - 'version': '-1', + 'ks': '{1:result:ks}', }, { 'action': 'getbyentryid', 'entryId': video_id, 'service': 'flavorAsset', + 'ks': '{1:result:ks}', + }, + { + 'action': 'list', + 'filter:entryIdEqual': video_id, + 'service': 'caption_captionasset', + 'ks': '{1:result:ks}', }, ] return self._kaltura_api_call( @@ -161,8 +177,9 @@ class KalturaIE(InfoExtractor): mobj = re.match(self._VALID_URL, url) partner_id, entry_id = mobj.group('partner_id', 'id') ks = None + captions = None if partner_id and entry_id: - info, flavor_assets = self._get_video_info(entry_id, partner_id, smuggled_data.get('service_url')) + _, info, flavor_assets, captions = self._get_video_info(entry_id, partner_id, smuggled_data.get('service_url')) else: path, query = mobj.group('path', 'query') if not path and not query: @@ -181,7 +198,7 @@ class KalturaIE(InfoExtractor): raise ExtractorError('Invalid URL', expected=True) if 'entry_id' in params: entry_id = params['entry_id'][0] - info, flavor_assets = self._get_video_info(entry_id, partner_id) + _, info, flavor_assets, captions = self._get_video_info(entry_id, partner_id) elif 'uiconf_id' in params and 'flashvars[referenceId]' in params: reference_id = params['flashvars[referenceId]'][0] webpage = self._download_webpage(url, reference_id) @@ -217,7 +234,7 @@ class KalturaIE(InfoExtractor): formats = [] for f in flavor_assets: # Continue if asset is not ready - if f['status'] != 2: + if f.get('status') != 2: continue video_url = sign_url( '%s/flavorId/%s' % (data_url, f['id'])) @@ -240,13 +257,25 @@ class KalturaIE(InfoExtractor): m3u8_url, entry_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) - self._check_formats(formats, entry_id) self._sort_formats(formats) + subtitles = {} + if captions: + for caption in captions.get('objects', []): + print(caption) + # Continue if caption is not ready + if f.get('status') != 2: + continue + subtitles.setdefault(caption.get('languageCode') or caption.get('language'), []).append({ + 'url': '%s/api_v3/service/caption_captionasset/action/serve/captionAssetId/%s' % (self._SERVICE_URL, caption['id']), + 'ext': caption.get('fileExt'), + }) + return { 'id': entry_id, 'title': info['name'], 'formats': formats, + 'subtitles': subtitles, 'description': clean_html(info.get('description')), 'thumbnail': info.get('thumbnailUrl'), 'duration': info.get('duration'), From 1891ea2d760a49d356a472516db40bad8309ef3c Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Thu, 4 Aug 2016 12:18:10 +0100 Subject: [PATCH 07/87] [nationalgeographic] Add support for National Geographic Episode Guide --- youtube_dl/extractor/nationalgeographic.py | 54 ++++++++++++++++++++-- 1 file changed, 49 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/nationalgeographic.py b/youtube_dl/extractor/nationalgeographic.py index e717abb9f..fe43d4bc8 100644 --- a/youtube_dl/extractor/nationalgeographic.py +++ b/youtube_dl/extractor/nationalgeographic.py @@ -1,16 +1,19 @@ from __future__ import unicode_literals +import re + from .common import InfoExtractor from .theplatform import ThePlatformIE from ..utils import ( smuggle_url, url_basename, update_url_query, + get_element_by_class, ) -class NationalGeographicIE(InfoExtractor): - IE_NAME = 'natgeo' +class NationalGeographicVideoIE(InfoExtractor): + IE_NAME = 'natgeo:video' _VALID_URL = r'https?://video\.nationalgeographic\.com/.*?' _TESTS = [ @@ -62,9 +65,9 @@ class NationalGeographicIE(InfoExtractor): } -class NationalGeographicChannelIE(ThePlatformIE): - IE_NAME = 'natgeo:channel' - _VALID_URL = r'https?://channel\.nationalgeographic\.com/(?:wild/)?[^/]+/videos/(?P[^/?]+)' +class NationalGeographicIE(ThePlatformIE): + IE_NAME = 'natgeo' + _VALID_URL = r'https?://channel\.nationalgeographic\.com/(?:wild/)?[^/]+/(?:videos|episodes)/(?P[^/?]+)' _TESTS = [ { @@ -95,6 +98,10 @@ class NationalGeographicChannelIE(ThePlatformIE): }, 'add_ie': ['ThePlatform'], }, + { + 'url': 'http://channel.nationalgeographic.com/the-story-of-god-with-morgan-freeman/episodes/the-power-of-miracles/', + 'only_matching': True, + } ] def _real_extract(self, url): @@ -122,3 +129,40 @@ class NationalGeographicChannelIE(ThePlatformIE): {'force_smil_url': True}), 'display_id': display_id, } + + +class NationalGeographicEpisodeGuideIE(ThePlatformIE): + IE_NAME = 'natgeo:episodeguide' + _VALID_URL = r'https?://channel\.nationalgeographic\.com/(?:wild/)?(?P[^/]+)/episode-guide' + _TESTS = [ + { + 'url': 'http://channel.nationalgeographic.com/the-story-of-god-with-morgan-freeman/episode-guide/', + 'info_dict': { + 'id': 'the-story-of-god-with-morgan-freeman-season-1', + 'title': 'The Story of God with Morgan Freeman - Season 1', + }, + 'playlist_mincount': 6, + }, + { + 'url': 'http://channel.nationalgeographic.com/underworld-inc/episode-guide/?s=2', + 'info_dict': { + 'id': 'underworld-inc-season-2', + 'title': 'Underworld, Inc. - Season 2', + }, + 'playlist_mincount': 7, + }, + ] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + show = get_element_by_class('show', webpage) + selected_season = self._search_regex( + r']+class="select-seasons[^"]*".*?]*>(.*?)', + webpage, 'selected season') + entries = [ + self.url_result(self._proto_relative_url(url), 'NationalGeographic') + for url in re.findall('(?s)]+class="col-inner"[^>]*?>.*?]+href="([^"]+)"', webpage)] + return self.playlist_result( + entries, '%s-%s' % (display_id, selected_season.lower().replace(' ', '-')), + '%s - %s' % (show, selected_season)) From 3c2c3af059d99e6795d1944a55d6601b5959871b Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Thu, 4 Aug 2016 12:20:56 +0100 Subject: [PATCH 08/87] [extractors] change imports for national geographic extractors --- youtube_dl/extractor/extractors.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 909f7d25c..28858cfea 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -491,8 +491,9 @@ from .myvi import MyviIE from .myvideo import MyVideoIE from .myvidster import MyVidsterIE from .nationalgeographic import ( + NationalGeographicVideoIE, NationalGeographicIE, - NationalGeographicChannelIE, + NationalGeographicEpisodeGuideIE, ) from .naver import NaverIE from .nba import NBAIE From 14704aeff6eeee4357e3a26f83432ff908db64fc Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Thu, 4 Aug 2016 14:54:34 +0100 Subject: [PATCH 09/87] [kaltura] remove debugging line --- youtube_dl/extractor/kaltura.py | 1 - 1 file changed, 1 deletion(-) diff --git a/youtube_dl/extractor/kaltura.py b/youtube_dl/extractor/kaltura.py index 9930ea710..ddf1165ff 100644 --- a/youtube_dl/extractor/kaltura.py +++ b/youtube_dl/extractor/kaltura.py @@ -262,7 +262,6 @@ class KalturaIE(InfoExtractor): subtitles = {} if captions: for caption in captions.get('objects', []): - print(caption) # Continue if caption is not ready if f.get('status') != 2: continue From 2396062c747ee81420dac0eac914d531ec8df910 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Thu, 4 Aug 2016 16:21:27 +0100 Subject: [PATCH 10/87] [5min] delegate extraction to AolIE recently the 5min SenseHandler request return HTTP Error 503: Service Unavailable error --- youtube_dl/extractor/fivemin.py | 135 +++++--------------------------- 1 file changed, 19 insertions(+), 116 deletions(-) diff --git a/youtube_dl/extractor/fivemin.py b/youtube_dl/extractor/fivemin.py index 6b8345416..2f882cbcb 100644 --- a/youtube_dl/extractor/fivemin.py +++ b/youtube_dl/extractor/fivemin.py @@ -1,24 +1,11 @@ from __future__ import unicode_literals -import re - from .common import InfoExtractor -from ..compat import ( - compat_parse_qs, - compat_urllib_parse_urlencode, - compat_urllib_parse_urlparse, - compat_urlparse, -) -from ..utils import ( - ExtractorError, - parse_duration, - replace_extension, -) class FiveMinIE(InfoExtractor): IE_NAME = '5min' - _VALID_URL = r'(?:5min:(?P\d+)(?::(?P\d+))?|https?://[^/]*?5min\.com/Scripts/PlayerSeed\.js\?(?P.*))' + _VALID_URL = r'(?:5min:|(?:https?://(?:[^/]*?5min\.com|delivery\.vidible\.tv/aol)/(?:(?:Scripts/PlayerSeed\.js|playerseed/?)?\?.*?playList=)?)(?P\d+)' _TESTS = [ { @@ -29,8 +16,16 @@ class FiveMinIE(InfoExtractor): 'id': '518013791', 'ext': 'mp4', 'title': 'iPad Mini with Retina Display Review', + 'description': 'iPad mini with Retina Display review', 'duration': 177, + 'uploader': 'engadget', + 'upload_date': '20131115', + 'timestamp': 1384515288, }, + 'params': { + # m3u8 download + 'skip_download': True, + } }, { # From http://on.aol.com/video/how-to-make-a-next-level-fruit-salad-518086247 @@ -44,108 +39,16 @@ class FiveMinIE(InfoExtractor): }, 'skip': 'no longer available', }, + { + 'url': 'http://embed.5min.com/518726732/', + 'only_matching': True, + }, + { + 'url': 'http://delivery.vidible.tv/aol?playList=518013791', + 'only_matching': True, + } ] - _ERRORS = { - 'ErrorVideoNotExist': 'We\'re sorry, but the video you are trying to watch does not exist.', - 'ErrorVideoNoLongerAvailable': 'We\'re sorry, but the video you are trying to watch is no longer available.', - 'ErrorVideoRejected': 'We\'re sorry, but the video you are trying to watch has been removed.', - 'ErrorVideoUserNotGeo': 'We\'re sorry, but the video you are trying to watch cannot be viewed from your current location.', - 'ErrorVideoLibraryRestriction': 'We\'re sorry, but the video you are trying to watch is currently unavailable for viewing at this domain.', - 'ErrorExposurePermission': 'We\'re sorry, but the video you are trying to watch is currently unavailable for viewing at this domain.', - } - _QUALITIES = { - 1: { - 'width': 640, - 'height': 360, - }, - 2: { - 'width': 854, - 'height': 480, - }, - 4: { - 'width': 1280, - 'height': 720, - }, - 8: { - 'width': 1920, - 'height': 1080, - }, - 16: { - 'width': 640, - 'height': 360, - }, - 32: { - 'width': 854, - 'height': 480, - }, - 64: { - 'width': 1280, - 'height': 720, - }, - 128: { - 'width': 640, - 'height': 360, - }, - } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - sid = mobj.group('sid') - - if mobj.group('query'): - qs = compat_parse_qs(mobj.group('query')) - if not qs.get('playList'): - raise ExtractorError('Invalid URL', expected=True) - video_id = qs['playList'][0] - if qs.get('sid'): - sid = qs['sid'][0] - - embed_url = 'https://embed.5min.com/playerseed/?playList=%s' % video_id - if not sid: - embed_page = self._download_webpage(embed_url, video_id, - 'Downloading embed page') - sid = self._search_regex(r'sid=(\d+)', embed_page, 'sid') - - response = self._download_json( - 'https://syn.5min.com/handlers/SenseHandler.ashx?' + - compat_urllib_parse_urlencode({ - 'func': 'GetResults', - 'playlist': video_id, - 'sid': sid, - 'isPlayerSeed': 'true', - 'url': embed_url, - }), - video_id) - if not response['success']: - raise ExtractorError( - '%s said: %s' % ( - self.IE_NAME, - self._ERRORS.get(response['errorMessage'], response['errorMessage'])), - expected=True) - info = response['binding'][0] - - formats = [] - parsed_video_url = compat_urllib_parse_urlparse(compat_parse_qs( - compat_urllib_parse_urlparse(info['EmbededURL']).query)['videoUrl'][0]) - for rendition in info['Renditions']: - if rendition['RenditionType'] == 'aac' or rendition['RenditionType'] == 'm3u8': - continue - else: - rendition_url = compat_urlparse.urlunparse(parsed_video_url._replace(path=replace_extension(parsed_video_url.path.replace('//', '/%s/' % rendition['ID']), rendition['RenditionType']))) - quality = self._QUALITIES.get(rendition['ID'], {}) - formats.append({ - 'format_id': '%s-%d' % (rendition['RenditionType'], rendition['ID']), - 'url': rendition_url, - 'width': quality.get('width'), - 'height': quality.get('height'), - }) - self._sort_formats(formats) - - return { - 'id': video_id, - 'title': info['Title'], - 'thumbnail': info.get('ThumbURL'), - 'duration': parse_duration(info.get('Duration')), - 'formats': formats, - } + video_id = self._match_id(url) + return self.url_result('aol-video:%s' % video_id) From 52e7fcfeb794ca0e50e312357d60234bcca5118c Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Thu, 4 Aug 2016 16:34:47 +0100 Subject: [PATCH 11/87] [engadget] Relax _VALID_URL --- youtube_dl/extractor/engadget.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/engadget.py b/youtube_dl/extractor/engadget.py index e5e57d485..a39e9010d 100644 --- a/youtube_dl/extractor/engadget.py +++ b/youtube_dl/extractor/engadget.py @@ -4,9 +4,10 @@ from .common import InfoExtractor class EngadgetIE(InfoExtractor): - _VALID_URL = r'https?://www.engadget.com/video/(?P\d+)' + _VALID_URL = r'https?://www.engadget.com/video/(?P[^/?#]+)' - _TEST = { + _TESTS = [{ + # video with 5min ID 'url': 'http://www.engadget.com/video/518153925/', 'md5': 'c6820d4828a5064447a4d9fc73f312c9', 'info_dict': { @@ -15,8 +16,12 @@ class EngadgetIE(InfoExtractor): 'title': 'Samsung Galaxy Tab Pro 8.4 Review', }, 'add_ie': ['FiveMin'], - } + }, { + # video with vidible ID + 'url': 'https://www.engadget.com/video/57a28462134aa15a39f0421a/', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) - return self.url_result('5min:%s' % video_id) + return self.url_result('aol-video:%s' % video_id) From 8895be01fc2d3774aa0b9b000af20c22903d3391 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Thu, 4 Aug 2016 16:55:12 +0100 Subject: [PATCH 12/87] [5min] fix _VALID_URL --- youtube_dl/extractor/fivemin.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/fivemin.py b/youtube_dl/extractor/fivemin.py index 2f882cbcb..ab82701c2 100644 --- a/youtube_dl/extractor/fivemin.py +++ b/youtube_dl/extractor/fivemin.py @@ -5,7 +5,7 @@ from .common import InfoExtractor class FiveMinIE(InfoExtractor): IE_NAME = '5min' - _VALID_URL = r'(?:5min:|(?:https?://(?:[^/]*?5min\.com|delivery\.vidible\.tv/aol)/(?:(?:Scripts/PlayerSeed\.js|playerseed/?)?\?.*?playList=)?)(?P\d+)' + _VALID_URL = r'(?:5min:|https?://(?:[^/]*?5min\.com|delivery\.vidible\.tv/aol)/(?:(?:Scripts/PlayerSeed\.js|playerseed/?)?\?.*?playList=)?)(?P\d+)' _TESTS = [ { From 8a00ea567b88ea240c6fe672387cc5cc482495fe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 4 Aug 2016 23:21:04 +0700 Subject: [PATCH 13/87] [natgeo:episodeguide] Do not shadow url from outer scope --- youtube_dl/extractor/nationalgeographic.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/nationalgeographic.py b/youtube_dl/extractor/nationalgeographic.py index fe43d4bc8..4bffcf704 100644 --- a/youtube_dl/extractor/nationalgeographic.py +++ b/youtube_dl/extractor/nationalgeographic.py @@ -161,8 +161,8 @@ class NationalGeographicEpisodeGuideIE(ThePlatformIE): r']+class="select-seasons[^"]*".*?]*>(.*?)', webpage, 'selected season') entries = [ - self.url_result(self._proto_relative_url(url), 'NationalGeographic') - for url in re.findall('(?s)]+class="col-inner"[^>]*?>.*?]+href="([^"]+)"', webpage)] + self.url_result(self._proto_relative_url(entry_url), 'NationalGeographic') + for entry_url in re.findall('(?s)]+class="col-inner"[^>]*?>.*?]+href="([^"]+)"', webpage)] return self.playlist_result( entries, '%s-%s' % (display_id, selected_season.lower().replace(' ', '-')), '%s - %s' % (show, selected_season)) From 4f427c4be860c582ca72dd4be64d45b54499232c Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Thu, 4 Aug 2016 18:28:49 +0100 Subject: [PATCH 14/87] [condenast] improve extraction --- youtube_dl/extractor/condenast.py | 111 ++++++++++++++++++------------ 1 file changed, 66 insertions(+), 45 deletions(-) diff --git a/youtube_dl/extractor/condenast.py b/youtube_dl/extractor/condenast.py index e8f2b5a07..976a0e89d 100644 --- a/youtube_dl/extractor/condenast.py +++ b/youtube_dl/extractor/condenast.py @@ -5,13 +5,17 @@ import re from .common import InfoExtractor from ..compat import ( - compat_urllib_parse_urlencode, compat_urllib_parse_urlparse, compat_urlparse, ) from ..utils import ( orderedSet, remove_end, + extract_attributes, + mimetype2ext, + determine_ext, + int_or_none, + parse_iso8601, ) @@ -58,6 +62,9 @@ class CondeNastIE(InfoExtractor): 'ext': 'mp4', 'title': '3D Printed Speakers Lit With LED', 'description': 'Check out these beautiful 3D printed LED speakers. You can\'t actually buy them, but LumiGeek is working on a board that will let you make you\'re own.', + 'uploader': 'wired', + 'upload_date': '20130314', + 'timestamp': 1363219200, } }, { # JS embed @@ -67,70 +74,84 @@ class CondeNastIE(InfoExtractor): 'id': '55f9cf8b61646d1acf00000c', 'ext': 'mp4', 'title': '3D printed TSA Travel Sentry keys really do open TSA locks', + 'uploader': 'arstechnica', + 'upload_date': '20150916', + 'timestamp': 1442434955, } }] def _extract_series(self, url, webpage): - title = self._html_search_regex(r'
.*?

(.+?)

', - webpage, 'series title', flags=re.DOTALL) + title = self._html_search_regex( + r'(?s)
.*?

(.+?)

', + webpage, 'series title') url_object = compat_urllib_parse_urlparse(url) base_url = '%s://%s' % (url_object.scheme, url_object.netloc) - m_paths = re.finditer(r'

.*?.*?(.+?)

', - r'
(.+?)
', - ], - webpage, 'description', fatal=False, flags=re.DOTALL) + query = {} + params = self._search_regex( + r'(?s)var params = {(.+?)}[;,]', webpage, 'player params', default=None) + if params: + query.update({ + 'videoId': self._search_regex(r'videoId: [\'"](.+?)[\'"]', params, 'video id'), + 'playerId': self._search_regex(r'playerId: [\'"](.+?)[\'"]', params, 'player id'), + 'target': self._search_regex(r'target: [\'"](.+?)[\'"]', params, 'target'), + }) else: - description = None - params = self._search_regex(r'var params = {(.+?)}[;,]', webpage, - 'player params', flags=re.DOTALL) - video_id = self._search_regex(r'videoId: [\'"](.+?)[\'"]', params, 'video id') - player_id = self._search_regex(r'playerId: [\'"](.+?)[\'"]', params, 'player id') - target = self._search_regex(r'target: [\'"](.+?)[\'"]', params, 'target') - data = compat_urllib_parse_urlencode({'videoId': video_id, - 'playerId': player_id, - 'target': target, - }) - base_info_url = self._search_regex(r'url = [\'"](.+?)[\'"][,;]', - webpage, 'base info url', - default='http://player.cnevids.com/player/loader.js?') - info_url = base_info_url + data - info_page = self._download_webpage(info_url, video_id, - 'Downloading video info') - video_info = self._search_regex(r'var\s+video\s*=\s*({.+?});', info_page, 'video info') - video_info = self._parse_json(video_info, video_id) + params = extract_attributes(self._search_regex( + r'(<[^>]+data-js="video-player"[^>]+>)', + webpage, 'player params element')) + query.update({ + 'videoId': params['data-video'], + 'playerId': params['data-player'], + 'target': params['id'], + }) + video_id = query['videoId'] + info_page = self._download_webpage( + 'http://player.cnevids.com/player/video.js', + video_id, 'Downloading video info', query=query) + video_info = self._parse_json(self._search_regex( + r'loadCallback\(({.+})\)', info_page, 'video info'), video_id)['video'] + title = video_info['title'] - formats = [{ - 'format_id': '%s-%s' % (fdata['type'].split('/')[-1], fdata['quality']), - 'url': fdata['src'], - 'ext': fdata['type'].split('/')[-1], - 'quality': 1 if fdata['quality'] == 'high' else 0, - } for fdata in video_info['sources'][0]] + formats = [] + for fdata in video_info.get('sources', [{}])[0]: + src = fdata.get('src') + if not src: + continue + ext = mimetype2ext(fdata.get('type')) or determine_ext(src) + quality = fdata.get('quality') + formats.append({ + 'format_id': ext + ('-%s' % quality if quality else ''), + 'url': src, + 'ext': ext, + 'quality': 1 if quality == 'high' else 0, + }) self._sort_formats(formats) - return { + info = self._search_json_ld(webpage, video_id) if url_type != 'embed' else {} + info.update({ 'id': video_id, 'formats': formats, - 'title': video_info['title'], - 'thumbnail': video_info['poster_frame'], - 'description': description, - } + 'title': title, + 'thumbnail': video_info.get('poster_frame'), + 'uploader': video_info.get('brand'), + 'duration': int_or_none(video_info.get('duration')), + 'tags': video_info.get('tags'), + 'series': video_info.get('series_title'), + 'season': video_info.get('season_title'), + 'timestamp': parse_iso8601(video_info.get('premiere_date')), + }) + return info def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - site = mobj.group('site') - url_type = mobj.group('type') - item_id = mobj.group('id') + site, url_type, item_id = re.match(self._VALID_URL, url).groups() # Convert JS embed to regular embed if url_type == 'embedjs': From b02b960c6bba834d9e7199ac53430c7933079dc8 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Thu, 4 Aug 2016 21:42:22 +0100 Subject: [PATCH 15/87] [naver] improve extraction(closes #8096) --- youtube_dl/extractor/naver.py | 95 ++++++++++++++++++++--------------- 1 file changed, 54 insertions(+), 41 deletions(-) diff --git a/youtube_dl/extractor/naver.py b/youtube_dl/extractor/naver.py index 6d6f69b44..60a4872be 100644 --- a/youtube_dl/extractor/naver.py +++ b/youtube_dl/extractor/naver.py @@ -4,12 +4,10 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import ( - compat_urllib_parse_urlencode, - compat_urlparse, -) from ..utils import ( ExtractorError, + int_or_none, + update_url_query, ) @@ -23,7 +21,6 @@ class NaverIE(InfoExtractor): 'ext': 'mp4', 'title': '[9월 모의고사 해설강의][수학_김상희] 수학 A형 16~20번', 'description': '합격불변의 법칙 메가스터디 | 메가스터디 수학 김상희 선생님이 9월 모의고사 수학A형 16번에서 20번까지 해설강의를 공개합니다.', - 'upload_date': '20130903', }, }, { 'url': 'http://tvcast.naver.com/v/395837', @@ -33,7 +30,6 @@ class NaverIE(InfoExtractor): 'ext': 'mp4', 'title': '9년이 지나도 아픈 기억, 전효성의 아버지', 'description': 'md5:5bf200dcbf4b66eb1b350d1eb9c753f7', - 'upload_date': '20150519', }, 'skip': 'Georestricted', }] @@ -51,48 +47,65 @@ class NaverIE(InfoExtractor): if error: raise ExtractorError(error, expected=True) raise ExtractorError('couldn\'t extract vid and key') - vid = m_id.group(1) - key = m_id.group(2) - query = compat_urllib_parse_urlencode({'vid': vid, 'inKey': key, }) - query_urls = compat_urllib_parse_urlencode({ - 'masterVid': vid, - 'protocol': 'p2p', - 'inKey': key, + video_data = self._download_json('http://play.rmcnmv.naver.com/vod/play/v2.0/' + m_id.group(1), video_id, query={ + 'key': m_id.group(2), }) - info = self._download_xml( - 'http://serviceapi.rmcnmv.naver.com/flash/videoInfo.nhn?' + query, - video_id, 'Downloading video info') - urls = self._download_xml( - 'http://serviceapi.rmcnmv.naver.com/flash/playableEncodingOption.nhn?' + query_urls, - video_id, 'Downloading video formats info') - + meta = video_data['meta'] + title = meta['subject'] formats = [] - for format_el in urls.findall('EncodingOptions/EncodingOption'): - domain = format_el.find('Domain').text - uri = format_el.find('uri').text - f = { - 'url': compat_urlparse.urljoin(domain, uri), - 'ext': 'mp4', - 'width': int(format_el.find('width').text), - 'height': int(format_el.find('height').text), - } - if domain.startswith('rtmp'): - # urlparse does not support custom schemes - # https://bugs.python.org/issue18828 - f.update({ - 'url': domain + uri, - 'ext': 'flv', - 'rtmp_protocol': '1', # rtmpt + + def extract_formats(streams, stream_type, query={}): + for stream in streams: + stream_url = stream.get('source') + if not stream_url: + continue + stream_url = update_url_query(stream_url, query) + encoding_option = stream.get('encodingOption', {}) + bitrate = stream.get('bitrate', {}) + formats.append({ + 'format_id': '%s_%s' % (stream.get('type') or stream_type, encoding_option.get('id') or encoding_option.get('name')), + 'url': stream_url, + 'width': int_or_none(encoding_option.get('width')), + 'height': int_or_none(encoding_option.get('height')), + 'vbr': int_or_none(bitrate.get('video')), + 'abr': int_or_none(bitrate.get('audio')), + 'filesize': int_or_none(stream.get('size')), + 'protocol': 'm3u8_native' if stream_type == 'HLS' else None, }) - formats.append(f) + + extract_formats(video_data.get('videos', {}).get('list', []), 'H264') + for stream_set in video_data.get('streams', []): + query = {} + for param in stream_set.get('keys', []): + query[param['name']] = param['value'] + stream_type = stream_set.get('type') + videos = stream_set.get('videos') + if videos: + extract_formats(videos, stream_type, query) + elif stream_type == 'HLS': + stream_url = stream_set.get('source') + if not stream_url: + continue + formats.extend(self._extract_m3u8_formats( + update_url_query(stream_url, query), video_id, + 'mp4', 'm3u8_native', m3u8_id=stream_type, fatal=False)) self._sort_formats(formats) + subtitles = {} + for caption in video_data.get('captions', {}).get('list', []): + caption_url = caption.get('source') + if not caption_url: + continue + subtitles.setdefault(caption.get('language') or caption.get('locale'), []).append({ + 'url': caption_url, + }) + return { 'id': video_id, - 'title': info.find('Subject').text, + 'title': title, 'formats': formats, + 'subtitles': subtitles, 'description': self._og_search_description(webpage), - 'thumbnail': self._og_search_thumbnail(webpage), - 'upload_date': info.find('WriteDate').text.replace('.', ''), - 'view_count': int(info.find('PlayCount').text), + 'thumbnail': meta.get('cover', {}).get('source') or self._og_search_thumbnail(webpage), + 'view_count': int_or_none(meta.get('count')), } From 7dc2a74e0ac9cfa74cc9de6f586ffd5cc8bac0d9 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Fri, 5 Aug 2016 11:41:55 +0800 Subject: [PATCH 16/87] [utils] Fix unified_timestamp for formats parsed by parsedate_tz() --- test/test_utils.py | 1 + youtube_dl/utils.py | 6 +++--- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/test/test_utils.py b/test/test_utils.py index 2273b5a10..5a2ae4a1e 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -308,6 +308,7 @@ class TestUtil(unittest.TestCase): self.assertEqual(unified_timestamp('25-09-2014'), 1411603200) self.assertEqual(unified_timestamp('27.02.2016 17:30'), 1456594200) self.assertEqual(unified_timestamp('UNKNOWN DATE FORMAT'), None) + self.assertEqual(unified_timestamp('May 16, 2016 11:15 PM'), 1463440500) def test_determine_ext(self): self.assertEqual(determine_ext('http://example.com/foo/bar.mp4/?download'), 'mp4') diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index f5cd6819b..97ddd9883 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1101,7 +1101,7 @@ def unified_timestamp(date_str, day_first=True): date_str = date_str.replace(',', ' ') - pm_delta = datetime.timedelta(hours=12 if re.search(r'(?i)PM', date_str) else 0) + pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0 timezone, date_str = extract_timezone(date_str) # Remove AM/PM + timezone @@ -1109,13 +1109,13 @@ def unified_timestamp(date_str, day_first=True): for expression in date_formats(day_first): try: - dt = datetime.datetime.strptime(date_str, expression) - timezone + pm_delta + dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta) return calendar.timegm(dt.timetuple()) except ValueError: pass timetuple = email.utils.parsedate_tz(date_str) if timetuple: - return calendar.timegm(timetuple.timetuple()) + return calendar.timegm(timetuple) + pm_delta * 3600 def determine_ext(url, default_ext='unknown_video'): From 962250f7eaf648e998e66e1d6aa5c2b4e018cd27 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Fri, 5 Aug 2016 11:44:50 +0800 Subject: [PATCH 17/87] [cbslocal] Fix timestamp parsing (closes #10213) --- ChangeLog | 7 +++++++ youtube_dl/extractor/cbslocal.py | 9 ++------- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/ChangeLog b/ChangeLog index f3c752e66..2d021ddf4 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,9 @@ +version + +Fixed/improved extractors +- [cbslocal] Fix timestamp parsing (#10213) + + version 2016.08.01 Fixed/improved extractors @@ -7,6 +13,7 @@ Fixed/improved extractors - [safari] Relax regular expressions for URL matching (#10202) - [cwtv] Add support for cwtvpr.com (#10196) + version 2016.07.30 Fixed/improved extractors diff --git a/youtube_dl/extractor/cbslocal.py b/youtube_dl/extractor/cbslocal.py index 74adb38a6..008c5fe32 100644 --- a/youtube_dl/extractor/cbslocal.py +++ b/youtube_dl/extractor/cbslocal.py @@ -1,12 +1,10 @@ # coding: utf-8 from __future__ import unicode_literals -import calendar -import datetime - from .anvato import AnvatoIE from .sendtonews import SendtoNewsIE from ..compat import compat_urlparse +from ..utils import unified_timestamp class CBSLocalIE(AnvatoIE): @@ -71,10 +69,7 @@ class CBSLocalIE(AnvatoIE): time_str = self._html_search_regex( r'class="entry-date">([^<]+)<', webpage, 'released date', fatal=False) - timestamp = None - if time_str: - timestamp = calendar.timegm(datetime.datetime.strptime( - time_str, '%b %d, %Y %I:%M %p').timetuple()) + timestamp = unified_timestamp(time_str) info_dict.update({ 'display_id': display_id, From f65dc41b72d3449d9faf1a2fcd0a2e32befa3607 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 5 Aug 2016 08:11:44 +0100 Subject: [PATCH 18/87] [naver] extract upload date --- youtube_dl/extractor/naver.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/naver.py b/youtube_dl/extractor/naver.py index 60a4872be..0891d2772 100644 --- a/youtube_dl/extractor/naver.py +++ b/youtube_dl/extractor/naver.py @@ -21,6 +21,7 @@ class NaverIE(InfoExtractor): 'ext': 'mp4', 'title': '[9월 모의고사 해설강의][수학_김상희] 수학 A형 16~20번', 'description': '합격불변의 법칙 메가스터디 | 메가스터디 수학 김상희 선생님이 9월 모의고사 수학A형 16번에서 20번까지 해설강의를 공개합니다.', + 'upload_date': '20130903', }, }, { 'url': 'http://tvcast.naver.com/v/395837', @@ -30,6 +31,7 @@ class NaverIE(InfoExtractor): 'ext': 'mp4', 'title': '9년이 지나도 아픈 기억, 전효성의 아버지', 'description': 'md5:5bf200dcbf4b66eb1b350d1eb9c753f7', + 'upload_date': '20150519', }, 'skip': 'Georestricted', }] @@ -47,9 +49,11 @@ class NaverIE(InfoExtractor): if error: raise ExtractorError(error, expected=True) raise ExtractorError('couldn\'t extract vid and key') - video_data = self._download_json('http://play.rmcnmv.naver.com/vod/play/v2.0/' + m_id.group(1), video_id, query={ - 'key': m_id.group(2), - }) + video_data = self._download_json( + 'http://play.rmcnmv.naver.com/vod/play/v2.0/' + m_id.group(1), + video_id, query={ + 'key': m_id.group(2), + }) meta = video_data['meta'] title = meta['subject'] formats = [] @@ -100,6 +104,12 @@ class NaverIE(InfoExtractor): 'url': caption_url, }) + upload_date = self._search_regex( + r']+class="date".*?(\d{4}\.\d{2}\.\d{2})', + webpage, 'upload date', fatal=False) + if upload_date: + upload_date = upload_date.replace('.', '') + return { 'id': video_id, 'title': title, @@ -108,4 +118,5 @@ class NaverIE(InfoExtractor): 'description': self._og_search_description(webpage), 'thumbnail': meta.get('cover', {}).get('source') or self._og_search_thumbnail(webpage), 'view_count': int_or_none(meta.get('count')), + 'upload_date': upload_date, } From fdd0b8f8e044adf61da95fe85bc9414724f1d835 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 5 Aug 2016 09:44:15 +0100 Subject: [PATCH 19/87] [tvp] extract video id from the webpage(fixes #7799) --- youtube_dl/extractor/tvp.py | 44 +++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/youtube_dl/extractor/tvp.py b/youtube_dl/extractor/tvp.py index e84876b54..2abfb7830 100644 --- a/youtube_dl/extractor/tvp.py +++ b/youtube_dl/extractor/tvp.py @@ -24,6 +24,7 @@ class TVPIE(InfoExtractor): 'id': '194536', 'ext': 'mp4', 'title': 'Czas honoru, I seria – odc. 13', + 'description': 'md5:76649d2014f65c99477be17f23a4dead', }, }, { 'url': 'http://www.tvp.pl/there-can-be-anything-so-i-shortened-it/17916176', @@ -32,6 +33,16 @@ class TVPIE(InfoExtractor): 'id': '17916176', 'ext': 'mp4', 'title': 'TVP Gorzów pokaże filmy studentów z podroży dookoła świata', + 'description': 'TVP Gorzów pokaże filmy studentów z podroży dookoła świata', + }, + }, { + # page id is not the same as video id(#7799) + 'url': 'http://vod.tvp.pl/22704887/08122015-1500', + 'md5': 'cf6a4705dfd1489aef8deb168d6ba742', + 'info_dict': { + 'id': '22680786', + 'ext': 'mp4', + 'title': 'Wiadomości, 08.12.2015, 15:00', }, }, { 'url': 'http://vod.tvp.pl/seriale/obyczajowe/na-sygnale/sezon-2-27-/odc-39/17834272', @@ -53,6 +64,39 @@ class TVPIE(InfoExtractor): 'only_matching': True, }] + def _real_extract(self, url): + page_id = self._match_id(url) + webpage = self._download_webpage(url, page_id) + video_id = self._search_regex([ + r']+src="[^"]*?object_id=(\d+)', + "object_id\s*:\s*'(\d+)'"], webpage, 'video id') + return { + '_type': 'url_transparent', + 'url': 'tvp:' + video_id, + 'description': self._og_search_description(webpage, default=None), + 'thumbnail': self._og_search_thumbnail(webpage), + 'ie_key': 'TVPEmbed', + } + + +class TVPEmbedIE(InfoExtractor): + IE_NAME = 'tvp:embed' + IE_DESC = 'Telewizja Polska' + _VALID_URL = r'(?:tvp:|https?://[^/]+\.tvp\.(?:pl|info)/sess/tvplayer\.php\?.*?object_id=)(?P\d+)' + + _TESTS = [{ + 'url': 'http://www.tvp.pl/sess/tvplayer.php?object_id=22670268', + 'md5': '8c9cd59d16edabf39331f93bf8a766c7', + 'info_dict': { + 'id': '22670268', + 'ext': 'mp4', + 'title': 'Panorama, 07.12.2015, 15:40', + }, + }, { + 'url': 'tvp:22670268', + 'only_matching': True, + }] + def _real_extract(self, url): video_id = self._match_id(url) From a7d29530732574554cd20324fda0faa17979a1ae Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 5 Aug 2016 10:11:59 +0100 Subject: [PATCH 20/87] [extractors] add tvp:embed import --- youtube_dl/extractor/extractors.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 28858cfea..fec560ba3 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -892,6 +892,7 @@ from .tvc import ( from .tvigle import TvigleIE from .tvland import TVLandIE from .tvp import ( + TVPEmbedIE, TVPIE, TVPSeriesIE, ) From 5a993e169262237f4990207049d4cbbc69d5fd8d Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 5 Aug 2016 10:13:11 +0100 Subject: [PATCH 21/87] [natgeo] fix tests(closes #10229) --- youtube_dl/extractor/nationalgeographic.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/nationalgeographic.py b/youtube_dl/extractor/nationalgeographic.py index 4bffcf704..0027ff1b8 100644 --- a/youtube_dl/extractor/nationalgeographic.py +++ b/youtube_dl/extractor/nationalgeographic.py @@ -74,7 +74,7 @@ class NationalGeographicIE(ThePlatformIE): 'url': 'http://channel.nationalgeographic.com/the-story-of-god-with-morgan-freeman/videos/uncovering-a-universal-knowledge/', 'md5': '518c9aa655686cf81493af5cc21e2a04', 'info_dict': { - 'id': 'nB5vIAfmyllm', + 'id': 'vKInpacll2pC', 'ext': 'mp4', 'title': 'Uncovering a Universal Knowledge', 'description': 'md5:1a89148475bf931b3661fcd6ddb2ae3a', @@ -88,7 +88,7 @@ class NationalGeographicIE(ThePlatformIE): 'url': 'http://channel.nationalgeographic.com/wild/destination-wild/videos/the-stunning-red-bird-of-paradise/', 'md5': 'c4912f656b4cbe58f3e000c489360989', 'info_dict': { - 'id': '3TmMv9OvGwIR', + 'id': 'Pok5lWCkiEFA', 'ext': 'mp4', 'title': 'The Stunning Red Bird of Paradise', 'description': 'md5:7bc8cd1da29686be4d17ad1230f0140c', From 08c655906c6afd41ca56e784f35791cfdb727d3d Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 5 Aug 2016 10:22:33 +0100 Subject: [PATCH 22/87] [5min] fix _VALID_URL(closes #10228) --- youtube_dl/extractor/fivemin.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/fivemin.py b/youtube_dl/extractor/fivemin.py index ab82701c2..f3f876ecd 100644 --- a/youtube_dl/extractor/fivemin.py +++ b/youtube_dl/extractor/fivemin.py @@ -5,7 +5,7 @@ from .common import InfoExtractor class FiveMinIE(InfoExtractor): IE_NAME = '5min' - _VALID_URL = r'(?:5min:|https?://(?:[^/]*?5min\.com|delivery\.vidible\.tv/aol)/(?:(?:Scripts/PlayerSeed\.js|playerseed/?)?\?.*?playList=)?)(?P\d+)' + _VALID_URL = r'(?:5min:|https?://(?:[^/]*?5min\.com/|delivery\.vidible\.tv/aol)(?:(?:Scripts/PlayerSeed\.js|playerseed/?)?\?.*?playList=)?)(?P\d+)' _TESTS = [ { From f0d31c624e51322e26889df417152b833549bb75 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 5 Aug 2016 22:17:32 +0700 Subject: [PATCH 23/87] [tvplay] Add support for subtitles (Closes #10194) --- youtube_dl/extractor/tvplay.py | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/tvplay.py b/youtube_dl/extractor/tvplay.py index 918f8f8bc..0c072a6ae 100644 --- a/youtube_dl/extractor/tvplay.py +++ b/youtube_dl/extractor/tvplay.py @@ -4,7 +4,10 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_str +from ..compat import ( + compat_str, + compat_urlparse, +) from ..utils import ( parse_iso8601, qualities, @@ -226,7 +229,8 @@ class TVPlayIE(InfoExtractor): 'ext': ext, } if video_url.startswith('rtmp'): - m = re.search(r'^(?Prtmp://[^/]+/(?P[^/]+))/(?P.+)$', video_url) + m = re.search( + r'^(?Prtmp://[^/]+/(?P[^/]+))/(?P.+)$', video_url) if not m: continue fmt.update({ @@ -242,6 +246,17 @@ class TVPlayIE(InfoExtractor): formats.append(fmt) self._sort_formats(formats) + # TODO: webvtt in m3u8 + subtitles = {} + sami_path = video.get('sami_path') + if sami_path: + lang = self._search_regex( + r'_([a-z]{2})\.xml', sami_path, 'lang', + default=compat_urlparse.urlparse(url).netloc.rsplit('.', 1)[-1]) + subtitles[lang] = [{ + 'url': sami_path, + }] + return { 'id': video_id, 'title': title, @@ -251,4 +266,5 @@ class TVPlayIE(InfoExtractor): 'view_count': int_or_none(video.get('views', {}).get('total')), 'age_limit': int_or_none(video.get('age_limit', 0)), 'formats': formats, + 'subtitles': subtitles, } From 5ca968d0a6278400be7f4215f93c026c2bf0eafb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 5 Aug 2016 22:37:38 +0700 Subject: [PATCH 24/87] [tvplay] Extract series metadata --- youtube_dl/extractor/tvplay.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/youtube_dl/extractor/tvplay.py b/youtube_dl/extractor/tvplay.py index 0c072a6ae..3d4c576c8 100644 --- a/youtube_dl/extractor/tvplay.py +++ b/youtube_dl/extractor/tvplay.py @@ -37,6 +37,9 @@ class TVPlayIE(InfoExtractor): 'ext': 'mp4', 'title': 'Kādi ir īri? - Viņas melo labāk', 'description': 'Baiba apsmej īrus, kādi tie ir un ko viņi dara.', + 'series': 'Viņas melo labāk', + 'season': '2.sezona', + 'season_number': 2, 'duration': 25, 'timestamp': 1406097056, 'upload_date': '20140723', @@ -49,6 +52,10 @@ class TVPlayIE(InfoExtractor): 'ext': 'flv', 'title': 'Moterys meluoja geriau', 'description': 'md5:9aec0fc68e2cbc992d2a140bd41fa89e', + 'series': 'Moterys meluoja geriau', + 'episode_number': 47, + 'season': '1 sezonas', + 'season_number': 1, 'duration': 1330, 'timestamp': 1403769181, 'upload_date': '20140626', @@ -257,10 +264,19 @@ class TVPlayIE(InfoExtractor): 'url': sami_path, }] + series = video.get('format_title') + episode_number = int_or_none(video.get('format_position', {}).get('episode')) + season = video.get('_embedded', {}).get('season', {}).get('title') + season_number = int_or_none(video.get('format_position', {}).get('season')) + return { 'id': video_id, 'title': title, 'description': video.get('description'), + 'series': series, + 'episode_number': episode_number, + 'season': season, + 'season_number': season_number, 'duration': int_or_none(video.get('duration')), 'timestamp': parse_iso8601(video.get('created_at')), 'view_count': int_or_none(video.get('views', {}).get('total')), From 0ca057b965e5699a88fc952da460b5adfb8e7644 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 5 Aug 2016 16:39:24 +0100 Subject: [PATCH 25/87] [jwplatform] add support for playlist extraction and relative urls and improve audio detection --- youtube_dl/extractor/jwplatform.py | 132 ++++++++++++++++------------- 1 file changed, 72 insertions(+), 60 deletions(-) diff --git a/youtube_dl/extractor/jwplatform.py b/youtube_dl/extractor/jwplatform.py index e44e31104..2a499bb77 100644 --- a/youtube_dl/extractor/jwplatform.py +++ b/youtube_dl/extractor/jwplatform.py @@ -4,10 +4,12 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import compat_urlparse from ..utils import ( determine_ext, float_or_none, int_or_none, + mimetype2ext, ) @@ -28,74 +30,84 @@ class JWPlatformBaseIE(InfoExtractor): return self._parse_jwplayer_data( jwplayer_data, video_id, *args, **kwargs) - def _parse_jwplayer_data(self, jwplayer_data, video_id, require_title=True, m3u8_id=None, rtmp_params=None): + def _parse_jwplayer_data(self, jwplayer_data, video_id, require_title=True, m3u8_id=None, rtmp_params=None, base_url=None): # JWPlayer backward compatibility: flattened playlists # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96 if 'playlist' not in jwplayer_data: jwplayer_data = {'playlist': [jwplayer_data]} - video_data = jwplayer_data['playlist'][0] + entries = [] + for video_data in jwplayer_data['playlist']: + # JWPlayer backward compatibility: flattened sources + # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35 + if 'sources' not in video_data: + video_data['sources'] = [video_data] - # JWPlayer backward compatibility: flattened sources - # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35 - if 'sources' not in video_data: - video_data['sources'] = [video_data] - - formats = [] - for source in video_data['sources']: - source_url = self._proto_relative_url(source['file']) - source_type = source.get('type') or '' - if source_type in ('application/vnd.apple.mpegurl', 'hls') or determine_ext(source_url) == 'm3u8': - formats.extend(self._extract_m3u8_formats( - source_url, video_id, 'mp4', 'm3u8_native', m3u8_id=m3u8_id, fatal=False)) - elif source_type.startswith('audio'): - formats.append({ - 'url': source_url, - 'vcodec': 'none', - }) - else: - a_format = { - 'url': source_url, - 'width': int_or_none(source.get('width')), - 'height': int_or_none(source.get('height')), - } - if source_url.startswith('rtmp'): - a_format['ext'] = 'flv', - - # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as - # of jwplayer.flash.swf - rtmp_url_parts = re.split( - r'((?:mp4|mp3|flv):)', source_url, 1) - if len(rtmp_url_parts) == 3: - rtmp_url, prefix, play_path = rtmp_url_parts - a_format.update({ - 'url': rtmp_url, - 'play_path': prefix + play_path, - }) - if rtmp_params: - a_format.update(rtmp_params) - formats.append(a_format) - self._sort_formats(formats) - - subtitles = {} - tracks = video_data.get('tracks') - if tracks and isinstance(tracks, list): - for track in tracks: - if track.get('file') and track.get('kind') == 'captions': - subtitles.setdefault(track.get('label') or 'en', []).append({ - 'url': self._proto_relative_url(track['file']) + formats = [] + for source in video_data['sources']: + source_url = self._proto_relative_url(source['file']) + if base_url: + source_url = compat_urlparse.urljoin(base_url, source_url) + source_type = source.get('type') or '' + ext = mimetype2ext(source_type) or determine_ext(source_url) + if source_type == 'hls' or ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + source_url, video_id, 'mp4', 'm3u8_native', m3u8_id=m3u8_id, fatal=False)) + # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67 + elif source_type.startswith('audio') or ext in ('oga', 'aac', 'mp3', 'mpeg', 'vorbis'): + formats.append({ + 'url': source_url, + 'vcodec': 'none', + 'ext': ext, }) + else: + a_format = { + 'url': source_url, + 'width': int_or_none(source.get('width')), + 'height': int_or_none(source.get('height')), + 'ext': ext, + } + if source_url.startswith('rtmp'): + a_format['ext'] = 'flv', - return { - 'id': video_id, - 'title': video_data['title'] if require_title else video_data.get('title'), - 'description': video_data.get('description'), - 'thumbnail': self._proto_relative_url(video_data.get('image')), - 'timestamp': int_or_none(video_data.get('pubdate')), - 'duration': float_or_none(jwplayer_data.get('duration')), - 'subtitles': subtitles, - 'formats': formats, - } + # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as + # of jwplayer.flash.swf + rtmp_url_parts = re.split( + r'((?:mp4|mp3|flv):)', source_url, 1) + if len(rtmp_url_parts) == 3: + rtmp_url, prefix, play_path = rtmp_url_parts + a_format.update({ + 'url': rtmp_url, + 'play_path': prefix + play_path, + }) + if rtmp_params: + a_format.update(rtmp_params) + formats.append(a_format) + self._sort_formats(formats) + + subtitles = {} + tracks = video_data.get('tracks') + if tracks and isinstance(tracks, list): + for track in tracks: + if track.get('file') and track.get('kind') == 'captions': + subtitles.setdefault(track.get('label') or 'en', []).append({ + 'url': self._proto_relative_url(track['file']) + }) + + entries.append({ + 'id': video_id, + 'title': video_data['title'] if require_title else video_data.get('title'), + 'description': video_data.get('description'), + 'thumbnail': self._proto_relative_url(video_data.get('image')), + 'timestamp': int_or_none(video_data.get('pubdate')), + 'duration': float_or_none(jwplayer_data.get('duration')), + 'subtitles': subtitles, + 'formats': formats, + }) + if len(entries) == 1: + return entries[0] + else: + return self.playlist_result(entries) class JWPlatformIE(JWPlatformBaseIE): From d50aca41f8e3aa3af8e19ec91100283b555ac59f Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 5 Aug 2016 16:40:21 +0100 Subject: [PATCH 26/87] [archiveorg] improve format extraction(closes #10219) --- youtube_dl/extractor/archiveorg.py | 78 +++++++++++++++--------------- 1 file changed, 38 insertions(+), 40 deletions(-) diff --git a/youtube_dl/extractor/archiveorg.py b/youtube_dl/extractor/archiveorg.py index 8feb7cb74..2472e4cc6 100644 --- a/youtube_dl/extractor/archiveorg.py +++ b/youtube_dl/extractor/archiveorg.py @@ -1,67 +1,65 @@ from __future__ import unicode_literals -from .common import InfoExtractor -from ..utils import unified_strdate +from .jwplatform import JWPlatformBaseIE +from ..utils import ( + unified_strdate, + clean_html, +) -class ArchiveOrgIE(InfoExtractor): +class ArchiveOrgIE(JWPlatformBaseIE): IE_NAME = 'archive.org' IE_DESC = 'archive.org videos' - _VALID_URL = r'https?://(?:www\.)?archive\.org/details/(?P[^?/]+)(?:[?].*)?$' + _VALID_URL = r'https?://(?:www\.)?archive\.org/(?:details|embed)/(?P[^/?#]+)(?:[?].*)?$' _TESTS = [{ 'url': 'http://archive.org/details/XD300-23_68HighlightsAResearchCntAugHumanIntellect', 'md5': '8af1d4cf447933ed3c7f4871162602db', 'info_dict': { 'id': 'XD300-23_68HighlightsAResearchCntAugHumanIntellect', - 'ext': 'ogv', + 'ext': 'ogg', 'title': '1968 Demo - FJCC Conference Presentation Reel #1', - 'description': 'md5:1780b464abaca9991d8968c877bb53ed', + 'description': 'md5:da45c349df039f1cc8075268eb1b5c25', 'upload_date': '19681210', 'uploader': 'SRI International' } }, { 'url': 'https://archive.org/details/Cops1922', - 'md5': '18f2a19e6d89af8425671da1cf3d4e04', + 'md5': 'bc73c8ab3838b5a8fc6c6651fa7b58ba', 'info_dict': { 'id': 'Cops1922', - 'ext': 'ogv', + 'ext': 'mp4', 'title': 'Buster Keaton\'s "Cops" (1922)', - 'description': 'md5:70f72ee70882f713d4578725461ffcc3', + 'description': 'md5:b4544662605877edd99df22f9620d858', } + }, { + 'url': 'http://archive.org/embed/XD300-23_68HighlightsAResearchCntAugHumanIntellect', + 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) + webpage = self._download_webpage( + 'http://archive.org/embed/' + video_id, video_id) + jwplayer_playlist = self._parse_json(self._search_regex( + r"(?s)Play\('[^']+'\s*,\s*(\[.+\])\s*,\s*{.*?}\);", + webpage, 'jwplayer playlist'), video_id) + info = self._parse_jwplayer_data( + {'playlist': jwplayer_playlist}, video_id, base_url=url) - json_url = url + ('&' if '?' in url else '?') + 'output=json' - data = self._download_json(json_url, video_id) + def get_optional(metadata, field): + return metadata.get(field, [None])[0] - def get_optional(data_dict, field): - return data_dict['metadata'].get(field, [None])[0] - - title = get_optional(data, 'title') - description = get_optional(data, 'description') - uploader = get_optional(data, 'creator') - upload_date = unified_strdate(get_optional(data, 'date')) - - formats = [ - { - 'format': fdata['format'], - 'url': 'http://' + data['server'] + data['dir'] + fn, - 'file_size': int(fdata['size']), - } - for fn, fdata in data['files'].items() - if 'Video' in fdata['format']] - - self._sort_formats(formats) - - return { - '_type': 'video', - 'id': video_id, - 'title': title, - 'formats': formats, - 'description': description, - 'uploader': uploader, - 'upload_date': upload_date, - 'thumbnail': data.get('misc', {}).get('image'), - } + metadata = self._download_json( + 'http://archive.org/details/' + video_id, video_id, query={ + 'output': 'json', + })['metadata'] + info.update({ + 'title': get_optional(metadata, 'title') or info.get('title'), + 'description': clean_html(get_optional(metadata, 'description')), + }) + if info.get('_type') != 'playlist': + info.update({ + 'uploader': get_optional(metadata, 'creator'), + 'upload_date': unified_strdate(get_optional(metadata, 'date')), + }) + return info \ No newline at end of file From 3859ebeee6d6448240176ef5e4c20f6b1d1db795 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 5 Aug 2016 22:49:19 +0700 Subject: [PATCH 27/87] [tvplay] Capture and output native error message --- youtube_dl/extractor/tvplay.py | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/tvplay.py b/youtube_dl/extractor/tvplay.py index 3d4c576c8..150bde663 100644 --- a/youtube_dl/extractor/tvplay.py +++ b/youtube_dl/extractor/tvplay.py @@ -5,15 +5,17 @@ import re from .common import InfoExtractor from ..compat import ( + compat_HTTPError, compat_str, compat_urlparse, ) from ..utils import ( + determine_ext, + ExtractorError, + int_or_none, parse_iso8601, qualities, - determine_ext, update_url_query, - int_or_none, ) @@ -206,12 +208,15 @@ class TVPlayIE(InfoExtractor): title = video['title'] - if video.get('is_geo_blocked'): - self.report_warning( - 'This content might not be available in your country due to copyright reasons') - - streams = self._download_json( - 'http://playapi.mtgx.tv/v1/videos/stream/%s' % video_id, video_id, 'Downloading streams JSON') + try: + streams = self._download_json( + 'http://playapi.mtgx.tv/v1/videos/stream/%s' % video_id, + video_id, 'Downloading streams JSON') + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: + msg = self._parse_json(e.cause.read().decode('utf-8'), video_id) + raise ExtractorError(msg['msg'], expected=True) + raise quality = qualities(['hls', 'medium', 'high']) formats = [] @@ -251,6 +256,11 @@ class TVPlayIE(InfoExtractor): 'url': video_url, }) formats.append(fmt) + + if not formats and video.get('is_geo_blocked'): + self.raise_geo_restricted( + 'This content might not be available in your country due to copyright reasons') + self._sort_formats(formats) # TODO: webvtt in m3u8 From 46933a15d69a0079e66a04475659d0cbd5e5f08d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 5 Aug 2016 23:14:32 +0700 Subject: [PATCH 28/87] [extractor/common] Support root JSON-LD lists (Closes #10203) --- youtube_dl/extractor/common.py | 76 ++++++++++++++++++---------------- 1 file changed, 41 insertions(+), 35 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 2d337d614..70909fc1c 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -828,41 +828,47 @@ class InfoExtractor(object): if not json_ld: return {} info = {} - if json_ld.get('@context') == 'http://schema.org': - item_type = json_ld.get('@type') - if expected_type is not None and expected_type != item_type: - return info - if item_type == 'TVEpisode': - info.update({ - 'episode': unescapeHTML(json_ld.get('name')), - 'episode_number': int_or_none(json_ld.get('episodeNumber')), - 'description': unescapeHTML(json_ld.get('description')), - }) - part_of_season = json_ld.get('partOfSeason') - if isinstance(part_of_season, dict) and part_of_season.get('@type') == 'TVSeason': - info['season_number'] = int_or_none(part_of_season.get('seasonNumber')) - part_of_series = json_ld.get('partOfSeries') - if isinstance(part_of_series, dict) and part_of_series.get('@type') == 'TVSeries': - info['series'] = unescapeHTML(part_of_series.get('name')) - elif item_type == 'Article': - info.update({ - 'timestamp': parse_iso8601(json_ld.get('datePublished')), - 'title': unescapeHTML(json_ld.get('headline')), - 'description': unescapeHTML(json_ld.get('articleBody')), - }) - elif item_type == 'VideoObject': - info.update({ - 'url': json_ld.get('contentUrl'), - 'title': unescapeHTML(json_ld.get('name')), - 'description': unescapeHTML(json_ld.get('description')), - 'thumbnail': json_ld.get('thumbnailUrl'), - 'duration': parse_duration(json_ld.get('duration')), - 'timestamp': unified_timestamp(json_ld.get('uploadDate')), - 'filesize': float_or_none(json_ld.get('contentSize')), - 'tbr': int_or_none(json_ld.get('bitrate')), - 'width': int_or_none(json_ld.get('width')), - 'height': int_or_none(json_ld.get('height')), - }) + if not isinstance(json_ld, (list, tuple, dict)): + return info + if isinstance(json_ld, dict): + json_ld = [json_ld] + for e in json_ld: + if e.get('@context') == 'http://schema.org': + item_type = e.get('@type') + if expected_type is not None and expected_type != item_type: + return info + if item_type == 'TVEpisode': + info.update({ + 'episode': unescapeHTML(e.get('name')), + 'episode_number': int_or_none(e.get('episodeNumber')), + 'description': unescapeHTML(e.get('description')), + }) + part_of_season = e.get('partOfSeason') + if isinstance(part_of_season, dict) and part_of_season.get('@type') == 'TVSeason': + info['season_number'] = int_or_none(part_of_season.get('seasonNumber')) + part_of_series = e.get('partOfSeries') + if isinstance(part_of_series, dict) and part_of_series.get('@type') == 'TVSeries': + info['series'] = unescapeHTML(part_of_series.get('name')) + elif item_type == 'Article': + info.update({ + 'timestamp': parse_iso8601(e.get('datePublished')), + 'title': unescapeHTML(e.get('headline')), + 'description': unescapeHTML(e.get('articleBody')), + }) + elif item_type == 'VideoObject': + info.update({ + 'url': e.get('contentUrl'), + 'title': unescapeHTML(e.get('name')), + 'description': unescapeHTML(e.get('description')), + 'thumbnail': e.get('thumbnailUrl'), + 'duration': parse_duration(e.get('duration')), + 'timestamp': unified_timestamp(e.get('uploadDate')), + 'filesize': float_or_none(e.get('contentSize')), + 'tbr': int_or_none(e.get('bitrate')), + 'width': int_or_none(e.get('width')), + 'height': int_or_none(e.get('height')), + }) + break return dict((k, v) for k, v in info.items() if v is not None) @staticmethod From 84bc23b41b5d1bc51102b264c23e96202e093d14 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 5 Aug 2016 23:16:19 +0700 Subject: [PATCH 29/87] [archiveorg] PEP 8 --- youtube_dl/extractor/archiveorg.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/archiveorg.py b/youtube_dl/extractor/archiveorg.py index 2472e4cc6..486dff82d 100644 --- a/youtube_dl/extractor/archiveorg.py +++ b/youtube_dl/extractor/archiveorg.py @@ -62,4 +62,4 @@ class ArchiveOrgIE(JWPlatformBaseIE): 'uploader': get_optional(metadata, 'creator'), 'upload_date': unified_strdate(get_optional(metadata, 'date')), }) - return info \ No newline at end of file + return info From 038a5e1a65401ad6bfd2b6572f321af66e7cd9bc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 6 Aug 2016 00:00:05 +0700 Subject: [PATCH 30/87] [adultswim] Add support for trailers (Closes #10235) --- youtube_dl/extractor/adultswim.py | 31 ++++++++++++++++++++++++++----- 1 file changed, 26 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/adultswim.py b/youtube_dl/extractor/adultswim.py index 8157da2cb..6d20229f9 100644 --- a/youtube_dl/extractor/adultswim.py +++ b/youtube_dl/extractor/adultswim.py @@ -83,6 +83,21 @@ class AdultSwimIE(InfoExtractor): # m3u8 download 'skip_download': True, } + }, { + # heroMetadata.trailer + 'url': 'http://www.adultswim.com/videos/decker/inside-decker-a-new-hero/', + 'md5': '33e9a5d8f646523ce0868ecfb0eed77d', + 'info_dict': { + 'id': 'I0LQFQkaSUaFp8PnAWHhoQ', + 'ext': 'mp4', + 'title': 'Decker - Inside Decker: A New Hero', + 'description': 'md5:c916df071d425d62d70c86d4399d3ee0', + 'duration': 249.008, + }, + 'params': { + # m3u8 download + 'skip_download': True, + } }] @staticmethod @@ -133,20 +148,26 @@ class AdultSwimIE(InfoExtractor): if video_info is None: if bootstrapped_data.get('slugged_video', {}).get('slug') == episode_path: video_info = bootstrapped_data['slugged_video'] - else: - raise ExtractorError('Unable to find video info') + if not video_info: + video_info = bootstrapped_data.get('heroMetadata', {}).get('trailer').get('video') + if not video_info: + raise ExtractorError('Unable to find video info') show = bootstrapped_data['show'] show_title = show['title'] stream = video_info.get('stream') - clips = [stream] if stream else video_info.get('clips') - if not clips: + if stream and stream.get('videoPlaybackID'): + segment_ids = [stream['videoPlaybackID']] + elif video_info.get('clips'): + segment_ids = [clip['videoPlaybackID'] for clip in video_info['clips']] + elif video_info.get('videoPlaybackID'): + segment_ids = [video_info['videoPlaybackID']] + else: raise ExtractorError( 'This video is only available via cable service provider subscription that' ' is not currently supported. You may want to use --cookies.' if video_info.get('auth') is True else 'Unable to find stream or clips', expected=True) - segment_ids = [clip['videoPlaybackID'] for clip in clips] episode_id = video_info['id'] episode_title = video_info['title'] From fe3ad1d45651d3bd05a3faae77bc23bd05e689f8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 6 Aug 2016 00:02:05 +0700 Subject: [PATCH 31/87] [adultswim] Remove superfluous md5 from test --- youtube_dl/extractor/adultswim.py | 1 - 1 file changed, 1 deletion(-) diff --git a/youtube_dl/extractor/adultswim.py b/youtube_dl/extractor/adultswim.py index 6d20229f9..3f7f8c036 100644 --- a/youtube_dl/extractor/adultswim.py +++ b/youtube_dl/extractor/adultswim.py @@ -86,7 +86,6 @@ class AdultSwimIE(InfoExtractor): }, { # heroMetadata.trailer 'url': 'http://www.adultswim.com/videos/decker/inside-decker-a-new-hero/', - 'md5': '33e9a5d8f646523ce0868ecfb0eed77d', 'info_dict': { 'id': 'I0LQFQkaSUaFp8PnAWHhoQ', 'ext': 'mp4', From 8122e79fefa8eca62c3cddbf686c8920b6d06a9a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 6 Aug 2016 00:12:37 +0700 Subject: [PATCH 32/87] [gamekings] Remove remnants --- youtube_dl/extractor/gamekings.py | 76 ------------------------------- 1 file changed, 76 deletions(-) delete mode 100644 youtube_dl/extractor/gamekings.py diff --git a/youtube_dl/extractor/gamekings.py b/youtube_dl/extractor/gamekings.py deleted file mode 100644 index cbcddcb7c..000000000 --- a/youtube_dl/extractor/gamekings.py +++ /dev/null @@ -1,76 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..utils import ( - xpath_text, - xpath_with_ns, -) -from .youtube import YoutubeIE - - -class GamekingsIE(InfoExtractor): - _VALID_URL = r'https?://www\.gamekings\.nl/(?:videos|nieuws)/(?P[^/]+)' - _TESTS = [{ - # YouTube embed video - 'url': 'http://www.gamekings.nl/videos/phoenix-wright-ace-attorney-dual-destinies-review/', - 'md5': '5208d3a17adeaef829a7861887cb9029', - 'info_dict': { - 'id': 'HkSQKetlGOU', - 'ext': 'mp4', - 'title': 'Phoenix Wright: Ace Attorney - Dual Destinies Review', - 'description': 'md5:db88c0e7f47e9ea50df3271b9dc72e1d', - 'thumbnail': 're:^https?://.*\.jpg$', - 'uploader_id': 'UCJugRGo4STYMeFr5RoOShtQ', - 'uploader': 'Gamekings Vault', - 'upload_date': '20151123', - }, - 'add_ie': ['Youtube'], - }, { - # vimeo video - 'url': 'http://www.gamekings.nl/videos/the-legend-of-zelda-majoras-mask/', - 'md5': '12bf04dfd238e70058046937657ea68d', - 'info_dict': { - 'id': 'the-legend-of-zelda-majoras-mask', - 'ext': 'mp4', - 'title': 'The Legend of Zelda: Majora’s Mask', - 'description': 'md5:9917825fe0e9f4057601fe1e38860de3', - 'thumbnail': 're:^https?://.*\.jpg$', - }, - }, { - 'url': 'http://www.gamekings.nl/nieuws/gamekings-extra-shelly-en-david-bereiden-zich-voor-op-de-livestream/', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - - playlist_id = self._search_regex( - r'gogoVideo\([^,]+,\s*"([^"]+)', webpage, 'playlist id') - - # Check if a YouTube embed is used - if YoutubeIE.suitable(playlist_id): - return self.url_result(playlist_id, ie='Youtube') - - playlist = self._download_xml( - 'http://www.gamekings.tv/wp-content/themes/gk2010/rss_playlist.php?id=%s' % playlist_id, - video_id) - - NS_MAP = { - 'jwplayer': 'http://rss.jwpcdn.com/' - } - - item = playlist.find('./channel/item') - - thumbnail = xpath_text(item, xpath_with_ns('./jwplayer:image', NS_MAP), 'thumbnail') - video_url = item.find(xpath_with_ns('./jwplayer:source', NS_MAP)).get('file') - - return { - 'id': video_id, - 'url': video_url, - 'title': self._og_search_title(webpage), - 'description': self._og_search_description(webpage), - 'thumbnail': thumbnail, - } From 7f2339c61789256b3ed12e196d14eff6791f4f46 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 6 Aug 2016 01:19:47 +0700 Subject: [PATCH 33/87] [ChangeLog] Actualize --- ChangeLog | 33 +++++++++++++++++++++++++++++++-- 1 file changed, 31 insertions(+), 2 deletions(-) diff --git a/ChangeLog b/ChangeLog index 2d021ddf4..7fd44d2a0 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,7 +1,36 @@ version -Fixed/improved extractors -- [cbslocal] Fix timestamp parsing (#10213) +Core +* Add support for JSON-LD root list entries (#10203) +* Improve unified_timestamp +* Lower preference of RTSP formats in generic sorting ++ Add support for multiple properties in _og_search_property +* Improve password hiding from verbose output + +Extractors ++ [adultswim] Add support for trailers (#10235) +* [archiveorg] Improve extraction (#10219) ++ [jwplatform] Add support for playlists ++ [jwplatform] Add support for relative URLs +* [jwplatform] Improve audio detection ++ [tvplay] Capture and output native error message ++ [tvplay] Extract series metadata ++ [tvplay] Add support for subtitles (#10194) +* [tvp] Improve extraction (#7799) +* [cbslocal] Fix timestamp parsing (#10213) ++ [naver] Add support for subtitles (#8096) +* [naver] Improve extraction +* [condenast] Improve extraction +* [engadget] Relax URL regular expression +* [5min] Fix extraction ++ [nationalgeographic] Add support for Episode Guide ++ [kaltura] Add support for subtitles +* [kaltura] Optimize network requests ++ [vodplatform] Add extractor for vod-platform.net +- [gamekings] Remove extractor +* [limelight] Extract HTTP formats +* [ntvru] Fix extraction ++ [comedycentral] Re-add :tds and :thedailyshow shortnames version 2016.08.01 From 491c42e690bd51687e43fd5178bebf99dcc2cc0c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 6 Aug 2016 01:23:48 +0700 Subject: [PATCH 34/87] release 2016.08.06 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- docs/supportedsites.md | 7 +++++-- youtube_dl/version.py | 2 +- 4 files changed, 10 insertions(+), 7 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index 9d15b6a89..7241840c5 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2016.08.01*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2016.08.01** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2016.08.06*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2016.08.06** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through [README](https://github.com/rg3/youtube-dl/blob/master/README.md) and **most notably** [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -35,7 +35,7 @@ $ youtube-dl -v [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2016.08.01 +[debug] youtube-dl version 2016.08.06 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index 7fd44d2a0..9fd78cdda 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version +version 2016.08.06 Core * Add support for JSON-LD root list entries (#10203) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 1f89b1c14..2c8f950df 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -142,6 +142,7 @@ - **CollegeRama** - **ComCarCoff** - **ComedyCentral** + - **ComedyCentralShortname** - **ComedyCentralTV** - **CondeNast**: Condé Nast media group: Allure, Architectural Digest, Ars Technica, Bon Appétit, Brides, Condé Nast, Condé Nast Traveler, Details, Epicurious, GQ, Glamour, Golf Digest, SELF, Teen Vogue, The New Yorker, Vanity Fair, Vogue, W Magazine, WIRED - **Coub** @@ -247,7 +248,6 @@ - **FunnyOrDie** - **Fusion** - **GameInformer** - - **Gamekings** - **GameOne** - **gameone:playlist** - **Gamersyde** @@ -415,7 +415,8 @@ - **MyVidster** - **n-tv.de** - **natgeo** - - **natgeo:channel** + - **natgeo:episodeguide** + - **natgeo:video** - **Naver** - **NBA** - **NBC** @@ -726,6 +727,7 @@ - **tvigle**: Интернет-телевидение Tvigle.ru - **tvland.com** - **tvp**: Telewizja Polska + - **tvp:embed**: Telewizja Polska - **tvp:series** - **TVPlay**: TV3Play and related services - **Tweakers** @@ -805,6 +807,7 @@ - **vk:wallpost** - **vlive** - **Vodlocker** + - **VODPlatform** - **VoiceRepublic** - **VoxMedia** - **Vporn** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 27f97b213..a4654fa59 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2016.08.01' +__version__ = '2016.08.06' From e563c0d73b778a1c91007f8abe0e6b43b1f7b608 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 5 Aug 2016 21:01:16 +0100 Subject: [PATCH 35/87] [condenast] fallback to loader.js if video.js fail --- youtube_dl/extractor/condenast.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/condenast.py b/youtube_dl/extractor/condenast.py index 976a0e89d..15fabbb1c 100644 --- a/youtube_dl/extractor/condenast.py +++ b/youtube_dl/extractor/condenast.py @@ -113,11 +113,19 @@ class CondeNastIE(InfoExtractor): 'target': params['id'], }) video_id = query['videoId'] + video_info = None info_page = self._download_webpage( 'http://player.cnevids.com/player/video.js', - video_id, 'Downloading video info', query=query) - video_info = self._parse_json(self._search_regex( - r'loadCallback\(({.+})\)', info_page, 'video info'), video_id)['video'] + video_id, 'Downloading video info', query=query, fatal=False) + if info_page: + video_info = self._parse_json(self._search_regex( + r'loadCallback\(({.+})\)', info_page, 'video info'), video_id)['video'] + else: + info_page = self._download_webpage( + 'http://player.cnevids.com/player/loader.js', + video_id, 'Downloading loader info', query=query) + video_info = self._parse_json(self._search_regex( + r'var\s+video\s*=\s*({.+?});', info_page, 'video info'), video_id) title = video_info['title'] formats = [] From d73ebac100c9f91acb002c4844ba67b73616322a Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sat, 6 Aug 2016 11:18:14 +0100 Subject: [PATCH 36/87] [pokemon] Add new extractor(closes #10093) --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/pokemon.py | 52 ++++++++++++++++++++++++++++++ 2 files changed, 53 insertions(+) create mode 100644 youtube_dl/extractor/pokemon.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index fec560ba3..11b64eeaa 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -636,6 +636,7 @@ from .pluralsight import ( PluralsightCourseIE, ) from .podomatic import PodomaticIE +from .pokemon import PokemonIE from .polskieradio import PolskieRadioIE from .porn91 import Porn91IE from .pornhd import PornHdIE diff --git a/youtube_dl/extractor/pokemon.py b/youtube_dl/extractor/pokemon.py new file mode 100644 index 000000000..ce27f33e1 --- /dev/null +++ b/youtube_dl/extractor/pokemon.py @@ -0,0 +1,52 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + extract_attributes, + int_or_none, +) + + +class PokemonIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?pokemon\.com/[a-z]{2}(?:.*?play=(?P[a-z0-9]{32})|/pokemon-episodes/(?P[^/?#]+))' + _TESTS = [{ + 'url': 'http://www.pokemon.com/us/pokemon-episodes/19_01-from-a-to-z/?play=true', + 'md5': '9fb209ae3a569aac25de0f5afc4ee08f', + 'info_dict': { + 'id': 'd0436c00c3ce4071ac6cee8130ac54a1', + 'ext': 'mp4', + 'title': 'From A to Z!', + 'description': 'Bonnie makes a new friend, Ash runs into an old friend, and a terrifying premonition begins to unfold!', + 'timestamp': 1460478136, + 'upload_date': '20160412', + }, + 'add_id': ['LimelightMedia'] + }, { + 'url': 'http://www.pokemon.com/uk/pokemon-episodes/?play=2e8b5c761f1d4a9286165d7748c1ece2', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id, display_id = re.match(self._VALID_URL, url).groups() + webpage = self._download_webpage(url, video_id or display_id) + video_data = extract_attributes(self._search_regex( + r'(<[^>]+data-video-id="%s"[^>]*>)' % (video_id if video_id else '[a-z0-9]{32}'), + webpage, 'video data element')) + video_id = video_data['data-video-id'] + title = video_data['data-video-title'] + return { + '_type': 'url_transparent', + 'id': video_id, + 'url': 'limelight:media:%s' % video_id, + 'title': title, + 'description': video_data.get('data-video-summary'), + 'thumbnail': video_data.get('data-video-poster'), + 'series': 'Pokémon', + 'season_number': int_or_none(video_data.get('data-video-season')), + 'episode': title, + 'episode_number': int_or_none(video_data.get('data-video-episode')), + 'ie_key': 'LimelightMedia', + } From 089a40955cdcdce1d8ea89b1402bde4c88c75546 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sat, 6 Aug 2016 12:08:14 +0100 Subject: [PATCH 37/87] [pokemon] improve _VALID_URL --- youtube_dl/extractor/pokemon.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/pokemon.py b/youtube_dl/extractor/pokemon.py index ce27f33e1..2d87e7e70 100644 --- a/youtube_dl/extractor/pokemon.py +++ b/youtube_dl/extractor/pokemon.py @@ -11,7 +11,7 @@ from ..utils import ( class PokemonIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?pokemon\.com/[a-z]{2}(?:.*?play=(?P[a-z0-9]{32})|/pokemon-episodes/(?P[^/?#]+))' + _VALID_URL = r'https?://(?:www\.)?pokemon\.com/[a-z]{2}(?:.*?play=(?P[a-z0-9]{32})|/[^/]+/\d+_\d+-(?P[^/?#]+))' _TESTS = [{ 'url': 'http://www.pokemon.com/us/pokemon-episodes/19_01-from-a-to-z/?play=true', 'md5': '9fb209ae3a569aac25de0f5afc4ee08f', @@ -27,6 +27,12 @@ class PokemonIE(InfoExtractor): }, { 'url': 'http://www.pokemon.com/uk/pokemon-episodes/?play=2e8b5c761f1d4a9286165d7748c1ece2', 'only_matching': True, + }, { + 'url': 'http://www.pokemon.com/fr/episodes-pokemon/18_09-un-hiver-inattendu/', + 'only_matching': True, + }, { + 'url': 'http://www.pokemon.com/de/pokemon-folgen/01_20-bye-bye-smettbo/', + 'only_matching': True, }] def _real_extract(self, url): From a7e5f274123dcfe9afd53134a33f83d53d1e497e Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sat, 6 Aug 2016 18:48:09 +0100 Subject: [PATCH 38/87] [bbc] improve extraction - extract f4m and dash formats - improve format sorting and listing - improve extraction of articles with `otherSettings.playlist` --- youtube_dl/extractor/bbc.py | 175 +++++++++++++++++------------------- 1 file changed, 83 insertions(+), 92 deletions(-) diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index 9cb7630a1..45c562bd8 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -229,51 +229,6 @@ class BBCCoUkIE(InfoExtractor): asx = self._download_xml(connection.get('href'), programme_id, 'Downloading ASX playlist') return [ref.get('href') for ref in asx.findall('./Entry/ref')] - def _extract_connection(self, connection, programme_id): - formats = [] - kind = connection.get('kind') - protocol = connection.get('protocol') - supplier = connection.get('supplier') - if protocol == 'http': - href = connection.get('href') - transfer_format = connection.get('transferFormat') - # ASX playlist - if supplier == 'asx': - for i, ref in enumerate(self._extract_asx_playlist(connection, programme_id)): - formats.append({ - 'url': ref, - 'format_id': 'ref%s_%s' % (i, supplier), - }) - # Skip DASH until supported - elif transfer_format == 'dash': - pass - elif transfer_format == 'hls': - formats.extend(self._extract_m3u8_formats( - href, programme_id, ext='mp4', entry_protocol='m3u8_native', - m3u8_id=supplier, fatal=False)) - # Direct link - else: - formats.append({ - 'url': href, - 'format_id': supplier or kind or protocol, - }) - elif protocol == 'rtmp': - application = connection.get('application', 'ondemand') - auth_string = connection.get('authString') - identifier = connection.get('identifier') - server = connection.get('server') - formats.append({ - 'url': '%s://%s/%s?%s' % (protocol, server, application, auth_string), - 'play_path': identifier, - 'app': '%s?%s' % (application, auth_string), - 'page_url': 'http://www.bbc.co.uk', - 'player_url': 'http://www.bbc.co.uk/emp/releases/iplayer/revisions/617463_618125_4/617463_618125_4_emp.swf', - 'rtmp_live': False, - 'ext': 'flv', - 'format_id': supplier, - }) - return formats - def _extract_items(self, playlist): return playlist.findall('./{%s}item' % self._EMP_PLAYLIST_NS) @@ -294,46 +249,6 @@ class BBCCoUkIE(InfoExtractor): def _extract_connections(self, media): return self._findall_ns(media, './{%s}connection') - def _extract_video(self, media, programme_id): - formats = [] - vbr = int_or_none(media.get('bitrate')) - vcodec = media.get('encoding') - service = media.get('service') - width = int_or_none(media.get('width')) - height = int_or_none(media.get('height')) - file_size = int_or_none(media.get('media_file_size')) - for connection in self._extract_connections(media): - conn_formats = self._extract_connection(connection, programme_id) - for format in conn_formats: - format.update({ - 'width': width, - 'height': height, - 'vbr': vbr, - 'vcodec': vcodec, - 'filesize': file_size, - }) - if service: - format['format_id'] = '%s_%s' % (service, format['format_id']) - formats.extend(conn_formats) - return formats - - def _extract_audio(self, media, programme_id): - formats = [] - abr = int_or_none(media.get('bitrate')) - acodec = media.get('encoding') - service = media.get('service') - for connection in self._extract_connections(media): - conn_formats = self._extract_connection(connection, programme_id) - for format in conn_formats: - format.update({ - 'format_id': '%s_%s' % (service, format['format_id']), - 'abr': abr, - 'acodec': acodec, - 'vcodec': 'none', - }) - formats.extend(conn_formats) - return formats - def _get_subtitles(self, media, programme_id): subtitles = {} for connection in self._extract_connections(media): @@ -382,10 +297,77 @@ class BBCCoUkIE(InfoExtractor): for media in self._extract_medias(media_selection): kind = media.get('kind') - if kind == 'audio': - formats.extend(self._extract_audio(media, programme_id)) - elif kind == 'video': - formats.extend(self._extract_video(media, programme_id)) + if kind in ('video', 'audio'): + bitrate = int_or_none(media.get('bitrate')) + encoding = media.get('encoding') + service = media.get('service') + width = int_or_none(media.get('width')) + height = int_or_none(media.get('height')) + file_size = int_or_none(media.get('media_file_size')) + for connection in self._extract_connections(media): + conn_kind = connection.get('kind') + protocol = connection.get('protocol') + supplier = connection.get('supplier') + href = connection.get('href') + transfer_format = connection.get('transferFormat') + format_id = supplier or conn_kind or protocol + if service: + format_id = '%s_%s' % (service, format_id) + # ASX playlist + if supplier == 'asx': + for i, ref in enumerate(self._extract_asx_playlist(connection, programme_id)): + formats.append({ + 'url': ref, + 'format_id': 'ref%s_%s' % (i, format_id), + }) + elif transfer_format == 'dash': + formats.extend(self._extract_mpd_formats( + href, programme_id, mpd_id=format_id, fatal=False)) + elif transfer_format == 'hls': + formats.extend(self._extract_m3u8_formats( + href, programme_id, ext='mp4', entry_protocol='m3u8_native', + m3u8_id=format_id, fatal=False)) + elif transfer_format == 'hds': + formats.extend(self._extract_f4m_formats( + href, programme_id, f4m_id=format_id, fatal=False)) + else: + fmt = { + 'format_id': format_id, + 'filesize': file_size, + } + if kind == 'video': + fmt.update({ + 'width': width, + 'height': height, + 'vbr': bitrate, + 'vcodec': encoding, + }) + else: + fmt.update({ + 'abr': bitrate, + 'acodec': encoding, + 'vcodec': 'none', + }) + if protocol == 'http': + # Direct link + fmt.update({ + 'url': href, + }) + elif protocol == 'rtmp': + application = connection.get('application', 'ondemand') + auth_string = connection.get('authString') + identifier = connection.get('identifier') + server = connection.get('server') + fmt.update({ + 'url': '%s://%s/%s?%s' % (protocol, server, application, auth_string), + 'play_path': identifier, + 'app': '%s?%s' % (application, auth_string), + 'page_url': 'http://www.bbc.co.uk', + 'player_url': 'http://www.bbc.co.uk/emp/releases/iplayer/revisions/617463_618125_4/617463_618125_4_emp.swf', + 'rtmp_live': False, + 'ext': 'flv', + }) + formats.append(fmt) elif kind == 'captions': subtitles = self.extract_subtitles(media, programme_id) return formats, subtitles @@ -820,13 +802,19 @@ class BBCIE(BBCCoUkIE): # http://www.bbc.com/turkce/multimedya/2015/10/151010_vid_ankara_patlama_ani) playlist = data_playable.get('otherSettings', {}).get('playlist', {}) if playlist: - for key in ('progressiveDownload', 'streaming'): + entry = None + for key in ('streaming', 'progressiveDownload'): playlist_url = playlist.get('%sUrl' % key) if not playlist_url: continue try: - entries.append(self._extract_from_playlist_sxml( - playlist_url, playlist_id, timestamp)) + info = self._extract_from_playlist_sxml( + playlist_url, playlist_id, timestamp) + if not entry: + entry = info + else: + entry['title'] = info['title'] + entry['formats'].extend(info['formats']) except Exception as e: # Some playlist URL may fail with 500, at the same time # the other one may work fine (e.g. @@ -834,6 +822,9 @@ class BBCIE(BBCCoUkIE): if isinstance(e.cause, compat_HTTPError) and e.cause.code == 500: continue raise + if entry: + self._sort_formats(entry['formats']) + entries.append(entry) if entries: return self.playlist_result(entries, playlist_id, playlist_title, playlist_description) From c57244cdb10cac6ff88fd4fa430c98f3909c065a Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sat, 6 Aug 2016 18:55:05 +0100 Subject: [PATCH 39/87] [common] lower the preference of m3u8 master manifest format --- youtube_dl/extractor/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 70909fc1c..b54a36996 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1140,7 +1140,7 @@ class InfoExtractor(object): 'url': m3u8_url, 'ext': ext, 'protocol': 'm3u8', - 'preference': preference - 1 if preference else -1, + 'preference': -100, 'resolution': 'multiple', 'format_note': 'Quality selection URL', } From d16b3c6677f1f699635892876f4566962094221d Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sat, 6 Aug 2016 18:58:38 +0100 Subject: [PATCH 40/87] [common] extract partOfTVSeries info in json-ld --- youtube_dl/extractor/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index b54a36996..67f49f51b 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -846,7 +846,7 @@ class InfoExtractor(object): part_of_season = e.get('partOfSeason') if isinstance(part_of_season, dict) and part_of_season.get('@type') == 'TVSeason': info['season_number'] = int_or_none(part_of_season.get('seasonNumber')) - part_of_series = e.get('partOfSeries') + part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries') if isinstance(part_of_series, dict) and part_of_series.get('@type') == 'TVSeries': info['series'] = unescapeHTML(part_of_series.get('name')) elif item_type == 'Article': From b0af12154e4a76bf7d493c7eb75cdeea6cf8fe17 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sat, 6 Aug 2016 19:24:59 +0100 Subject: [PATCH 41/87] [bbc] reduce requests and improve format_id --- youtube_dl/extractor/bbc.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index 45c562bd8..35d042e4d 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -294,6 +294,7 @@ class BBCCoUkIE(InfoExtractor): def _process_media_selector(self, media_selection, programme_id): formats = [] subtitles = None + urls = [] for media in self._extract_medias(media_selection): kind = media.get('kind') @@ -305,10 +306,14 @@ class BBCCoUkIE(InfoExtractor): height = int_or_none(media.get('height')) file_size = int_or_none(media.get('media_file_size')) for connection in self._extract_connections(media): + href = connection.get('href') + if href in urls: + continue + if href: + urls.append(href) conn_kind = connection.get('kind') protocol = connection.get('protocol') supplier = connection.get('supplier') - href = connection.get('href') transfer_format = connection.get('transferFormat') format_id = supplier or conn_kind or protocol if service: @@ -331,6 +336,8 @@ class BBCCoUkIE(InfoExtractor): formats.extend(self._extract_f4m_formats( href, programme_id, f4m_id=format_id, fatal=False)) else: + if bitrate: + format_id += '-%d' % bitrate fmt = { 'format_id': format_id, 'filesize': file_size, From ad152e2d954445c1bfa974c5d4a47ea622269d82 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sat, 6 Aug 2016 19:36:12 +0100 Subject: [PATCH 42/87] [bbc] fix test --- youtube_dl/extractor/bbc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index 35d042e4d..7bc7b2ed6 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -578,7 +578,7 @@ class BBCIE(BBCCoUkIE): 'info_dict': { 'id': '150615_telabyad_kentin_cogu', 'ext': 'mp4', - 'title': "Tel Abyad'da IŞİD bayrağı indirildi YPG bayrağı çekildi", + 'title': "YPG: Tel Abyad'ın tamamı kontrolümüzde", 'description': 'md5:33a4805a855c9baf7115fcbde57e7025', 'timestamp': 1434397334, 'upload_date': '20150615', From d3f8e038fea62fc117b7747627eb8d654863d152 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sun, 7 Aug 2016 02:42:58 +0800 Subject: [PATCH 43/87] [utils] Add decode_png for openload (#9706) --- youtube_dl/utils.py | 108 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 108 insertions(+) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 97ddd9883..ddbfcd2f1 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -47,6 +47,7 @@ from .compat import ( compat_socket_create_connection, compat_str, compat_struct_pack, + compat_struct_unpack, compat_urllib_error, compat_urllib_parse, compat_urllib_parse_urlencode, @@ -2969,3 +2970,110 @@ def parse_m3u8_attributes(attrib): def urshift(val, n): return val >> n if val >= 0 else (val + 0x100000000) >> n + + +# Based on png2str() written by @gdkchan and improved by @yokrysty +# Originally posted at https://github.com/rg3/youtube-dl/issues/9706 +def decode_png(png_data): + # Reference: https://www.w3.org/TR/PNG/ + header = png_data[8:] + + if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR': + raise IOError('Not a valid PNG file.') + + int_map = {1: '>B', 2: '>H', 4: '>I'} + unpack_integer = lambda x: compat_struct_unpack(int_map[len(x)], x)[0] + + chunks = [] + + while header: + length = unpack_integer(header[:4]) + header = header[4:] + + chunk_type = header[:4] + header = header[4:] + + chunk_data = header[:length] + header = header[length:] + + header = header[4:] # Skip CRC + + chunks.append({ + 'type': chunk_type, + 'length': length, + 'data': chunk_data + }) + + ihdr = chunks[0]['data'] + + width = unpack_integer(ihdr[:4]) + height = unpack_integer(ihdr[4:8]) + + idat = b'' + + for chunk in chunks: + if chunk['type'] == b'IDAT': + idat += chunk['data'] + + if not idat: + raise IOError('Unable to read PNG data.') + + decompressed_data = bytearray(zlib.decompress(idat)) + + stride = width * 3 + pixels = [] + + def _get_pixel(idx): + x = idx % stride + y = idx // stride + return pixels[y][x] + + for y in range(height): + basePos = y * (1 + stride) + filter_type = decompressed_data[basePos] + + current_row = [] + + pixels.append(current_row) + + for x in range(stride): + color = decompressed_data[1 + basePos + x] + basex = y * stride + x + left = 0 + up = 0 + + if x > 2: + left = _get_pixel(basex - 3) + if y > 0: + up = _get_pixel(basex - stride) + + if filter_type == 1: # Sub + color = (color + left) & 0xff + elif filter_type == 2: # Up + color = (color + up) & 0xff + elif filter_type == 3: # Average + color = (color + ((left + up) >> 1)) & 0xff + elif filter_type == 4: # Paeth + a = left + b = up + c = 0 + + if x > 2 and y > 0: + c = _get_pixel(basex - stride - 3) + + p = a + b - c + + pa = abs(p - a) + pb = abs(p - b) + pc = abs(p - c) + + if pa <= pb and pa <= pc: + color = (color + a) & 0xff + elif pb <= pc: + color = (color + b) & 0xff + else: + color = (color + c) & 0xff + + current_row.append(color) + + return width, height, pixels From c1decda58c812b3d0a3d4dfa998e7d8bd8f99203 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sun, 7 Aug 2016 02:44:15 +0800 Subject: [PATCH 44/87] [openload] Fix extraction (closes #9706) --- youtube_dl/extractor/openload.py | 133 +++++++++++++++---------------- 1 file changed, 63 insertions(+), 70 deletions(-) diff --git a/youtube_dl/extractor/openload.py b/youtube_dl/extractor/openload.py index 6415b8fdc..4e80ca9ff 100644 --- a/youtube_dl/extractor/openload.py +++ b/youtube_dl/extractor/openload.py @@ -1,15 +1,14 @@ # coding: utf-8 -from __future__ import unicode_literals +from __future__ import unicode_literals, division -import re +import math from .common import InfoExtractor from ..compat import compat_chr from ..utils import ( + decode_png, determine_ext, - encode_base_n, ExtractorError, - mimetype2ext, ) @@ -41,60 +40,6 @@ class OpenloadIE(InfoExtractor): 'only_matching': True, }] - @staticmethod - def openload_level2_debase(m): - radix, num = int(m.group(1)) + 27, int(m.group(2)) - return '"' + encode_base_n(num, radix) + '"' - - @classmethod - def openload_level2(cls, txt): - # The function name is ǃ \u01c3 - # Using escaped unicode literals does not work in Python 3.2 - return re.sub(r'ǃ\((\d+),(\d+)\)', cls.openload_level2_debase, txt, re.UNICODE).replace('"+"', '') - - # Openload uses a variant of aadecode - # openload_decode and related functions are originally written by - # vitas@matfyz.cz and released with public domain - # See https://github.com/rg3/youtube-dl/issues/8489 - @classmethod - def openload_decode(cls, txt): - symbol_table = [ - ('_', '(゚Д゚) [゚Θ゚]'), - ('a', '(゚Д゚) [゚ω゚ノ]'), - ('b', '(゚Д゚) [゚Θ゚ノ]'), - ('c', '(゚Д゚) [\'c\']'), - ('d', '(゚Д゚) [゚ー゚ノ]'), - ('e', '(゚Д゚) [゚Д゚ノ]'), - ('f', '(゚Д゚) [1]'), - - ('o', '(゚Д゚) [\'o\']'), - ('u', '(o゚ー゚o)'), - ('c', '(゚Д゚) [\'c\']'), - - ('7', '((゚ー゚) + (o^_^o))'), - ('6', '((o^_^o) +(o^_^o) +(c^_^o))'), - ('5', '((゚ー゚) + (゚Θ゚))'), - ('4', '(-~3)'), - ('3', '(-~-~1)'), - ('2', '(-~1)'), - ('1', '(-~0)'), - ('0', '((c^_^o)-(c^_^o))'), - ] - delim = '(゚Д゚)[゚ε゚]+' - ret = '' - for aachar in txt.split(delim): - for val, pat in symbol_table: - aachar = aachar.replace(pat, val) - aachar = aachar.replace('+ ', '') - m = re.match(r'^\d+', aachar) - if m: - ret += compat_chr(int(m.group(0), 8)) - else: - m = re.match(r'^u([\da-f]+)', aachar) - if m: - ret += compat_chr(int(m.group(1), 16)) - return cls.openload_level2(ret) - def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) @@ -102,29 +47,77 @@ class OpenloadIE(InfoExtractor): if 'File not found' in webpage: raise ExtractorError('File not found', expected=True) - code = self._search_regex( - r'\s*
\s*]+>[^>]+\s*]+>([^<]+)', - webpage, 'JS code') + # The following extraction logic is proposed by @Belderak and @gdkchan + # and declared to be used freely in youtube-dl + # See https://github.com/rg3/youtube-dl/issues/9706 - decoded = self.openload_decode(code) + numbers_js = self._download_webpage( + 'https://openload.co/assets/js/obfuscator/n.js', video_id, + note='Downloading signature numbers') + signums = self._search_regex( + r'window\.signatureNumbers\s*=\s*[\'"](?P[a-z]+)[\'"]', + numbers_js, 'signature numbers', group='data') - video_url = self._search_regex( - r'return\s+"(https?://[^"]+)"', decoded, 'video URL') + linkimg_uri = self._search_regex( + r']+id="linkimg"[^>]+src="([^"]+)"', webpage, 'link image') + linkimg = self._request_webpage( + linkimg_uri, video_id, note=False).read() + + width, height, pixels = decode_png(linkimg) + + output = '' + for y in range(height): + for x in range(width): + r, g, b = pixels[y][3 * x:3 * x + 3] + if r == 0 and g == 0 and b == 0: + break + else: + output += compat_chr(r) + output += compat_chr(g) + output += compat_chr(b) + + img_str_length = len(output) // 200 + img_str = [[0 for x in range(img_str_length)] for y in range(10)] + + sig_str_length = len(signums) // 260 + sig_str = [[0 for x in range(sig_str_length)] for y in range(10)] + + for i in range(10): + for j in range(img_str_length): + begin = i * img_str_length * 20 + j * 20 + img_str[i][j] = output[begin:begin + 20] + for j in range(sig_str_length): + begin = i * sig_str_length * 26 + j * 26 + sig_str[i][j] = signums[begin:begin + 26] + + parts = [] + # TODO: find better names for str_, chr_ and sum_ + str_ = '' + for i in [2, 3, 5, 7]: + str_ = '' + sum_ = float(99) + for j in range(len(sig_str[i])): + for chr_idx in range(len(img_str[i][j])): + if sum_ > float(122): + sum_ = float(98) + chr_ = compat_chr(int(math.floor(sum_))) + if sig_str[i][j][chr_idx] == chr_ and j >= len(str_): + sum_ += float(2.5) + str_ += img_str[i][j][chr_idx] + parts.append(str_.replace(',', '')) + + video_url = 'https://openload.co/stream/%s~%s~%s~%s' % (parts[3], parts[1], parts[2], parts[0]) title = self._og_search_title(webpage, default=None) or self._search_regex( r']+class=["\']title["\'][^>]*>([^<]+)', webpage, 'title', default=None) or self._html_search_meta( 'description', webpage, 'title', fatal=True) - ext = mimetype2ext(self._search_regex( - r'window\.vt\s*=\s*(["\'])(?P.+?)\1', decoded, - 'mimetype', default=None, group='mimetype')) or determine_ext( - video_url, 'mp4') - return { 'id': video_id, 'title': title, - 'ext': ext, 'thumbnail': self._og_search_thumbnail(webpage, default=None), 'url': video_url, + # Seems all videos have extensions in their titles + 'ext': determine_ext(title), } From e37b54b140c552ee3d751b868ecc0a71df13829f Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sat, 6 Aug 2016 20:53:13 +0100 Subject: [PATCH 45/87] [fox] fix theplatform release url query --- youtube_dl/extractor/fox.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/fox.py b/youtube_dl/extractor/fox.py index 95c1abf94..9f406b17e 100644 --- a/youtube_dl/extractor/fox.py +++ b/youtube_dl/extractor/fox.py @@ -2,7 +2,10 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import smuggle_url +from ..utils import ( + smuggle_url, + update_url_query, +) class FOXIE(InfoExtractor): @@ -29,11 +32,12 @@ class FOXIE(InfoExtractor): release_url = self._parse_json(self._search_regex( r'"fox_pdk_player"\s*:\s*({[^}]+?})', webpage, 'fox_pdk_player'), - video_id)['release_url'] + '&switch=http' + video_id)['release_url'] return { '_type': 'url_transparent', 'ie_key': 'ThePlatform', - 'url': smuggle_url(release_url, {'force_smil_url': True}), + 'url': smuggle_url(update_url_query( + release_url, {'switch': 'http'}), {'force_smil_url': True}), 'id': video_id, } From b47a75017bc4b4739ccebb3e8bde0642981968b0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 7 Aug 2016 15:57:42 +0700 Subject: [PATCH 46/87] [tnaflix] Fix metadata extraction (Closes #10249) --- youtube_dl/extractor/tnaflix.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/tnaflix.py b/youtube_dl/extractor/tnaflix.py index 78174178e..74a0c8f17 100644 --- a/youtube_dl/extractor/tnaflix.py +++ b/youtube_dl/extractor/tnaflix.py @@ -189,9 +189,9 @@ class TNAFlixNetworkEmbedIE(TNAFlixNetworkBaseIE): class TNAFlixIE(TNAFlixNetworkBaseIE): _VALID_URL = r'https?://(?:www\.)?tnaflix\.com/[^/]+/(?P[^/]+)/video(?P\d+)' - _TITLE_REGEX = r'(.+?) - TNAFlix Porn Videos' - _DESCRIPTION_REGEX = r']+name="description"[^>]+content="([^"]+)"' - _UPLOADER_REGEX = r'\s*Verified Member\s*\s*

(.+?)

' + _TITLE_REGEX = r'(.+?) - (?:TNAFlix Porn Videos|TNAFlix\.com)' + _DESCRIPTION_REGEX = r'(?s)>Description:]+>(.+?)<' + _UPLOADER_REGEX = r'\s*Verified Member\s*\s*(.+?)<' _CATEGORIES_REGEX = r'(?s)]*>Categories:(.+?)' _TESTS = [{ From a1aadd09a4721ffd4d782aae4ddeae2699ca17fa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 7 Aug 2016 15:59:05 +0700 Subject: [PATCH 47/87] [tnaflixnetworkbase] Improve title extraction --- youtube_dl/extractor/tnaflix.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/tnaflix.py b/youtube_dl/extractor/tnaflix.py index 74a0c8f17..7ddf77767 100644 --- a/youtube_dl/extractor/tnaflix.py +++ b/youtube_dl/extractor/tnaflix.py @@ -118,8 +118,12 @@ class TNAFlixNetworkBaseIE(InfoExtractor): xpath_text(cfg_xml, './startThumb', 'thumbnail'), 'http:') thumbnails = self._extract_thumbnails(cfg_xml) - title = self._html_search_regex( - self._TITLE_REGEX, webpage, 'title') if self._TITLE_REGEX else self._og_search_title(webpage) + title = None + if self._TITLE_REGEX: + title = self._html_search_regex( + self._TITLE_REGEX, webpage, 'title', default=None) + if not title: + title = self._og_search_title(webpage) age_limit = self._rta_search(webpage) or 18 From 37768f92422c2cf61a961dbaf54d61dabd364506 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sun, 7 Aug 2016 10:58:11 +0100 Subject: [PATCH 48/87] [common] correctly lower the preference of m3u8 master manifest format --- youtube_dl/extractor/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 67f49f51b..0891309dd 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1140,7 +1140,7 @@ class InfoExtractor(object): 'url': m3u8_url, 'ext': ext, 'protocol': 'm3u8', - 'preference': -100, + 'preference': preference - 100 if preference else -100, 'resolution': 'multiple', 'format_note': 'Quality selection URL', } From f9622868e722a1863de257c7f3cbc3076eea6e83 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sun, 7 Aug 2016 11:14:15 +0100 Subject: [PATCH 49/87] [bbc] preserve format_id backward compatibility --- youtube_dl/extractor/bbc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index 7bc7b2ed6..b6c240832 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -336,7 +336,7 @@ class BBCCoUkIE(InfoExtractor): formats.extend(self._extract_f4m_formats( href, programme_id, f4m_id=format_id, fatal=False)) else: - if bitrate: + if not service and not supplier and bitrate: format_id += '-%d' % bitrate fmt = { 'format_id': format_id, From 9fb64c04cdbe1b58f968fa80489168173ac7e565 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 7 Aug 2016 18:01:50 +0700 Subject: [PATCH 50/87] [bbc] Add support for morph embeds (Closes #10239) --- youtube_dl/extractor/bbc.py | 64 +++++++++++++++++++++++++++++++++++++ 1 file changed, 64 insertions(+) diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index b6c240832..0ee096dda 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -5,11 +5,13 @@ import re from .common import InfoExtractor from ..utils import ( + dict_get, ExtractorError, float_or_none, int_or_none, parse_duration, parse_iso8601, + try_get, unescapeHTML, ) from ..compat import ( @@ -643,6 +645,24 @@ class BBCIE(BBCCoUkIE): # rtmp download 'skip_download': True, } + }, { + # single video embedded with Morph + 'url': 'http://www.bbc.co.uk/sport/live/olympics/36895975', + 'info_dict': { + 'id': 'p041vhd0', + 'ext': 'mp4', + 'title': "Nigeria v Japan - Men's First Round", + 'description': 'Live coverage of the first round from Group B at the Amazonia Arena.', + 'duration': 7980, + 'uploader': 'BBC Sport', + 'uploader_id': 'bbc_sport', + }, + 'params': { + # m3u8 download + 'skip_download': True, + 'proxy': '5.101.173.158:8080', + }, + 'skip': 'Georestricted to UK', }, { # single video with playlist.sxml URL in playlist param 'url': 'http://www.bbc.com/sport/0/football/33653409', @@ -864,6 +884,50 @@ class BBCIE(BBCCoUkIE): 'subtitles': subtitles, } + # Morph based embed (e.g. http://www.bbc.co.uk/sport/live/olympics/36895975) + # There are several setPayload calls may be present but the video + # seems to be always related to the first one + morph_payload = self._parse_json( + self._search_regex( + r'Morph\.setPayload\([^,]+,\s*({.+?})\);', + webpage, 'morph payload', default='{}'), + playlist_id, fatal=False) + if morph_payload: + components = try_get(morph_payload, lambda x: x['body']['components'], list) or [] + for component in components: + if not isinstance(component, dict): + continue + lead_media = try_get(component, lambda x: x['props']['leadMedia'], dict) + if not lead_media: + continue + identifiers = lead_media.get('identifiers') + if not identifiers or not isinstance(identifiers, dict): + continue + programme_id = identifiers.get('vpid') or identifiers.get('playablePid') + if not programme_id: + continue + title = lead_media.get('title') or self._og_search_title(webpage) + formats, subtitles = self._download_media_selector(programme_id) + self._sort_formats(formats) + description = lead_media.get('summary') + uploader = lead_media.get('masterBrand') + uploader_id = lead_media.get('mid') + duration = None + duration_d = lead_media.get('duration') + if isinstance(duration_d, dict): + duration = parse_duration(dict_get( + duration_d, ('rawDuration', 'formattedDuration', 'spokenDuration'))) + return { + 'id': programme_id, + 'title': title, + 'description': description, + 'duration': duration, + 'uploader': uploader, + 'uploader_id': uploader_id, + 'formats': formats, + 'subtitles': subtitles, + } + def extract_all(pattern): return list(filter(None, map( lambda s: self._parse_json(s, playlist_id, fatal=False), From aaa42cf0cf3e8c09209908474998fbd3dc86b91a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 7 Aug 2016 18:05:13 +0700 Subject: [PATCH 51/87] [bbc] PEP 8 --- youtube_dl/extractor/bbc.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index 0ee096dda..d059e02a3 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -339,7 +339,7 @@ class BBCCoUkIE(InfoExtractor): href, programme_id, f4m_id=format_id, fatal=False)) else: if not service and not supplier and bitrate: - format_id += '-%d' % bitrate + format_id += '-%d' % bitrate fmt = { 'format_id': format_id, 'filesize': file_size, @@ -945,7 +945,7 @@ class BBCIE(BBCCoUkIE): r'setPlaylist\("(%s)"\)' % EMBED_URL, webpage)) if entries: return self.playlist_result( - [self.url_result(entry, 'BBCCoUk') for entry in entries], + [self.url_result(entry_, 'BBCCoUk') for entry_ in entries], playlist_id, playlist_title, playlist_description) # Multiple video article (e.g. http://www.bbc.com/news/world-europe-32668511) From 998f0944526658ae19f29fd4ca04391a5a1ac027 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 7 Aug 2016 18:13:05 +0700 Subject: [PATCH 52/87] [bbc] Remove proxy from test --- youtube_dl/extractor/bbc.py | 1 - 1 file changed, 1 deletion(-) diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index d059e02a3..b29e05970 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -660,7 +660,6 @@ class BBCIE(BBCCoUkIE): 'params': { # m3u8 download 'skip_download': True, - 'proxy': '5.101.173.158:8080', }, 'skip': 'Georestricted to UK', }, { From 958849275f1c6072c712e8d611294a762fadc7f0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 7 Aug 2016 19:04:22 +0700 Subject: [PATCH 53/87] [extractor/generic] Make _search_json_ld non fatal --- youtube_dl/extractor/generic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index e89a03760..44c6c354c 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -2241,7 +2241,7 @@ class GenericIE(InfoExtractor): # Looking for http://schema.org/VideoObject json_ld = self._search_json_ld( - webpage, video_id, default=None, expected_type='VideoObject') + webpage, video_id, fatal=False, expected_type='VideoObject') if json_ld and json_ld.get('url'): info_dict.update({ 'title': video_title or info_dict['title'], From d34995a9e3f596c4dd80178d99f7bd8dbc748e2b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 7 Aug 2016 19:06:55 +0700 Subject: [PATCH 54/87] [flipagram] Make _search_json_ld non fatal --- youtube_dl/extractor/flipagram.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/flipagram.py b/youtube_dl/extractor/flipagram.py index acb6133ff..634f5fe3b 100644 --- a/youtube_dl/extractor/flipagram.py +++ b/youtube_dl/extractor/flipagram.py @@ -48,7 +48,7 @@ class FlipagramIE(InfoExtractor): flipagram = video_data['flipagram'] video = flipagram['video'] - json_ld = self._search_json_ld(webpage, video_id, default=False) + json_ld = self._search_json_ld(webpage, video_id, fatal=False) title = json_ld.get('title') or flipagram['captionText'] description = json_ld.get('description') or flipagram.get('captionText') From a8795327cae2bfce299d770039db40a3ac4df2e3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 7 Aug 2016 20:45:18 +0700 Subject: [PATCH 55/87] [utils] Add support TV Parental Guidelines ratings in parse_age_limit --- test/test_utils.py | 15 +++++++++++++++ youtube_dl/utils.py | 20 ++++++++++++++++++-- 2 files changed, 33 insertions(+), 2 deletions(-) diff --git a/test/test_utils.py b/test/test_utils.py index 5a2ae4a1e..724346886 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -42,6 +42,7 @@ from youtube_dl.utils import ( ohdave_rsa_encrypt, OnDemandPagedList, orderedSet, + parse_age_limit, parse_duration, parse_filesize, parse_count, @@ -432,6 +433,20 @@ class TestUtil(unittest.TestCase): url_basename('http://media.w3.org/2010/05/sintel/trailer.mp4'), 'trailer.mp4') + def test_parse_age_limit(self): + self.assertEqual(parse_age_limit(None), None) + self.assertEqual(parse_age_limit(False), None) + self.assertEqual(parse_age_limit('invalid'), None) + self.assertEqual(parse_age_limit(0), 0) + self.assertEqual(parse_age_limit(18), 18) + self.assertEqual(parse_age_limit(21), 21) + self.assertEqual(parse_age_limit(22), None) + self.assertEqual(parse_age_limit('18'), 18) + self.assertEqual(parse_age_limit('18+'), 18) + self.assertEqual(parse_age_limit('PG-13'), 13) + self.assertEqual(parse_age_limit('TV-14'), 14) + self.assertEqual(parse_age_limit('TV-MA'), 17) + def test_parse_duration(self): self.assertEqual(parse_duration(None), None) self.assertEqual(parse_duration(False), None) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index ddbfcd2f1..c50238ba1 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1984,11 +1984,27 @@ US_RATINGS = { } +TV_PARENTAL_GUIDELINES = { + 'TV-Y': 0, + 'TV-Y7': 7, + 'TV-G': 0, + 'TV-PG': 0, + 'TV-14': 14, + 'TV-MA': 17, +} + + def parse_age_limit(s): - if s is None: + if type(s) == int: + return s if 0 <= s <= 21 else None + if not isinstance(s, compat_basestring): return None m = re.match(r'^(?P\d{1,2})\+?$', s) - return int(m.group('age')) if m else US_RATINGS.get(s) + if m: + return int(m.group('age')) + if s in US_RATINGS: + return US_RATINGS[s] + return TV_PARENTAL_GUIDELINES.get(s) def strip_jsonp(code): From d92cb463052c262a6750f3a5bfc7d564a527ee9f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 7 Aug 2016 20:57:05 +0700 Subject: [PATCH 56/87] [discoverygo] Add extractor (Closes #10245) --- youtube_dl/extractor/discoverygo.py | 98 +++++++++++++++++++++++++++++ youtube_dl/extractor/extractors.py | 1 + 2 files changed, 99 insertions(+) create mode 100644 youtube_dl/extractor/discoverygo.py diff --git a/youtube_dl/extractor/discoverygo.py b/youtube_dl/extractor/discoverygo.py new file mode 100644 index 000000000..adb68b96c --- /dev/null +++ b/youtube_dl/extractor/discoverygo.py @@ -0,0 +1,98 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + extract_attributes, + int_or_none, + parse_age_limit, + unescapeHTML, +) + + +class DiscoveryGoIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?discoverygo\.com/(?:[^/]+/)*(?P[^/?#&]+)' + _TEST = { + 'url': 'https://www.discoverygo.com/love-at-first-kiss/kiss-first-ask-questions-later/', + 'info_dict': { + 'id': '57a33c536b66d1cd0345eeb1', + 'ext': 'mp4', + 'title': 'Kiss First, Ask Questions Later!', + 'description': 'md5:fe923ba34050eae468bffae10831cb22', + 'duration': 2579, + 'series': 'Love at First Kiss', + 'season_number': 1, + 'episode_number': 1, + 'age_limit': 14, + }, + } + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + + container = extract_attributes( + self._search_regex( + r'(]+class=["\']video-player-container[^>]+>)', + webpage, 'video container')) + + video = self._parse_json( + unescapeHTML(container.get('data-video') or container.get('data-json')), + display_id) + + title = video['name'] + + stream = video['stream'] + STREAM_URL_SUFFIX = 'streamUrl' + formats = [] + for stream_kind in ('', 'hds'): + suffix = STREAM_URL_SUFFIX.capitalize() if stream_kind else STREAM_URL_SUFFIX + stream_url = stream.get('%s%s' % (stream_kind, suffix)) + if not stream_url: + continue + if stream_kind == '': + formats.extend(self._extract_m3u8_formats( + stream_url, display_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + elif stream_kind == 'hds': + formats.extend(self._extract_f4m_formats( + stream_url, display_id, f4m_id=stream_kind, fatal=False)) + self._sort_formats(formats) + + video_id = video.get('id') or display_id + description = video.get('description', {}).get('detailed') + duration = int_or_none(video.get('duration')) + + series = video.get('show', {}).get('name') + season_number = int_or_none(video.get('season', {}).get('number')) + episode_number = int_or_none(video.get('episodeNumber')) + + tags = video.get('tags') + age_limit = parse_age_limit(video.get('parental', {}).get('rating')) + + subtitles = {} + captions = stream.get('captions') + if isinstance(captions, list): + for caption in captions: + subtitle_url = caption.get('fileUrl') + if (not subtitle_url or not isinstance(subtitle_url, compat_str) or + not subtitle_url.startswith('http')): + continue + lang = caption.get('fileLang', 'en') + subtitles.setdefault(lang, []).append({'url': subtitle_url}) + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'description': description, + 'duration': duration, + 'series': series, + 'season_number': season_number, + 'episode_number': episode_number, + 'tags': tags, + 'age_limit': age_limit, + 'formats': formats, + 'subtitles': subtitles, + } diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 11b64eeaa..c2c4617ee 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -221,6 +221,7 @@ from .dvtv import DVTVIE from .dumpert import DumpertIE from .defense import DefenseGouvFrIE from .discovery import DiscoveryIE +from .discoverygo import DiscoveryGoIE from .dispeak import DigitallySpeakingIE from .dropbox import DropboxIE from .dw import ( From 845dfcdc40c20ea43f1f2bed9ccbc34e1994a89e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 7 Aug 2016 21:10:48 +0700 Subject: [PATCH 57/87] [ChangeLog] Actualize --- ChangeLog | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/ChangeLog b/ChangeLog index 9fd78cdda..c4743df6d 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,29 @@ +version + +Core ++ Add support for TV Parental Guidelines ratings in parse_age_limit ++ Add decode_png (#9706) ++ Add support for partOfTVSeries in JSON-LD +* Lower master M3U8 manifest preference for better format sorting + +Extractors ++ [discoverygo] Add extractor (#10245) +* [flipagram] Make JSON-LD extraction non fatal +* [generic] Make JSON-LD extraction non fatal ++ [bbc] Add support for morph embeds (#10239) +* [tnaflixnetworkbase] Improve title extraction +* [tnaflix] Fix metadata extraction (#10249) +* [fox] Fix theplatform release URL query +* [openload] Fix extraction (#9706) +* [bbc] Skip duplicate manifest URLs +* [bbc] Improve format code ++ [bbc] Add support for DASH and F4M +* [bbc] Improve format sorting and listing +* [bbc] Improve playlist extraction ++ [pokemon] Add extractor (#10093) ++ [condenast] Add fallback scenario for video info extraction + + version 2016.08.06 Core From 4a01befb347b35f36dda4f344b7b502e53253da7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 7 Aug 2016 21:12:41 +0700 Subject: [PATCH 58/87] release 2016.08.07 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- docs/supportedsites.md | 2 ++ youtube_dl/version.py | 2 +- 4 files changed, 7 insertions(+), 5 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index 7241840c5..2319e45df 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2016.08.06*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2016.08.06** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2016.08.07*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2016.08.07** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through [README](https://github.com/rg3/youtube-dl/blob/master/README.md) and **most notably** [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -35,7 +35,7 @@ $ youtube-dl -v [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2016.08.06 +[debug] youtube-dl version 2016.08.07 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index c4743df6d..7803ec1e9 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version +version 2016.08.07 Core + Add support for TV Parental Guidelines ratings in parse_age_limit diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 2c8f950df..3608e1807 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -182,6 +182,7 @@ - **DigitallySpeaking** - **Digiteka** - **Discovery** + - **DiscoveryGo** - **Dotsub** - **DouyuTV**: 斗鱼 - **DPlay** @@ -518,6 +519,7 @@ - **plus.google**: Google Plus - **pluzz.francetv.fr** - **podomatic** + - **Pokemon** - **PolskieRadio** - **PornHd** - **PornHub**: PornHub and Thumbzilla diff --git a/youtube_dl/version.py b/youtube_dl/version.py index a4654fa59..b48552031 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2016.08.06' +__version__ = '2016.08.07' From b2bd968f4b6795ffe3fc2d923cc73171687c2980 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sun, 7 Aug 2016 22:59:34 +0800 Subject: [PATCH 59/87] [kuwo:singer] Fix extraction --- ChangeLog | 5 +++++ youtube_dl/extractor/kuwo.py | 6 ++++-- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/ChangeLog b/ChangeLog index 7803ec1e9..32a96432b 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,8 @@ +version + +Extractors +* [kuwo:singer] Fix extraction + version 2016.08.07 Core diff --git a/youtube_dl/extractor/kuwo.py b/youtube_dl/extractor/kuwo.py index b1d460599..0eeb9ffeb 100644 --- a/youtube_dl/extractor/kuwo.py +++ b/youtube_dl/extractor/kuwo.py @@ -4,6 +4,7 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import compat_urlparse from ..utils import ( get_element_by_id, clean_html, @@ -242,8 +243,9 @@ class KuwoSingerIE(InfoExtractor): query={'artistId': artist_id, 'pn': page_num, 'rn': self.PAGE_SIZE}) return [ - self.url_result(song_url, 'Kuwo') for song_url in re.findall( - r']+class="name">]+href="(http://www\.kuwo\.cn/yinyue/\d+)', + self.url_result(compat_urlparse.urljoin(url, song_url), 'Kuwo') + for song_url in re.findall( + r']+class="name">]+href="(/yinyue/\d+)', webpage) ] From d21a661bb4cbdb92f4db588e3801bb19d4fa96b2 Mon Sep 17 00:00:00 2001 From: Charlie Le Date: Sun, 7 Aug 2016 13:38:10 -0700 Subject: [PATCH 60/87] [README.md] Update Options Link The link references a bad anchor. The updated link now references the correct anchor. --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index a9f3001a6..b42d5c730 100644 --- a/README.md +++ b/README.md @@ -1196,7 +1196,7 @@ Make sure that someone has not already opened the issue you're trying to open. S ### Why are existing options not enough? -Before requesting a new feature, please have a quick peek at [the list of supported options](https://github.com/rg3/youtube-dl/blob/master/README.md#synopsis). Many feature requests are for features that actually exist already! Please, absolutely do show off your work in the issue report and detail how the existing similar options do *not* solve your problem. +Before requesting a new feature, please have a quick peek at [the list of supported options](https://github.com/rg3/youtube-dl/blob/master/README.md#options). Many feature requests are for features that actually exist already! Please, absolutely do show off your work in the issue report and detail how the existing similar options do *not* solve your problem. ### Is there enough context in your bug report? From e1f93a0a76f054f537c9103a8958e1c1ab631fa5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Petr=20Zvoni=CC=81c=CC=8Cek?= Date: Sun, 7 Aug 2016 17:35:54 +0200 Subject: [PATCH 61/87] [rozhlas] Add new extractor --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/rozhlas.py | 34 ++++++++++++++++++++++++++++++ 2 files changed, 35 insertions(+) create mode 100644 youtube_dl/extractor/rozhlas.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index c2c4617ee..7ede6afda 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -696,6 +696,7 @@ from .rockstargames import RockstarGamesIE from .roosterteeth import RoosterTeethIE from .rottentomatoes import RottenTomatoesIE from .roxwel import RoxwelIE +from .rozhlas import RozhlasIE from .rtbf import RTBFIE from .rte import RteIE, RteRadioIE from .rtlnl import RtlNlIE diff --git a/youtube_dl/extractor/rozhlas.py b/youtube_dl/extractor/rozhlas.py new file mode 100644 index 000000000..ef99b5b1f --- /dev/null +++ b/youtube_dl/extractor/rozhlas.py @@ -0,0 +1,34 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class RozhlasIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?prehravac\.rozhlas\.cz/audio/(?P[0-9]+)' + _TEST = { + 'url': 'http://prehravac.rozhlas.cz/audio/3421320', + 'md5': '504c902dbc9e9a1fd50326eccf02a7e2', + 'info_dict': { + 'id': '3421320', + 'ext': 'mp3', + 'title': 'Echo Pavla Klusáka (30.06.2015 21:00)', + 'description': 'Osmdesátiny Terryho Rileyho jsou skvělou příležitostí proletět se elektronickými i akustickými díly zakladatatele minimalismu, který je aktivní už přes padesát let' + } + } + + def _real_extract(self, url): + audio_id = self._match_id(url) + webpage = self._download_webpage(url, audio_id) + + title = self._html_search_regex(r'

(.+?)

', webpage, 'title') + description = self._html_search_regex(r'

', webpage, 'description', fatal=False) + + url = 'http://media.rozhlas.cz/_audio/' + audio_id + '.mp3' + + return { + 'id': audio_id, + 'url': url, + 'title': title, + 'description': description, + } From de02d1f4e94a50eaad0f268fffec6880e9d61d2f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 8 Aug 2016 04:58:02 +0700 Subject: [PATCH 62/87] [rozhlas] Fix regexes and improve extraction (Closes #10253) --- youtube_dl/extractor/rozhlas.py | 30 +++++++++++++++++++++++------- 1 file changed, 23 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/rozhlas.py b/youtube_dl/extractor/rozhlas.py index ef99b5b1f..f8eda8dea 100644 --- a/youtube_dl/extractor/rozhlas.py +++ b/youtube_dl/extractor/rozhlas.py @@ -2,11 +2,15 @@ from __future__ import unicode_literals from .common import InfoExtractor +from ..utils import ( + int_or_none, + remove_start, +) class RozhlasIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?prehravac\.rozhlas\.cz/audio/(?P[0-9]+)' - _TEST = { + _TESTS = [{ 'url': 'http://prehravac.rozhlas.cz/audio/3421320', 'md5': '504c902dbc9e9a1fd50326eccf02a7e2', 'info_dict': { @@ -15,20 +19,32 @@ class RozhlasIE(InfoExtractor): 'title': 'Echo Pavla Klusáka (30.06.2015 21:00)', 'description': 'Osmdesátiny Terryho Rileyho jsou skvělou příležitostí proletět se elektronickými i akustickými díly zakladatatele minimalismu, který je aktivní už přes padesát let' } - } + }, { + 'url': 'http://prehravac.rozhlas.cz/audio/3421320/embed', + 'skip_download': True, + }] def _real_extract(self, url): audio_id = self._match_id(url) - webpage = self._download_webpage(url, audio_id) - title = self._html_search_regex(r'

(.+?)

', webpage, 'title') - description = self._html_search_regex(r'

', webpage, 'description', fatal=False) + webpage = self._download_webpage( + 'http://prehravac.rozhlas.cz/audio/%s' % audio_id, audio_id) - url = 'http://media.rozhlas.cz/_audio/' + audio_id + '.mp3' + title = self._html_search_regex( + r'

(.+?)

\s*]*>.*?

\s*]+id=["\']player-track', + webpage, 'title', default=None) or remove_start( + self._og_search_title(webpage), 'Radio Wave - ') + description = self._html_search_regex( + r']+title=(["\'])(?P(?:(?!\1).)+)\1[^>]*>.*?

\s*]+id=["\']player-track', + webpage, 'description', fatal=False, group='url') + duration = int_or_none(self._search_regex( + r'data-duration=["\'](\d+)', webpage, 'duration', default=None)) return { 'id': audio_id, - 'url': url, + 'url': 'http://media.rozhlas.cz/_audio/%s.mp3' % audio_id, 'title': title, 'description': description, + 'duration': duration, + 'vcodec': 'none', } From 6bb801cfafb08efbd5d2d371e3ff2b7336ca09fa Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sun, 7 Aug 2016 22:56:04 +0100 Subject: [PATCH 63/87] [cwtv] extract http formats --- youtube_dl/extractor/cwtv.py | 36 ++++++++++++++++++++++++++---------- 1 file changed, 26 insertions(+), 10 deletions(-) diff --git a/youtube_dl/extractor/cwtv.py b/youtube_dl/extractor/cwtv.py index c66c359cf..1ab9333b2 100644 --- a/youtube_dl/extractor/cwtv.py +++ b/youtube_dl/extractor/cwtv.py @@ -28,7 +28,8 @@ class CWTVIE(InfoExtractor): 'params': { # m3u8 download 'skip_download': True, - } + }, + 'skip': 'redirect to http://cwtv.com/shows/arrow/', }, { 'url': 'http://www.cwseed.com/shows/whose-line-is-it-anyway/jeff-davis-4/?play=24282b12-ead2-42f2-95ad-26770c2c6088', 'info_dict': { @@ -44,10 +45,6 @@ class CWTVIE(InfoExtractor): 'upload_date': '20151006', 'timestamp': 1444107300, }, - 'params': { - # m3u8 download - 'skip_download': True, - } }, { 'url': 'http://cwtv.com/thecw/chroniclesofcisco/?play=8adebe35-f447-465f-ab52-e863506ff6d6', 'only_matching': True, @@ -61,11 +58,30 @@ class CWTVIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - video_data = self._download_json( - 'http://metaframe.digitalsmiths.tv/v2/CWtv/assets/%s/partner/132?format=json' % video_id, video_id) - - formats = self._extract_m3u8_formats( - video_data['videos']['variantplaylist']['uri'], video_id, 'mp4') + video_data = None + formats = [] + for partner in (154, 213): + vdata = self._download_json( + 'http://metaframe.digitalsmiths.tv/v2/CWtv/assets/%s/partner/%d?format=json' % (video_id, partner), video_id, fatal=False) + if not vdata: + continue + video_data = vdata + for quality, quality_data in vdata.get('videos', {}).items(): + quality_url = quality_data.get('uri') + if not quality_url: + continue + if quality == 'variantplaylist': + formats.extend(self._extract_m3u8_formats( + quality_url, video_id, 'mp4', m3u8_id='hls', fatal=False)) + else: + tbr = int_or_none(quality_data.get('bitrate')) + format_id = 'http' + ('-%d' % tbr if tbr else '') + if self._is_valid_url(quality_url, video_id, format_id): + formats.append({ + 'format_id': format_id, + 'url': quality_url, + 'tbr': tbr, + }) self._sort_formats(formats) thumbnails = [{ From f17d5f6d1446c17206df9cc277ea2666aed09fab Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sun, 7 Aug 2016 23:57:19 +0800 Subject: [PATCH 64/87] [features.aol.com] Fix _TESTS --- youtube_dl/extractor/aol.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/youtube_dl/extractor/aol.py b/youtube_dl/extractor/aol.py index 42c21bf41..2cdee3320 100644 --- a/youtube_dl/extractor/aol.py +++ b/youtube_dl/extractor/aol.py @@ -123,6 +123,10 @@ class AolFeaturesIE(InfoExtractor): 'title': 'What To Watch - February 17, 2016', }, 'add_ie': ['FiveMin'], + 'params': { + # encrypted m3u8 download + 'skip_download': True, + }, }] def _real_extract(self, url): From 412abb8760c493fe2e8eeedab58aa7ca667c336d Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Mon, 8 Aug 2016 12:57:17 +0800 Subject: [PATCH 65/87] [bilibili] Update _TESTS --- youtube_dl/extractor/bilibili.py | 72 +++++++------------------------- 1 file changed, 16 insertions(+), 56 deletions(-) diff --git a/youtube_dl/extractor/bilibili.py b/youtube_dl/extractor/bilibili.py index b17047b39..d8eb71821 100644 --- a/youtube_dl/extractor/bilibili.py +++ b/youtube_dl/extractor/bilibili.py @@ -25,13 +25,13 @@ class BiliBiliIE(InfoExtractor): _TESTS = [{ 'url': 'http://www.bilibili.tv/video/av1074402/', - 'md5': '5f7d29e1a2872f3df0cf76b1f87d3788', + 'md5': '9fa226fe2b8a9a4d5a69b4c6a183417e', 'info_dict': { 'id': '1554319', - 'ext': 'flv', + 'ext': 'mp4', 'title': '【金坷垃】金泡沫', 'description': 'md5:ce18c2a2d2193f0df2917d270f2e5923', - 'duration': 308.067, + 'duration': 308.315, 'timestamp': 1398012660, 'upload_date': '20140420', 'thumbnail': 're:^https?://.+\.jpg', @@ -41,73 +41,33 @@ class BiliBiliIE(InfoExtractor): }, { 'url': 'http://www.bilibili.com/video/av1041170/', 'info_dict': { - 'id': '1041170', + 'id': '1507019', + 'ext': 'mp4', 'title': '【BD1080P】刀语【诸神&异域】', 'description': '这是个神奇的故事~每个人不留弹幕不给走哦~切利哦!~', + 'timestamp': 1396530060, + 'upload_date': '20140403', + 'uploader': '枫叶逝去', + 'uploader_id': '520116', }, - 'playlist_count': 9, }, { 'url': 'http://www.bilibili.com/video/av4808130/', 'info_dict': { - 'id': '4808130', + 'id': '7802182', + 'ext': 'mp4', 'title': '【长篇】哆啦A梦443【钉铛】', 'description': '(2016.05.27)来组合客人的脸吧&amp;寻母六千里锭 抱歉,又轮到周日上班现在才到家 封面www.pixiv.net/member_illust.php?mode=medium&amp;illust_id=56912929', + 'timestamp': 1464564180, + 'upload_date': '20160529', + 'uploader': '喜欢拉面', + 'uploader_id': '151066', }, - 'playlist': [{ - 'md5': '55cdadedf3254caaa0d5d27cf20a8f9c', - 'info_dict': { - 'id': '4808130_part1', - 'ext': 'flv', - 'title': '【长篇】哆啦A梦443【钉铛】', - 'description': '(2016.05.27)来组合客人的脸吧&amp;寻母六千里锭 抱歉,又轮到周日上班现在才到家 封面www.pixiv.net/member_illust.php?mode=medium&amp;illust_id=56912929', - 'timestamp': 1464564180, - 'upload_date': '20160529', - 'uploader': '喜欢拉面', - 'uploader_id': '151066', - }, - }, { - 'md5': '926f9f67d0c482091872fbd8eca7ea3d', - 'info_dict': { - 'id': '4808130_part2', - 'ext': 'flv', - 'title': '【长篇】哆啦A梦443【钉铛】', - 'description': '(2016.05.27)来组合客人的脸吧&amp;寻母六千里锭 抱歉,又轮到周日上班现在才到家 封面www.pixiv.net/member_illust.php?mode=medium&amp;illust_id=56912929', - 'timestamp': 1464564180, - 'upload_date': '20160529', - 'uploader': '喜欢拉面', - 'uploader_id': '151066', - }, - }, { - 'md5': '4b7b225b968402d7c32348c646f1fd83', - 'info_dict': { - 'id': '4808130_part3', - 'ext': 'flv', - 'title': '【长篇】哆啦A梦443【钉铛】', - 'description': '(2016.05.27)来组合客人的脸吧&amp;寻母六千里锭 抱歉,又轮到周日上班现在才到家 封面www.pixiv.net/member_illust.php?mode=medium&amp;illust_id=56912929', - 'timestamp': 1464564180, - 'upload_date': '20160529', - 'uploader': '喜欢拉面', - 'uploader_id': '151066', - }, - }, { - 'md5': '7b795e214166501e9141139eea236e91', - 'info_dict': { - 'id': '4808130_part4', - 'ext': 'flv', - 'title': '【长篇】哆啦A梦443【钉铛】', - 'description': '(2016.05.27)来组合客人的脸吧&amp;寻母六千里锭 抱歉,又轮到周日上班现在才到家 封面www.pixiv.net/member_illust.php?mode=medium&amp;illust_id=56912929', - 'timestamp': 1464564180, - 'upload_date': '20160529', - 'uploader': '喜欢拉面', - 'uploader_id': '151066', - }, - }], }, { # Missing upload time 'url': 'http://www.bilibili.com/video/av1867637/', 'info_dict': { 'id': '2880301', - 'ext': 'flv', + 'ext': 'mp4', 'title': '【HDTV】【喜剧】岳父岳母真难当 (2014)【法国票房冠军】', 'description': '一个信奉天主教的法国旧式传统资产阶级家庭中有四个女儿。三个女儿却分别找了阿拉伯、犹太、中国丈夫,老夫老妻唯独期盼剩下未嫁的小女儿能找一个信奉天主教的法国白人,结果没想到小女儿找了一位非裔黑人……【这次应该不会跳帧了】', 'uploader': '黑夜为猫', From b1c6f21c741b7f18b2b46165be0a3539735a6184 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Mon, 8 Aug 2016 12:59:07 +0800 Subject: [PATCH 66/87] [aparat] Fix extraction --- ChangeLog | 1 + youtube_dl/extractor/aparat.py | 14 ++++++-------- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/ChangeLog b/ChangeLog index 32a96432b..657ff3e48 100644 --- a/ChangeLog +++ b/ChangeLog @@ -2,6 +2,7 @@ version Extractors * [kuwo:singer] Fix extraction +* [aparat] Fix extraction version 2016.08.07 diff --git a/youtube_dl/extractor/aparat.py b/youtube_dl/extractor/aparat.py index 63429780e..025e29aa4 100644 --- a/youtube_dl/extractor/aparat.py +++ b/youtube_dl/extractor/aparat.py @@ -1,8 +1,6 @@ # coding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor from ..utils import ( ExtractorError, @@ -15,7 +13,7 @@ class AparatIE(InfoExtractor): _TEST = { 'url': 'http://www.aparat.com/v/wP8On', - 'md5': '6714e0af7e0d875c5a39c4dc4ab46ad1', + 'md5': '131aca2e14fe7c4dcb3c4877ba300c89', 'info_dict': { 'id': 'wP8On', 'ext': 'mp4', @@ -31,13 +29,13 @@ class AparatIE(InfoExtractor): # Note: There is an easier-to-parse configuration at # http://www.aparat.com/video/video/config/videohash/%video_id # but the URL in there does not work - embed_url = ('http://www.aparat.com/video/video/embed/videohash/' + - video_id + '/vt/frame') + embed_url = 'http://www.aparat.com/video/video/embed/vt/frame/showvideo/yes/videohash/' + video_id webpage = self._download_webpage(embed_url, video_id) - video_urls = [video_url.replace('\\/', '/') for video_url in re.findall( - r'(?:fileList\[[0-9]+\]\s*=|"file"\s*:)\s*"([^"]+)"', webpage)] - for i, video_url in enumerate(video_urls): + file_list = self._parse_json(self._search_regex( + r'fileList\s*=\s*JSON\.parse\(\'([^\']+)\'\)', webpage, 'file list'), video_id) + for i, item in enumerate(file_list[0]): + video_url = item['file'] req = HEADRequest(video_url) res = self._request_webpage( req, video_id, note='Testing video URL %d' % i, errnote=False) From d71207121df27dc251ee15628b0742e8d7db0db7 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Mon, 8 Aug 2016 12:59:55 +0800 Subject: [PATCH 67/87] [biqle] Skip an invalid test --- youtube_dl/extractor/biqle.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/biqle.py b/youtube_dl/extractor/biqle.py index ae4579b33..beaebfd2a 100644 --- a/youtube_dl/extractor/biqle.py +++ b/youtube_dl/extractor/biqle.py @@ -24,7 +24,8 @@ class BIQLEIE(InfoExtractor): 'ext': 'mp4', 'title': 'Ребенок в шоке от автоматической мойки', 'uploader': 'Dmitry Kotov', - } + }, + 'skip': ' This video was marked as adult. Embedding adult videos on external sites is prohibited.', }] def _real_extract(self, url): From a41a6c5094e757eb51cbd6747d868c2f9450f324 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Mon, 8 Aug 2016 13:06:02 +0800 Subject: [PATCH 68/87] [chaturbate] Skip the invalid test --- youtube_dl/extractor/chaturbate.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/chaturbate.py b/youtube_dl/extractor/chaturbate.py index b2234549e..29a8820d5 100644 --- a/youtube_dl/extractor/chaturbate.py +++ b/youtube_dl/extractor/chaturbate.py @@ -17,7 +17,8 @@ class ChaturbateIE(InfoExtractor): }, 'params': { 'skip_download': True, - } + }, + 'skip': 'Room is offline', }, { 'url': 'https://en.chaturbate.com/siswet19/', 'only_matching': True, From 3dc240e8c659112a12e8bd1260c04db9f50f1b61 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Mon, 8 Aug 2016 18:48:21 +0800 Subject: [PATCH 69/87] [sohu] Update _TESTS (closes #10260) --- youtube_dl/extractor/sohu.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/youtube_dl/extractor/sohu.py b/youtube_dl/extractor/sohu.py index 72fe66142..48e2ba2dd 100644 --- a/youtube_dl/extractor/sohu.py +++ b/youtube_dl/extractor/sohu.py @@ -14,10 +14,10 @@ from ..utils import ExtractorError class SohuIE(InfoExtractor): _VALID_URL = r'https?://(?Pmy\.)?tv\.sohu\.com/.+?/(?(mytv)|n)(?P\d+)\.shtml.*?' + # Sohu videos give different MD5 sums on Travis CI and my machine _TESTS = [{ 'note': 'This video is available only in Mainland China', 'url': 'http://tv.sohu.com/20130724/n382479172.shtml#super', - 'md5': '29175c8cadd8b5cc4055001e85d6b372', 'info_dict': { 'id': '382479172', 'ext': 'mp4', @@ -26,7 +26,6 @@ class SohuIE(InfoExtractor): 'skip': 'On available in China', }, { 'url': 'http://tv.sohu.com/20150305/n409385080.shtml', - 'md5': '699060e75cf58858dd47fb9c03c42cfb', 'info_dict': { 'id': '409385080', 'ext': 'mp4', @@ -34,7 +33,6 @@ class SohuIE(InfoExtractor): } }, { 'url': 'http://my.tv.sohu.com/us/232799889/78693464.shtml', - 'md5': '9bf34be48f2f4dadcb226c74127e203c', 'info_dict': { 'id': '78693464', 'ext': 'mp4', @@ -48,7 +46,6 @@ class SohuIE(InfoExtractor): 'title': '【神探苍实战秘籍】第13期 战争之影 赫卡里姆', }, 'playlist': [{ - 'md5': 'bdbfb8f39924725e6589c146bc1883ad', 'info_dict': { 'id': '78910339_part1', 'ext': 'mp4', @@ -56,7 +53,6 @@ class SohuIE(InfoExtractor): 'title': '【神探苍实战秘籍】第13期 战争之影 赫卡里姆', } }, { - 'md5': '3e1f46aaeb95354fd10e7fca9fc1804e', 'info_dict': { 'id': '78910339_part2', 'ext': 'mp4', @@ -64,7 +60,6 @@ class SohuIE(InfoExtractor): 'title': '【神探苍实战秘籍】第13期 战争之影 赫卡里姆', } }, { - 'md5': '8407e634175fdac706766481b9443450', 'info_dict': { 'id': '78910339_part3', 'ext': 'mp4', From 395c74615c58f5f1b2e462786d6f4d8bab1a313a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 8 Aug 2016 21:49:27 +0700 Subject: [PATCH 70/87] Revert "[extractor/generic] Make _search_json_ld non fatal" This reverts commit 958849275f1c6072c712e8d611294a762fadc7f0. --- youtube_dl/extractor/generic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 44c6c354c..e89a03760 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -2241,7 +2241,7 @@ class GenericIE(InfoExtractor): # Looking for http://schema.org/VideoObject json_ld = self._search_json_ld( - webpage, video_id, fatal=False, expected_type='VideoObject') + webpage, video_id, default=None, expected_type='VideoObject') if json_ld and json_ld.get('url'): info_dict.update({ 'title': video_title or info_dict['title'], From 3711fa1eb24224a5b898ede355136c9009355a87 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 8 Aug 2016 21:49:45 +0700 Subject: [PATCH 71/87] Revert "[flipagram] Make _search_json_ld non fatal" This reverts commit d34995a9e3f596c4dd80178d99f7bd8dbc748e2b. --- youtube_dl/extractor/flipagram.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/flipagram.py b/youtube_dl/extractor/flipagram.py index 634f5fe3b..acb6133ff 100644 --- a/youtube_dl/extractor/flipagram.py +++ b/youtube_dl/extractor/flipagram.py @@ -48,7 +48,7 @@ class FlipagramIE(InfoExtractor): flipagram = video_data['flipagram'] video = flipagram['video'] - json_ld = self._search_json_ld(webpage, video_id, fatal=False) + json_ld = self._search_json_ld(webpage, video_id, default=False) title = json_ld.get('title') or flipagram['captionText'] description = json_ld.get('description') or flipagram.get('captionText') From 321b5e082a7340225091ea5ad852e12b2dc499e9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 8 Aug 2016 22:36:18 +0700 Subject: [PATCH 72/87] [extractor/common] Respect default in _search_json_ld --- youtube_dl/extractor/common.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 0891309dd..e47770c1d 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -816,11 +816,14 @@ class InfoExtractor(object): json_ld = self._search_regex( r'(?s)]+type=(["\'])application/ld\+json\1[^>]*>(?P.+?)', html, 'JSON-LD', group='json_ld', **kwargs) + default = kwargs.get('default', NO_DEFAULT) if not json_ld: - return {} - return self._json_ld( - json_ld, video_id, fatal=kwargs.get('fatal', True), - expected_type=expected_type) + return default if default is not NO_DEFAULT else {} + # JSON-LD may be malformed and thus `fatal` should be respected. + # At the same time `default` may be passed that assumes `fatal=False` + # for _search_regex. Let's simulate the same behavior here as well. + fatal = kwargs.get('fatal', True) if default == NO_DEFAULT else False + return self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type) def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None): if isinstance(json_ld, compat_str): From 522f6c066da93b6baec3399b7098556e5ec55f43 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 8 Aug 2016 22:44:36 +0700 Subject: [PATCH 73/87] [bbc] Add proper default to _search_json_ld call --- youtube_dl/extractor/bbc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index b29e05970..83e6d024c 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -759,7 +759,7 @@ class BBCIE(BBCCoUkIE): webpage = self._download_webpage(url, playlist_id) - json_ld_info = self._search_json_ld(webpage, playlist_id, default=None) + json_ld_info = self._search_json_ld(webpage, playlist_id, default={}) timestamp = json_ld_info.get('timestamp') playlist_title = json_ld_info.get('title') From 1e7f602e2ac27bac266cde4c67b31f59510a91f5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 8 Aug 2016 22:45:49 +0700 Subject: [PATCH 74/87] [condenast] Make _search_json_ld call non fatal --- youtube_dl/extractor/condenast.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/condenast.py b/youtube_dl/extractor/condenast.py index 15fabbb1c..8d8f60598 100644 --- a/youtube_dl/extractor/condenast.py +++ b/youtube_dl/extractor/condenast.py @@ -143,7 +143,8 @@ class CondeNastIE(InfoExtractor): }) self._sort_formats(formats) - info = self._search_json_ld(webpage, video_id) if url_type != 'embed' else {} + info = self._search_json_ld( + webpage, video_id, fatal=False) if url_type != 'embed' else {} info.update({ 'id': video_id, 'formats': formats, From e8ed7354e6fdd7bd3759e24f0aa08233316303b7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 8 Aug 2016 22:46:19 +0700 Subject: [PATCH 75/87] [flipagram] Add proper default to _search_json_ld call --- youtube_dl/extractor/flipagram.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/flipagram.py b/youtube_dl/extractor/flipagram.py index acb6133ff..1902a2393 100644 --- a/youtube_dl/extractor/flipagram.py +++ b/youtube_dl/extractor/flipagram.py @@ -48,7 +48,7 @@ class FlipagramIE(InfoExtractor): flipagram = video_data['flipagram'] video = flipagram['video'] - json_ld = self._search_json_ld(webpage, video_id, default=False) + json_ld = self._search_json_ld(webpage, video_id, default={}) title = json_ld.get('title') or flipagram['captionText'] description = json_ld.get('description') or flipagram.get('captionText') From 082395d0a0f21375f63e3c9b33011b30d5de3aae Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 8 Aug 2016 22:48:33 +0700 Subject: [PATCH 76/87] [extractor/generic] Add proper default to _search_json_ld call --- youtube_dl/extractor/generic.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index e89a03760..50500ce0e 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -2241,8 +2241,8 @@ class GenericIE(InfoExtractor): # Looking for http://schema.org/VideoObject json_ld = self._search_json_ld( - webpage, video_id, default=None, expected_type='VideoObject') - if json_ld and json_ld.get('url'): + webpage, video_id, default={}, expected_type='VideoObject') + if json_ld.get('url'): info_dict.update({ 'title': video_title or info_dict['title'], 'description': video_description, From 8991844ea260a17943e935acfd5565caab27a99e Mon Sep 17 00:00:00 2001 From: Sukhbir Singh Date: Sun, 7 Aug 2016 19:11:00 -0400 Subject: [PATCH 77/87] [sonyliv] Add new extractor --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/sonyliv.py | 28 ++++++++++++++++++++++++++++ 2 files changed, 29 insertions(+) create mode 100644 youtube_dl/extractor/sonyliv.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 7ede6afda..f1043dae6 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -756,6 +756,7 @@ from .smotri import ( ) from .snotr import SnotrIE from .sohu import SohuIE +from .sonyliv import SonyLIVIE from .soundcloud import ( SoundcloudIE, SoundcloudSetIE, diff --git a/youtube_dl/extractor/sonyliv.py b/youtube_dl/extractor/sonyliv.py new file mode 100644 index 000000000..c717b1f4d --- /dev/null +++ b/youtube_dl/extractor/sonyliv.py @@ -0,0 +1,28 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class SonyLIVIE(InfoExtractor): + _VALID_URL = r'http?://(?:www\.)?sonyliv\.com/details/episodes/(?P[0-9]+)/' + _TEST = { + 'url': 'http://www.sonyliv.com/details/episodes/5024612095001/Ep.-1---Achaari-Cheese-Toast---Bachelor\'s-Delight', + 'info_dict': { + 'title': 'Ep. 1 - Achaari Cheese Toast - Bachelor\'s Delight', + 'id': '5024612095001', + 'ext': 'mp4', + 'upload_date': '20160707', + 'description': 'Bachelor\'s Delight is a new food show from Sony LIV to satisfy the taste buds of all those bachelors looking for a quick bite.', + 'uploader_id': '4338955589001', + 'timestamp': 1467870968, + }, + 'params': { + 'skip_download': True, + }, + 'add_ie': ['BrightcoveNew'], + } + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/4338955589001/default_default/index.html?videoId=%s' + + def _real_extract(self, url): + return self.url_result(self.BRIGHTCOVE_URL_TEMPLATE % self._match_id(url), 'BrightcoveNew') From 77426a087b6af712b69faf62a54db959f0d13288 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 9 Aug 2016 02:16:28 +0700 Subject: [PATCH 78/87] [sonyliv] Improve (Closes #10258) --- youtube_dl/extractor/sonyliv.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/sonyliv.py b/youtube_dl/extractor/sonyliv.py index c717b1f4d..accd112aa 100644 --- a/youtube_dl/extractor/sonyliv.py +++ b/youtube_dl/extractor/sonyliv.py @@ -5,15 +5,15 @@ from .common import InfoExtractor class SonyLIVIE(InfoExtractor): - _VALID_URL = r'http?://(?:www\.)?sonyliv\.com/details/episodes/(?P[0-9]+)/' - _TEST = { - 'url': 'http://www.sonyliv.com/details/episodes/5024612095001/Ep.-1---Achaari-Cheese-Toast---Bachelor\'s-Delight', + _VALID_URL = r'https?://(?:www\.)?sonyliv\.com/details/[^/]+/(?P\d+)' + _TESTS = [{ + 'url': "http://www.sonyliv.com/details/episodes/5024612095001/Ep.-1---Achaari-Cheese-Toast---Bachelor's-Delight", 'info_dict': { - 'title': 'Ep. 1 - Achaari Cheese Toast - Bachelor\'s Delight', + 'title': "Ep. 1 - Achaari Cheese Toast - Bachelor's Delight", 'id': '5024612095001', 'ext': 'mp4', 'upload_date': '20160707', - 'description': 'Bachelor\'s Delight is a new food show from Sony LIV to satisfy the taste buds of all those bachelors looking for a quick bite.', + 'description': 'md5:7f28509a148d5be9d0782b4d5106410d', 'uploader_id': '4338955589001', 'timestamp': 1467870968, }, @@ -21,8 +21,14 @@ class SonyLIVIE(InfoExtractor): 'skip_download': True, }, 'add_ie': ['BrightcoveNew'], - } + }, { + 'url': 'http://www.sonyliv.com/details/full%20movie/4951168986001/Sei-Raat-(Bangla)', + 'only_matching': True, + }] + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/4338955589001/default_default/index.html?videoId=%s' def _real_extract(self, url): - return self.url_result(self.BRIGHTCOVE_URL_TEMPLATE % self._match_id(url), 'BrightcoveNew') + brightcove_id = self._match_id(url) + return self.url_result( + self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, 'BrightcoveNew', brightcove_id) From affaea0688780c03afe36bc81c7151e696649b5a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Petr=20Zvoni=CC=81c=CC=8Cek?= Date: Sat, 6 Aug 2016 15:26:48 +0200 Subject: [PATCH 79/87] [rbmaradio] Fixed extractor --- youtube_dl/extractor/rbmaradio.py | 45 +++++++++++++------------------ 1 file changed, 18 insertions(+), 27 deletions(-) diff --git a/youtube_dl/extractor/rbmaradio.py b/youtube_dl/extractor/rbmaradio.py index 7932af6ef..92f0d8f76 100644 --- a/youtube_dl/extractor/rbmaradio.py +++ b/youtube_dl/extractor/rbmaradio.py @@ -1,55 +1,46 @@ # encoding: utf-8 from __future__ import unicode_literals -import json -import re - from .common import InfoExtractor from ..utils import ( - ExtractorError, + clean_html ) class RBMARadioIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P[^/]+)$' + _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/[^/]+/episodes/(?P[^/]+)$' _TEST = { - 'url': 'http://www.rbmaradio.com/shows/ford-lopatin-live-at-primavera-sound-2011', + 'url': 'https://www.rbmaradio.com/shows/main-stage/episodes/ford-lopatin-live-at-primavera-sound-2011', 'md5': '6bc6f9bcb18994b4c983bc3bf4384d95', 'info_dict': { 'id': 'ford-lopatin-live-at-primavera-sound-2011', 'ext': 'mp3', - 'uploader_id': 'ford-lopatin', - 'location': 'Spain', 'description': 'Joel Ford and Daniel ’Oneohtrix Point Never’ Lopatin fly their midified pop extravaganza to Spain. Live at Primavera Sound 2011.', - 'uploader': 'Ford & Lopatin', - 'title': 'Live at Primavera Sound 2011', + 'title': 'Ford & Lopatin - Main Stage', }, } def _real_extract(self, url): - m = re.match(self._VALID_URL, url) - video_id = m.group('videoID') + video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) + json_data = self._search_regex(r'', + webpage, 'json data') + data = self._parse_json(json_data, video_id) - json_data = self._search_regex(r'window\.gon.*?gon\.show=(.+?);$', - webpage, 'json data', flags=re.MULTILINE) + item = None + for episode in data['episodes']: + items = data['episodes'][episode] + if video_id in items: + item = items[video_id] - try: - data = json.loads(json_data) - except ValueError as e: - raise ExtractorError('Invalid JSON: ' + str(e)) - - video_url = data['akamai_url'] + '&cbr=256' + video_url = item['audioURL'] + '?cbr=256' return { 'id': video_id, 'url': video_url, - 'title': data['title'], - 'description': data.get('teaser_text'), - 'location': data.get('country_of_origin'), - 'uploader': data.get('host', {}).get('name'), - 'uploader_id': data.get('host', {}).get('slug'), - 'thumbnail': data.get('image', {}).get('large_url_2x'), - 'duration': data.get('duration'), + 'title': item.get('title') + ' - ' + item.get('showTitle'), + 'description': clean_html(item.get('longTeaser')), + 'thumbnail': self._proto_relative_url(item.get('imageURL', {}).get('landscape')), + 'duration': item.get('duration'), } From 3a380766d1d6abd83213319b41cdf9a18977a69c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 9 Aug 2016 02:46:29 +0700 Subject: [PATCH 80/87] [rbmaradio] Improve, simplify and extract all formats (Closes #10242) --- youtube_dl/extractor/rbmaradio.py | 69 +++++++++++++++++++++---------- 1 file changed, 47 insertions(+), 22 deletions(-) diff --git a/youtube_dl/extractor/rbmaradio.py b/youtube_dl/extractor/rbmaradio.py index 92f0d8f76..471928ef8 100644 --- a/youtube_dl/extractor/rbmaradio.py +++ b/youtube_dl/extractor/rbmaradio.py @@ -1,46 +1,71 @@ -# encoding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( - clean_html + clean_html, + int_or_none, + unified_timestamp, + update_url_query, ) class RBMARadioIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/[^/]+/episodes/(?P[^/]+)$' + _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P[^/]+)/episodes/(?P[^/?#&]+)' _TEST = { 'url': 'https://www.rbmaradio.com/shows/main-stage/episodes/ford-lopatin-live-at-primavera-sound-2011', 'md5': '6bc6f9bcb18994b4c983bc3bf4384d95', 'info_dict': { 'id': 'ford-lopatin-live-at-primavera-sound-2011', 'ext': 'mp3', - 'description': 'Joel Ford and Daniel ’Oneohtrix Point Never’ Lopatin fly their midified pop extravaganza to Spain. Live at Primavera Sound 2011.', - 'title': 'Ford & Lopatin - Main Stage', + 'title': 'Main Stage - Ford & Lopatin', + 'description': 'md5:4f340fb48426423530af5a9d87bd7b91', + 'thumbnail': 're:^https?://.*\.jpg', + 'duration': 2452, + 'timestamp': 1307103164, + 'upload_date': '20110603', }, } def _real_extract(self, url): - video_id = self._match_id(url) + mobj = re.match(self._VALID_URL, url) + show_id = mobj.group('show_id') + episode_id = mobj.group('id') - webpage = self._download_webpage(url, video_id) - json_data = self._search_regex(r'', - webpage, 'json data') - data = self._parse_json(json_data, video_id) + webpage = self._download_webpage(url, episode_id) - item = None - for episode in data['episodes']: - items = data['episodes'][episode] - if video_id in items: - item = items[video_id] + episode = self._parse_json( + self._search_regex( + r'__INITIAL_STATE__\s*=\s*({.+?})\s*', + webpage, 'json data'), + episode_id)['episodes'][show_id][episode_id] - video_url = item['audioURL'] + '?cbr=256' + title = episode['title'] + + show_title = episode.get('showTitle') + if show_title: + title = '%s - %s' % (show_title, title) + + formats = [{ + 'url': update_url_query(episode['audioURL'], query={'cbr': abr}), + 'format_id': compat_str(abr), + 'abr': abr, + 'vcodec': 'none', + } for abr in (96, 128, 256)] + + description = clean_html(episode.get('longTeaser')) + thumbnail = self._proto_relative_url(episode.get('imageURL', {}).get('landscape')) + duration = int_or_none(episode.get('duration')) + timestamp = unified_timestamp(episode.get('publishedAt')) return { - 'id': video_id, - 'url': video_url, - 'title': item.get('title') + ' - ' + item.get('showTitle'), - 'description': clean_html(item.get('longTeaser')), - 'thumbnail': self._proto_relative_url(item.get('imageURL', {}).get('landscape')), - 'duration': item.get('duration'), + 'id': episode_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'duration': duration, + 'timestamp': timestamp, + 'formats': formats, } From 065bc354894f1d35592529455d9eb685470124b9 Mon Sep 17 00:00:00 2001 From: singh-pratyush96 Date: Thu, 4 Aug 2016 15:47:22 +0530 Subject: [PATCH 81/87] Add --max-sleep-interval (Closes #9930) --- youtube_dl/YoutubeDL.py | 6 +++++- youtube_dl/__init__.py | 7 +++++++ youtube_dl/downloader/common.py | 7 +++++-- youtube_dl/options.py | 12 ++++++++++-- 4 files changed, 27 insertions(+), 5 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 6551f086f..82b77783d 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -249,7 +249,11 @@ class YoutubeDL(object): source_address: (Experimental) Client-side IP address to bind to. call_home: Boolean, true iff we are allowed to contact the youtube-dl servers for debugging. - sleep_interval: Number of seconds to sleep before each download. + sleep_interval: Minimum number of seconds to sleep before each download. + Sleep will be for a random interval if --max-sleep-interval is also passed. + max_sleep_interval:Max number of seconds to sleep before each download. + Sleep will be for a random interval if passed along with --min-sleep-interval + or --sleep-interval, otherwise ignored. listformats: Print an overview of available video formats and exit. list_thumbnails: Print a table of all thumbnails and exit. match_filter: A function that gets called with the info_dict of diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 2b34bf9c2..86af18d33 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -145,6 +145,12 @@ def _real_main(argv=None): if numeric_limit is None: parser.error('invalid max_filesize specified') opts.max_filesize = numeric_limit + if opts.sleep_interval is not None: + if opts.sleep_interval < 0: + parser.error('sleep interval should not be negative') + elif opts.max_sleep_interval is not None: + if opts.max_sleep_interval < opts.sleep_interval: + parser.error('max sleep interval should not be less than sleep interval') def parse_retries(retries): if retries in ('inf', 'infinite'): @@ -370,6 +376,7 @@ def _real_main(argv=None): 'source_address': opts.source_address, 'call_home': opts.call_home, 'sleep_interval': opts.sleep_interval, + 'max_sleep_interval': opts.max_sleep_interval, 'external_downloader': opts.external_downloader, 'list_thumbnails': opts.list_thumbnails, 'playlist_items': opts.playlist_items, diff --git a/youtube_dl/downloader/common.py b/youtube_dl/downloader/common.py index 1dba9f49a..8e377c72c 100644 --- a/youtube_dl/downloader/common.py +++ b/youtube_dl/downloader/common.py @@ -4,6 +4,7 @@ import os import re import sys import time +import random from ..compat import compat_os_name from ..utils import ( @@ -342,8 +343,10 @@ class FileDownloader(object): }) return True - sleep_interval = self.params.get('sleep_interval') - if sleep_interval: + sleep_lower_bound = self.params.get('sleep_interval') + if sleep_lower_bound: + sleep_upper_bound = self.params.get('max_sleep_interval', sleep_lower_bound) + sleep_interval = random.uniform(sleep_lower_bound, sleep_upper_bound) self.to_screen('[download] Sleeping %s seconds...' % sleep_interval) time.sleep(sleep_interval) diff --git a/youtube_dl/options.py b/youtube_dl/options.py index 942d44912..068e824a0 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -499,9 +499,17 @@ def parseOpts(overrideArguments=None): dest='bidi_workaround', action='store_true', help='Work around terminals that lack bidirectional text support. Requires bidiv or fribidi executable in PATH') workarounds.add_option( - '--sleep-interval', metavar='SECONDS', + '--sleep-interval', '--min-sleep-interval', metavar='SECONDS', dest='sleep_interval', type=float, - help='Number of seconds to sleep before each download.') + help='Minimum number of seconds to sleep before each download. Sleep will be for a random interval if ' + '--max-sleep-interval is also passed.' + ) + workarounds.add_option( + '--max-sleep-interval', metavar='SECONDS', + dest='max_sleep_interval', type=float, + help='Max number of seconds to sleep before each download. Sleep will be for a random interval if passed' + ' along with --min-sleep-interval or --sleep-interval, otherwise ignored.' + ) verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options') verbosity.add_option( From 7aa589a5e10c983f701b48b8431cc5f54b85cf3b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 9 Aug 2016 03:46:52 +0700 Subject: [PATCH 82/87] Fix --min/max-sleep-interval wording --- youtube_dl/YoutubeDL.py | 15 ++++++++++----- youtube_dl/options.py | 15 +++++++++------ 2 files changed, 19 insertions(+), 11 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 82b77783d..193f8db9f 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -249,11 +249,16 @@ class YoutubeDL(object): source_address: (Experimental) Client-side IP address to bind to. call_home: Boolean, true iff we are allowed to contact the youtube-dl servers for debugging. - sleep_interval: Minimum number of seconds to sleep before each download. - Sleep will be for a random interval if --max-sleep-interval is also passed. - max_sleep_interval:Max number of seconds to sleep before each download. - Sleep will be for a random interval if passed along with --min-sleep-interval - or --sleep-interval, otherwise ignored. + sleep_interval: Number of seconds to sleep before each download when + used alone or a lower bound of a range for randomized + sleep before each download (minimum possible number + of seconds to sleep) when used along with + max_sleep_interval. + max_sleep_interval:Upper bound of a range for randomized sleep before each + download (maximum possible number of seconds to sleep). + Must only be used along with sleep_interval. + Actual sleep time will be a random float from range + [sleep_interval; max_sleep_interval]. listformats: Print an overview of available video formats and exit. list_thumbnails: Print a table of all thumbnails and exit. match_filter: A function that gets called with the info_dict of diff --git a/youtube_dl/options.py b/youtube_dl/options.py index 068e824a0..d32a9e32c 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -501,15 +501,18 @@ def parseOpts(overrideArguments=None): workarounds.add_option( '--sleep-interval', '--min-sleep-interval', metavar='SECONDS', dest='sleep_interval', type=float, - help='Minimum number of seconds to sleep before each download. Sleep will be for a random interval if ' - '--max-sleep-interval is also passed.' - ) + help=( + 'Number of seconds to sleep before each download when used alone ' + 'or a lower bound of a range for randomized sleep before each download ' + '(minimum possible number of seconds to sleep) when used along with ' + '--max-sleep-interval.')) workarounds.add_option( '--max-sleep-interval', metavar='SECONDS', dest='max_sleep_interval', type=float, - help='Max number of seconds to sleep before each download. Sleep will be for a random interval if passed' - ' along with --min-sleep-interval or --sleep-interval, otherwise ignored.' - ) + help=( + 'Upper bound of a range for randomized sleep before each download ' + '(maximum possible number of seconds to sleep). Must only be used ' + 'along with --min-sleep-interval.')) verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options') verbosity.add_option( From 1ad6b891b21b45830736698a7b59c30d9605a562 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 9 Aug 2016 03:47:56 +0700 Subject: [PATCH 83/87] Add more checks for --min/max-sleep-interval arguments and use more idiomatic naming --- youtube_dl/__init__.py | 12 ++++++++---- youtube_dl/downloader/common.py | 9 +++++---- 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 86af18d33..a9730292c 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -147,10 +147,14 @@ def _real_main(argv=None): opts.max_filesize = numeric_limit if opts.sleep_interval is not None: if opts.sleep_interval < 0: - parser.error('sleep interval should not be negative') - elif opts.max_sleep_interval is not None: - if opts.max_sleep_interval < opts.sleep_interval: - parser.error('max sleep interval should not be less than sleep interval') + parser.error('sleep interval must be positive or 0') + if opts.max_sleep_interval is not None: + if opts.max_sleep_interval < 0: + parser.error('max sleep interval must be positive or 0') + if opts.max_sleep_interval < opts.sleep_interval: + parser.error('max sleep interval must be greater than or equal to min sleep interval') + else: + opts.max_sleep_interval = opts.sleep_interval def parse_retries(retries): if retries in ('inf', 'infinite'): diff --git a/youtube_dl/downloader/common.py b/youtube_dl/downloader/common.py index 8e377c72c..8482cbd84 100644 --- a/youtube_dl/downloader/common.py +++ b/youtube_dl/downloader/common.py @@ -343,10 +343,11 @@ class FileDownloader(object): }) return True - sleep_lower_bound = self.params.get('sleep_interval') - if sleep_lower_bound: - sleep_upper_bound = self.params.get('max_sleep_interval', sleep_lower_bound) - sleep_interval = random.uniform(sleep_lower_bound, sleep_upper_bound) + min_sleep_interval = self.params.get('sleep_interval') + if min_sleep_interval: + max_sleep_interval = self.params.get('max_sleep_interval', min_sleep_interval) + print(min_sleep_interval, max_sleep_interval) + sleep_interval = random.uniform(min_sleep_interval, max_sleep_interval) self.to_screen('[download] Sleeping %s seconds...' % sleep_interval) time.sleep(sleep_interval) From 5e42f8a0adec94156b911399558a967214a9d3aa Mon Sep 17 00:00:00 2001 From: nyorain Date: Sat, 6 Aug 2016 01:21:39 +0200 Subject: [PATCH 84/87] Make --metadata-from-title non fatal Output a warning if the metadata can't be parsed from the title (and don't write any metadata) instead of raising a critical error. --- youtube_dl/postprocessor/metadatafromtitle.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/postprocessor/metadatafromtitle.py b/youtube_dl/postprocessor/metadatafromtitle.py index 42377fa0f..f7c831c67 100644 --- a/youtube_dl/postprocessor/metadatafromtitle.py +++ b/youtube_dl/postprocessor/metadatafromtitle.py @@ -38,7 +38,8 @@ class MetadataFromTitlePP(PostProcessor): title = info['title'] match = re.match(self._titleregex, title) if match is None: - raise MetadataFromTitlePPError('Could not interpret title of video as "%s"' % self._titleformat) + self._downloader.to_screen('[fromtitle] Could not interpret title of video as "%s"' % self._titleformat) + return [], info for attribute, value in match.groupdict().items(): value = match.group(attribute) info[attribute] = value From 25dd58ca6a1326c27e57262c3a298b163fc3c1eb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 9 Aug 2016 04:01:05 +0700 Subject: [PATCH 85/87] [metadatafromtitle] Remove unused exception class --- youtube_dl/postprocessor/metadatafromtitle.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/youtube_dl/postprocessor/metadatafromtitle.py b/youtube_dl/postprocessor/metadatafromtitle.py index f7c831c67..920573da9 100644 --- a/youtube_dl/postprocessor/metadatafromtitle.py +++ b/youtube_dl/postprocessor/metadatafromtitle.py @@ -3,11 +3,6 @@ from __future__ import unicode_literals import re from .common import PostProcessor -from ..utils import PostProcessingError - - -class MetadataFromTitlePPError(PostProcessingError): - pass class MetadataFromTitlePP(PostProcessor): From 9778b3e7eede2d405f0102a958ea7df7ab2c6444 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 9 Aug 2016 04:03:52 +0700 Subject: [PATCH 86/87] Credit @zvonicek for #10242 and #10253 --- AUTHORS | 1 + 1 file changed, 1 insertion(+) diff --git a/AUTHORS b/AUTHORS index 890c827a0..7e17d625b 100644 --- a/AUTHORS +++ b/AUTHORS @@ -179,3 +179,4 @@ Jakub Adam Wieczorek Aleksandar Topuzović Nehal Patel Rob van Bekkum +Petr Zvoníček From b6578166847c5435095346b0225abd758cad6bb4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 9 Aug 2016 04:04:45 +0700 Subject: [PATCH 87/87] Credit @singh-pratyush96 for #10223 --- AUTHORS | 1 + 1 file changed, 1 insertion(+) diff --git a/AUTHORS b/AUTHORS index 7e17d625b..1fd4be785 100644 --- a/AUTHORS +++ b/AUTHORS @@ -180,3 +180,4 @@ Aleksandar Topuzović Nehal Patel Rob van Bekkum Petr Zvoníček +Pratyush Singh