diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index c2bd5d8ae..b2bfa9ec5 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.05.01*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.05.01** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.05.09*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.05.09** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -36,7 +36,7 @@ Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2018.05.01 +[debug] youtube-dl version 2018.05.09 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index 916b8edb8..ef6cc3850 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,21 @@ +version 2018.05.09 + +Core +* [YoutubeDL] Ensure ext exists for automatic captions +* Introduce --geo-bypass-ip-block + +Extractors ++ [udemy] Extract asset captions ++ [udemy] Extract stream URLs (#16372) ++ [businessinsider] Add support for businessinsider.com (#16387, #16388, #16389) ++ [cloudflarestream] Add support for cloudflarestream.com (#16375) +* [watchbox] Fix extraction (#16356) +* [discovery] Extract Affiliate/Anonymous Auth Token from cookies (#14954) ++ [itv:btcc] Add support for itv.com/btcc (#16139) +* [tunein] Use live title for live streams (#16347) +* [itv] Improve extraction (#16253) + + version 2018.05.01 Core diff --git a/README.md b/README.md index 5af0f387b..d9fe2350a 100644 --- a/README.md +++ b/README.md @@ -116,6 +116,9 @@ Alternatively, refer to the [developer instructions](#developer-instructions) fo --geo-bypass-country CODE Force bypass geographic restriction with explicitly provided two-letter ISO 3166-2 country code (experimental) + --geo-bypass-ip-block IP_BLOCK Force bypass geographic restriction with + explicitly provided IP block in CIDR + notation (experimental) ## Video Selection: --playlist-start NUMBER Playlist video to start at (default is 1) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index c5a48002b..88fac6e90 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -122,6 +122,7 @@ - **BRMediathek**: Bayerischer Rundfunk Mediathek - **bt:article**: Bergens Tidende Articles - **bt:vestlendingen**: Bergens Tidende - Vestlendingen + - **BusinessInsider** - **BuzzFeed** - **BYUtv** - **Camdemy** @@ -163,6 +164,7 @@ - **ClipRs** - **Clipsyndicate** - **CloserToTruth** + - **CloudflareStream** - **cloudtime**: CloudTime - **Cloudy** - **Clubic** @@ -373,6 +375,7 @@ - **Ir90Tv** - **ITTF** - **ITV** + - **ITVBTCC** - **ivi**: ivi.ru - **ivi:compilation**: ivi.ru compilations - **ivideon**: Ivideon TV diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index f1a359011..046e03247 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -1482,23 +1482,28 @@ class YoutubeDL(object): if info_dict.get('%s_number' % field) is not None and not info_dict.get(field): info_dict[field] = '%s %d' % (field.capitalize(), info_dict['%s_number' % field]) + for cc_kind in ('subtitles', 'automatic_captions'): + cc = info_dict.get(cc_kind) + if cc: + for _, subtitle in cc.items(): + for subtitle_format in subtitle: + if subtitle_format.get('url'): + subtitle_format['url'] = sanitize_url(subtitle_format['url']) + if subtitle_format.get('ext') is None: + subtitle_format['ext'] = determine_ext(subtitle_format['url']).lower() + + automatic_captions = info_dict.get('automatic_captions') subtitles = info_dict.get('subtitles') - if subtitles: - for _, subtitle in subtitles.items(): - for subtitle_format in subtitle: - if subtitle_format.get('url'): - subtitle_format['url'] = sanitize_url(subtitle_format['url']) - if subtitle_format.get('ext') is None: - subtitle_format['ext'] = determine_ext(subtitle_format['url']).lower() if self.params.get('listsubtitles', False): if 'automatic_captions' in info_dict: - self.list_subtitles(info_dict['id'], info_dict.get('automatic_captions'), 'automatic captions') + self.list_subtitles( + info_dict['id'], automatic_captions, 'automatic captions') self.list_subtitles(info_dict['id'], subtitles, 'subtitles') return + info_dict['requested_subtitles'] = self.process_subtitles( - info_dict['id'], subtitles, - info_dict.get('automatic_captions')) + info_dict['id'], subtitles, automatic_captions) # We now pick which formats have to be downloaded if info_dict.get('formats') is None: diff --git a/youtube_dl/extractor/mixcloud.py b/youtube_dl/extractor/mixcloud.py index a56b7690f..b7bccb504 100644 --- a/youtube_dl/extractor/mixcloud.py +++ b/youtube_dl/extractor/mixcloud.py @@ -179,6 +179,10 @@ class MixcloudIE(InfoExtractor): formats.append({ 'format_id': 'http', 'url': decrypted, + 'downloader_options': { + # Mixcloud starts throttling at >~5M + 'http_chunk_size': 5242880, + }, }) self._sort_formats(formats) diff --git a/youtube_dl/extractor/nick.py b/youtube_dl/extractor/nick.py index 256a24d86..5e34d776b 100644 --- a/youtube_dl/extractor/nick.py +++ b/youtube_dl/extractor/nick.py @@ -85,7 +85,7 @@ class NickBrIE(MTVServicesInfoExtractor): https?:// (?: (?P(?:www\.)?nickjr|mundonick\.uol)\.com\.br| - (?:www\.)?nickjr\.nl + (?:www\.)?nickjr\.[a-z]{2} ) /(?:programas/)?[^/]+/videos/(?:episodios/)?(?P[^/?\#.]+) ''' @@ -98,6 +98,9 @@ class NickBrIE(MTVServicesInfoExtractor): }, { 'url': 'http://www.nickjr.nl/paw-patrol/videos/311-ge-wol-dig-om-terug-te-zijn/', 'only_matching': True, + }, { + 'url': 'http://www.nickjr.de/blaze-und-die-monster-maschinen/videos/f6caaf8f-e4e8-4cc1-b489-9380d6dcd059/', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/reddit.py b/youtube_dl/extractor/reddit.py index 8372925be..7b0aa6232 100644 --- a/youtube_dl/extractor/reddit.py +++ b/youtube_dl/extractor/reddit.py @@ -47,7 +47,7 @@ class RedditIE(InfoExtractor): class RedditRIE(InfoExtractor): - _VALID_URL = r'(?Phttps?://(?:(?:www|old)\.)?reddit\.com/r/[^/]+/comments/(?P[^/?#&]+))' + _VALID_URL = r'(?Phttps?://(?:[^/]+\.)?reddit\.com/r/[^/]+/comments/(?P[^/?#&]+))' _TESTS = [{ 'url': 'https://www.reddit.com/r/videos/comments/6rrwyj/that_small_heart_attack/', 'info_dict': { @@ -86,6 +86,10 @@ class RedditRIE(InfoExtractor): # youtube 'url': 'https://www.reddit.com/r/videos/comments/6t75wq/southern_man_tries_to_speak_without_an_accent/', 'only_matching': True, + }, { + # reddit video @ nm reddit + 'url': 'https://nm.reddit.com/r/Cricket/comments/8idvby/lousy_cameraman_finds_himself_in_cairns_line_of/', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/teamcoco.py b/youtube_dl/extractor/teamcoco.py index 9056c8cbc..f06e5b19a 100644 --- a/youtube_dl/extractor/teamcoco.py +++ b/youtube_dl/extractor/teamcoco.py @@ -1,35 +1,34 @@ # coding: utf-8 from __future__ import unicode_literals -import binascii -import re import json from .common import InfoExtractor -from ..compat import ( - compat_b64decode, - compat_ord, -) from ..utils import ( - ExtractorError, - qualities, determine_ext, + ExtractorError, + int_or_none, + mimetype2ext, + parse_duration, + parse_iso8601, + qualities, ) class TeamcocoIE(InfoExtractor): - _VALID_URL = r'https?://teamcoco\.com/video/(?P[0-9]+)?/?(?P.*)' + _VALID_URL = r'https?://teamcoco\.com/video/(?P[^/?#]+)' _TESTS = [ { - 'url': 'http://teamcoco.com/video/80187/conan-becomes-a-mary-kay-beauty-consultant', - 'md5': '3f7746aa0dc86de18df7539903d399ea', + 'url': 'http://teamcoco.com/video/mary-kay-remote', + 'md5': '55d532f81992f5c92046ad02fec34d7d', 'info_dict': { 'id': '80187', 'ext': 'mp4', 'title': 'Conan Becomes A Mary Kay Beauty Consultant', 'description': 'Mary Kay is perhaps the most trusted name in female beauty, so of course Conan is a natural choice to sell their products.', - 'duration': 504, - 'age_limit': 0, + 'duration': 495.0, + 'upload_date': '20140402', + 'timestamp': 1396407600, } }, { 'url': 'http://teamcoco.com/video/louis-ck-interview-george-w-bush', @@ -40,7 +39,8 @@ class TeamcocoIE(InfoExtractor): 'description': 'Louis C.K. got starstruck by George W. Bush, so what? Part one.', 'title': 'Louis C.K. Interview Pt. 1 11/3/11', 'duration': 288, - 'age_limit': 0, + 'upload_date': '20111104', + 'timestamp': 1320405840, } }, { 'url': 'http://teamcoco.com/video/timothy-olyphant-drinking-whiskey', @@ -49,6 +49,8 @@ class TeamcocoIE(InfoExtractor): 'ext': 'mp4', 'title': 'Timothy Olyphant Raises A Toast To “Justified”', 'description': 'md5:15501f23f020e793aeca761205e42c24', + 'upload_date': '20150415', + 'timestamp': 1429088400, }, 'params': { 'skip_download': True, # m3u8 downloads @@ -63,110 +65,93 @@ class TeamcocoIE(InfoExtractor): }, 'params': { 'skip_download': True, # m3u8 downloads - } + }, + 'skip': 'This video is no longer available.', } ] - _VIDEO_ID_REGEXES = ( - r'"eVar42"\s*:\s*(\d+)', - r'Ginger\.TeamCoco\.openInApp\("video",\s*"([^"]+)"', - r'"id_not"\s*:\s*(\d+)' - ) + + def _graphql_call(self, query_template, object_type, object_id): + find_object = 'find' + object_type + return self._download_json( + 'http://teamcoco.com/graphql/', object_id, data=json.dumps({ + 'query': query_template % (find_object, object_id) + }))['data'][find_object] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) + display_id = self._match_id(url) - display_id = mobj.group('display_id') - webpage, urlh = self._download_webpage_handle(url, display_id) - if 'src=expired' in urlh.geturl(): - raise ExtractorError('This video is expired.', expected=True) + response = self._graphql_call('''{ + %s(slug: "video/%s") { + ... on RecordSlug { + record { + id + title + teaser + publishOn + thumb { + preview + } + tags { + name + } + duration + } + } + ... on NotFoundSlug { + status + } + } +}''', 'Slug', display_id) + if response.get('status'): + raise ExtractorError('This video is no longer available.', expected=True) - video_id = mobj.group('video_id') - if not video_id: - video_id = self._html_search_regex( - self._VIDEO_ID_REGEXES, webpage, 'video id') + record = response['record'] + video_id = record['id'] - data = None - - preload_codes = self._html_search_regex( - r'(function.+)setTimeout\(function\(\)\{playlist', - webpage, 'preload codes') - base64_fragments = re.findall(r'"([a-zA-Z0-9+/=]+)"', preload_codes) - base64_fragments.remove('init') - - def _check_sequence(cur_fragments): - if not cur_fragments: - return - for i in range(len(cur_fragments)): - cur_sequence = (''.join(cur_fragments[i:] + cur_fragments[:i])).encode('ascii') - try: - raw_data = compat_b64decode(cur_sequence) - if compat_ord(raw_data[0]) == compat_ord('{'): - return json.loads(raw_data.decode('utf-8')) - except (TypeError, binascii.Error, UnicodeDecodeError, ValueError): - continue - - def _check_data(): - for i in range(len(base64_fragments) + 1): - for j in range(i, len(base64_fragments) + 1): - data = _check_sequence(base64_fragments[:i] + base64_fragments[j:]) - if data: - return data - - self.to_screen('Try to compute possible data sequence. This may take some time.') - data = _check_data() - - if not data: - raise ExtractorError( - 'Preload information could not be extracted', expected=True) + srcs = self._graphql_call('''{ + %s(id: "%s") { + src + } +}''', 'RecordVideoSource', video_id)['src'] formats = [] - get_quality = qualities(['500k', '480p', '1000k', '720p', '1080p']) - for filed in data['files']: - if determine_ext(filed['url']) == 'm3u8': - # compat_urllib_parse.urljoin does not work here - if filed['url'].startswith('/'): - m3u8_url = 'http://ht.cdn.turner.com/tbs/big/teamcoco' + filed['url'] - else: - m3u8_url = filed['url'] - m3u8_formats = self._extract_m3u8_formats( - m3u8_url, video_id, ext='mp4') - for m3u8_format in m3u8_formats: - if m3u8_format not in formats: - formats.append(m3u8_format) - elif determine_ext(filed['url']) == 'f4m': - # TODO Correct f4m extraction + get_quality = qualities(['low', 'sd', 'hd', 'uhd']) + for format_id, src in srcs.items(): + if not isinstance(src, dict): continue + src_url = src.get('src') + if not src_url: + continue + ext = determine_ext(src_url, mimetype2ext(src.get('type'))) + if format_id == 'hls' or ext == 'm3u8': + # compat_urllib_parse.urljoin does not work here + if src_url.startswith('/'): + src_url = 'http://ht.cdn.turner.com/tbs/big/teamcoco' + src_url + formats.extend(self._extract_m3u8_formats( + src_url, video_id, 'mp4', m3u8_id=format_id, fatal=False)) else: - if filed['url'].startswith('/mp4:protected/'): + if src_url.startswith('/mp4:protected/'): # TODO Correct extraction for these files continue - m_format = re.search(r'(\d+(k|p))\.mp4', filed['url']) - if m_format is not None: - format_id = m_format.group(1) - else: - format_id = filed['bitrate'] - tbr = ( - int(filed['bitrate']) - if filed['bitrate'].isdigit() - else None) + tbr = int_or_none(self._search_regex( + r'(\d+)k\.mp4', src_url, 'tbr', default=None)) formats.append({ - 'url': filed['url'], - 'ext': 'mp4', + 'url': src_url, + 'ext': ext, 'tbr': tbr, 'format_id': format_id, 'quality': get_quality(format_id), }) - self._sort_formats(formats) return { 'id': video_id, 'display_id': display_id, 'formats': formats, - 'title': data['title'], - 'thumbnail': data.get('thumb', {}).get('href'), - 'description': data.get('teaser'), - 'duration': data.get('duration'), - 'age_limit': self._family_friendly_search(webpage), + 'title': record['title'], + 'thumbnail': record.get('thumb', {}).get('preview'), + 'description': record.get('teaser'), + 'duration': parse_duration(record.get('duration')), + 'timestamp': parse_iso8601(record.get('publishOn')), } diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py index 4c11fd3c3..3ee2af52e 100644 --- a/youtube_dl/extractor/twitch.py +++ b/youtube_dl/extractor/twitch.py @@ -8,6 +8,7 @@ import random from .common import InfoExtractor from ..compat import ( compat_HTTPError, + compat_kwargs, compat_parse_qs, compat_str, compat_urllib_parse_urlencode, @@ -16,11 +17,14 @@ from ..compat import ( from ..utils import ( clean_html, ExtractorError, + float_or_none, int_or_none, - js_to_json, orderedSet, parse_duration, parse_iso8601, + qualities, + try_get, + unified_timestamp, update_url_query, urlencode_postdata, urljoin, @@ -45,10 +49,11 @@ class TwitchBaseIE(InfoExtractor): '%s returned error: %s - %s' % (self.IE_NAME, error, response.get('message')), expected=True) - def _call_api(self, path, item_id, note): + def _call_api(self, path, item_id, *args, **kwargs): + kwargs.setdefault('headers', {})['Client-ID'] = self._CLIENT_ID response = self._download_json( - '%s/%s' % (self._API_BASE, path), item_id, note, - headers={'Client-ID': self._CLIENT_ID}) + '%s/%s' % (self._API_BASE, path), item_id, + *args, **compat_kwargs(kwargs)) self._handle_error(response) return response @@ -622,21 +627,23 @@ class TwitchStreamIE(TwitchBaseIE): } -class TwitchClipsIE(InfoExtractor): +class TwitchClipsIE(TwitchBaseIE): IE_NAME = 'twitch:clips' _VALID_URL = r'https?://clips\.twitch\.tv/(?:[^/]+/)*(?P[^/?#&]+)' _TESTS = [{ - 'url': 'https://clips.twitch.tv/ea/AggressiveCobraPoooound', + 'url': 'https://clips.twitch.tv/FaintLightGullWholeWheat', 'md5': '761769e1eafce0ffebfb4089cb3847cd', 'info_dict': { - 'id': 'AggressiveCobraPoooound', + 'id': '42850523', 'ext': 'mp4', 'title': 'EA Play 2016 Live from the Novo Theatre', 'thumbnail': r're:^https?://.*\.jpg', + 'timestamp': 1465767393, + 'upload_date': '20160612', 'creator': 'EA', 'uploader': 'stereotype_', - 'uploader_id': 'stereotype_', + 'uploader_id': '43566419', }, }, { # multiple formats @@ -647,34 +654,63 @@ class TwitchClipsIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + status = self._download_json( + 'https://clips.twitch.tv/api/v2/clips/%s/status' % video_id, + video_id) - clip = self._parse_json( - self._search_regex( - r'(?s)clipInfo\s*=\s*({.+?});', webpage, 'clip info'), - video_id, transform_source=js_to_json) + formats = [] - title = clip.get('title') or clip.get('channel_title') or self._og_search_title(webpage) - - formats = [{ - 'url': option['source'], - 'format_id': option.get('quality'), - 'height': int_or_none(option.get('quality')), - } for option in clip.get('quality_options', []) if option.get('source')] - - if not formats: - formats = [{ - 'url': clip['clip_video_url'], - }] + for option in status['quality_options']: + if not isinstance(option, dict): + continue + source = option.get('source') + if not source or not isinstance(source, compat_str): + continue + formats.append({ + 'url': source, + 'format_id': option.get('quality'), + 'height': int_or_none(option.get('quality')), + 'fps': int_or_none(option.get('frame_rate')), + }) self._sort_formats(formats) - return { - 'id': video_id, - 'title': title, - 'thumbnail': self._og_search_thumbnail(webpage), - 'creator': clip.get('broadcaster_display_name') or clip.get('broadcaster_login'), - 'uploader': clip.get('curator_login'), - 'uploader_id': clip.get('curator_display_name'), + info = { 'formats': formats, } + + clip = self._call_api( + 'kraken/clips/%s' % video_id, video_id, fatal=False, headers={ + 'Accept': 'application/vnd.twitchtv.v5+json', + }) + + if clip: + quality_key = qualities(('tiny', 'small', 'medium')) + thumbnails = [] + thumbnails_dict = clip.get('thumbnails') + if isinstance(thumbnails_dict, dict): + for thumbnail_id, thumbnail_url in thumbnails_dict.items(): + thumbnails.append({ + 'id': thumbnail_id, + 'url': thumbnail_url, + 'preference': quality_key(thumbnail_id), + }) + + info.update({ + 'id': clip.get('tracking_id') or video_id, + 'title': clip.get('title') or video_id, + 'duration': float_or_none(clip.get('duration')), + 'views': int_or_none(clip.get('views')), + 'timestamp': unified_timestamp(clip.get('created_at')), + 'thumbnails': thumbnails, + 'creator': try_get(clip, lambda x: x['broadcaster']['display_name'], compat_str), + 'uploader': try_get(clip, lambda x: x['curator']['display_name'], compat_str), + 'uploader_id': try_get(clip, lambda x: x['curator']['id'], compat_str), + }) + else: + info.update({ + 'title': video_id, + 'id': video_id, + }) + + return info diff --git a/youtube_dl/extractor/udemy.py b/youtube_dl/extractor/udemy.py index bf1134e3f..0a74a9768 100644 --- a/youtube_dl/extractor/udemy.py +++ b/youtube_dl/extractor/udemy.py @@ -18,6 +18,7 @@ from ..utils import ( int_or_none, js_to_json, sanitized_Request, + try_get, unescapeHTML, urlencode_postdata, ) @@ -105,7 +106,7 @@ class UdemyIE(InfoExtractor): % (course_id, lecture_id), lecture_id, 'Downloading lecture JSON', query={ 'fields[lecture]': 'title,description,view_html,asset', - 'fields[asset]': 'asset_type,stream_url,thumbnail_url,download_urls,data', + 'fields[asset]': 'asset_type,stream_url,thumbnail_url,download_urls,stream_urls,captions,data', }) def _handle_error(self, response): @@ -303,9 +304,25 @@ class UdemyIE(InfoExtractor): 'url': src, }) - download_urls = asset.get('download_urls') - if isinstance(download_urls, dict): - extract_formats(download_urls.get('Video')) + for url_kind in ('download', 'stream'): + urls = asset.get('%s_urls' % url_kind) + if isinstance(urls, dict): + extract_formats(urls.get('Video')) + + captions = asset.get('captions') + if isinstance(captions, list): + for cc in captions: + if not isinstance(cc, dict): + continue + cc_url = cc.get('url') + if not cc_url or not isinstance(cc_url, compat_str): + continue + lang = try_get(cc, lambda x: x['locale']['locale'], compat_str) + sub_dict = (automatic_captions if cc.get('source') == 'auto' + else subtitles) + sub_dict.setdefault(lang or 'en', []).append({ + 'url': cc_url, + }) view_html = lecture.get('view_html') if view_html: diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 04896efc8..6f47b1795 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2018.05.01' +__version__ = '2018.05.09'