From e298d3a08c2c04aceb5b32e6e8f59c7832d65bd3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 22 Jul 2016 21:05:39 +0700 Subject: [PATCH 01/55] [youtube] Fix authentication (Closes #10140) --- youtube_dl/extractor/youtube.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 270ee8861..268080ba6 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -53,6 +53,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor): """Provide base functions for Youtube extractors""" _LOGIN_URL = 'https://accounts.google.com/ServiceLogin' _TWOFACTOR_URL = 'https://accounts.google.com/signin/challenge' + _PASSWORD_CHALLENGE_URL = 'https://accounts.google.com/signin/challenge/sl/password' _NETRC_MACHINE = 'youtube' # If True it will raise an error if no login info is provided _LOGIN_REQUIRED = False @@ -116,12 +117,10 @@ class YoutubeBaseInfoExtractor(InfoExtractor): 'hl': 'en_US', } - login_data = urlencode_postdata(login_form_strs) - - req = sanitized_Request(self._LOGIN_URL, login_data) login_results = self._download_webpage( - req, None, - note='Logging in', errnote='unable to log in', fatal=False) + self._PASSWORD_CHALLENGE_URL, None, + note='Logging in', errnote='unable to log in', fatal=False, + data=urlencode_postdata(login_form_strs)) if login_results is False: return False From add7d2a0e2127f412883b726392a9c3ff6e8be9d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 22 Jul 2016 21:24:09 +0700 Subject: [PATCH 02/55] [pornhub] Make error regex less ambiguous (Closes #10138) --- youtube_dl/extractor/pornhub.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index d2c92531b..20976c101 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -111,7 +111,7 @@ class PornHubIE(InfoExtractor): webpage = self._download_webpage(req, video_id) error_msg = self._html_search_regex( - r'(?s)]+class=(["\']).*?\b(?:removed|userMessageSection)\b.*?\1[^>]*>(?P.+?)', + r'(?s)]+class=(["\'])(?:(?!\1).)*\b(?:removed|userMessageSection)\b(?:(?!\1).)*\1[^>]*>(?P.+?)', webpage, 'error message', default=None, group='error') if error_msg: error_msg = re.sub(r'\s+', ' ', error_msg) From b13647cf3c9ab2fb6afafdd761e04a993fe68bec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 22 Jul 2016 23:04:13 +0700 Subject: [PATCH 03/55] [eporner] Fix extraction (Closes #10139) --- youtube_dl/extractor/eporner.py | 82 +++++++++++++++++++++++++-------- 1 file changed, 62 insertions(+), 20 deletions(-) diff --git a/youtube_dl/extractor/eporner.py b/youtube_dl/extractor/eporner.py index ac5d0fe24..f3734e9f8 100644 --- a/youtube_dl/extractor/eporner.py +++ b/youtube_dl/extractor/eporner.py @@ -4,19 +4,23 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( + encode_base_n, + ExtractorError, + int_or_none, parse_duration, str_to_int, ) class EpornerIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?eporner\.com/hd-porn/(?P\w+)/(?P[\w-]+)' + _VALID_URL = r'https?://(?:www\.)?eporner\.com/hd-porn/(?P\w+)(?:/(?P[\w-]+))?' _TESTS = [{ 'url': 'http://www.eporner.com/hd-porn/95008/Infamous-Tiffany-Teen-Strip-Tease-Video/', 'md5': '39d486f046212d8e1b911c52ab4691f8', 'info_dict': { - 'id': '95008', + 'id': 'qlDUmNsj6VS', 'display_id': 'Infamous-Tiffany-Teen-Strip-Tease-Video', 'ext': 'mp4', 'title': 'Infamous Tiffany Teen Strip Tease Video', @@ -28,34 +32,72 @@ class EpornerIE(InfoExtractor): # New (May 2016) URL layout 'url': 'http://www.eporner.com/hd-porn/3YRUtzMcWn0/Star-Wars-XXX-Parody/', 'only_matching': True, + }, { + 'url': 'http://www.eporner.com/hd-porn/3YRUtzMcWn0', + 'only_matching': True, }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') - display_id = mobj.group('display_id') + display_id = mobj.group('display_id') or video_id - webpage = self._download_webpage(url, display_id) - title = self._html_search_regex( - r'(.*?) - EPORNER', webpage, 'title') + webpage, urlh = self._download_webpage_handle(url, display_id) - redirect_url = 'http://www.eporner.com/config5/%s' % video_id - player_code = self._download_webpage( - redirect_url, display_id, note='Downloading player config') + video_id = self._match_id(compat_str(urlh.geturl())) - sources = self._search_regex( - r'(?s)sources\s*:\s*\[\s*({.+?})\s*\]', player_code, 'sources') + hash = self._search_regex( + r'hash\s*:\s*["\']([\da-f]{32})', webpage, 'hash') + + title = self._og_search_title(webpage, default=None) or self._html_search_regex( + r'<title>(.+?) - EPORNER', webpage, 'title') + + # Reverse engineered from vjs.js + def calc_hash(s): + return ''.join((encode_base_n(int(s[lb:lb + 8], 16), 36) for lb in range(0, 32, 8))) + + video = self._download_json( + 'http://www.eporner.com/xhr/video/%s' % video_id, + display_id, note='Downloading video JSON', + query={ + 'hash': calc_hash(hash), + 'device': 'generic', + 'domain': 'www.eporner.com', + 'fallback': 'false', + }) + + if video.get('available') is False: + raise ExtractorError( + '%s said: %s' % (self.IE_NAME, video['message']), expected=True) + + sources = video['sources'] formats = [] - for video_url, format_id in re.findall(r'file\s*:\s*"([^"]+)",\s*label\s*:\s*"([^"]+)"', sources): - fmt = { - 'url': video_url, - 'format_id': format_id, - } - m = re.search(r'^(\d+)', format_id) - if m: - fmt['height'] = int(m.group(1)) - formats.append(fmt) + for kind, formats_dict in sources.items(): + if not isinstance(formats_dict, dict): + continue + for format_id, format_dict in formats_dict.items(): + if not isinstance(format_dict, dict): + continue + src = format_dict.get('src') + if not isinstance(src, compat_str) or not src.startswith('http'): + continue + if kind == 'hls': + formats.extend(self._extract_m3u8_formats( + src, display_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id=kind, fatal=False)) + else: + height = int_or_none(self._search_regex( + r'(\d+)[pP]', format_id, 'height', default=None)) + fps = int_or_none(self._search_regex( + r'(\d+)fps', format_id, 'fps', default=None)) + + formats.append({ + 'url': src, + 'format_id': format_id, + 'height': height, + 'fps': fps, + }) self._sort_formats(formats) duration = parse_duration(self._html_search_meta('duration', webpage)) From 5275efe30d0606deb44c7723271110f66a1e1e65 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 22 Jul 2016 23:11:28 +0700 Subject: [PATCH 04/55] release 2016.07.22 --- .github/ISSUE_TEMPLATE.md | 6 +++--- docs/supportedsites.md | 2 ++ youtube_dl/version.py | 2 +- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index 8b68f371b..f5b444b84 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2016.07.17*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2016.07.17** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2016.07.22*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2016.07.22** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through [README](https://github.com/rg3/youtube-dl/blob/master/README.md) and **most notably** [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -35,7 +35,7 @@ $ youtube-dl -v <your command line> [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2016.07.17 +[debug] youtube-dl version 2016.07.22 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/docs/supportedsites.md b/docs/supportedsites.md index eaa165347..36b8ec9ad 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -142,6 +142,7 @@ - **ComCarCoff** - **ComedyCentral** - **ComedyCentralShows**: The Daily Show / The Colbert Report + - **ComedyCentralTV** - **CondeNast**: Condé Nast media group: Allure, Architectural Digest, Ars Technica, Bon Appétit, Brides, Condé Nast, Condé Nast Traveler, Details, Epicurious, GQ, Glamour, Golf Digest, SELF, Teen Vogue, The New Yorker, Vanity Fair, Vogue, W Magazine, WIRED - **Coub** - **Cracked** @@ -477,6 +478,7 @@ - **NYTimes** - **NYTimesArticle** - **ocw.mit.edu** + - **OdaTV** - **Odnoklassniki** - **OktoberfestTV** - **on.aol.com** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 34b62480b..732a5dfdf 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2016.07.17' +__version__ = '2016.07.22' From f164b97123d5b61a7dd055c888212a0dc670f04f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 23 Jul 2016 16:48:59 +0700 Subject: [PATCH 05/55] [utils] Add another f4m mimetype to mimetype2ext --- youtube_dl/utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index e6e0155b4..f5cd6819b 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -2123,6 +2123,7 @@ def mimetype2ext(mt): 'dash+xml': 'mpd', 'f4m': 'f4m', 'f4m+xml': 'f4m', + 'hds+xml': 'f4m', 'vnd.ms-sstr+xml': 'ism', }.get(res, res) From 4671dd41b27e39eb4682189fca44d0f4272a4751 Mon Sep 17 00:00:00 2001 From: Rob van Bekkum <rob_van_bekkum@hotmail.com> Date: Sat, 27 Feb 2016 18:21:42 +0100 Subject: [PATCH 06/55] [arkena:lcp] Add extractors --- youtube_dl/extractor/arkenaplay.py | 151 +++++++++++++++++++++++++++++ youtube_dl/extractor/lcp.py | 39 ++++++++ 2 files changed, 190 insertions(+) create mode 100644 youtube_dl/extractor/arkenaplay.py create mode 100644 youtube_dl/extractor/lcp.py diff --git a/youtube_dl/extractor/arkenaplay.py b/youtube_dl/extractor/arkenaplay.py new file mode 100644 index 000000000..0061ea196 --- /dev/null +++ b/youtube_dl/extractor/arkenaplay.py @@ -0,0 +1,151 @@ +# coding: utf-8 +from __future__ import unicode_literals +from .common import InfoExtractor +from ..utils import ( + int_or_none, + parse_iso8601 +) +import re + + +class ArkenaPlayIE(InfoExtractor): + IE_NAME = 'ArkenaPlay' + _VALID_URL = r'(?P<shortcut>arkena:(?P<version>[0-9]+):(?P<mediatype>[A-Za-z0-9]+):(?P<mediaId>[^:]+):(?P<widgetsettingId>[A-Za-z0-9]+):(?P<accountId>[A-Za-z0-9]+))|(?:(?P<host>https?://(?:www\.)?play\..*\..*)/embed/(?:avp/v[0-9]+/player/[A-Za-z0-9]+/)?(?P<id>.*)?)' + + _TESTS = [{ + 'url': 'http://play.lcp.fr/embed/327336/131064/darkmatter/0', + 'md5': '6cea4f7d13810464ef8485a924fc3333', + 'info_dict': { + 'id': '327336', + 'url': 're:http://httpod.scdn.arkena.com/11970/327336.*', + 'ext': 'mp4', + 'title': '327336', + 'upload_date': '20160225', + 'timestamp': 1456391602 + } + }, { + # Shortcut for: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411 + 'url': 'arkena:2:media:b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe:1:129411', + 'md5': 'b96f2f71b359a8ecd05ce4e1daa72365', + 'info_dict': { + 'id': 'b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe', + 'url': 'http://88e04ec095b07cd1aa3ea588be47e870.httpcache0.90034-httpcache0.dna.qbrick.com/90034-httpcache0/4bf759a1-00090034/bbb_sunflower_2160p_60fps_normal_720p.mp4', + 'ext': 'mp4', + 'title': 'Big Buck Bunny', + 'description': 'Royalty free test video', + 'upload_date': '20150528', + 'timestamp': 1432816365 + } + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + if mobj.group('shortcut'): + version = mobj.group('version') + mediatype = mobj.group('mediatype') + mediaid = mobj.group('mediaId') + widgetsettingid = mobj.group('widgetsettingId') + accountid = mobj.group('accountId') + display_id = '{0}:{1}:{2}:{3}'.format(mediatype, mediaid, widgetsettingid, accountid) + media_url = 'https://play.arkena.com/config/avp/v{0}/player/{1}/{2}/{3}/{4}/?callbackMethod=?'.format( + version, mediatype, mediaid, widgetsettingid, accountid) + else: + display_id = self._search_regex(self._VALID_URL, url, 'host_name', group='id') + webpage = self._download_webpage(url, display_id) + + media_url_regex = '"(?P<mediainfo>(?P<host>.*)/(c|C)onfig/.*\?callbackMethod=\?)"' + media_url = self._html_search_regex(media_url_regex, webpage, 'arkena_media_info_url') + hostname = self._html_search_regex(media_url_regex, webpage, 'arkena_media_host', group='host') + if not hostname: + hostname = self._search_regex(self._VALID_URL, url, 'host_name', group='host') + media_url = hostname + media_url + + # Extract the required info of the media files gathered in a dictionary + arkena_info = self._download_webpage(media_url, 'arkena_info_') + arkena_info_regex = r'\?\((?P<json>.*)\);' + media_dict = self._parse_json(self._search_regex(arkena_info_regex, arkena_info, 'json', group='json'), + display_id) + + # All videos are part of a playlist, a single video is also put in a playlist + playlist_items = media_dict.get('Playlist', []) + if len(playlist_items) == 0: + return self.url_result(url, 'Generic') + elif len(playlist_items) == 1: + arkena_media_info = playlist_items[0] + return self.__extract_from_playlistentry(arkena_media_info) + else: + entries_info = [] + for arkena_playlist_item in playlist_items: + entries_info.append(self.__extract_from_playlistentry(arkena_playlist_item)) + return { + 'id': display_id, + 'entries': entries_info + } + + def __extract_from_playlistentry(self, arkena_playlistentry_info): + media_info = arkena_playlistentry_info.get('MediaInfo', {}) + thumbnails = self.__get_thumbnails(media_info) + title = media_info.get('Title') + description = media_info.get('Description') + video_id = media_info.get('VideoId') + timestamp = parse_iso8601(media_info.get('PublishDate')) + formats = self.__get_video_formats(arkena_playlistentry_info, video_id) + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'thumbnails': thumbnails, + 'description': description, + 'timestamp': timestamp + } + + def __get_thumbnails(self, arkena_mediainfo): + thumbnails = [] + thumbnails_info = arkena_mediainfo.get('Poster') + if not thumbnails_info: + return None + for thumbnail in thumbnails_info: + thumbnail_url = thumbnail.get('Url') + if not thumbnail_url: + continue + thumbnails.append({ + 'url': thumbnail_url, + 'width': int_or_none(thumbnail.get('Size')) + }) + return thumbnails + + def __get_video_formats(self, media_files_info, video_id): + formats = [] + media_files = media_files_info.get('MediaFiles') + if not media_files: + return None + + for type_name, video_files_json in media_files.iteritems(): + for video_info in video_files_json: + video_url = video_info.get('Url') + if not video_url: + continue + type = video_info.get('Type') + if type_name in ['Mp4', 'WebM', 'Flash']: + bitrate = int_or_none(video_info.get('Bitrate'), scale=1000) + ext = None + if type == 'video/mp4': + ext = 'mp4' + elif type == 'video/webm': + ext = 'webm' + elif type == 'video/x-flv': + ext = 'flv' + formats.append({ + 'url': video_url, + 'ext': ext, + 'tbr': bitrate + }) + elif type_name == 'M3u8' and type == 'application/x-mpegURL': + formats.extend(self._extract_m3u8_formats(video_url, video_id, 'mp4', m3u8_id='hls', fatal=False)) + elif type_name == 'Flash' and type == 'application/hds+xml': + formats.extend(self._extract_f4m_formats(video_url, video_id, f4m_id='hds', fatal=False)) + elif type_name == 'Dash' and type == 'application/dash+xml': + formats.extend(self._extract_mpd_formats(video_url, video_id, mpd_id='dash', fatal=False)) + + self._sort_formats(formats) + return formats \ No newline at end of file diff --git a/youtube_dl/extractor/lcp.py b/youtube_dl/extractor/lcp.py new file mode 100644 index 000000000..38d7502df --- /dev/null +++ b/youtube_dl/extractor/lcp.py @@ -0,0 +1,39 @@ +# coding: utf-8 +from __future__ import unicode_literals +from .common import InfoExtractor + +class LcpIE(InfoExtractor): + IE_NAME = 'LCP' + _VALID_URL = r'https?://(?:www\.)?lcp\.fr/(?:[^\/]+/)*(?P<id>[^/]+)' + + _TESTS = [{ + 'url': 'http://www.lcp.fr/la-politique-en-video/schwartzenberg-prg-preconise-francois-hollande-de-participer-une-primaire', + 'md5': 'ab96c4dae94322ece1e98d97c8dc7807', + 'info_dict': { + 'id': 'd56d03e9', + 'url': 're:http://httpod.scdn.arkena.com/11970/d56d03e9_.*', + 'ext': 'mp4', + 'title': 'd56d03e9', + 'upload_date': '20160226', + 'timestamp': 1456488895 + } + }, { + 'url': 'http://www.lcp.fr/le-direct', + 'info_dict': { + 'title': 'Le direct | LCP Assembl\xe9e nationale', + 'id': 'le-direct', + }, + 'playlist_mincount': 1 + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + embed_url_regex = r'"(?P<url>(?:https?://(?:www\.)?)?play\.lcp\.fr/embed/[A-za-z0-9]+/[A-za-z0-9]+/[A-za-z0-9]+/[A-za-z0-9]+)"' + embed_url = self._html_search_regex(embed_url_regex, webpage, 'player_url', default=None, fatal=False) + if not embed_url: + return self.url_result(url, 'Generic') + + title = self._og_search_title(webpage, default=None) + return self.url_result(embed_url, 'ArkenaPlay', video_id=display_id, video_title=title) From bbe1f3634aaa800bef99b1edba8058f337da98fd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 23 Jul 2016 17:55:54 +0700 Subject: [PATCH 07/55] [arkena] Improve extraction (Closes #8682) --- youtube_dl/extractor/arkena.py | 115 +++++++++++++++++++++++++++++++++ 1 file changed, 115 insertions(+) create mode 100644 youtube_dl/extractor/arkena.py diff --git a/youtube_dl/extractor/arkena.py b/youtube_dl/extractor/arkena.py new file mode 100644 index 000000000..d003a027b --- /dev/null +++ b/youtube_dl/extractor/arkena.py @@ -0,0 +1,115 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + float_or_none, + int_or_none, + mimetype2ext, + parse_iso8601, + strip_jsonp, +) + + +class ArkenaIE(InfoExtractor): + _VALID_URL = r'https?://play\.arkena\.com/(?:config|embed)/avp/v\d/player/media/(?P<id>[^/]+)/[^/]+/(?P<account_id>\d+)' + _TESTS = [{ + 'url': 'https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411', + 'md5': 'b96f2f71b359a8ecd05ce4e1daa72365', + 'info_dict': { + 'id': 'b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe', + 'ext': 'mp4', + 'title': 'Big Buck Bunny', + 'description': 'Royalty free test video', + 'timestamp': 1432816365, + 'upload_date': '20150528', + 'is_live': False, + }, + }, { + 'url': 'https://play.arkena.com/config/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411/?callbackMethod=jQuery1111023664739129262213_1469227693893', + 'only_matching': True, + }, { + 'url': 'http://play.arkena.com/config/avp/v1/player/media/327336/darkmatter/131064/?callbackMethod=jQuery1111002221189684892677_1469227595972', + 'only_matching': True, + }, { + 'url': 'http://play.arkena.com/embed/avp/v1/player/media/327336/darkmatter/131064/', + 'only_matching': True, + }] + + @staticmethod + def _extract_url(webpage): + # See https://support.arkena.com/display/PLAY/Ways+to+embed+your+video + mobj = re.search( + r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//play\.arkena\.com/embed/avp/.+?)\1', + webpage) + if mobj: + return mobj.group('url') + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + account_id = mobj.group('account_id') + + playlist = self._download_json( + 'https://play.arkena.com/config/avp/v2/player/media/%s/0/%s/?callbackMethod=_' + % (video_id, account_id), + video_id, transform_source=strip_jsonp)['Playlist'][0] + + media_info = playlist['MediaInfo'] + title = media_info['Title'] + media_files = playlist['MediaFiles'] + + is_live = False + formats = [] + for kind_case, kind_formats in media_files.items(): + kind = kind_case.lower() + for f in kind_formats: + f_url = f.get('Url') + if not f_url: + continue + is_live = f.get('Live') == 'true' + exts = (mimetype2ext(f.get('Type')), determine_ext(f_url, None)) + if kind == 'm3u8' or 'm3u8' in exts: + formats.extend(self._extract_m3u8_formats( + f_url, video_id, 'mp4', + entry_protocol='m3u8' if is_live else 'm3u8_native', + m3u8_id=kind, fatal=False, live=is_live)) + elif kind == 'flash' or 'f4m' in exts: + formats.extend(self._extract_f4m_formats( + f_url, video_id, f4m_id=kind, fatal=False)) + elif kind == 'dash' or 'mpd' in exts: + formats.extend(self._extract_mpd_formats( + f_url, video_id, mpd_id=kind, fatal=False)) + elif kind == 'silverlight': + # TODO: process when ism is supported (see + # https://github.com/rg3/youtube-dl/issues/8118) + pass + else: + tbr = float_or_none(f.get('Bitrate'), 1000) + formats.append({ + 'url': f_url, + 'format_id': '%s-%d' % (kind, tbr) if tbr else kind, + 'tbr': tbr, + }) + self._sort_formats(formats) + + description = media_info.get('Description') + video_id = media_info.get('VideoId') or video_id + timestamp = parse_iso8601(media_info.get('PublishDate')) + thumbnails = [{ + 'url': thumbnail['Url'], + 'width': int_or_none(thumbnail.get('Size')), + } for thumbnail in (media_info.get('Poster') or []) if thumbnail.get('Url')] + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'timestamp': timestamp, + 'is_live': is_live, + 'thumbnails': thumbnails, + 'formats': formats, + } From c8e170b2092f5e2ad9ea8fd7fb2eedd35e307a1c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 23 Jul 2016 17:56:11 +0700 Subject: [PATCH 08/55] [lcp] Improve extraction --- youtube_dl/extractor/lcp.py | 81 ++++++++++++++++++++++++++++++------- 1 file changed, 66 insertions(+), 15 deletions(-) diff --git a/youtube_dl/extractor/lcp.py b/youtube_dl/extractor/lcp.py index 38d7502df..ade27a99e 100644 --- a/youtube_dl/extractor/lcp.py +++ b/youtube_dl/extractor/lcp.py @@ -1,39 +1,90 @@ # coding: utf-8 from __future__ import unicode_literals + from .common import InfoExtractor +from .arkena import ArkenaIE + + +class LcpPlayIE(ArkenaIE): + _VALID_URL = r'https?://play\.lcp\.fr/embed/(?P<id>[^/]+)/(?P<account_id>[^/]+)/[^/]+/[^/]+' + _TESTS = [{ + 'url': 'http://play.lcp.fr/embed/327336/131064/darkmatter/0', + 'md5': 'b8bd9298542929c06c1c15788b1f277a', + 'info_dict': { + 'id': '327336', + 'ext': 'mp4', + 'title': '327336', + 'timestamp': 1456391602, + 'upload_date': '20160225', + }, + 'params': { + 'skip_download': True, + }, + }] + class LcpIE(InfoExtractor): - IE_NAME = 'LCP' - _VALID_URL = r'https?://(?:www\.)?lcp\.fr/(?:[^\/]+/)*(?P<id>[^/]+)' + _VALID_URL = r'https?://(?:www\.)?lcp\.fr/(?:[^/]+/)*(?P<id>[^/]+)' _TESTS = [{ + # arkena embed 'url': 'http://www.lcp.fr/la-politique-en-video/schwartzenberg-prg-preconise-francois-hollande-de-participer-une-primaire', - 'md5': 'ab96c4dae94322ece1e98d97c8dc7807', + 'md5': 'b8bd9298542929c06c1c15788b1f277a', 'info_dict': { 'id': 'd56d03e9', - 'url': 're:http://httpod.scdn.arkena.com/11970/d56d03e9_.*', 'ext': 'mp4', - 'title': 'd56d03e9', + 'title': 'Schwartzenberg (PRG) préconise à François Hollande de participer à une primaire à gauche', + 'description': 'md5:96ad55009548da9dea19f4120c6c16a8', + 'timestamp': 1456488895, 'upload_date': '20160226', - 'timestamp': 1456488895 - } + }, + 'params': { + 'skip_download': True, + }, }, { + # dailymotion live stream 'url': 'http://www.lcp.fr/le-direct', 'info_dict': { - 'title': 'Le direct | LCP Assembl\xe9e nationale', - 'id': 'le-direct', + 'id': 'xji3qy', + 'ext': 'mp4', + 'title': 'La Chaine Parlementaire (LCP), Live TNT', + 'description': 'md5:5c69593f2de0f38bd9a949f2c95e870b', + 'uploader': 'LCP', + 'uploader_id': 'xbz33d', + 'timestamp': 1308923058, + 'upload_date': '20110624', }, - 'playlist_mincount': 1 + 'params': { + # m3u8 live stream + 'skip_download': True, + }, + }, { + 'url': 'http://www.lcp.fr/emissions/277792-les-volontaires', + 'only_matching': True, }] def _real_extract(self, url): display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) - embed_url_regex = r'"(?P<url>(?:https?://(?:www\.)?)?play\.lcp\.fr/embed/[A-za-z0-9]+/[A-za-z0-9]+/[A-za-z0-9]+/[A-za-z0-9]+)"' - embed_url = self._html_search_regex(embed_url_regex, webpage, 'player_url', default=None, fatal=False) - if not embed_url: + play_url = self._search_regex( + r'<iframe[^>]+src=(["\'])(?P<url>%s?(?:(?!\1).)*)\1' % LcpPlayIE._VALID_URL, + webpage, 'play iframe', default=None, group='url') + + if not play_url: return self.url_result(url, 'Generic') - title = self._og_search_title(webpage, default=None) - return self.url_result(embed_url, 'ArkenaPlay', video_id=display_id, video_title=title) + title = self._og_search_title(webpage, default=None) or self._html_search_meta( + 'twitter:title', webpage, fatal=True) + description = self._html_search_meta( + ('description', 'twitter:description'), webpage) + + return { + '_type': 'url_transparent', + 'ie_key': LcpPlayIE.ie_key(), + 'url': play_url, + 'display_id': display_id, + 'title': title, + 'description': description, + } From 0673741af33557dbcfe18f06fbc9c0145e64822c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 23 Jul 2016 17:56:29 +0700 Subject: [PATCH 09/55] [extractors] Add imports for arkena and lcp --- youtube_dl/extractor/extractors.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 6de50296c..3ca0ef83a 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -44,6 +44,7 @@ from .appletrailers import ( AppleTrailersSectionIE, ) from .archiveorg import ArchiveOrgIE +from .arkena import ArkenaIE from .ard import ( ARDIE, ARDMediathekIE, @@ -397,6 +398,10 @@ from .kuwo import ( ) from .la7 import LA7IE from .laola1tv import Laola1TvIE +from .lcp import ( + LcpPlayIE, + LcpIE, +) from .learnr import LearnrIE from .lecture2go import Lecture2GoIE from .lemonde import LemondeIE From 1979969f91abb514f0fe2bfdb3389fabf8e86488 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 23 Jul 2016 17:56:48 +0700 Subject: [PATCH 10/55] [extractor/generic] Add support for arkena embeds --- youtube_dl/extractor/generic.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 6d346cb1c..011940580 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -62,6 +62,7 @@ from .videomore import VideomoreIE from .googledrive import GoogleDriveIE from .jwplatform import JWPlatformIE from .digiteka import DigitekaIE +from .arkena import ArkenaIE from .instagram import InstagramIE from .liveleak import LiveLeakIE from .threeqsdn import ThreeQSDNIE @@ -1342,6 +1343,23 @@ class GenericIE(InfoExtractor): }, 'add_ie': ['Vimeo'], }, + { + 'url': 'https://support.arkena.com/display/PLAY/Ways+to+embed+your+video', + 'md5': 'b96f2f71b359a8ecd05ce4e1daa72365', + 'info_dict': { + 'id': 'b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe', + 'ext': 'mp4', + 'title': 'Big Buck Bunny', + 'description': 'Royalty free test video', + 'timestamp': 1432816365, + 'upload_date': '20150528', + 'is_live': False, + }, + 'params': { + 'skip_download': True, + }, + 'add_ie': [ArkenaIE.ie_key()], + }, # { # # TODO: find another test # # http://schema.org/VideoObject @@ -2146,6 +2164,11 @@ class GenericIE(InfoExtractor): if digiteka_url: return self.url_result(self._proto_relative_url(digiteka_url), DigitekaIE.ie_key()) + # Look for Arkena embeds + arkena_url = ArkenaIE._extract_url(webpage) + if arkena_url: + return self.url_result(arkena_url, ArkenaIE.ie_key()) + # Look for Limelight embeds mobj = re.search(r'LimelightPlayer\.doLoad(Media|Channel|ChannelList)\(["\'](?P<id>[a-z0-9]{32})', webpage) if mobj: From 3a8947650b6545e105ca43a551f7c35007b6f647 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 23 Jul 2016 17:57:55 +0700 Subject: [PATCH 11/55] [arkenaplay] Remove extractor --- youtube_dl/extractor/arkenaplay.py | 151 ----------------------------- 1 file changed, 151 deletions(-) delete mode 100644 youtube_dl/extractor/arkenaplay.py diff --git a/youtube_dl/extractor/arkenaplay.py b/youtube_dl/extractor/arkenaplay.py deleted file mode 100644 index 0061ea196..000000000 --- a/youtube_dl/extractor/arkenaplay.py +++ /dev/null @@ -1,151 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals -from .common import InfoExtractor -from ..utils import ( - int_or_none, - parse_iso8601 -) -import re - - -class ArkenaPlayIE(InfoExtractor): - IE_NAME = 'ArkenaPlay' - _VALID_URL = r'(?P<shortcut>arkena:(?P<version>[0-9]+):(?P<mediatype>[A-Za-z0-9]+):(?P<mediaId>[^:]+):(?P<widgetsettingId>[A-Za-z0-9]+):(?P<accountId>[A-Za-z0-9]+))|(?:(?P<host>https?://(?:www\.)?play\..*\..*)/embed/(?:avp/v[0-9]+/player/[A-Za-z0-9]+/)?(?P<id>.*)?)' - - _TESTS = [{ - 'url': 'http://play.lcp.fr/embed/327336/131064/darkmatter/0', - 'md5': '6cea4f7d13810464ef8485a924fc3333', - 'info_dict': { - 'id': '327336', - 'url': 're:http://httpod.scdn.arkena.com/11970/327336.*', - 'ext': 'mp4', - 'title': '327336', - 'upload_date': '20160225', - 'timestamp': 1456391602 - } - }, { - # Shortcut for: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411 - 'url': 'arkena:2:media:b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe:1:129411', - 'md5': 'b96f2f71b359a8ecd05ce4e1daa72365', - 'info_dict': { - 'id': 'b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe', - 'url': 'http://88e04ec095b07cd1aa3ea588be47e870.httpcache0.90034-httpcache0.dna.qbrick.com/90034-httpcache0/4bf759a1-00090034/bbb_sunflower_2160p_60fps_normal_720p.mp4', - 'ext': 'mp4', - 'title': 'Big Buck Bunny', - 'description': 'Royalty free test video', - 'upload_date': '20150528', - 'timestamp': 1432816365 - } - }] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - if mobj.group('shortcut'): - version = mobj.group('version') - mediatype = mobj.group('mediatype') - mediaid = mobj.group('mediaId') - widgetsettingid = mobj.group('widgetsettingId') - accountid = mobj.group('accountId') - display_id = '{0}:{1}:{2}:{3}'.format(mediatype, mediaid, widgetsettingid, accountid) - media_url = 'https://play.arkena.com/config/avp/v{0}/player/{1}/{2}/{3}/{4}/?callbackMethod=?'.format( - version, mediatype, mediaid, widgetsettingid, accountid) - else: - display_id = self._search_regex(self._VALID_URL, url, 'host_name', group='id') - webpage = self._download_webpage(url, display_id) - - media_url_regex = '"(?P<mediainfo>(?P<host>.*)/(c|C)onfig/.*\?callbackMethod=\?)"' - media_url = self._html_search_regex(media_url_regex, webpage, 'arkena_media_info_url') - hostname = self._html_search_regex(media_url_regex, webpage, 'arkena_media_host', group='host') - if not hostname: - hostname = self._search_regex(self._VALID_URL, url, 'host_name', group='host') - media_url = hostname + media_url - - # Extract the required info of the media files gathered in a dictionary - arkena_info = self._download_webpage(media_url, 'arkena_info_') - arkena_info_regex = r'\?\((?P<json>.*)\);' - media_dict = self._parse_json(self._search_regex(arkena_info_regex, arkena_info, 'json', group='json'), - display_id) - - # All videos are part of a playlist, a single video is also put in a playlist - playlist_items = media_dict.get('Playlist', []) - if len(playlist_items) == 0: - return self.url_result(url, 'Generic') - elif len(playlist_items) == 1: - arkena_media_info = playlist_items[0] - return self.__extract_from_playlistentry(arkena_media_info) - else: - entries_info = [] - for arkena_playlist_item in playlist_items: - entries_info.append(self.__extract_from_playlistentry(arkena_playlist_item)) - return { - 'id': display_id, - 'entries': entries_info - } - - def __extract_from_playlistentry(self, arkena_playlistentry_info): - media_info = arkena_playlistentry_info.get('MediaInfo', {}) - thumbnails = self.__get_thumbnails(media_info) - title = media_info.get('Title') - description = media_info.get('Description') - video_id = media_info.get('VideoId') - timestamp = parse_iso8601(media_info.get('PublishDate')) - formats = self.__get_video_formats(arkena_playlistentry_info, video_id) - return { - 'id': video_id, - 'title': title, - 'formats': formats, - 'thumbnails': thumbnails, - 'description': description, - 'timestamp': timestamp - } - - def __get_thumbnails(self, arkena_mediainfo): - thumbnails = [] - thumbnails_info = arkena_mediainfo.get('Poster') - if not thumbnails_info: - return None - for thumbnail in thumbnails_info: - thumbnail_url = thumbnail.get('Url') - if not thumbnail_url: - continue - thumbnails.append({ - 'url': thumbnail_url, - 'width': int_or_none(thumbnail.get('Size')) - }) - return thumbnails - - def __get_video_formats(self, media_files_info, video_id): - formats = [] - media_files = media_files_info.get('MediaFiles') - if not media_files: - return None - - for type_name, video_files_json in media_files.iteritems(): - for video_info in video_files_json: - video_url = video_info.get('Url') - if not video_url: - continue - type = video_info.get('Type') - if type_name in ['Mp4', 'WebM', 'Flash']: - bitrate = int_or_none(video_info.get('Bitrate'), scale=1000) - ext = None - if type == 'video/mp4': - ext = 'mp4' - elif type == 'video/webm': - ext = 'webm' - elif type == 'video/x-flv': - ext = 'flv' - formats.append({ - 'url': video_url, - 'ext': ext, - 'tbr': bitrate - }) - elif type_name == 'M3u8' and type == 'application/x-mpegURL': - formats.extend(self._extract_m3u8_formats(video_url, video_id, 'mp4', m3u8_id='hls', fatal=False)) - elif type_name == 'Flash' and type == 'application/hds+xml': - formats.extend(self._extract_f4m_formats(video_url, video_id, f4m_id='hds', fatal=False)) - elif type_name == 'Dash' and type == 'application/dash+xml': - formats.extend(self._extract_mpd_formats(video_url, video_id, mpd_id='dash', fatal=False)) - - self._sort_formats(formats) - return formats \ No newline at end of file From 6548030a17ba36de9f4a340cb71413d32b90f3a6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 23 Jul 2016 18:00:19 +0700 Subject: [PATCH 12/55] Credit @rvanbekkum for arkena (#8682) --- AUTHORS | 1 + 1 file changed, 1 insertion(+) diff --git a/AUTHORS b/AUTHORS index f762e8a16..890c827a0 100644 --- a/AUTHORS +++ b/AUTHORS @@ -178,3 +178,4 @@ Artur Krysiak Jakub Adam Wieczorek Aleksandar Topuzović Nehal Patel +Rob van Bekkum From f1991ce928d2a670b37ce4ac0e088459d9e28202 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 23 Jul 2016 18:07:55 +0700 Subject: [PATCH 13/55] [arkena] Skip dash formats --- youtube_dl/extractor/arkena.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/arkena.py b/youtube_dl/extractor/arkena.py index d003a027b..d7c5eeb8a 100644 --- a/youtube_dl/extractor/arkena.py +++ b/youtube_dl/extractor/arkena.py @@ -81,12 +81,15 @@ class ArkenaIE(InfoExtractor): formats.extend(self._extract_f4m_formats( f_url, video_id, f4m_id=kind, fatal=False)) elif kind == 'dash' or 'mpd' in exts: - formats.extend(self._extract_mpd_formats( - f_url, video_id, mpd_id=kind, fatal=False)) + # TODO: Current DASH formats are broken - $Time$ pattern in + # <SegmentTemplate> not implemented yet + # formats.extend(self._extract_mpd_formats( + # f_url, video_id, mpd_id=kind, fatal=False)) + continue elif kind == 'silverlight': # TODO: process when ism is supported (see # https://github.com/rg3/youtube-dl/issues/8118) - pass + continue else: tbr = float_or_none(f.get('Bitrate'), 1000) formats.append({ From b4a131e1a5c345b17df37a9b0c76e612855e5402 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 24 Jul 2016 04:36:49 +0700 Subject: [PATCH 14/55] [facebook] Relax _VALID_URL (Closes #10151) --- youtube_dl/extractor/facebook.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index cdb093262..0fb781a73 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -27,7 +27,7 @@ class FacebookIE(InfoExtractor): _VALID_URL = r'''(?x) (?: https?:// - (?:\w+\.)?facebook\.com/ + (?:[\w-]+\.)?facebook\.com/ (?:[^#]*?\#!/)? (?: (?: @@ -127,6 +127,9 @@ class FacebookIE(InfoExtractor): }, { 'url': 'https://www.facebook.com/groups/164828000315060/permalink/764967300301124/', 'only_matching': True, + }, { + 'url': 'https://zh-hk.facebook.com/peoplespower/videos/1135894589806027/', + 'only_matching': True, }] @staticmethod From 111de00289d8c019764e79247568248f8a4b11f6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?D=C3=A9stin=20Reed?= <trox1972@users.noreply.github.com> Date: Sat, 23 Jul 2016 17:02:50 +0200 Subject: [PATCH 15/55] [DailyMail] Improve title and description extraction --- youtube_dl/extractor/dailymail.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/dailymail.py b/youtube_dl/extractor/dailymail.py index b60a1d813..98c835bf1 100644 --- a/youtube_dl/extractor/dailymail.py +++ b/youtube_dl/extractor/dailymail.py @@ -5,19 +5,20 @@ from .common import InfoExtractor from ..utils import ( int_or_none, determine_protocol, + unescapeHTML, ) class DailyMailIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?dailymail\.co\.uk/video/[^/]+/video-(?P<id>[0-9]+)' _TEST = { - 'url': 'http://www.dailymail.co.uk/video/sciencetech/video-1288527/Turn-video-impressionist-masterpiece.html', - 'md5': '2f639d446394f53f3a33658b518b6615', + 'url': 'http://www.dailymail.co.uk/video/tvshowbiz/video-1295863/The-Mountain-appears-sparkling-water-ad-Heavy-Bubbles.html', + 'md5': 'f6129624562251f628296c3a9ffde124', 'info_dict': { - 'id': '1288527', + 'id': '1295863', 'ext': 'mp4', - 'title': 'Turn any video into an impressionist masterpiece', - 'description': 'md5:88ddbcb504367987b2708bb38677c9d2', + 'title': 'The Mountain appears in sparkling water ad for \'Heavy Bubbles\'', + 'description': 'md5:a93d74b6da172dd5dc4d973e0b766a84', } } @@ -26,7 +27,7 @@ class DailyMailIE(InfoExtractor): webpage = self._download_webpage(url, video_id) video_data = self._parse_json(self._search_regex( r"data-opts='({.+?})'", webpage, 'video data'), video_id) - title = video_data['title'] + title = unescapeHTML(video_data['title']) video_sources = self._download_json(video_data.get( 'sources', {}).get('url') or 'http://www.dailymail.co.uk/api/player/%s/video-sources.json' % video_id, video_id) @@ -55,7 +56,7 @@ class DailyMailIE(InfoExtractor): return { 'id': video_id, 'title': title, - 'description': video_data.get('descr'), + 'description': unescapeHTML(video_data.get('descr')), 'thumbnail': video_data.get('poster') or video_data.get('thumbnail'), 'formats': formats, } From f09483485728871286f2670c8b8d62f56a89b1e1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 24 Jul 2016 10:27:16 +0700 Subject: [PATCH 16/55] [extractor/common] Add support for $ in SegmentTemplate in MPD manifests --- youtube_dl/extractor/common.py | 61 ++++++++++++++++++++++++++++------ 1 file changed, 50 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 29544c1a8..b8a76e3cb 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1481,6 +1481,13 @@ class InfoExtractor(object): compat_etree_fromstring(mpd.encode('utf-8')), mpd_id, mpd_base_url, formats_dict=formats_dict) def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}): + """ + Parse formats from MPD manifest. + References: + 1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E), + http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip + 2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP + """ if mpd_doc.get('type') == 'dynamic': return [] @@ -1513,8 +1520,16 @@ class InfoExtractor(object): s_e = segment_timeline.findall(_add_ns('S')) if s_e: ms_info['total_number'] = 0 + ms_info['s'] = [] for s in s_e: - ms_info['total_number'] += 1 + int(s.get('r', '0')) + r = int(s.get('r', 0)) + ms_info['total_number'] += 1 + r + ms_info['s'].append({ + 't': int(s.get('t', 0)), + # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60]) + 'd': int(s.attrib['d']), + 'r': r, + }) else: timescale = segment_template.get('timescale') if timescale: @@ -1551,7 +1566,7 @@ class InfoExtractor(object): continue representation_attrib = adaptation_set.attrib.copy() representation_attrib.update(representation.attrib) - # According to page 41 of ISO/IEC 29001-1:2014, @mimeType is mandatory + # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory mime_type = representation_attrib['mimeType'] content_type = mime_type.split('/')[0] if content_type == 'text': @@ -1595,16 +1610,40 @@ class InfoExtractor(object): representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration)) media_template = representation_ms_info['media_template'] media_template = media_template.replace('$RepresentationID$', representation_id) - media_template = re.sub(r'\$(Number|Bandwidth)\$', r'%(\1)d', media_template) - media_template = re.sub(r'\$(Number|Bandwidth)%([^$]+)\$', r'%(\1)\2', media_template) + media_template = re.sub(r'\$(Number|Bandwidth|Time)\$', r'%(\1)d', media_template) + media_template = re.sub(r'\$(Number|Bandwidth|Time)%([^$]+)\$', r'%(\1)\2', media_template) media_template.replace('$$', '$') - representation_ms_info['segment_urls'] = [ - media_template % { - 'Number': segment_number, - 'Bandwidth': representation_attrib.get('bandwidth')} - for segment_number in range( - representation_ms_info['start_number'], - representation_ms_info['total_number'] + representation_ms_info['start_number'])] + + # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$ + # can't be used at the same time + if '%(Number' in media_template: + representation_ms_info['segment_urls'] = [ + media_template % { + 'Number': segment_number, + 'Bandwidth': representation_attrib.get('bandwidth'), + } + for segment_number in range( + representation_ms_info['start_number'], + representation_ms_info['total_number'] + representation_ms_info['start_number'])] + else: + representation_ms_info['segment_urls'] = [] + segment_time = 0 + + def add_segment_url(): + representation_ms_info['segment_urls'].append( + media_template % { + 'Time': segment_time, + 'Bandwidth': representation_attrib.get('bandwidth'), + } + ) + + for num, s in enumerate(representation_ms_info['s']): + segment_time = s.get('t') or segment_time + add_segment_url() + for r in range(s.get('r', 0)): + segment_time += s['d'] + add_segment_url() + segment_time += s['d'] if 'segment_urls' in representation_ms_info: f.update({ 'segment_urls': representation_ms_info['segment_urls'], From 94c04a3c793a8332ea68bb2eff2979da4ef66af6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 24 Jul 2016 10:28:11 +0700 Subject: [PATCH 17/55] [arkena] Enable dash formats --- youtube_dl/extractor/arkena.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/arkena.py b/youtube_dl/extractor/arkena.py index d7c5eeb8a..d45cae301 100644 --- a/youtube_dl/extractor/arkena.py +++ b/youtube_dl/extractor/arkena.py @@ -81,11 +81,8 @@ class ArkenaIE(InfoExtractor): formats.extend(self._extract_f4m_formats( f_url, video_id, f4m_id=kind, fatal=False)) elif kind == 'dash' or 'mpd' in exts: - # TODO: Current DASH formats are broken - $Time$ pattern in - # <SegmentTemplate> not implemented yet - # formats.extend(self._extract_mpd_formats( - # f_url, video_id, mpd_id=kind, fatal=False)) - continue + formats.extend(self._extract_mpd_formats( + f_url, video_id, mpd_id=kind, fatal=False)) elif kind == 'silverlight': # TODO: process when ism is supported (see # https://github.com/rg3/youtube-dl/issues/8118) From d9cb92c84058bce2c222b7a634608d7a16addcb2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 24 Jul 2016 10:29:09 +0700 Subject: [PATCH 18/55] [telegraaf] Enable dash formats --- youtube_dl/extractor/telegraaf.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/telegraaf.py b/youtube_dl/extractor/telegraaf.py index 9092e9b85..926d36e5a 100644 --- a/youtube_dl/extractor/telegraaf.py +++ b/youtube_dl/extractor/telegraaf.py @@ -49,9 +49,8 @@ class TelegraafIE(InfoExtractor): formats.extend(self._extract_m3u8_formats( manifest_url, video_id, ext='mp4', m3u8_id='hls')) elif ext == 'mpd': - # TODO: Current DASH formats are broken - $Time$ pattern in - # <SegmentTemplate> not implemented yet - continue + formats.extend(self._extract_mpd_formats( + manifest_url, video_id, mpd_id='dash', fatal=False)) else: self.report_warning('Unknown adaptive format %s' % ext) for location in locations.get('progressive', []): From f75e6890dbc70f0686e967e89b5c422a3aee8951 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 24 Jul 2016 10:29:26 +0700 Subject: [PATCH 19/55] [telegraaf] Make hls non fatal --- youtube_dl/extractor/telegraaf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/telegraaf.py b/youtube_dl/extractor/telegraaf.py index 926d36e5a..58078c531 100644 --- a/youtube_dl/extractor/telegraaf.py +++ b/youtube_dl/extractor/telegraaf.py @@ -47,7 +47,7 @@ class TelegraafIE(InfoExtractor): ext = determine_ext(manifest_url) if ext == 'm3u8': formats.extend(self._extract_m3u8_formats( - manifest_url, video_id, ext='mp4', m3u8_id='hls')) + manifest_url, video_id, ext='mp4', m3u8_id='hls', fatal=False)) elif ext == 'mpd': formats.extend(self._extract_mpd_formats( manifest_url, video_id, mpd_id='dash', fatal=False)) From 5a65668e25ba424693a69cbaa81059e8c38a52ae Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 24 Jul 2016 10:35:55 +0700 Subject: [PATCH 20/55] [dcn] Enable dash formats --- youtube_dl/extractor/dcn.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/dcn.py b/youtube_dl/extractor/dcn.py index efb8585e8..b8542820a 100644 --- a/youtube_dl/extractor/dcn.py +++ b/youtube_dl/extractor/dcn.py @@ -62,11 +62,9 @@ class DCNBaseIE(InfoExtractor): r'file\s*:\s*"https?(://[^"]+)/playlist.m3u8', r'<a[^>]+href="rtsp(://[^"]+)"' ], webpage, 'format url') - # TODO: Current DASH formats are broken - $Time$ pattern in - # <SegmentTemplate> not implemented yet - # formats.extend(self._extract_mpd_formats( - # format_url_base + '/manifest.mpd', - # video_id, mpd_id='dash', fatal=False)) + formats.extend(self._extract_mpd_formats( + format_url_base + '/manifest.mpd', + video_id, mpd_id='dash', fatal=False)) formats.extend(self._extract_m3u8_formats( format_url_base + '/playlist.m3u8', video_id, 'mp4', m3u8_entry_protocol, m3u8_id='hls', fatal=False)) From ae6fff4e64b34bdcbd5b4f90c89cfd9e55ddeffa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 24 Jul 2016 10:43:05 +0700 Subject: [PATCH 21/55] [onet] Enable dash formats --- youtube_dl/extractor/onet.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/onet.py b/youtube_dl/extractor/onet.py index 402d3a9f7..fc22ad5eb 100644 --- a/youtube_dl/extractor/onet.py +++ b/youtube_dl/extractor/onet.py @@ -59,11 +59,8 @@ class OnetBaseIE(InfoExtractor): # TODO: Support Microsoft Smooth Streaming continue elif ext == 'mpd': - # TODO: Current DASH formats are broken - $Time$ pattern in - # <SegmentTemplate> not implemented yet - # formats.extend(self._extract_mpd_formats( - # video_url, video_id, mpd_id='dash', fatal=False)) - continue + formats.extend(self._extract_mpd_formats( + video_url, video_id, mpd_id='dash', fatal=False)) else: formats.append({ 'url': video_url, From 9513c1eb1789818956faf6f6273f6563bae58aa1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 24 Jul 2016 11:03:39 +0700 Subject: [PATCH 22/55] [tvp] Update dash format comment --- youtube_dl/extractor/tvp.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/tvp.py b/youtube_dl/extractor/tvp.py index 5070082da..e84876b54 100644 --- a/youtube_dl/extractor/tvp.py +++ b/youtube_dl/extractor/tvp.py @@ -89,8 +89,8 @@ class TVPIE(InfoExtractor): r'(https?://.+?/video)(?:\.(?:ism|f4m|m3u8)|-\d+\.mp4)', video_url, 'video base url', default=None) if video_url_base: - # TODO: Current DASH formats are broken - $Time$ pattern in - # <SegmentTemplate> not implemented yet + # TODO: <Group> found instead of <AdaptationSet> in MPD manifest. + # It's not mentioned in MPEG-DASH standard. Figure that out. # formats.extend(self._extract_mpd_formats( # video_url_base + '.ism/video.mpd', # video_id, mpd_id='dash', fatal=False)) From 8fdc538b4687ee920a81bd661a04c608d75e0011 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 24 Jul 2016 11:39:50 +0700 Subject: [PATCH 23/55] release 2016.07.24 --- .github/ISSUE_TEMPLATE.md | 6 +++--- docs/supportedsites.md | 3 +++ youtube_dl/version.py | 2 +- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index f5b444b84..01a5ab5ec 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2016.07.22*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2016.07.22** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2016.07.24*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2016.07.24** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through [README](https://github.com/rg3/youtube-dl/blob/master/README.md) and **most notably** [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -35,7 +35,7 @@ $ youtube-dl -v <your command line> [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2016.07.22 +[debug] youtube-dl version 2016.07.24 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 36b8ec9ad..fa70fe49a 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -46,6 +46,7 @@ - **archive.org**: archive.org videos - **ARD** - **ARD:mediathek** + - **Arkena** - **arte.tv** - **arte.tv:+7** - **arte.tv:cinema** @@ -337,6 +338,8 @@ - **kuwo:song**: 酷我音乐 - **la7.it** - **Laola1Tv** + - **Lcp** + - **LcpPlay** - **Le**: 乐视网 - **Learnr** - **Lecture2Go** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 732a5dfdf..428bd9a97 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2016.07.22' +__version__ = '2016.07.24' From e8be2943a7bf6fc171ddf04d251a854f8e4ecf7e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 24 Jul 2016 18:38:18 +0700 Subject: [PATCH 24/55] [smotri] Modernize, make more robust and fix tests --- youtube_dl/extractor/smotri.py | 99 +++++++++++++++++----------------- 1 file changed, 50 insertions(+), 49 deletions(-) diff --git a/youtube_dl/extractor/smotri.py b/youtube_dl/extractor/smotri.py index 5c3fd0fec..114358786 100644 --- a/youtube_dl/extractor/smotri.py +++ b/youtube_dl/extractor/smotri.py @@ -13,20 +13,21 @@ from ..utils import ( sanitized_Request, unified_strdate, urlencode_postdata, + xpath_text, ) class SmotriIE(InfoExtractor): IE_DESC = 'Smotri.com' IE_NAME = 'smotri' - _VALID_URL = r'^https?://(?:www\.)?(?:smotri\.com/video/view/\?id=|pics\.smotri\.com/(?:player|scrubber_custom8)\.swf\?file=)(?P<id>v(?P<realvideoid>[0-9]+)[a-z0-9]{4})' + _VALID_URL = r'https?://(?:www\.)?(?:smotri\.com/video/view/\?id=|pics\.smotri\.com/(?:player|scrubber_custom8)\.swf\?file=)(?P<id>v(?P<realvideoid>[0-9]+)[a-z0-9]{4})' _NETRC_MACHINE = 'smotri' _TESTS = [ # real video id 2610366 { 'url': 'http://smotri.com/video/view/?id=v261036632ab', - 'md5': '2a7b08249e6f5636557579c368040eb9', + 'md5': '02c0dfab2102984e9c5bb585cc7cc321', 'info_dict': { 'id': 'v261036632ab', 'ext': 'mp4', @@ -174,11 +175,11 @@ class SmotriIE(InfoExtractor): if video_password: video_form['pass'] = hashlib.md5(video_password.encode('utf-8')).hexdigest() - request = sanitized_Request( - 'http://smotri.com/video/view/url/bot/', urlencode_postdata(video_form)) - request.add_header('Content-Type', 'application/x-www-form-urlencoded') - - video = self._download_json(request, video_id, 'Downloading video JSON') + video = self._download_json( + 'http://smotri.com/video/view/url/bot/', + video_id, 'Downloading video JSON', + data=urlencode_postdata(video_form), + headers={'Content-Type': 'application/x-www-form-urlencoded'}) video_url = video.get('_vidURL') or video.get('_vidURL_mp4') @@ -196,11 +197,11 @@ class SmotriIE(InfoExtractor): raise ExtractorError(msg, expected=True) title = video['title'] - thumbnail = video['_imgURL'] - upload_date = unified_strdate(video['added']) - uploader = video['userNick'] - uploader_id = video['userLogin'] - duration = int_or_none(video['duration']) + thumbnail = video.get('_imgURL') + upload_date = unified_strdate(video.get('added')) + uploader = video.get('userNick') + uploader_id = video.get('userLogin') + duration = int_or_none(video.get('duration')) # Video JSON does not provide enough meta data # We will extract some from the video web page instead @@ -209,7 +210,7 @@ class SmotriIE(InfoExtractor): # Warning if video is unavailable warning = self._html_search_regex( - r'<div class="videoUnModer">(.*?)</div>', webpage, + r'<div[^>]+class="videoUnModer"[^>]*>(.+?)</div>', webpage, 'warning message', default=None) if warning is not None: self._downloader.report_warning( @@ -217,20 +218,22 @@ class SmotriIE(InfoExtractor): (video_id, warning)) # Adult content - if re.search('EroConfirmText">', webpage) is not None: + if 'EroConfirmText">' in webpage: self.report_age_confirmation() confirm_string = self._html_search_regex( - r'<a href="/video/view/\?id=%s&confirm=([^"]+)" title="[^"]+">' % video_id, + r'<a[^>]+href="/video/view/\?id=%s&confirm=([^"]+)"' % video_id, webpage, 'confirm string') confirm_url = webpage_url + '&confirm=%s' % confirm_string - webpage = self._download_webpage(confirm_url, video_id, 'Downloading video page (age confirmed)') + webpage = self._download_webpage( + confirm_url, video_id, + 'Downloading video page (age confirmed)') adult_content = True else: adult_content = False view_count = self._html_search_regex( - 'Общее количество просмотров.*?<span class="Number">(\\d+)</span>', - webpage, 'view count', fatal=False, flags=re.MULTILINE | re.DOTALL) + r'(?s)Общее количество просмотров.*?<span class="Number">(\d+)</span>', + webpage, 'view count', fatal=False) return { 'id': video_id, @@ -249,37 +252,33 @@ class SmotriIE(InfoExtractor): class SmotriCommunityIE(InfoExtractor): IE_DESC = 'Smotri.com community videos' IE_NAME = 'smotri:community' - _VALID_URL = r'^https?://(?:www\.)?smotri\.com/community/video/(?P<communityid>[0-9A-Za-z_\'-]+)' + _VALID_URL = r'https?://(?:www\.)?smotri\.com/community/video/(?P<id>[0-9A-Za-z_\'-]+)' _TEST = { 'url': 'http://smotri.com/community/video/kommuna', 'info_dict': { 'id': 'kommuna', - 'title': 'КПРФ', }, 'playlist_mincount': 4, } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - community_id = mobj.group('communityid') + community_id = self._match_id(url) - url = 'http://smotri.com/export/rss/video/by/community/-/%s/video.xml' % community_id - rss = self._download_xml(url, community_id, 'Downloading community RSS') + rss = self._download_xml( + 'http://smotri.com/export/rss/video/by/community/-/%s/video.xml' % community_id, + community_id, 'Downloading community RSS') - entries = [self.url_result(video_url.text, 'Smotri') - for video_url in rss.findall('./channel/item/link')] + entries = [ + self.url_result(video_url.text, SmotriIE.ie_key()) + for video_url in rss.findall('./channel/item/link')] - description_text = rss.find('./channel/description').text - community_title = self._html_search_regex( - '^Видео сообщества "([^"]+)"$', description_text, 'community title') - - return self.playlist_result(entries, community_id, community_title) + return self.playlist_result(entries, community_id) class SmotriUserIE(InfoExtractor): IE_DESC = 'Smotri.com user videos' IE_NAME = 'smotri:user' - _VALID_URL = r'^https?://(?:www\.)?smotri\.com/user/(?P<userid>[0-9A-Za-z_\'-]+)' + _VALID_URL = r'https?://(?:www\.)?smotri\.com/user/(?P<id>[0-9A-Za-z_\'-]+)' _TESTS = [{ 'url': 'http://smotri.com/user/inspector', 'info_dict': { @@ -290,19 +289,19 @@ class SmotriUserIE(InfoExtractor): }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - user_id = mobj.group('userid') + user_id = self._match_id(url) - url = 'http://smotri.com/export/rss/user/video/-/%s/video.xml' % user_id - rss = self._download_xml(url, user_id, 'Downloading user RSS') + rss = self._download_xml( + 'http://smotri.com/export/rss/user/video/-/%s/video.xml' % user_id, + user_id, 'Downloading user RSS') entries = [self.url_result(video_url.text, 'Smotri') for video_url in rss.findall('./channel/item/link')] - description_text = rss.find('./channel/description').text - user_nickname = self._html_search_regex( - '^Видео режиссера (.*)$', description_text, - 'user nickname') + description_text = xpath_text(rss, './channel/description') or '' + user_nickname = self._search_regex( + '^Видео режиссера (.+)$', description_text, + 'user nickname', fatal=False) return self.playlist_result(entries, user_id, user_nickname) @@ -310,11 +309,11 @@ class SmotriUserIE(InfoExtractor): class SmotriBroadcastIE(InfoExtractor): IE_DESC = 'Smotri.com broadcasts' IE_NAME = 'smotri:broadcast' - _VALID_URL = r'^https?://(?:www\.)?(?P<url>smotri\.com/live/(?P<broadcastid>[^/]+))/?.*' + _VALID_URL = r'https?://(?:www\.)?(?P<url>smotri\.com/live/(?P<id>[^/]+))/?.*' def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - broadcast_id = mobj.group('broadcastid') + broadcast_id = mobj.group('id') broadcast_url = 'http://' + mobj.group('url') broadcast_page = self._download_webpage(broadcast_url, broadcast_id, 'Downloading broadcast page') @@ -328,7 +327,8 @@ class SmotriBroadcastIE(InfoExtractor): (username, password) = self._get_login_info() if username is None: - self.raise_login_required('Erotic broadcasts allowed only for registered users') + self.raise_login_required( + 'Erotic broadcasts allowed only for registered users') login_form = { 'login-hint53': '1', @@ -343,8 +343,9 @@ class SmotriBroadcastIE(InfoExtractor): broadcast_page = self._download_webpage( request, broadcast_id, 'Logging in and confirming age') - if re.search('>Неверный логин или пароль<', broadcast_page) is not None: - raise ExtractorError('Unable to log in: bad username or password', expected=True) + if '>Неверный логин или пароль<' in broadcast_page: + raise ExtractorError( + 'Unable to log in: bad username or password', expected=True) adult_content = True else: @@ -383,11 +384,11 @@ class SmotriBroadcastIE(InfoExtractor): broadcast_playpath = broadcast_json['_streamName'] broadcast_app = '%s/%s' % (mobj.group('app'), broadcast_json['_vidURL']) - broadcast_thumbnail = broadcast_json['_imgURL'] + broadcast_thumbnail = broadcast_json.get('_imgURL') broadcast_title = self._live_title(broadcast_json['title']) - broadcast_description = broadcast_json['description'] - broadcaster_nick = broadcast_json['nick'] - broadcaster_login = broadcast_json['login'] + broadcast_description = broadcast_json.get('description') + broadcaster_nick = broadcast_json.get('nick') + broadcaster_login = broadcast_json.get('login') rtmp_conn = 'S:%s' % uuid.uuid4().hex except KeyError: if protected_broadcast: From 59eaf69e3300ad7f780d0f976052fcc3416284c1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 25 Jul 2016 22:31:10 +0700 Subject: [PATCH 25/55] [camdemy] Fix camdemy --- youtube_dl/extractor/camdemy.py | 54 ++++++++++++++++++--------------- 1 file changed, 29 insertions(+), 25 deletions(-) diff --git a/youtube_dl/extractor/camdemy.py b/youtube_dl/extractor/camdemy.py index 6ffbeabd3..8eece2c02 100644 --- a/youtube_dl/extractor/camdemy.py +++ b/youtube_dl/extractor/camdemy.py @@ -1,7 +1,6 @@ # coding: utf-8 from __future__ import unicode_literals -import datetime import re from .common import InfoExtractor @@ -10,8 +9,9 @@ from ..compat import ( compat_urlparse, ) from ..utils import ( - parse_iso8601, + clean_html, str_to_int, + unified_strdate, ) @@ -26,14 +26,13 @@ class CamdemyIE(InfoExtractor): 'ext': 'mp4', 'title': 'Ch1-1 Introduction, Signals (02-23-2012)', 'thumbnail': 're:^https?://.*\.jpg$', - 'description': '', 'creator': 'ss11spring', 'upload_date': '20130114', - 'timestamp': 1358154556, 'view_count': int, } }, { # With non-empty description + # webpage returns "No permission or not login" 'url': 'http://www.camdemy.com/media/13885', 'md5': '4576a3bb2581f86c61044822adbd1249', 'info_dict': { @@ -41,33 +40,34 @@ class CamdemyIE(InfoExtractor): 'ext': 'mp4', 'title': 'EverCam + Camdemy QuickStart', 'thumbnail': 're:^https?://.*\.jpg$', - 'description': 'md5:050b62f71ed62928f8a35f1a41e186c9', + 'description': 'md5:2a9f989c2b153a2342acee579c6e7db6', 'creator': 'evercam', - 'upload_date': '20140620', - 'timestamp': 1403271569, } }, { - # External source + # External source (YouTube) 'url': 'http://www.camdemy.com/media/14842', - 'md5': '50e1c3c3aa233d3d7b7daa2fa10b1cf7', 'info_dict': { 'id': '2vsYQzNIsJo', 'ext': 'mp4', + 'title': 'Excel 2013 Tutorial - How to add Password Protection', + 'description': 'Excel 2013 Tutorial for Beginners - How to add Password Protection', 'upload_date': '20130211', 'uploader': 'Hun Kim', - 'description': 'Excel 2013 Tutorial for Beginners - How to add Password Protection', 'uploader_id': 'hunkimtutorials', - 'title': 'Excel 2013 Tutorial - How to add Password Protection', - } + }, + 'params': { + 'skip_download': True, + }, }] def _real_extract(self, url): video_id = self._match_id(url) - page = self._download_webpage(url, video_id) + + webpage = self._download_webpage(url, video_id) src_from = self._html_search_regex( - r"<div class='srcFrom'>Source: <a title='([^']+)'", page, - 'external source', default=None) + r"class=['\"]srcFrom['\"][^>]*>Sources?(?:\s+from)?\s*:\s*<a[^>]+(?:href|title)=(['\"])(?P<url>(?:(?!\1).)+)\1", + webpage, 'external source', default=None, group='url') if src_from: return self.url_result(src_from) @@ -78,27 +78,31 @@ class CamdemyIE(InfoExtractor): video_folder = compat_urlparse.urljoin(thumb_url, 'video/') file_list_doc = self._download_xml( compat_urlparse.urljoin(video_folder, 'fileList.xml'), - video_id, 'Filelist XML') + video_id, 'Downloading filelist XML') file_name = file_list_doc.find('./video/item/fileName').text video_url = compat_urlparse.urljoin(video_folder, file_name) - timestamp = parse_iso8601(self._html_search_regex( - r"<div class='title'>Posted\s*:</div>\s*<div class='value'>([^<>]+)<", - page, 'creation time', fatal=False), - delimiter=' ', timezone=datetime.timedelta(hours=8)) - view_count = str_to_int(self._html_search_regex( - r"<div class='title'>Views\s*:</div>\s*<div class='value'>([^<>]+)<", - page, 'view count', fatal=False)) + # Some URLs return "No permission or not login" in a webpage despite being + # freely available via oembed JSON URL (e.g. http://www.camdemy.com/media/13885) + upload_date = unified_strdate(self._search_regex( + r'>published on ([^<]+)<', webpage, + 'upload date', default=None)) + view_count = str_to_int(self._search_regex( + r'role=["\']viewCnt["\'][^>]*>([\d,.]+) views', + webpage, 'view count', default=None)) + description = self._html_search_meta( + 'description', webpage, default=None) or clean_html( + oembed_obj.get('description')) return { 'id': video_id, 'url': video_url, 'title': oembed_obj['title'], 'thumbnail': thumb_url, - 'description': self._html_search_meta('description', page), + 'description': description, 'creator': oembed_obj['author_name'], 'duration': oembed_obj['duration'], - 'timestamp': timestamp, + 'upload_date': upload_date, 'view_count': view_count, } From 0a147785e8d1bb763c2f0634bef1c580194e1d1e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 25 Jul 2016 22:35:12 +0700 Subject: [PATCH 26/55] [camdemy] Extract duration properly --- youtube_dl/extractor/camdemy.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/camdemy.py b/youtube_dl/extractor/camdemy.py index 8eece2c02..268c34392 100644 --- a/youtube_dl/extractor/camdemy.py +++ b/youtube_dl/extractor/camdemy.py @@ -10,6 +10,7 @@ from ..compat import ( ) from ..utils import ( clean_html, + parse_duration, str_to_int, unified_strdate, ) @@ -27,6 +28,7 @@ class CamdemyIE(InfoExtractor): 'title': 'Ch1-1 Introduction, Signals (02-23-2012)', 'thumbnail': 're:^https?://.*\.jpg$', 'creator': 'ss11spring', + 'duration': 1591, 'upload_date': '20130114', 'view_count': int, } @@ -42,6 +44,7 @@ class CamdemyIE(InfoExtractor): 'thumbnail': 're:^https?://.*\.jpg$', 'description': 'md5:2a9f989c2b153a2342acee579c6e7db6', 'creator': 'evercam', + 'duration': 318, } }, { # External source (YouTube) @@ -74,6 +77,7 @@ class CamdemyIE(InfoExtractor): oembed_obj = self._download_json( 'http://www.camdemy.com/oembed/?format=json&url=' + url, video_id) + title = oembed_obj['title'] thumb_url = oembed_obj['thumbnail_url'] video_folder = compat_urlparse.urljoin(thumb_url, 'video/') file_list_doc = self._download_xml( @@ -97,11 +101,11 @@ class CamdemyIE(InfoExtractor): return { 'id': video_id, 'url': video_url, - 'title': oembed_obj['title'], + 'title': title, 'thumbnail': thumb_url, 'description': description, - 'creator': oembed_obj['author_name'], - 'duration': oembed_obj['duration'], + 'creator': oembed_obj.get('author_name'), + 'duration': parse_duration(oembed_obj.get('duration')), 'upload_date': upload_date, 'view_count': view_count, } From 712c7530ff4acf3b7ae82cc19b2babb1588a3eab Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Tue, 26 Jul 2016 11:03:43 +0800 Subject: [PATCH 27/55] [mtv] Extract more metadata and more 1. Remove MTVIggyIE. All www.mtviggy.com URLs now redirects to www.mtv.com 2. Fix MTVDEIE 3. Return multiple URLs from _transform_rtmp_url. This is for tosh.cc.com --- youtube_dl/extractor/mtv.py | 58 +++++++++++++++++++------------------ 1 file changed, 30 insertions(+), 28 deletions(-) diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py index f3ec2ebbc..ba9c28a56 100644 --- a/youtube_dl/extractor/mtv.py +++ b/youtube_dl/extractor/mtv.py @@ -16,6 +16,7 @@ from ..utils import ( HEADRequest, sanitized_Request, strip_or_none, + timeconvert, unescapeHTML, url_basename, RegexNotFoundError, @@ -36,13 +37,13 @@ class MTVServicesInfoExtractor(InfoExtractor): return uri.split(':')[-1] # This was originally implemented for ComedyCentral, but it also works here - @staticmethod - def _transform_rtmp_url(rtmp_video_url): + @classmethod + def _transform_rtmp_url(cls, rtmp_video_url): m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp\..+?/.*)$', rtmp_video_url) if not m: - return rtmp_video_url + return {'rtmp': rtmp_video_url} base = 'http://viacommtvstrmfs.fplive.net/' - return base + m.group('finalid') + return {'http': base + m.group('finalid')} def _get_feed_url(self, uri): return self._FEED_URL @@ -86,14 +87,14 @@ class MTVServicesInfoExtractor(InfoExtractor): rtmp_video_url = rendition.find('./src').text if rtmp_video_url.endswith('siteunavail.png'): continue - new_url = self._transform_rtmp_url(rtmp_video_url) - formats.append({ + new_urls = self._transform_rtmp_url(rtmp_video_url) + formats.extend([{ 'ext': 'flv' if new_url.startswith('rtmp') else ext, 'url': new_url, - 'format_id': rendition.get('bitrate'), + 'format_id': '-'.join([kind, rendition.get('bitrate')]), 'width': int(rendition.get('width')), 'height': int(rendition.get('height')), - }) + } for kind, new_url in new_urls.items()]) except (KeyError, TypeError): raise ExtractorError('Invalid rendition field.') self._sort_formats(formats) @@ -136,6 +137,8 @@ class MTVServicesInfoExtractor(InfoExtractor): description = strip_or_none(xpath_text(itemdoc, 'description')) + timestamp = timeconvert(xpath_text(itemdoc, 'pubDate')) + title_el = None if title_el is None: title_el = find_xpath_attr( @@ -168,6 +171,7 @@ class MTVServicesInfoExtractor(InfoExtractor): 'thumbnail': self._get_thumbnail_url(uri, itemdoc), 'description': description, 'duration': float_or_none(content_el.attrib.get('duration')), + 'timestamp': timestamp, } def _get_feed_query(self, uri): @@ -186,8 +190,13 @@ class MTVServicesInfoExtractor(InfoExtractor): idoc = self._download_xml( url, video_id, 'Downloading info', transform_source=fix_xml_ampersands) + + title = xpath_text(idoc, './channel/title') + description = xpath_text(idoc, './channel/description') + return self.playlist_result( - [self._get_video_info(item) for item in idoc.findall('.//item')]) + [self._get_video_info(item) for item in idoc.findall('.//item')], + playlist_title=title, playlist_description=description) def _extract_mgid(self, webpage): try: @@ -233,6 +242,8 @@ class MTVServicesEmbeddedIE(MTVServicesInfoExtractor): 'ext': 'mp4', 'title': 'Peter Dinklage Sums Up \'Game Of Thrones\' In 45 Seconds', 'description': '"Sexy sexy sexy, stabby stabby stabby, beautiful language," says Peter Dinklage as he tries summarizing "Game of Thrones" in under a minute.', + 'timestamp': 1400126400, + 'upload_date': '20140515', }, } @@ -275,6 +286,8 @@ class MTVIE(MTVServicesInfoExtractor): 'ext': 'mp4', 'title': 'Taylor Swift - "Ours (VH1 Storytellers)"', 'description': 'Album: Taylor Swift performs "Ours" for VH1 Storytellers at Harvey Mudd College.', + 'timestamp': 1352610000, + 'upload_date': '20121111', }, }, ] @@ -301,20 +314,6 @@ class MTVIE(MTVServicesInfoExtractor): return self._get_videos_info(uri) -class MTVIggyIE(MTVServicesInfoExtractor): - IE_NAME = 'mtviggy.com' - _VALID_URL = r'https?://www\.mtviggy\.com/videos/.+' - _TEST = { - 'url': 'http://www.mtviggy.com/videos/arcade-fire-behind-the-scenes-at-the-biggest-music-experiment-yet/', - 'info_dict': { - 'id': '984696', - 'ext': 'mp4', - 'title': 'Arcade Fire: Behind the Scenes at the Biggest Music Experiment Yet', - } - } - _FEED_URL = 'http://all.mtvworldverticals.com/feed-xml/' - - class MTVDEIE(MTVServicesInfoExtractor): IE_NAME = 'mtv.de' _VALID_URL = r'https?://(?:www\.)?mtv\.de/(?:artists|shows|news)/(?:[^/]+/)*(?P<id>\d+)-[^/#?]+/*(?:[#?].*)?$' @@ -322,7 +321,7 @@ class MTVDEIE(MTVServicesInfoExtractor): 'url': 'http://www.mtv.de/artists/10571-cro/videos/61131-traum', 'info_dict': { 'id': 'music_video-a50bc5f0b3aa4b3190aa', - 'ext': 'mp4', + 'ext': 'flv', 'title': 'MusicVideo_cro-traum', 'description': 'Cro - Traum', }, @@ -335,7 +334,7 @@ class MTVDEIE(MTVServicesInfoExtractor): 'url': 'http://www.mtv.de/shows/933-teen-mom-2/staffeln/5353/folgen/63565-enthullungen', 'info_dict': { 'id': 'local_playlist-f5ae778b9832cc837189', - 'ext': 'mp4', + 'ext': 'flv', 'title': 'Episode_teen-mom-2_shows_season-5_episode-1_full-episode_part1', }, 'params': { @@ -343,7 +342,6 @@ class MTVDEIE(MTVServicesInfoExtractor): 'skip_download': True, }, }, { - # single video in pagePlaylist with different id 'url': 'http://www.mtv.de/news/77491-mtv-movies-spotlight-pixels-teil-3', 'info_dict': { 'id': 'local_playlist-4e760566473c4c8c5344', @@ -355,6 +353,7 @@ class MTVDEIE(MTVServicesInfoExtractor): # rtmp download 'skip_download': True, }, + 'skip': 'Das Video kann zur Zeit nicht abgespielt werden.', }] def _real_extract(self, url): @@ -367,11 +366,14 @@ class MTVDEIE(MTVServicesInfoExtractor): r'window\.pagePlaylist\s*=\s*(\[.+?\]);\n', webpage, 'page playlist'), video_id) + def _mrss_url(item): + return item['mrss'] + item.get('mrssvars', '') + # news pages contain single video in playlist with different id if len(playlist) == 1: - return self._get_videos_info_from_url(playlist[0]['mrss'], video_id) + return self._get_videos_info_from_url(_mrss_url(playlist[0]), video_id) for item in playlist: item_id = item.get('id') if item_id and compat_str(item_id) == video_id: - return self._get_videos_info_from_url(item['mrss'], video_id) + return self._get_videos_info_from_url(_mrss_url(item), video_id) From cc99d4f826a942b18133fe4221c9de2f9197e860 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Tue, 26 Jul 2016 11:06:50 +0800 Subject: [PATCH 28/55] [comedycentral] Remove IEs for *.cc.com except tosh.cc.com All other subdomains now redirects to cc.com/* URLs --- youtube_dl/extractor/comedycentral.py | 280 ++++---------------------- 1 file changed, 35 insertions(+), 245 deletions(-) diff --git a/youtube_dl/extractor/comedycentral.py b/youtube_dl/extractor/comedycentral.py index 63f68f765..c76909e48 100644 --- a/youtube_dl/extractor/comedycentral.py +++ b/youtube_dl/extractor/comedycentral.py @@ -1,17 +1,6 @@ from __future__ import unicode_literals -import re - from .mtv import MTVServicesInfoExtractor -from ..compat import ( - compat_str, - compat_urllib_parse_urlencode, -) -from ..utils import ( - ExtractorError, - float_or_none, - unified_strdate, -) class ComedyCentralIE(MTVServicesInfoExtractor): @@ -26,8 +15,10 @@ class ComedyCentralIE(MTVServicesInfoExtractor): 'info_dict': { 'id': 'cef0cbb3-e776-4bc9-b62e-8016deccb354', 'ext': 'mp4', - 'title': 'CC:Stand-Up|Greg Fitzsimmons: Life on Stage|Uncensored - Too Good of a Mother', + 'title': 'CC:Stand-Up|August 18, 2013|1|0101|Uncensored - Too Good of a Mother', 'description': 'After a certain point, breastfeeding becomes c**kblocking.', + 'timestamp': 1376798400, + 'upload_date': '20130818', }, }, { 'url': 'http://www.cc.com/shows/the-daily-show-with-trevor-noah/interviews/6yx39d/exclusive-rand-paul-extended-interview', @@ -35,244 +26,43 @@ class ComedyCentralIE(MTVServicesInfoExtractor): }] -class ComedyCentralShowsIE(MTVServicesInfoExtractor): - IE_DESC = 'The Daily Show / The Colbert Report' - # urls can be abbreviations like :thedailyshow - # urls for episodes like: - # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day - # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news - # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524 - _VALID_URL = r'''(?x)^(:(?P<shortname>tds|thedailyshow) - |https?://(:www\.)? - (?P<showname>thedailyshow|thecolbertreport|tosh)\.(?:cc\.)?com/ - ((?:full-)?episodes/(?:[0-9a-z]{6}/)?(?P<episode>.*)| - (?P<clip> - (?:(?:guests/[^/]+|videos|video-(?:clips|playlists)|special-editions|news-team/[^/]+)/[^/]+/(?P<videotitle>[^/?#]+)) - |(the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?)) - |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)) - )| - (?P<interview> - extended-interviews/(?P<interID>[0-9a-z]+)/ - (?:playlist_tds_extended_)?(?P<interview_title>[^/?#]*?) - (?:/[^/?#]?|[?#]|$)))) - ''' +class ToshIE(MTVServicesInfoExtractor): + IE_DESC = 'Tosh.0' + _VALID_URL = r'^https?://tosh\.cc\.com/video-(?:clips|collections)/[^/]+/(?P<videotitle>[^/?#]+)' + _FEED_URL = 'http://tosh.cc.com/feeds/mrss' + _TESTS = [{ - 'url': 'http://thedailyshow.cc.com/watch/thu-december-13-2012/kristen-stewart', - 'md5': '4e2f5cb088a83cd8cdb7756132f9739d', - 'info_dict': { - 'id': 'ab9ab3e7-5a98-4dbe-8b21-551dc0523d55', - 'ext': 'mp4', - 'upload_date': '20121213', - 'description': 'Kristen Stewart learns to let loose in "On the Road."', - 'uploader': 'thedailyshow', - 'title': 'thedailyshow kristen-stewart part 1', - } - }, { - 'url': 'http://thedailyshow.cc.com/extended-interviews/b6364d/sarah-chayes-extended-interview', - 'info_dict': { - 'id': 'sarah-chayes-extended-interview', - 'description': 'Carnegie Endowment Senior Associate Sarah Chayes discusses how corrupt institutions function throughout the world in her book "Thieves of State: Why Corruption Threatens Global Security."', - 'title': 'thedailyshow Sarah Chayes Extended Interview', - }, - 'playlist': [ - { - 'info_dict': { - 'id': '0baad492-cbec-4ec1-9e50-ad91c291127f', - 'ext': 'mp4', - 'upload_date': '20150129', - 'description': 'Carnegie Endowment Senior Associate Sarah Chayes discusses how corrupt institutions function throughout the world in her book "Thieves of State: Why Corruption Threatens Global Security."', - 'uploader': 'thedailyshow', - 'title': 'thedailyshow sarah-chayes-extended-interview part 1', - }, - }, - { - 'info_dict': { - 'id': '1e4fb91b-8ce7-4277-bd7c-98c9f1bbd283', - 'ext': 'mp4', - 'upload_date': '20150129', - 'description': 'Carnegie Endowment Senior Associate Sarah Chayes discusses how corrupt institutions function throughout the world in her book "Thieves of State: Why Corruption Threatens Global Security."', - 'uploader': 'thedailyshow', - 'title': 'thedailyshow sarah-chayes-extended-interview part 2', - }, - }, - ], - 'params': { - 'skip_download': True, - }, - }, { - 'url': 'http://thedailyshow.cc.com/extended-interviews/xm3fnq/andrew-napolitano-extended-interview', - 'only_matching': True, - }, { - 'url': 'http://thecolbertreport.cc.com/videos/29w6fx/-realhumanpraise-for-fox-news', - 'only_matching': True, - }, { - 'url': 'http://thecolbertreport.cc.com/videos/gh6urb/neil-degrasse-tyson-pt--1?xrs=eml_col_031114', - 'only_matching': True, - }, { - 'url': 'http://thedailyshow.cc.com/guests/michael-lewis/3efna8/exclusive---michael-lewis-extended-interview-pt--3', - 'only_matching': True, - }, { - 'url': 'http://thedailyshow.cc.com/episodes/sy7yv0/april-8--2014---denis-leary', - 'only_matching': True, - }, { - 'url': 'http://thecolbertreport.cc.com/episodes/8ase07/april-8--2014---jane-goodall', - 'only_matching': True, - }, { - 'url': 'http://thedailyshow.cc.com/video-playlists/npde3s/the-daily-show-19088-highlights', - 'only_matching': True, - }, { - 'url': 'http://thedailyshow.cc.com/video-playlists/t6d9sg/the-daily-show-20038-highlights/be3cwo', - 'only_matching': True, - }, { - 'url': 'http://thedailyshow.cc.com/special-editions/2l8fdb/special-edition---a-look-back-at-food', - 'only_matching': True, - }, { - 'url': 'http://thedailyshow.cc.com/news-team/michael-che/7wnfel/we-need-to-talk-about-israel', - 'only_matching': True, - }, { 'url': 'http://tosh.cc.com/video-clips/68g93d/twitter-users-share-summer-plans', + 'info_dict': { + 'description': 'Tosh asked fans to share their summer plans.', + 'title': 'Twitter Users Share Summer Plans', + }, + 'playlist': [{ + 'md5': 'f269e88114c1805bb6d7653fecea9e06', + 'info_dict': { + 'id': '90498ec2-ed00-11e0-aca6-0026b9414f30', + 'ext': 'mp4', + 'title': 'Tosh.0|June 9, 2077|2|211|Twitter Users Share Summer Plans', + 'description': 'Tosh asked fans to share their summer plans.', + 'thumbnail': 're:^https?://.*\.jpg', + # It's really reported to be published on year 2077 + 'upload_date': '20770610', + 'timestamp': 3390510600, + 'subtitles': { + 'en': 'mincount:3', + }, + }, + }] + }, { + 'url': 'http://tosh.cc.com/video-collections/x2iz7k/just-plain-foul/m5q4fp', 'only_matching': True, }] - _available_formats = ['3500', '2200', '1700', '1200', '750', '400'] - - _video_extensions = { - '3500': 'mp4', - '2200': 'mp4', - '1700': 'mp4', - '1200': 'mp4', - '750': 'mp4', - '400': 'mp4', - } - _video_dimensions = { - '3500': (1280, 720), - '2200': (960, 540), - '1700': (768, 432), - '1200': (640, 360), - '750': (512, 288), - '400': (384, 216), - } - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - - if mobj.group('shortname'): - return self.url_result('http://www.cc.com/shows/the-daily-show-with-trevor-noah/full-episodes') - - if mobj.group('clip'): - if mobj.group('videotitle'): - epTitle = mobj.group('videotitle') - elif mobj.group('showname') == 'thedailyshow': - epTitle = mobj.group('tdstitle') - else: - epTitle = mobj.group('cntitle') - dlNewest = False - elif mobj.group('interview'): - epTitle = mobj.group('interview_title') - dlNewest = False - else: - dlNewest = not mobj.group('episode') - if dlNewest: - epTitle = mobj.group('showname') - else: - epTitle = mobj.group('episode') - show_name = mobj.group('showname') - - webpage, htmlHandle = self._download_webpage_handle(url, epTitle) - if dlNewest: - url = htmlHandle.geturl() - mobj = re.match(self._VALID_URL, url, re.VERBOSE) - if mobj is None: - raise ExtractorError('Invalid redirected URL: ' + url) - if mobj.group('episode') == '': - raise ExtractorError('Redirected URL is still not specific: ' + url) - epTitle = (mobj.group('episode') or mobj.group('videotitle')).rpartition('/')[-1] - - mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage) - if len(mMovieParams) == 0: - # The Colbert Report embeds the information in a without - # a URL prefix; so extract the alternate reference - # and then add the URL prefix manually. - - altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video|playlist).*?:.*?)"', webpage) - if len(altMovieParams) == 0: - raise ExtractorError('unable to find Flash URL in webpage ' + url) - else: - mMovieParams = [('http://media.mtvnservices.com/' + altMovieParams[0], altMovieParams[0])] - - uri = mMovieParams[0][1] - # Correct cc.com in uri - uri = re.sub(r'(episode:[^.]+)(\.cc)?\.com', r'\1.com', uri) - - index_url = 'http://%s.cc.com/feeds/mrss?%s' % (show_name, compat_urllib_parse_urlencode({'uri': uri})) - idoc = self._download_xml( - index_url, epTitle, - 'Downloading show index', 'Unable to download episode index') - - title = idoc.find('./channel/title').text - description = idoc.find('./channel/description').text - - entries = [] - item_els = idoc.findall('.//item') - for part_num, itemEl in enumerate(item_els): - upload_date = unified_strdate(itemEl.findall('./pubDate')[0].text) - thumbnail = itemEl.find('.//{http://search.yahoo.com/mrss/}thumbnail').attrib.get('url') - - content = itemEl.find('.//{http://search.yahoo.com/mrss/}content') - duration = float_or_none(content.attrib.get('duration')) - mediagen_url = content.attrib['url'] - guid = itemEl.find('./guid').text.rpartition(':')[-1] - - cdoc = self._download_xml( - mediagen_url, epTitle, - 'Downloading configuration for segment %d / %d' % (part_num + 1, len(item_els))) - - turls = [] - for rendition in cdoc.findall('.//rendition'): - finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text) - turls.append(finfo) - - formats = [] - for format, rtmp_video_url in turls: - w, h = self._video_dimensions.get(format, (None, None)) - formats.append({ - 'format_id': 'vhttp-%s' % format, - 'url': self._transform_rtmp_url(rtmp_video_url), - 'ext': self._video_extensions.get(format, 'mp4'), - 'height': h, - 'width': w, - }) - formats.append({ - 'format_id': 'rtmp-%s' % format, - 'url': rtmp_video_url.replace('viacomccstrm', 'viacommtvstrm'), - 'ext': self._video_extensions.get(format, 'mp4'), - 'height': h, - 'width': w, - }) - self._sort_formats(formats) - - subtitles = self._extract_subtitles(cdoc, guid) - - virtual_id = show_name + ' ' + epTitle + ' part ' + compat_str(part_num + 1) - entries.append({ - 'id': guid, - 'title': virtual_id, - 'formats': formats, - 'uploader': show_name, - 'upload_date': upload_date, - 'duration': duration, - 'thumbnail': thumbnail, - 'description': description, - 'subtitles': subtitles, - }) - - return { - '_type': 'playlist', - 'id': epTitle, - 'entries': entries, - 'title': show_name + ' ' + title, - 'description': description, - } + @classmethod + def _transform_rtmp_url(cls, rtmp_video_url): + new_urls = super(ToshIE, cls)._transform_rtmp_url(rtmp_video_url) + new_urls['rtmp'] = rtmp_video_url.replace('viacomccstrm', 'viacommtvstrm') + return new_urls class ComedyCentralTVIE(MTVServicesInfoExtractor): From 14a28e705b3e27606c2fdbc242ae5a33c2e9f28e Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Tue, 26 Jul 2016 11:08:09 +0800 Subject: [PATCH 29/55] [test/test_all_urls] Remove *.cc.com tests --- test/test_all_urls.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/test/test_all_urls.py b/test/test_all_urls.py index 1f6079c29..cd1cd4b24 100644 --- a/test/test_all_urls.py +++ b/test/test_all_urls.py @@ -101,8 +101,6 @@ class TestAllURLsMatching(unittest.TestCase): self.assertMatch(':ytsubs', ['youtube:subscriptions']) self.assertMatch(':ytsubscriptions', ['youtube:subscriptions']) self.assertMatch(':ythistory', ['youtube:history']) - self.assertMatch(':thedailyshow', ['ComedyCentralShows']) - self.assertMatch(':tds', ['ComedyCentralShows']) def test_vimeo_matching(self): self.assertMatch('https://vimeo.com/channels/tributes', ['vimeo:channel']) From 5c32a77cad549f9d2f3f02ed4204e4591dd1889b Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Tue, 26 Jul 2016 11:08:55 +0800 Subject: [PATCH 30/55] [nextmovie] Remove extractor This domain name now redirects to mtv.com --- youtube_dl/extractor/nextmovie.py | 30 ------------------------------ 1 file changed, 30 deletions(-) delete mode 100644 youtube_dl/extractor/nextmovie.py diff --git a/youtube_dl/extractor/nextmovie.py b/youtube_dl/extractor/nextmovie.py deleted file mode 100644 index 9ccd7d774..000000000 --- a/youtube_dl/extractor/nextmovie.py +++ /dev/null @@ -1,30 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .mtv import MTVServicesInfoExtractor -from ..compat import compat_urllib_parse_urlencode - - -class NextMovieIE(MTVServicesInfoExtractor): - IE_NAME = 'nextmovie.com' - _VALID_URL = r'https?://(?:www\.)?nextmovie\.com/shows/[^/]+/\d{4}-\d{2}-\d{2}/(?P<id>[^/?#]+)' - _FEED_URL = 'http://lite.dextr.mtvi.com/service1/dispatch.htm' - _TESTS = [{ - 'url': 'http://www.nextmovie.com/shows/exclusives/2013-03-10/mgid:uma:videolist:nextmovie.com:1715019/', - 'md5': '09a9199f2f11f10107d04fcb153218aa', - 'info_dict': { - 'id': '961726', - 'ext': 'mp4', - 'title': 'The Muppets\' Gravity', - }, - }] - - def _get_feed_query(self, uri): - return compat_urllib_parse_urlencode({ - 'feed': '1505', - 'mgid': uri, - }) - - def _real_extract(self, url): - mgid = self._match_id(url) - return self._get_videos_info(mgid) From a3aa814b774a413d9e7f4fbfadf06fe6dcc59b25 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Tue, 26 Jul 2016 11:10:41 +0800 Subject: [PATCH 31/55] Update _TESTS for MTV sites --- youtube_dl/extractor/nick.py | 1 + youtube_dl/extractor/southpark.py | 34 ++++++++++++++++--- youtube_dl/extractor/spike.py | 4 ++- youtube_dl/extractor/tvland.py | 55 +++++++------------------------ 4 files changed, 44 insertions(+), 50 deletions(-) diff --git a/youtube_dl/extractor/nick.py b/youtube_dl/extractor/nick.py index 4935002d0..9c54846e1 100644 --- a/youtube_dl/extractor/nick.py +++ b/youtube_dl/extractor/nick.py @@ -7,6 +7,7 @@ from ..utils import update_url_query class NickIE(MTVServicesInfoExtractor): + # None of videos on the website are still alive? IE_NAME = 'nick.com' _VALID_URL = r'https?://(?:www\.)?nick(?:jr)?\.com/(?:videos/clip|[^/]+/videos)/(?P<id>[^/?#.]+)' _FEED_URL = 'http://udat.mtvnservices.com/service1/dispatch.htm' diff --git a/youtube_dl/extractor/southpark.py b/youtube_dl/extractor/southpark.py index 87b650468..a147f7db1 100644 --- a/youtube_dl/extractor/southpark.py +++ b/youtube_dl/extractor/southpark.py @@ -17,6 +17,8 @@ class SouthParkIE(MTVServicesInfoExtractor): 'ext': 'mp4', 'title': 'South Park|Bat Daded', 'description': 'Randy disqualifies South Park by getting into a fight with Bat Dad.', + 'timestamp': 1112760000, + 'upload_date': '20050406', }, }] @@ -28,6 +30,10 @@ class SouthParkEsIE(SouthParkIE): _TESTS = [{ 'url': 'http://southpark.cc.com/episodios-en-espanol/s01e01-cartman-consigue-una-sonda-anal#source=351c1323-0b96-402d-a8b9-40d01b2e9bde&position=1&sort=!airdate', + 'info_dict': { + 'title': 'Cartman Consigue Una Sonda Anal', + 'description': 'Cartman Consigue Una Sonda Anal', + }, 'playlist_count': 4, }] @@ -42,17 +48,27 @@ class SouthParkDeIE(SouthParkIE): 'info_dict': { 'id': '85487c96-b3b9-4e39-9127-ad88583d9bf2', 'ext': 'mp4', - 'title': 'The Government Won\'t Respect My Privacy', + 'title': 'South Park|The Government Won\'t Respect My Privacy', 'description': 'Cartman explains the benefits of "Shitter" to Stan, Kyle and Craig.', + 'timestamp': 1380160800, + 'upload_date': '20130926', }, }, { # non-ASCII characters in initial URL 'url': 'http://www.southpark.de/alle-episoden/s18e09-hashtag-aufwärmen', - 'playlist_count': 4, + 'info_dict': { + 'title': 'Hashtag „Aufwärmen“', + 'description': 'Kyle will mit seinem kleinen Bruder Ike Videospiele spielen. Als der nicht mehr mit ihm spielen will, hat Kyle Angst, dass er die Kids von heute nicht mehr versteht.', + }, + 'playlist_count': 3, }, { # non-ASCII characters in redirect URL 'url': 'http://www.southpark.de/alle-episoden/s18e09', - 'playlist_count': 4, + 'info_dict': { + 'title': 'Hashtag „Aufwärmen“', + 'description': 'Kyle will mit seinem kleinen Bruder Ike Videospiele spielen. Als der nicht mehr mit ihm spielen will, hat Kyle Angst, dass er die Kids von heute nicht mehr versteht.', + }, + 'playlist_count': 3, }] @@ -63,7 +79,11 @@ class SouthParkNlIE(SouthParkIE): _TESTS = [{ 'url': 'http://www.southpark.nl/full-episodes/s18e06-freemium-isnt-free', - 'playlist_count': 4, + 'info_dict': { + 'title': 'Freemium Isn\'t Free', + 'description': 'Stan is addicted to the new Terrance and Phillip mobile game.', + }, + 'playlist_mincount': 3, }] @@ -74,5 +94,9 @@ class SouthParkDkIE(SouthParkIE): _TESTS = [{ 'url': 'http://www.southparkstudios.dk/full-episodes/s18e07-grounded-vindaloop', - 'playlist_count': 4, + 'info_dict': { + 'title': 'Grounded Vindaloop', + 'description': 'Butters is convinced he\'s living in a virtual reality.', + }, + 'playlist_mincount': 3, }] diff --git a/youtube_dl/extractor/spike.py b/youtube_dl/extractor/spike.py index 63ea7718b..218785ee4 100644 --- a/youtube_dl/extractor/spike.py +++ b/youtube_dl/extractor/spike.py @@ -11,8 +11,10 @@ class SpikeIE(MTVServicesInfoExtractor): 'info_dict': { 'id': 'b9c8221a-4e50-479a-b86d-3333323e38ba', 'ext': 'mp4', - 'title': 'Auction Hunters|Can Allen Ride A Hundred Year-Old Motorcycle?', + 'title': 'Auction Hunters|December 27, 2013|4|414|Can Allen Ride A Hundred Year-Old Motorcycle?', 'description': 'md5:fbed7e82ed5fad493615b3094a9499cb', + 'timestamp': 1388120400, + 'upload_date': '20131227', }, }, { 'url': 'http://www.spike.com/video-clips/lhtu8m/', diff --git a/youtube_dl/extractor/tvland.py b/youtube_dl/extractor/tvland.py index b73279dec..cb76a2a58 100644 --- a/youtube_dl/extractor/tvland.py +++ b/youtube_dl/extractor/tvland.py @@ -9,56 +9,23 @@ class TVLandIE(MTVServicesInfoExtractor): _VALID_URL = r'https?://(?:www\.)?tvland\.com/(?:video-clips|episodes)/(?P<id>[^/?#.]+)' _FEED_URL = 'http://www.tvland.com/feeds/mrss/' _TESTS = [{ + # Geo-restricted. Without a proxy metadata are still there. With a + # proxy it redirects to http://m.tvland.com/app/ 'url': 'http://www.tvland.com/episodes/hqhps2/everybody-loves-raymond-the-invasion-ep-048', - 'playlist': [ - { - 'md5': '227e9723b9669c05bf51098b10287aa7', - 'info_dict': { - 'id': 'bcbd3a83-3aca-4dca-809b-f78a87dcccdd', - 'ext': 'mp4', - 'title': 'Everybody Loves Raymond|Everybody Loves Raymond 048 HD, Part 1 of 5', - } - }, - { - 'md5': '9fa2b764ec0e8194fb3ebb01a83df88b', - 'info_dict': { - 'id': 'f4279548-6e13-40dd-92e8-860d27289197', - 'ext': 'mp4', - 'title': 'Everybody Loves Raymond|Everybody Loves Raymond 048 HD, Part 2 of 5', - } - }, - { - 'md5': 'fde4c3bccd7cc7e3576b338734153cec', - 'info_dict': { - 'id': '664e4a38-53ef-4115-9bc9-d0f789ec6334', - 'ext': 'mp4', - 'title': 'Everybody Loves Raymond|Everybody Loves Raymond 048 HD, Part 3 of 5', - } - }, - { - 'md5': '247f6780cda6891f2e49b8ae2b10e017', - 'info_dict': { - 'id': '9146ecf5-b15a-4d78-879c-6679b77f4960', - 'ext': 'mp4', - 'title': 'Everybody Loves Raymond|Everybody Loves Raymond 048 HD, Part 4 of 5', - } - }, - { - 'md5': 'fd269f33256e47bad5eb6c40de089ff6', - 'info_dict': { - 'id': '04334a2e-9a47-4214-a8c2-ae5792e2fab7', - 'ext': 'mp4', - 'title': 'Everybody Loves Raymond|Everybody Loves Raymond 048 HD, Part 5 of 5', - } - } - ], + 'info_dict': { + 'description': 'md5:80973e81b916a324e05c14a3fb506d29', + 'title': 'The Invasion', + }, + 'playlist': [], }, { 'url': 'http://www.tvland.com/video-clips/zea2ev/younger-younger--hilary-duff---little-lies', 'md5': 'e2c6389401cf485df26c79c247b08713', 'info_dict': { 'id': 'b8697515-4bbe-4e01-83d5-fa705ce5fa88', 'ext': 'mp4', - 'title': 'Younger|Younger: Hilary Duff - Little Lies', - 'description': 'md5:7d192f56ca8d958645c83f0de8ef0269' + 'title': 'Younger|December 28, 2015|2|NO-EPISODE#|Younger: Hilary Duff - Little Lies', + 'description': 'md5:7d192f56ca8d958645c83f0de8ef0269', + 'upload_date': '20151228', + 'timestamp': 1451289600, }, }] From 05d1e7aaa9cee8d9b7b4d96b480d5e2f32f15f47 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Tue, 26 Jul 2016 11:11:36 +0800 Subject: [PATCH 32/55] [generic] Fix an MTV test and another test that breaks nosetests --- youtube_dl/extractor/generic.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 011940580..1eac679e5 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -474,7 +474,7 @@ class GenericIE(InfoExtractor): 'url': 'http://www.vestifinance.ru/articles/25753', 'info_dict': { 'id': '25753', - 'title': 'Вести Экономика ― Прямые трансляции с Форума-выставки "Госзаказ-2013"', + 'title': 'Прямые трансляции с Форума-выставки "Госзаказ-2013"', }, 'playlist': [{ 'info_dict': { @@ -641,6 +641,8 @@ class GenericIE(InfoExtractor): 'ext': 'mp4', 'title': 'Key and Peele|October 10, 2012|2|203|Liam Neesons - Uncensored', 'description': 'Two valets share their love for movie star Liam Neesons.', + 'timestamp': 1349922600, + 'upload_date': '20121011', }, }, # YouTube embed via <data-embed-url=""> From 4d3e543c73949765deb4b144b1042b1baedd7692 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Tue, 26 Jul 2016 11:17:28 +0800 Subject: [PATCH 33/55] Update extractors.py --- youtube_dl/extractor/extractors.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 3ca0ef83a..53fab1a31 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -159,8 +159,8 @@ from .coub import CoubIE from .collegerama import CollegeRamaIE from .comedycentral import ( ComedyCentralIE, - ComedyCentralShowsIE, ComedyCentralTVIE, + ToshIE, ) from .comcarcoff import ComCarCoffIE from .commonmistakes import CommonMistakesIE, UnicodeBOMIE @@ -480,7 +480,6 @@ from .msn import MSNIE from .mtv import ( MTVIE, MTVServicesEmbeddedIE, - MTVIggyIE, MTVDEIE, ) from .muenchentv import MuenchenTVIE @@ -530,7 +529,6 @@ from .nextmedia import ( NextMediaActionNewsIE, AppleDailyIE, ) -from .nextmovie import NextMovieIE from .nfb import NFBIE from .nfl import NFLIE from .nhl import ( From 10a1bb3a78453a48f5006179ccb7c6998579dde7 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Tue, 26 Jul 2016 13:12:24 +0800 Subject: [PATCH 34/55] [mtv] Fix for videos with missing bitrates --- youtube_dl/extractor/mtv.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py index ba9c28a56..9e71c08c2 100644 --- a/youtube_dl/extractor/mtv.py +++ b/youtube_dl/extractor/mtv.py @@ -91,7 +91,7 @@ class MTVServicesInfoExtractor(InfoExtractor): formats.extend([{ 'ext': 'flv' if new_url.startswith('rtmp') else ext, 'url': new_url, - 'format_id': '-'.join([kind, rendition.get('bitrate')]), + 'format_id': '-'.join(filter(None, [kind, rendition.get('bitrate')])), 'width': int(rendition.get('width')), 'height': int(rendition.get('height')), } for kind, new_url in new_urls.items()]) From c74299a72c7d003f95d700b80a69469d567fce05 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Tue, 26 Jul 2016 13:13:14 +0800 Subject: [PATCH 35/55] [cmt] Detect unavailable videos and update _TESTS --- youtube_dl/extractor/cmt.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/youtube_dl/extractor/cmt.py b/youtube_dl/extractor/cmt.py index f1311b14f..f24568dcc 100644 --- a/youtube_dl/extractor/cmt.py +++ b/youtube_dl/extractor/cmt.py @@ -1,5 +1,7 @@ from __future__ import unicode_literals + from .mtv import MTVIE +from ..utils import ExtractorError class CMTIE(MTVIE): @@ -16,7 +18,27 @@ class CMTIE(MTVIE): 'title': 'Garth Brooks - "The Call (featuring Trisha Yearwood)"', 'description': 'Blame It All On My Roots', }, + 'skip': 'Video not available', + }, { + 'url': 'http://www.cmt.com/videos/misc/1504699/still-the-king-ep-109-in-3-minutes.jhtml#id=1739908', + 'md5': 'e61a801ca4a183a466c08bd98dccbb1c', + 'info_dict': { + 'id': '1504699', + 'ext': 'mp4', + 'title': 'Still The King Ep. 109 in 3 Minutes', + 'description': 'Relive or catch up with Still The King by watching this recap of season 1, episode 9. New episodes Sundays 9/8c.', + 'timestamp': 1469421000.0, + 'upload_date': '20160725', + }, }, { 'url': 'http://www.cmt.com/shows/party-down-south/party-down-south-ep-407-gone-girl/1738172/playlist/#id=1738172', 'only_matching': True, }] + + @classmethod + def _transform_rtmp_url(cls, rtmp_video_url): + if 'error_not_available.swf' in rtmp_video_url: + raise ExtractorError( + '%s said: video is not available' % cls.IE_NAME, expected=True) + + return super(CMTIE, cls)._transform_rtmp_url(rtmp_video_url) From 326fa4e6e52cb0d02c6026d8a2d32e34b8e99107 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Tue, 26 Jul 2016 13:16:04 +0800 Subject: [PATCH 36/55] [generic] Skip an invalid test --- youtube_dl/extractor/generic.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 1eac679e5..98dd6a7e3 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -859,6 +859,7 @@ class GenericIE(InfoExtractor): 'description': 'md5:601cb790edd05908957dae8aaa866465', 'upload_date': '20150220', }, + 'skip': 'All The Daily Show URLs now redirect to http://www.cc.com/shows/', }, # jwplayer YouTube { From 35f6e0ff365cb2958164caad66d4f15e400f31c0 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Tue, 26 Jul 2016 13:19:47 +0800 Subject: [PATCH 37/55] [mtv.de] Skip 2 geo-restricted tests --- youtube_dl/extractor/mtv.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py index 9e71c08c2..2f455680e 100644 --- a/youtube_dl/extractor/mtv.py +++ b/youtube_dl/extractor/mtv.py @@ -329,6 +329,7 @@ class MTVDEIE(MTVServicesInfoExtractor): # rtmp download 'skip_download': True, }, + 'skip': 'Blocked at Travis CI', }, { # mediagen URL without query (e.g. http://videos.mtvnn.com/mediagen/e865da714c166d18d6f80893195fcb97) 'url': 'http://www.mtv.de/shows/933-teen-mom-2/staffeln/5353/folgen/63565-enthullungen', @@ -341,6 +342,7 @@ class MTVDEIE(MTVServicesInfoExtractor): # rtmp download 'skip_download': True, }, + 'skip': 'Blocked at Travis CI', }, { 'url': 'http://www.mtv.de/news/77491-mtv-movies-spotlight-pixels-teil-3', 'info_dict': { From 7f8b92e3cfffc5983423ad3b078c2132b9ff39cd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 26 Jul 2016 21:44:53 +0700 Subject: [PATCH 38/55] [bigflix] Update tests --- youtube_dl/extractor/bigflix.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/bigflix.py b/youtube_dl/extractor/bigflix.py index 33762ad93..b19f35b5d 100644 --- a/youtube_dl/extractor/bigflix.py +++ b/youtube_dl/extractor/bigflix.py @@ -12,7 +12,7 @@ class BigflixIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?bigflix\.com/.+/(?P<id>[0-9]+)' _TESTS = [{ 'url': 'http://www.bigflix.com/Hindi-movies/Action-movies/Singham-Returns/16537', - 'md5': 'ec76aa9b1129e2e5b301a474e54fab74', + 'md5': 'dc1b4aebb46e3a7077ecc0d9f43f61e3', 'info_dict': { 'id': '16537', 'ext': 'mp4', @@ -26,7 +26,7 @@ class BigflixIE(InfoExtractor): 'id': '16070', 'ext': 'mp4', 'title': 'Madarasapatinam', - 'description': 'md5:63b9b8ed79189c6f0418c26d9a3452ca', + 'description': 'md5:9f0470b26a4ba8e824c823b5d95c2f6b', 'formats': 'mincount:2', }, 'params': { From 88bd486b9a287db7de2a859863ed3356b418cd66 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 26 Jul 2016 22:58:50 +0700 Subject: [PATCH 39/55] [cbc] Improve extraction for videos embedded with clipId --- youtube_dl/extractor/cbc.py | 29 +++++++++++++++++++++++++---- 1 file changed, 25 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/cbc.py b/youtube_dl/extractor/cbc.py index 06772d492..885d2c721 100644 --- a/youtube_dl/extractor/cbc.py +++ b/youtube_dl/extractor/cbc.py @@ -4,9 +4,11 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( js_to_json, smuggle_url, + try_get, ) @@ -27,7 +29,20 @@ class CBCIE(InfoExtractor): }, 'skip': 'Geo-restricted to Canada', }, { - # with clipId + # with clipId, feed available via tpfeed.cbc.ca and feed.theplatform.com + 'url': 'http://www.cbc.ca/22minutes/videos/22-minutes-update/22-minutes-update-episode-4', + 'md5': '162adfa070274b144f4fdc3c3b8207db', + 'info_dict': { + 'id': '2414435309', + 'ext': 'mp4', + 'title': '22 Minutes Update: What Not To Wear Quebec', + 'description': "This week's latest Canadian top political story is What Not To Wear Quebec.", + 'upload_date': '20131025', + 'uploader': 'CBCC-NEW', + 'timestamp': 1382717907, + }, + }, { + # with clipId, feed only available via tpfeed.cbc.ca 'url': 'http://www.cbc.ca/archives/entry/1978-robin-williams-freestyles-on-90-minutes-live', 'md5': '0274a90b51a9b4971fe005c63f592f12', 'info_dict': { @@ -83,9 +98,15 @@ class CBCIE(InfoExtractor): media_id = player_info.get('mediaId') if not media_id: clip_id = player_info['clipId'] - media_id = self._download_json( - 'http://feed.theplatform.com/f/h9dtGB/punlNGjMlc1F?fields=id&byContent=byReleases%3DbyId%253D' + clip_id, - clip_id)['entries'][0]['id'].split('/')[-1] + feed = self._download_json( + 'http://tpfeed.cbc.ca/f/ExhSPC/vms_5akSXx4Ng_Zn?byCustomValue={:mpsReleases}{%s}'% clip_id, + clip_id, fatal=False) + if feed: + media_id = try_get(feed, lambda x: x['entries'][0]['guid'], compat_str) + if not media_id: + media_id = self._download_json( + 'http://feed.theplatform.com/f/h9dtGB/punlNGjMlc1F?fields=id&byContent=byReleases%3DbyId%253D' + clip_id, + clip_id)['entries'][0]['id'].split('/')[-1] return self.url_result('cbcplayer:%s' % media_id, 'CBCPlayer', media_id) else: entries = [self.url_result('cbcplayer:%s' % media_id, 'CBCPlayer', media_id) for media_id in re.findall(r'<iframe[^>]+src="[^"]+?mediaId=(\d+)"', webpage)] From dc35ba0eba000d9557b40387838f7dfff103286f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 26 Jul 2016 23:06:21 +0700 Subject: [PATCH 40/55] [mgtv] Fix typo --- youtube_dl/extractor/mgtv.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/mgtv.py b/youtube_dl/extractor/mgtv.py index d970e94ec..27bdff8b2 100644 --- a/youtube_dl/extractor/mgtv.py +++ b/youtube_dl/extractor/mgtv.py @@ -9,7 +9,7 @@ class MGTVIE(InfoExtractor): _VALID_URL = r'https?://www\.mgtv\.com/v/(?:[^/]+/)*(?P<id>\d+)\.html' IE_DESC = '芒果TV' - _TEST = { + _TESTS = [{ 'url': 'http://www.mgtv.com/v/1/290525/f/3116640.html', 'md5': '1bdadcf760a0b90946ca68ee9a2db41a', 'info_dict': { @@ -20,7 +20,11 @@ class MGTVIE(InfoExtractor): 'duration': 7461, 'thumbnail': 're:^https?://.*\.jpg$', }, - } + }, { + # no tbr extracted from stream_url + 'url': 'http://www.mgtv.com/v/1/1/f/3324755.html', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) @@ -41,7 +45,8 @@ class MGTVIE(InfoExtractor): def extract_format(stream_url, format_id, idx, query={}): format_info = self._download_json( stream_url, video_id, - note='Download video info for format %s' % format_id or '#%d' % idx, query=query) + note='Download video info for format %s' % (format_id or '#%d' % idx), + query=query) return { 'format_id': format_id, 'url': format_info['info'], From 9a700deea47b2514ef07d4ab7a0c21c7942b8b26 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 26 Jul 2016 23:07:16 +0700 Subject: [PATCH 41/55] [instagram] Remove duplicate field in test --- youtube_dl/extractor/instagram.py | 1 - 1 file changed, 1 deletion(-) diff --git a/youtube_dl/extractor/instagram.py b/youtube_dl/extractor/instagram.py index fc0197ae1..8f7f232be 100644 --- a/youtube_dl/extractor/instagram.py +++ b/youtube_dl/extractor/instagram.py @@ -36,7 +36,6 @@ class InstagramIE(InfoExtractor): 'info_dict': { 'id': 'BA-pQFBG8HZ', 'ext': 'mp4', - 'uploader_id': 'britneyspears', 'title': 'Video by britneyspears', 'thumbnail': 're:^https?://.*\.jpg', 'timestamp': 1453760977, From ced70c86400d09b0a03ecc0500e2874efc50b354 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 26 Jul 2016 23:08:08 +0700 Subject: [PATCH 42/55] [cbc] PEP 8 --- youtube_dl/extractor/cbc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/cbc.py b/youtube_dl/extractor/cbc.py index 885d2c721..a87e97140 100644 --- a/youtube_dl/extractor/cbc.py +++ b/youtube_dl/extractor/cbc.py @@ -99,7 +99,7 @@ class CBCIE(InfoExtractor): if not media_id: clip_id = player_info['clipId'] feed = self._download_json( - 'http://tpfeed.cbc.ca/f/ExhSPC/vms_5akSXx4Ng_Zn?byCustomValue={:mpsReleases}{%s}'% clip_id, + 'http://tpfeed.cbc.ca/f/ExhSPC/vms_5akSXx4Ng_Zn?byCustomValue={:mpsReleases}{%s}' % clip_id, clip_id, fatal=False) if feed: media_id = try_get(feed, lambda x: x['entries'][0]['guid'], compat_str) From 3e050d51d4752e89269c5ae82cf4f0f49ad424b8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 26 Jul 2016 23:14:04 +0700 Subject: [PATCH 43/55] [orf:oe1] Relax _VALID_URL --- youtube_dl/extractor/orf.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/orf.py b/youtube_dl/extractor/orf.py index ccb23e069..6ae30679a 100644 --- a/youtube_dl/extractor/orf.py +++ b/youtube_dl/extractor/orf.py @@ -137,13 +137,16 @@ class ORFTVthekIE(InfoExtractor): class ORFOE1IE(InfoExtractor): IE_NAME = 'orf:oe1' IE_DESC = 'Radio Österreich 1' - _VALID_URL = r'https?://oe1\.orf\.at/(?:programm/|konsole.*?#\?track_id=)(?P<id>[0-9]+)' + _VALID_URL = r'https?://oe1\.orf\.at/(?:programm/|konsole\?.*?\btrack_id=)(?P<id>[0-9]+)' # Audios on ORF radio are only available for 7 days, so we can't add tests. - _TEST = { + _TESTS = [{ 'url': 'http://oe1.orf.at/konsole?show=on_demand#?track_id=394211', 'only_matching': True, - } + }, { + 'url': 'http://oe1.orf.at/konsole?show=ondemand&track_id=443608&load_day=/programm/konsole/tag/20160726', + 'only_matching': True, + }] def _real_extract(self, url): show_id = self._match_id(url) From 40090e8d519f88c85ce65d8e882093d8ab26e368 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 26 Jul 2016 23:54:06 +0700 Subject: [PATCH 44/55] [extractor/common] Improve is_suitable In order to fix breakage introduced by a3aa814b774a413d9e7f4fbfadf06fe6dcc59b25 --- youtube_dl/extractor/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index b8a76e3cb..53c28f016 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1786,7 +1786,7 @@ class InfoExtractor(object): any_restricted = False for tc in self.get_testcases(include_onlymatching=False): - if 'playlist' in tc: + if tc.get('playlist', []): tc = tc['playlist'][0] is_restricted = age_restricted( tc.get('info_dict', {}).get('age_limit'), age_limit) From dcbb07c35a113ace7b6b90c40f661f9c31174a64 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 26 Jul 2016 23:56:53 +0700 Subject: [PATCH 45/55] release 2016.07.26.2 --- .github/ISSUE_TEMPLATE.md | 6 +++--- docs/supportedsites.md | 4 +--- youtube_dl/version.py | 2 +- 3 files changed, 5 insertions(+), 7 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index 01a5ab5ec..f102d2611 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2016.07.24*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2016.07.24** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2016.07.26.2*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2016.07.26.2** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through [README](https://github.com/rg3/youtube-dl/blob/master/README.md) and **most notably** [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -35,7 +35,7 @@ $ youtube-dl -v <your command line> [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2016.07.24 +[debug] youtube-dl version 2016.07.26.2 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/docs/supportedsites.md b/docs/supportedsites.md index fa70fe49a..1f89b1c14 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -142,7 +142,6 @@ - **CollegeRama** - **ComCarCoff** - **ComedyCentral** - - **ComedyCentralShows**: The Daily Show / The Colbert Report - **ComedyCentralTV** - **CondeNast**: Condé Nast media group: Allure, Architectural Digest, Ars Technica, Bon Appétit, Brides, Condé Nast, Condé Nast Traveler, Details, Epicurious, GQ, Glamour, Golf Digest, SELF, Teen Vogue, The New Yorker, Vanity Fair, Vogue, W Magazine, WIRED - **Coub** @@ -401,7 +400,6 @@ - **MSN** - **MTV** - **mtv.de** - - **mtviggy.com** - **mtvservices:embedded** - **MuenchenTV**: münchen.tv - **MusicPlayOn** @@ -441,7 +439,6 @@ - **Newstube** - **NextMedia**: 蘋果日報 - **NextMediaActionNews**: 蘋果日報 - 動新聞 - - **nextmovie.com** - **nfb**: National Film Board of Canada - **nfl.com** - **nhl.com** @@ -699,6 +696,7 @@ - **TNAFlix** - **TNAFlixNetworkEmbed** - **toggle** + - **Tosh**: Tosh.0 - **tou.tv** - **Toypics**: Toypics user profile - **ToypicsUser**: Toypics user profile diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 428bd9a97..9a1642407 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2016.07.24' +__version__ = '2016.07.26.2' From 7935926baadaad1bcf5c45b1da0f3d2d8a173ce9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 27 Jul 2016 00:14:40 +0700 Subject: [PATCH 46/55] [devscripts/show-downloads-statistics] Add support for paging --- devscripts/show-downloads-statistics.py | 36 ++++++++++++++----------- 1 file changed, 21 insertions(+), 15 deletions(-) diff --git a/devscripts/show-downloads-statistics.py b/devscripts/show-downloads-statistics.py index b591d3fc9..e25d28411 100644 --- a/devscripts/show-downloads-statistics.py +++ b/devscripts/show-downloads-statistics.py @@ -1,6 +1,7 @@ #!/usr/bin/env python from __future__ import unicode_literals +import itertools import json import os import re @@ -21,21 +22,26 @@ def format_size(bytes): total_bytes = 0 -releases = json.loads(compat_urllib_request.urlopen( - 'https://api.github.com/repos/rg3/youtube-dl/releases').read().decode('utf-8')) +for page in itertools.count(1): + releases = json.loads(compat_urllib_request.urlopen( + 'https://api.github.com/repos/rg3/youtube-dl/releases?page=%s' % page + ).read().decode('utf-8')) -for release in releases: - compat_print(release['name']) - for asset in release['assets']: - asset_name = asset['name'] - total_bytes += asset['download_count'] * asset['size'] - if all(not re.match(p, asset_name) for p in ( - r'^youtube-dl$', - r'^youtube-dl-\d{4}\.\d{2}\.\d{2}(?:\.\d+)?\.tar\.gz$', - r'^youtube-dl\.exe$')): - continue - compat_print( - ' %s size: %s downloads: %d' - % (asset_name, format_size(asset['size']), asset['download_count'])) + if not releases: + break + + for release in releases: + compat_print(release['name']) + for asset in release['assets']: + asset_name = asset['name'] + total_bytes += asset['download_count'] * asset['size'] + if all(not re.match(p, asset_name) for p in ( + r'^youtube-dl$', + r'^youtube-dl-\d{4}\.\d{2}\.\d{2}(?:\.\d+)?\.tar\.gz$', + r'^youtube-dl\.exe$')): + continue + compat_print( + ' %s size: %s downloads: %d' + % (asset_name, format_size(asset['size']), asset['download_count'])) compat_print('total downloads traffic: %s' % format_size(total_bytes)) From 289a16b4f3c67faa230d1b05b48f0c224b3472d8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 27 Jul 2016 23:28:01 +0700 Subject: [PATCH 47/55] [shared] Respect redirect URL (Closes #10170) --- youtube_dl/extractor/shared.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/shared.py b/youtube_dl/extractor/shared.py index e7e5f653e..6757e6ccf 100644 --- a/youtube_dl/extractor/shared.py +++ b/youtube_dl/extractor/shared.py @@ -37,15 +37,17 @@ class SharedIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + + webpage, urlh = self._download_webpage_handle(url, video_id) if '>File does not exist<' in webpage: raise ExtractorError( 'Video %s does not exist' % video_id, expected=True) download_form = self._hidden_inputs(webpage) + request = sanitized_Request( - url, urlencode_postdata(download_form)) + urlh.geturl(), urlencode_postdata(download_form)) request.add_header('Content-Type', 'application/x-www-form-urlencoded') video_page = self._download_webpage( From 05c8268c8192dcc4f61c869aba659c8e51d040bb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 27 Jul 2016 23:39:02 +0700 Subject: [PATCH 48/55] [shared] Modernize and make more robust --- youtube_dl/extractor/shared.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/shared.py b/youtube_dl/extractor/shared.py index 6757e6ccf..d592dfeb8 100644 --- a/youtube_dl/extractor/shared.py +++ b/youtube_dl/extractor/shared.py @@ -6,7 +6,6 @@ from .common import InfoExtractor from ..utils import ( ExtractorError, int_or_none, - sanitized_Request, urlencode_postdata, ) @@ -46,21 +45,24 @@ class SharedIE(InfoExtractor): download_form = self._hidden_inputs(webpage) - request = sanitized_Request( - urlh.geturl(), urlencode_postdata(download_form)) - request.add_header('Content-Type', 'application/x-www-form-urlencoded') - video_page = self._download_webpage( - request, video_id, 'Downloading video page') + urlh.geturl(), video_id, 'Downloading video page', + data=urlencode_postdata(download_form), + headers={ + 'Content-Type': 'application/x-www-form-urlencoded', + 'Referer': urlh.geturl(), + }) video_url = self._html_search_regex( - r'data-url="([^"]+)"', video_page, 'video URL') + r'data-url=(["\'])(?P<url>(?:(?!\1).)+)\1', + video_page, 'video URL', group='url') title = base64.b64decode(self._html_search_meta( 'full:title', webpage, 'title').encode('utf-8')).decode('utf-8') filesize = int_or_none(self._html_search_meta( 'full:size', webpage, 'file size', fatal=False)) thumbnail = self._html_search_regex( - r'data-poster="([^"]+)"', video_page, 'thumbnail', default=None) + r'data-poster=(["\'])(?P<url>(?:(?!\1).)+)\1', + video_page, 'thumbnail', default=None, group='url') return { 'id': video_id, From 39eef54cf01a9017da0ea83618978293f19841bb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 28 Jul 2016 21:38:23 +0700 Subject: [PATCH 49/55] [ard:mediathek] Skip unavailable test --- youtube_dl/extractor/ard.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py index c15cf1575..07e67dd33 100644 --- a/youtube_dl/extractor/ard.py +++ b/youtube_dl/extractor/ard.py @@ -73,6 +73,7 @@ class ARDMediathekIE(InfoExtractor): 'description': 'md5:c0c1c8048514deaed2a73b3a60eecacb', 'duration': 3287, }, + 'skip': 'Video is no longer available', }] def _extract_media_info(self, media_info_url, webpage, video_id): From 94aae015485801ff1edc3689a11f8866b077ce47 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 28 Jul 2016 22:15:15 +0700 Subject: [PATCH 50/55] [extractor/generic] Extract all soundcloud embeds (Closes #10179) --- youtube_dl/extractor/generic.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 98dd6a7e3..6c4af6424 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -71,6 +71,7 @@ from .vessel import VesselIE from .kaltura import KalturaIE from .eagleplatform import EaglePlatformIE from .facebook import FacebookIE +from .soundcloud import SoundcloudIE class GenericIE(InfoExtractor): @@ -1999,12 +2000,9 @@ class GenericIE(InfoExtractor): return self.url_result(myvi_url) # Look for embedded soundcloud player - mobj = re.search( - r'<iframe\s+(?:[a-zA-Z0-9_-]+="[^"]+"\s+)*src="(?P<url>https?://(?:w\.)?soundcloud\.com/player[^"]+)"', - webpage) - if mobj is not None: - url = unescapeHTML(mobj.group('url')) - return self.url_result(url) + soundcloud_urls = SoundcloudIE._extract_urls(webpage) + if soundcloud_urls: + return _playlist_from_matches(soundcloud_urls, getter=unescapeHTML, ie=SoundcloudIE.ie_key()) # Look for embedded mtvservices player mtvservices_url = MTVServicesEmbeddedIE._extract_url(webpage) From fbdf8d15d110c75c9029d86943a9222226b9b3f0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 28 Jul 2016 22:16:05 +0700 Subject: [PATCH 51/55] [soundcloud] Add _extract_urls (#10179) --- youtube_dl/extractor/soundcloud.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index 194dabc71..aeae931a2 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -119,6 +119,12 @@ class SoundcloudIE(InfoExtractor): _CLIENT_ID = '02gUJC0hH2ct1EGOcYXQIzRFU91c72Ea' _IPHONE_CLIENT_ID = '376f225bf427445fc4bfb6b99b72e0bf' + @staticmethod + def _extract_urls(webpage): + return [m.group('url') for m in re.finditer( + r'<iframe[^>]+src=(["\'])(?P<url>(?:https?://)?(?:w\.)?soundcloud\.com/player.+?)\1', + webpage)] + def report_resolve(self, video_id): """Report information extraction.""" self.to_screen('%s: Resolving id' % video_id) From db19df6ca06f0470fa1c08b0a5255625f658ac3d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 28 Jul 2016 22:20:08 +0700 Subject: [PATCH 52/55] [extractor/generic] Add test for #10179 --- youtube_dl/extractor/generic.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 6c4af6424..5364f0b19 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -785,6 +785,15 @@ class GenericIE(InfoExtractor): 'upload_date': '20141029', } }, + # Soundcloud multiple embeds + { + 'url': 'http://www.guitarplayer.com/lessons/1014/legato-workout-one-hour-to-more-fluid-performance---tab/52809', + 'info_dict': { + 'id': '52809', + 'title': 'Guitar Essentials: Legato Workout—One-Hour to Fluid Performance | TAB + AUDIO', + }, + 'playlist_mincount': 7, + }, # Livestream embed { 'url': 'http://www.esa.int/Our_Activities/Space_Science/Rosetta/Philae_comet_touch-down_webcast', From 74ba450a816927cdcf6ab5a60182359a8a1d8469 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 28 Jul 2016 22:30:09 +0700 Subject: [PATCH 53/55] [twitch:clips] Fix extraction (Closes #9767) --- youtube_dl/extractor/twitch.py | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py index 67b1277cc..5757eb119 100644 --- a/youtube_dl/extractor/twitch.py +++ b/youtube_dl/extractor/twitch.py @@ -461,7 +461,7 @@ class TwitchClipsIE(InfoExtractor): IE_NAME = 'twitch:clips' _VALID_URL = r'https?://clips\.twitch\.tv/(?:[^/]+/)*(?P<id>[^/?#&]+)' - _TEST = { + _TESTS = [{ 'url': 'https://clips.twitch.tv/ea/AggressiveCobraPoooound', 'md5': '761769e1eafce0ffebfb4089cb3847cd', 'info_dict': { @@ -473,7 +473,11 @@ class TwitchClipsIE(InfoExtractor): 'uploader': 'stereotype_', 'uploader_id': 'stereotype_', }, - } + }, { + # multiple formats + 'url': 'https://clips.twitch.tv/rflegendary/UninterestedBeeDAESuppy', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) @@ -485,15 +489,25 @@ class TwitchClipsIE(InfoExtractor): r'(?s)clipInfo\s*=\s*({.+?});', webpage, 'clip info'), video_id, transform_source=js_to_json) - video_url = clip['clip_video_url'] - title = clip['channel_title'] + title = clip.get('channel_title') or self._og_search_title(webpage) + + formats = [{ + 'url': option['source'], + 'format_id': option.get('quality'), + 'height': int_or_none(option.get('quality')), + } for option in clip.get('quality_options', []) if option.get('source')] + + if not formats: + formats = [{ + 'url': clip['clip_video_url'], + }] return { 'id': video_id, - 'url': video_url, 'title': title, 'thumbnail': self._og_search_thumbnail(webpage), 'creator': clip.get('broadcaster_display_name') or clip.get('broadcaster_login'), 'uploader': clip.get('curator_login'), 'uploader_id': clip.get('curator_display_name'), + 'formats': formats, } From d9d56deadf1795a88ec1cfa7b18c13ca559b3d50 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 28 Jul 2016 02:42:57 +0700 Subject: [PATCH 54/55] release 2016.07.28 --- .github/ISSUE_TEMPLATE.md | 6 +++--- youtube_dl/version.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index f102d2611..27257ee0a 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2016.07.26.2*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2016.07.26.2** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2016.07.28*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2016.07.28** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through [README](https://github.com/rg3/youtube-dl/blob/master/README.md) and **most notably** [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -35,7 +35,7 @@ $ youtube-dl -v <your command line> [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2016.07.26.2 +[debug] youtube-dl version 2016.07.28 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 9a1642407..2cfa406d9 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2016.07.26.2' +__version__ = '2016.07.28' From 0cacae280725074fd4eef82c079b0829a15206f1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 29 Jul 2016 09:01:53 +0700 Subject: [PATCH 55/55] [twitch:clips] Sort formats --- youtube_dl/extractor/twitch.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py index 5757eb119..890f55180 100644 --- a/youtube_dl/extractor/twitch.py +++ b/youtube_dl/extractor/twitch.py @@ -502,6 +502,8 @@ class TwitchClipsIE(InfoExtractor): 'url': clip['clip_video_url'], }] + self._sort_formats(formats) + return { 'id': video_id, 'title': title,