diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index 8b68f371b..27257ee0a 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2016.07.17*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2016.07.17** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2016.07.28*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2016.07.28** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through [README](https://github.com/rg3/youtube-dl/blob/master/README.md) and **most notably** [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -35,7 +35,7 @@ $ youtube-dl -v [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2016.07.17 +[debug] youtube-dl version 2016.07.28 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/AUTHORS b/AUTHORS index f762e8a16..890c827a0 100644 --- a/AUTHORS +++ b/AUTHORS @@ -178,3 +178,4 @@ Artur Krysiak Jakub Adam Wieczorek Aleksandar Topuzović Nehal Patel +Rob van Bekkum diff --git a/devscripts/show-downloads-statistics.py b/devscripts/show-downloads-statistics.py index b591d3fc9..e25d28411 100644 --- a/devscripts/show-downloads-statistics.py +++ b/devscripts/show-downloads-statistics.py @@ -1,6 +1,7 @@ #!/usr/bin/env python from __future__ import unicode_literals +import itertools import json import os import re @@ -21,21 +22,26 @@ def format_size(bytes): total_bytes = 0 -releases = json.loads(compat_urllib_request.urlopen( - 'https://api.github.com/repos/rg3/youtube-dl/releases').read().decode('utf-8')) +for page in itertools.count(1): + releases = json.loads(compat_urllib_request.urlopen( + 'https://api.github.com/repos/rg3/youtube-dl/releases?page=%s' % page + ).read().decode('utf-8')) -for release in releases: - compat_print(release['name']) - for asset in release['assets']: - asset_name = asset['name'] - total_bytes += asset['download_count'] * asset['size'] - if all(not re.match(p, asset_name) for p in ( - r'^youtube-dl$', - r'^youtube-dl-\d{4}\.\d{2}\.\d{2}(?:\.\d+)?\.tar\.gz$', - r'^youtube-dl\.exe$')): - continue - compat_print( - ' %s size: %s downloads: %d' - % (asset_name, format_size(asset['size']), asset['download_count'])) + if not releases: + break + + for release in releases: + compat_print(release['name']) + for asset in release['assets']: + asset_name = asset['name'] + total_bytes += asset['download_count'] * asset['size'] + if all(not re.match(p, asset_name) for p in ( + r'^youtube-dl$', + r'^youtube-dl-\d{4}\.\d{2}\.\d{2}(?:\.\d+)?\.tar\.gz$', + r'^youtube-dl\.exe$')): + continue + compat_print( + ' %s size: %s downloads: %d' + % (asset_name, format_size(asset['size']), asset['download_count'])) compat_print('total downloads traffic: %s' % format_size(total_bytes)) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index eaa165347..1f89b1c14 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -46,6 +46,7 @@ - **archive.org**: archive.org videos - **ARD** - **ARD:mediathek** + - **Arkena** - **arte.tv** - **arte.tv:+7** - **arte.tv:cinema** @@ -141,7 +142,7 @@ - **CollegeRama** - **ComCarCoff** - **ComedyCentral** - - **ComedyCentralShows**: The Daily Show / The Colbert Report + - **ComedyCentralTV** - **CondeNast**: Condé Nast media group: Allure, Architectural Digest, Ars Technica, Bon Appétit, Brides, Condé Nast, Condé Nast Traveler, Details, Epicurious, GQ, Glamour, Golf Digest, SELF, Teen Vogue, The New Yorker, Vanity Fair, Vogue, W Magazine, WIRED - **Coub** - **Cracked** @@ -336,6 +337,8 @@ - **kuwo:song**: 酷我音乐 - **la7.it** - **Laola1Tv** + - **Lcp** + - **LcpPlay** - **Le**: 乐视网 - **Learnr** - **Lecture2Go** @@ -397,7 +400,6 @@ - **MSN** - **MTV** - **mtv.de** - - **mtviggy.com** - **mtvservices:embedded** - **MuenchenTV**: münchen.tv - **MusicPlayOn** @@ -437,7 +439,6 @@ - **Newstube** - **NextMedia**: 蘋果日報 - **NextMediaActionNews**: 蘋果日報 - 動新聞 - - **nextmovie.com** - **nfb**: National Film Board of Canada - **nfl.com** - **nhl.com** @@ -477,6 +478,7 @@ - **NYTimes** - **NYTimesArticle** - **ocw.mit.edu** + - **OdaTV** - **Odnoklassniki** - **OktoberfestTV** - **on.aol.com** @@ -694,6 +696,7 @@ - **TNAFlix** - **TNAFlixNetworkEmbed** - **toggle** + - **Tosh**: Tosh.0 - **tou.tv** - **Toypics**: Toypics user profile - **ToypicsUser**: Toypics user profile diff --git a/test/test_all_urls.py b/test/test_all_urls.py index 1f6079c29..cd1cd4b24 100644 --- a/test/test_all_urls.py +++ b/test/test_all_urls.py @@ -101,8 +101,6 @@ class TestAllURLsMatching(unittest.TestCase): self.assertMatch(':ytsubs', ['youtube:subscriptions']) self.assertMatch(':ytsubscriptions', ['youtube:subscriptions']) self.assertMatch(':ythistory', ['youtube:history']) - self.assertMatch(':thedailyshow', ['ComedyCentralShows']) - self.assertMatch(':tds', ['ComedyCentralShows']) def test_vimeo_matching(self): self.assertMatch('https://vimeo.com/channels/tributes', ['vimeo:channel']) diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py index c15cf1575..07e67dd33 100644 --- a/youtube_dl/extractor/ard.py +++ b/youtube_dl/extractor/ard.py @@ -73,6 +73,7 @@ class ARDMediathekIE(InfoExtractor): 'description': 'md5:c0c1c8048514deaed2a73b3a60eecacb', 'duration': 3287, }, + 'skip': 'Video is no longer available', }] def _extract_media_info(self, media_info_url, webpage, video_id): diff --git a/youtube_dl/extractor/arkena.py b/youtube_dl/extractor/arkena.py new file mode 100644 index 000000000..d45cae301 --- /dev/null +++ b/youtube_dl/extractor/arkena.py @@ -0,0 +1,115 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + float_or_none, + int_or_none, + mimetype2ext, + parse_iso8601, + strip_jsonp, +) + + +class ArkenaIE(InfoExtractor): + _VALID_URL = r'https?://play\.arkena\.com/(?:config|embed)/avp/v\d/player/media/(?P[^/]+)/[^/]+/(?P\d+)' + _TESTS = [{ + 'url': 'https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411', + 'md5': 'b96f2f71b359a8ecd05ce4e1daa72365', + 'info_dict': { + 'id': 'b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe', + 'ext': 'mp4', + 'title': 'Big Buck Bunny', + 'description': 'Royalty free test video', + 'timestamp': 1432816365, + 'upload_date': '20150528', + 'is_live': False, + }, + }, { + 'url': 'https://play.arkena.com/config/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411/?callbackMethod=jQuery1111023664739129262213_1469227693893', + 'only_matching': True, + }, { + 'url': 'http://play.arkena.com/config/avp/v1/player/media/327336/darkmatter/131064/?callbackMethod=jQuery1111002221189684892677_1469227595972', + 'only_matching': True, + }, { + 'url': 'http://play.arkena.com/embed/avp/v1/player/media/327336/darkmatter/131064/', + 'only_matching': True, + }] + + @staticmethod + def _extract_url(webpage): + # See https://support.arkena.com/display/PLAY/Ways+to+embed+your+video + mobj = re.search( + r']+src=(["\'])(?P(?:https?:)?//play\.arkena\.com/embed/avp/.+?)\1', + webpage) + if mobj: + return mobj.group('url') + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + account_id = mobj.group('account_id') + + playlist = self._download_json( + 'https://play.arkena.com/config/avp/v2/player/media/%s/0/%s/?callbackMethod=_' + % (video_id, account_id), + video_id, transform_source=strip_jsonp)['Playlist'][0] + + media_info = playlist['MediaInfo'] + title = media_info['Title'] + media_files = playlist['MediaFiles'] + + is_live = False + formats = [] + for kind_case, kind_formats in media_files.items(): + kind = kind_case.lower() + for f in kind_formats: + f_url = f.get('Url') + if not f_url: + continue + is_live = f.get('Live') == 'true' + exts = (mimetype2ext(f.get('Type')), determine_ext(f_url, None)) + if kind == 'm3u8' or 'm3u8' in exts: + formats.extend(self._extract_m3u8_formats( + f_url, video_id, 'mp4', + entry_protocol='m3u8' if is_live else 'm3u8_native', + m3u8_id=kind, fatal=False, live=is_live)) + elif kind == 'flash' or 'f4m' in exts: + formats.extend(self._extract_f4m_formats( + f_url, video_id, f4m_id=kind, fatal=False)) + elif kind == 'dash' or 'mpd' in exts: + formats.extend(self._extract_mpd_formats( + f_url, video_id, mpd_id=kind, fatal=False)) + elif kind == 'silverlight': + # TODO: process when ism is supported (see + # https://github.com/rg3/youtube-dl/issues/8118) + continue + else: + tbr = float_or_none(f.get('Bitrate'), 1000) + formats.append({ + 'url': f_url, + 'format_id': '%s-%d' % (kind, tbr) if tbr else kind, + 'tbr': tbr, + }) + self._sort_formats(formats) + + description = media_info.get('Description') + video_id = media_info.get('VideoId') or video_id + timestamp = parse_iso8601(media_info.get('PublishDate')) + thumbnails = [{ + 'url': thumbnail['Url'], + 'width': int_or_none(thumbnail.get('Size')), + } for thumbnail in (media_info.get('Poster') or []) if thumbnail.get('Url')] + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'timestamp': timestamp, + 'is_live': is_live, + 'thumbnails': thumbnails, + 'formats': formats, + } diff --git a/youtube_dl/extractor/bigflix.py b/youtube_dl/extractor/bigflix.py index 33762ad93..b19f35b5d 100644 --- a/youtube_dl/extractor/bigflix.py +++ b/youtube_dl/extractor/bigflix.py @@ -12,7 +12,7 @@ class BigflixIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?bigflix\.com/.+/(?P[0-9]+)' _TESTS = [{ 'url': 'http://www.bigflix.com/Hindi-movies/Action-movies/Singham-Returns/16537', - 'md5': 'ec76aa9b1129e2e5b301a474e54fab74', + 'md5': 'dc1b4aebb46e3a7077ecc0d9f43f61e3', 'info_dict': { 'id': '16537', 'ext': 'mp4', @@ -26,7 +26,7 @@ class BigflixIE(InfoExtractor): 'id': '16070', 'ext': 'mp4', 'title': 'Madarasapatinam', - 'description': 'md5:63b9b8ed79189c6f0418c26d9a3452ca', + 'description': 'md5:9f0470b26a4ba8e824c823b5d95c2f6b', 'formats': 'mincount:2', }, 'params': { diff --git a/youtube_dl/extractor/camdemy.py b/youtube_dl/extractor/camdemy.py index 6ffbeabd3..268c34392 100644 --- a/youtube_dl/extractor/camdemy.py +++ b/youtube_dl/extractor/camdemy.py @@ -1,7 +1,6 @@ # coding: utf-8 from __future__ import unicode_literals -import datetime import re from .common import InfoExtractor @@ -10,8 +9,10 @@ from ..compat import ( compat_urlparse, ) from ..utils import ( - parse_iso8601, + clean_html, + parse_duration, str_to_int, + unified_strdate, ) @@ -26,14 +27,14 @@ class CamdemyIE(InfoExtractor): 'ext': 'mp4', 'title': 'Ch1-1 Introduction, Signals (02-23-2012)', 'thumbnail': 're:^https?://.*\.jpg$', - 'description': '', 'creator': 'ss11spring', + 'duration': 1591, 'upload_date': '20130114', - 'timestamp': 1358154556, 'view_count': int, } }, { # With non-empty description + # webpage returns "No permission or not login" 'url': 'http://www.camdemy.com/media/13885', 'md5': '4576a3bb2581f86c61044822adbd1249', 'info_dict': { @@ -41,64 +42,71 @@ class CamdemyIE(InfoExtractor): 'ext': 'mp4', 'title': 'EverCam + Camdemy QuickStart', 'thumbnail': 're:^https?://.*\.jpg$', - 'description': 'md5:050b62f71ed62928f8a35f1a41e186c9', + 'description': 'md5:2a9f989c2b153a2342acee579c6e7db6', 'creator': 'evercam', - 'upload_date': '20140620', - 'timestamp': 1403271569, + 'duration': 318, } }, { - # External source + # External source (YouTube) 'url': 'http://www.camdemy.com/media/14842', - 'md5': '50e1c3c3aa233d3d7b7daa2fa10b1cf7', 'info_dict': { 'id': '2vsYQzNIsJo', 'ext': 'mp4', + 'title': 'Excel 2013 Tutorial - How to add Password Protection', + 'description': 'Excel 2013 Tutorial for Beginners - How to add Password Protection', 'upload_date': '20130211', 'uploader': 'Hun Kim', - 'description': 'Excel 2013 Tutorial for Beginners - How to add Password Protection', 'uploader_id': 'hunkimtutorials', - 'title': 'Excel 2013 Tutorial - How to add Password Protection', - } + }, + 'params': { + 'skip_download': True, + }, }] def _real_extract(self, url): video_id = self._match_id(url) - page = self._download_webpage(url, video_id) + + webpage = self._download_webpage(url, video_id) src_from = self._html_search_regex( - r"
Source: ]*>Sources?(?:\s+from)?\s*:\s*]+(?:href|title)=(['\"])(?P(?:(?!\1).)+)\1", + webpage, 'external source', default=None, group='url') if src_from: return self.url_result(src_from) oembed_obj = self._download_json( 'http://www.camdemy.com/oembed/?format=json&url=' + url, video_id) + title = oembed_obj['title'] thumb_url = oembed_obj['thumbnail_url'] video_folder = compat_urlparse.urljoin(thumb_url, 'video/') file_list_doc = self._download_xml( compat_urlparse.urljoin(video_folder, 'fileList.xml'), - video_id, 'Filelist XML') + video_id, 'Downloading filelist XML') file_name = file_list_doc.find('./video/item/fileName').text video_url = compat_urlparse.urljoin(video_folder, file_name) - timestamp = parse_iso8601(self._html_search_regex( - r"
Posted\s*:
\s*
([^<>]+)<", - page, 'creation time', fatal=False), - delimiter=' ', timezone=datetime.timedelta(hours=8)) - view_count = str_to_int(self._html_search_regex( - r"
Views\s*:
\s*
([^<>]+)<", - page, 'view count', fatal=False)) + # Some URLs return "No permission or not login" in a webpage despite being + # freely available via oembed JSON URL (e.g. http://www.camdemy.com/media/13885) + upload_date = unified_strdate(self._search_regex( + r'>published on ([^<]+)<', webpage, + 'upload date', default=None)) + view_count = str_to_int(self._search_regex( + r'role=["\']viewCnt["\'][^>]*>([\d,.]+) views', + webpage, 'view count', default=None)) + description = self._html_search_meta( + 'description', webpage, default=None) or clean_html( + oembed_obj.get('description')) return { 'id': video_id, 'url': video_url, - 'title': oembed_obj['title'], + 'title': title, 'thumbnail': thumb_url, - 'description': self._html_search_meta('description', page), - 'creator': oembed_obj['author_name'], - 'duration': oembed_obj['duration'], - 'timestamp': timestamp, + 'description': description, + 'creator': oembed_obj.get('author_name'), + 'duration': parse_duration(oembed_obj.get('duration')), + 'upload_date': upload_date, 'view_count': view_count, } diff --git a/youtube_dl/extractor/cbc.py b/youtube_dl/extractor/cbc.py index 06772d492..a87e97140 100644 --- a/youtube_dl/extractor/cbc.py +++ b/youtube_dl/extractor/cbc.py @@ -4,9 +4,11 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( js_to_json, smuggle_url, + try_get, ) @@ -27,7 +29,20 @@ class CBCIE(InfoExtractor): }, 'skip': 'Geo-restricted to Canada', }, { - # with clipId + # with clipId, feed available via tpfeed.cbc.ca and feed.theplatform.com + 'url': 'http://www.cbc.ca/22minutes/videos/22-minutes-update/22-minutes-update-episode-4', + 'md5': '162adfa070274b144f4fdc3c3b8207db', + 'info_dict': { + 'id': '2414435309', + 'ext': 'mp4', + 'title': '22 Minutes Update: What Not To Wear Quebec', + 'description': "This week's latest Canadian top political story is What Not To Wear Quebec.", + 'upload_date': '20131025', + 'uploader': 'CBCC-NEW', + 'timestamp': 1382717907, + }, + }, { + # with clipId, feed only available via tpfeed.cbc.ca 'url': 'http://www.cbc.ca/archives/entry/1978-robin-williams-freestyles-on-90-minutes-live', 'md5': '0274a90b51a9b4971fe005c63f592f12', 'info_dict': { @@ -83,9 +98,15 @@ class CBCIE(InfoExtractor): media_id = player_info.get('mediaId') if not media_id: clip_id = player_info['clipId'] - media_id = self._download_json( - 'http://feed.theplatform.com/f/h9dtGB/punlNGjMlc1F?fields=id&byContent=byReleases%3DbyId%253D' + clip_id, - clip_id)['entries'][0]['id'].split('/')[-1] + feed = self._download_json( + 'http://tpfeed.cbc.ca/f/ExhSPC/vms_5akSXx4Ng_Zn?byCustomValue={:mpsReleases}{%s}' % clip_id, + clip_id, fatal=False) + if feed: + media_id = try_get(feed, lambda x: x['entries'][0]['guid'], compat_str) + if not media_id: + media_id = self._download_json( + 'http://feed.theplatform.com/f/h9dtGB/punlNGjMlc1F?fields=id&byContent=byReleases%3DbyId%253D' + clip_id, + clip_id)['entries'][0]['id'].split('/')[-1] return self.url_result('cbcplayer:%s' % media_id, 'CBCPlayer', media_id) else: entries = [self.url_result('cbcplayer:%s' % media_id, 'CBCPlayer', media_id) for media_id in re.findall(r']+src="[^"]+?mediaId=(\d+)"', webpage)] diff --git a/youtube_dl/extractor/cmt.py b/youtube_dl/extractor/cmt.py index f1311b14f..f24568dcc 100644 --- a/youtube_dl/extractor/cmt.py +++ b/youtube_dl/extractor/cmt.py @@ -1,5 +1,7 @@ from __future__ import unicode_literals + from .mtv import MTVIE +from ..utils import ExtractorError class CMTIE(MTVIE): @@ -16,7 +18,27 @@ class CMTIE(MTVIE): 'title': 'Garth Brooks - "The Call (featuring Trisha Yearwood)"', 'description': 'Blame It All On My Roots', }, + 'skip': 'Video not available', + }, { + 'url': 'http://www.cmt.com/videos/misc/1504699/still-the-king-ep-109-in-3-minutes.jhtml#id=1739908', + 'md5': 'e61a801ca4a183a466c08bd98dccbb1c', + 'info_dict': { + 'id': '1504699', + 'ext': 'mp4', + 'title': 'Still The King Ep. 109 in 3 Minutes', + 'description': 'Relive or catch up with Still The King by watching this recap of season 1, episode 9. New episodes Sundays 9/8c.', + 'timestamp': 1469421000.0, + 'upload_date': '20160725', + }, }, { 'url': 'http://www.cmt.com/shows/party-down-south/party-down-south-ep-407-gone-girl/1738172/playlist/#id=1738172', 'only_matching': True, }] + + @classmethod + def _transform_rtmp_url(cls, rtmp_video_url): + if 'error_not_available.swf' in rtmp_video_url: + raise ExtractorError( + '%s said: video is not available' % cls.IE_NAME, expected=True) + + return super(CMTIE, cls)._transform_rtmp_url(rtmp_video_url) diff --git a/youtube_dl/extractor/comedycentral.py b/youtube_dl/extractor/comedycentral.py index 63f68f765..c76909e48 100644 --- a/youtube_dl/extractor/comedycentral.py +++ b/youtube_dl/extractor/comedycentral.py @@ -1,17 +1,6 @@ from __future__ import unicode_literals -import re - from .mtv import MTVServicesInfoExtractor -from ..compat import ( - compat_str, - compat_urllib_parse_urlencode, -) -from ..utils import ( - ExtractorError, - float_or_none, - unified_strdate, -) class ComedyCentralIE(MTVServicesInfoExtractor): @@ -26,8 +15,10 @@ class ComedyCentralIE(MTVServicesInfoExtractor): 'info_dict': { 'id': 'cef0cbb3-e776-4bc9-b62e-8016deccb354', 'ext': 'mp4', - 'title': 'CC:Stand-Up|Greg Fitzsimmons: Life on Stage|Uncensored - Too Good of a Mother', + 'title': 'CC:Stand-Up|August 18, 2013|1|0101|Uncensored - Too Good of a Mother', 'description': 'After a certain point, breastfeeding becomes c**kblocking.', + 'timestamp': 1376798400, + 'upload_date': '20130818', }, }, { 'url': 'http://www.cc.com/shows/the-daily-show-with-trevor-noah/interviews/6yx39d/exclusive-rand-paul-extended-interview', @@ -35,244 +26,43 @@ class ComedyCentralIE(MTVServicesInfoExtractor): }] -class ComedyCentralShowsIE(MTVServicesInfoExtractor): - IE_DESC = 'The Daily Show / The Colbert Report' - # urls can be abbreviations like :thedailyshow - # urls for episodes like: - # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day - # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news - # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524 - _VALID_URL = r'''(?x)^(:(?Ptds|thedailyshow) - |https?://(:www\.)? - (?Pthedailyshow|thecolbertreport|tosh)\.(?:cc\.)?com/ - ((?:full-)?episodes/(?:[0-9a-z]{6}/)?(?P.*)| - (?P - (?:(?:guests/[^/]+|videos|video-(?:clips|playlists)|special-editions|news-team/[^/]+)/[^/]+/(?P[^/?#]+)) - |(the-colbert-report-(videos|collections)/(?P[0-9]+)/[^/]*/(?P.*?)) - |(watch/(?P[^/]*)/(?P.*)) - )| - (?P - extended-interviews/(?P[0-9a-z]+)/ - (?:playlist_tds_extended_)?(?P[^/?#]*?) - (?:/[^/?#]?|[?#]|$)))) - ''' +class ToshIE(MTVServicesInfoExtractor): + IE_DESC = 'Tosh.0' + _VALID_URL = r'^https?://tosh\.cc\.com/video-(?:clips|collections)/[^/]+/(?P[^/?#]+)' + _FEED_URL = 'http://tosh.cc.com/feeds/mrss' + _TESTS = [{ - 'url': 'http://thedailyshow.cc.com/watch/thu-december-13-2012/kristen-stewart', - 'md5': '4e2f5cb088a83cd8cdb7756132f9739d', - 'info_dict': { - 'id': 'ab9ab3e7-5a98-4dbe-8b21-551dc0523d55', - 'ext': 'mp4', - 'upload_date': '20121213', - 'description': 'Kristen Stewart learns to let loose in "On the Road."', - 'uploader': 'thedailyshow', - 'title': 'thedailyshow kristen-stewart part 1', - } - }, { - 'url': 'http://thedailyshow.cc.com/extended-interviews/b6364d/sarah-chayes-extended-interview', - 'info_dict': { - 'id': 'sarah-chayes-extended-interview', - 'description': 'Carnegie Endowment Senior Associate Sarah Chayes discusses how corrupt institutions function throughout the world in her book "Thieves of State: Why Corruption Threatens Global Security."', - 'title': 'thedailyshow Sarah Chayes Extended Interview', - }, - 'playlist': [ - { - 'info_dict': { - 'id': '0baad492-cbec-4ec1-9e50-ad91c291127f', - 'ext': 'mp4', - 'upload_date': '20150129', - 'description': 'Carnegie Endowment Senior Associate Sarah Chayes discusses how corrupt institutions function throughout the world in her book "Thieves of State: Why Corruption Threatens Global Security."', - 'uploader': 'thedailyshow', - 'title': 'thedailyshow sarah-chayes-extended-interview part 1', - }, - }, - { - 'info_dict': { - 'id': '1e4fb91b-8ce7-4277-bd7c-98c9f1bbd283', - 'ext': 'mp4', - 'upload_date': '20150129', - 'description': 'Carnegie Endowment Senior Associate Sarah Chayes discusses how corrupt institutions function throughout the world in her book "Thieves of State: Why Corruption Threatens Global Security."', - 'uploader': 'thedailyshow', - 'title': 'thedailyshow sarah-chayes-extended-interview part 2', - }, - }, - ], - 'params': { - 'skip_download': True, - }, - }, { - 'url': 'http://thedailyshow.cc.com/extended-interviews/xm3fnq/andrew-napolitano-extended-interview', - 'only_matching': True, - }, { - 'url': 'http://thecolbertreport.cc.com/videos/29w6fx/-realhumanpraise-for-fox-news', - 'only_matching': True, - }, { - 'url': 'http://thecolbertreport.cc.com/videos/gh6urb/neil-degrasse-tyson-pt--1?xrs=eml_col_031114', - 'only_matching': True, - }, { - 'url': 'http://thedailyshow.cc.com/guests/michael-lewis/3efna8/exclusive---michael-lewis-extended-interview-pt--3', - 'only_matching': True, - }, { - 'url': 'http://thedailyshow.cc.com/episodes/sy7yv0/april-8--2014---denis-leary', - 'only_matching': True, - }, { - 'url': 'http://thecolbertreport.cc.com/episodes/8ase07/april-8--2014---jane-goodall', - 'only_matching': True, - }, { - 'url': 'http://thedailyshow.cc.com/video-playlists/npde3s/the-daily-show-19088-highlights', - 'only_matching': True, - }, { - 'url': 'http://thedailyshow.cc.com/video-playlists/t6d9sg/the-daily-show-20038-highlights/be3cwo', - 'only_matching': True, - }, { - 'url': 'http://thedailyshow.cc.com/special-editions/2l8fdb/special-edition---a-look-back-at-food', - 'only_matching': True, - }, { - 'url': 'http://thedailyshow.cc.com/news-team/michael-che/7wnfel/we-need-to-talk-about-israel', - 'only_matching': True, - }, { 'url': 'http://tosh.cc.com/video-clips/68g93d/twitter-users-share-summer-plans', + 'info_dict': { + 'description': 'Tosh asked fans to share their summer plans.', + 'title': 'Twitter Users Share Summer Plans', + }, + 'playlist': [{ + 'md5': 'f269e88114c1805bb6d7653fecea9e06', + 'info_dict': { + 'id': '90498ec2-ed00-11e0-aca6-0026b9414f30', + 'ext': 'mp4', + 'title': 'Tosh.0|June 9, 2077|2|211|Twitter Users Share Summer Plans', + 'description': 'Tosh asked fans to share their summer plans.', + 'thumbnail': 're:^https?://.*\.jpg', + # It's really reported to be published on year 2077 + 'upload_date': '20770610', + 'timestamp': 3390510600, + 'subtitles': { + 'en': 'mincount:3', + }, + }, + }] + }, { + 'url': 'http://tosh.cc.com/video-collections/x2iz7k/just-plain-foul/m5q4fp', 'only_matching': True, }] - _available_formats = ['3500', '2200', '1700', '1200', '750', '400'] - - _video_extensions = { - '3500': 'mp4', - '2200': 'mp4', - '1700': 'mp4', - '1200': 'mp4', - '750': 'mp4', - '400': 'mp4', - } - _video_dimensions = { - '3500': (1280, 720), - '2200': (960, 540), - '1700': (768, 432), - '1200': (640, 360), - '750': (512, 288), - '400': (384, 216), - } - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - - if mobj.group('shortname'): - return self.url_result('http://www.cc.com/shows/the-daily-show-with-trevor-noah/full-episodes') - - if mobj.group('clip'): - if mobj.group('videotitle'): - epTitle = mobj.group('videotitle') - elif mobj.group('showname') == 'thedailyshow': - epTitle = mobj.group('tdstitle') - else: - epTitle = mobj.group('cntitle') - dlNewest = False - elif mobj.group('interview'): - epTitle = mobj.group('interview_title') - dlNewest = False - else: - dlNewest = not mobj.group('episode') - if dlNewest: - epTitle = mobj.group('showname') - else: - epTitle = mobj.group('episode') - show_name = mobj.group('showname') - - webpage, htmlHandle = self._download_webpage_handle(url, epTitle) - if dlNewest: - url = htmlHandle.geturl() - mobj = re.match(self._VALID_URL, url, re.VERBOSE) - if mobj is None: - raise ExtractorError('Invalid redirected URL: ' + url) - if mobj.group('episode') == '': - raise ExtractorError('Redirected URL is still not specific: ' + url) - epTitle = (mobj.group('episode') or mobj.group('videotitle')).rpartition('/')[-1] - - mMovieParams = re.findall('(?:[0-9]+)' _TEST = { - 'url': 'http://www.dailymail.co.uk/video/sciencetech/video-1288527/Turn-video-impressionist-masterpiece.html', - 'md5': '2f639d446394f53f3a33658b518b6615', + 'url': 'http://www.dailymail.co.uk/video/tvshowbiz/video-1295863/The-Mountain-appears-sparkling-water-ad-Heavy-Bubbles.html', + 'md5': 'f6129624562251f628296c3a9ffde124', 'info_dict': { - 'id': '1288527', + 'id': '1295863', 'ext': 'mp4', - 'title': 'Turn any video into an impressionist masterpiece', - 'description': 'md5:88ddbcb504367987b2708bb38677c9d2', + 'title': 'The Mountain appears in sparkling water ad for \'Heavy Bubbles\'', + 'description': 'md5:a93d74b6da172dd5dc4d973e0b766a84', } } @@ -26,7 +27,7 @@ class DailyMailIE(InfoExtractor): webpage = self._download_webpage(url, video_id) video_data = self._parse_json(self._search_regex( r"data-opts='({.+?})'", webpage, 'video data'), video_id) - title = video_data['title'] + title = unescapeHTML(video_data['title']) video_sources = self._download_json(video_data.get( 'sources', {}).get('url') or 'http://www.dailymail.co.uk/api/player/%s/video-sources.json' % video_id, video_id) @@ -55,7 +56,7 @@ class DailyMailIE(InfoExtractor): return { 'id': video_id, 'title': title, - 'description': video_data.get('descr'), + 'description': unescapeHTML(video_data.get('descr')), 'thumbnail': video_data.get('poster') or video_data.get('thumbnail'), 'formats': formats, } diff --git a/youtube_dl/extractor/dcn.py b/youtube_dl/extractor/dcn.py index efb8585e8..b8542820a 100644 --- a/youtube_dl/extractor/dcn.py +++ b/youtube_dl/extractor/dcn.py @@ -62,11 +62,9 @@ class DCNBaseIE(InfoExtractor): r'file\s*:\s*"https?(://[^"]+)/playlist.m3u8', r']+href="rtsp(://[^"]+)"' ], webpage, 'format url') - # TODO: Current DASH formats are broken - $Time$ pattern in - # not implemented yet - # formats.extend(self._extract_mpd_formats( - # format_url_base + '/manifest.mpd', - # video_id, mpd_id='dash', fatal=False)) + formats.extend(self._extract_mpd_formats( + format_url_base + '/manifest.mpd', + video_id, mpd_id='dash', fatal=False)) formats.extend(self._extract_m3u8_formats( format_url_base + '/playlist.m3u8', video_id, 'mp4', m3u8_entry_protocol, m3u8_id='hls', fatal=False)) diff --git a/youtube_dl/extractor/eporner.py b/youtube_dl/extractor/eporner.py index ac5d0fe24..f3734e9f8 100644 --- a/youtube_dl/extractor/eporner.py +++ b/youtube_dl/extractor/eporner.py @@ -4,19 +4,23 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( + encode_base_n, + ExtractorError, + int_or_none, parse_duration, str_to_int, ) class EpornerIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?eporner\.com/hd-porn/(?P\w+)/(?P[\w-]+)' + _VALID_URL = r'https?://(?:www\.)?eporner\.com/hd-porn/(?P\w+)(?:/(?P[\w-]+))?' _TESTS = [{ 'url': 'http://www.eporner.com/hd-porn/95008/Infamous-Tiffany-Teen-Strip-Tease-Video/', 'md5': '39d486f046212d8e1b911c52ab4691f8', 'info_dict': { - 'id': '95008', + 'id': 'qlDUmNsj6VS', 'display_id': 'Infamous-Tiffany-Teen-Strip-Tease-Video', 'ext': 'mp4', 'title': 'Infamous Tiffany Teen Strip Tease Video', @@ -28,34 +32,72 @@ class EpornerIE(InfoExtractor): # New (May 2016) URL layout 'url': 'http://www.eporner.com/hd-porn/3YRUtzMcWn0/Star-Wars-XXX-Parody/', 'only_matching': True, + }, { + 'url': 'http://www.eporner.com/hd-porn/3YRUtzMcWn0', + 'only_matching': True, }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') - display_id = mobj.group('display_id') + display_id = mobj.group('display_id') or video_id - webpage = self._download_webpage(url, display_id) - title = self._html_search_regex( - r'(.*?) - EPORNER', webpage, 'title') + webpage, urlh = self._download_webpage_handle(url, display_id) - redirect_url = 'http://www.eporner.com/config5/%s' % video_id - player_code = self._download_webpage( - redirect_url, display_id, note='Downloading player config') + video_id = self._match_id(compat_str(urlh.geturl())) - sources = self._search_regex( - r'(?s)sources\s*:\s*\[\s*({.+?})\s*\]', player_code, 'sources') + hash = self._search_regex( + r'hash\s*:\s*["\']([\da-f]{32})', webpage, 'hash') + + title = self._og_search_title(webpage, default=None) or self._html_search_regex( + r'<title>(.+?) - EPORNER', webpage, 'title') + + # Reverse engineered from vjs.js + def calc_hash(s): + return ''.join((encode_base_n(int(s[lb:lb + 8], 16), 36) for lb in range(0, 32, 8))) + + video = self._download_json( + 'http://www.eporner.com/xhr/video/%s' % video_id, + display_id, note='Downloading video JSON', + query={ + 'hash': calc_hash(hash), + 'device': 'generic', + 'domain': 'www.eporner.com', + 'fallback': 'false', + }) + + if video.get('available') is False: + raise ExtractorError( + '%s said: %s' % (self.IE_NAME, video['message']), expected=True) + + sources = video['sources'] formats = [] - for video_url, format_id in re.findall(r'file\s*:\s*"([^"]+)",\s*label\s*:\s*"([^"]+)"', sources): - fmt = { - 'url': video_url, - 'format_id': format_id, - } - m = re.search(r'^(\d+)', format_id) - if m: - fmt['height'] = int(m.group(1)) - formats.append(fmt) + for kind, formats_dict in sources.items(): + if not isinstance(formats_dict, dict): + continue + for format_id, format_dict in formats_dict.items(): + if not isinstance(format_dict, dict): + continue + src = format_dict.get('src') + if not isinstance(src, compat_str) or not src.startswith('http'): + continue + if kind == 'hls': + formats.extend(self._extract_m3u8_formats( + src, display_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id=kind, fatal=False)) + else: + height = int_or_none(self._search_regex( + r'(\d+)[pP]', format_id, 'height', default=None)) + fps = int_or_none(self._search_regex( + r'(\d+)fps', format_id, 'fps', default=None)) + + formats.append({ + 'url': src, + 'format_id': format_id, + 'height': height, + 'fps': fps, + }) self._sort_formats(formats) duration = parse_duration(self._html_search_meta('duration', webpage)) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 6de50296c..53fab1a31 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -44,6 +44,7 @@ from .appletrailers import ( AppleTrailersSectionIE, ) from .archiveorg import ArchiveOrgIE +from .arkena import ArkenaIE from .ard import ( ARDIE, ARDMediathekIE, @@ -158,8 +159,8 @@ from .coub import CoubIE from .collegerama import CollegeRamaIE from .comedycentral import ( ComedyCentralIE, - ComedyCentralShowsIE, ComedyCentralTVIE, + ToshIE, ) from .comcarcoff import ComCarCoffIE from .commonmistakes import CommonMistakesIE, UnicodeBOMIE @@ -397,6 +398,10 @@ from .kuwo import ( ) from .la7 import LA7IE from .laola1tv import Laola1TvIE +from .lcp import ( + LcpPlayIE, + LcpIE, +) from .learnr import LearnrIE from .lecture2go import Lecture2GoIE from .lemonde import LemondeIE @@ -475,7 +480,6 @@ from .msn import MSNIE from .mtv import ( MTVIE, MTVServicesEmbeddedIE, - MTVIggyIE, MTVDEIE, ) from .muenchentv import MuenchenTVIE @@ -525,7 +529,6 @@ from .nextmedia import ( NextMediaActionNewsIE, AppleDailyIE, ) -from .nextmovie import NextMovieIE from .nfb import NFBIE from .nfl import NFLIE from .nhl import ( diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index cdb093262..0fb781a73 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -27,7 +27,7 @@ class FacebookIE(InfoExtractor): _VALID_URL = r'''(?x) (?: https?:// - (?:\w+\.)?facebook\.com/ + (?:[\w-]+\.)?facebook\.com/ (?:[^#]*?\#!/)? (?: (?: @@ -127,6 +127,9 @@ class FacebookIE(InfoExtractor): }, { 'url': 'https://www.facebook.com/groups/164828000315060/permalink/764967300301124/', 'only_matching': True, + }, { + 'url': 'https://zh-hk.facebook.com/peoplespower/videos/1135894589806027/', + 'only_matching': True, }] @staticmethod diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 6d346cb1c..5364f0b19 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -62,6 +62,7 @@ from .videomore import VideomoreIE from .googledrive import GoogleDriveIE from .jwplatform import JWPlatformIE from .digiteka import DigitekaIE +from .arkena import ArkenaIE from .instagram import InstagramIE from .liveleak import LiveLeakIE from .threeqsdn import ThreeQSDNIE @@ -70,6 +71,7 @@ from .vessel import VesselIE from .kaltura import KalturaIE from .eagleplatform import EaglePlatformIE from .facebook import FacebookIE +from .soundcloud import SoundcloudIE class GenericIE(InfoExtractor): @@ -473,7 +475,7 @@ class GenericIE(InfoExtractor): 'url': 'http://www.vestifinance.ru/articles/25753', 'info_dict': { 'id': '25753', - 'title': 'Вести Экономика ― Прямые трансляции с Форума-выставки "Госзаказ-2013"', + 'title': 'Прямые трансляции с Форума-выставки "Госзаказ-2013"', }, 'playlist': [{ 'info_dict': { @@ -640,6 +642,8 @@ class GenericIE(InfoExtractor): 'ext': 'mp4', 'title': 'Key and Peele|October 10, 2012|2|203|Liam Neesons - Uncensored', 'description': 'Two valets share their love for movie star Liam Neesons.', + 'timestamp': 1349922600, + 'upload_date': '20121011', }, }, # YouTube embed via <data-embed-url=""> @@ -781,6 +785,15 @@ class GenericIE(InfoExtractor): 'upload_date': '20141029', } }, + # Soundcloud multiple embeds + { + 'url': 'http://www.guitarplayer.com/lessons/1014/legato-workout-one-hour-to-more-fluid-performance---tab/52809', + 'info_dict': { + 'id': '52809', + 'title': 'Guitar Essentials: Legato Workout—One-Hour to Fluid Performance | TAB + AUDIO', + }, + 'playlist_mincount': 7, + }, # Livestream embed { 'url': 'http://www.esa.int/Our_Activities/Space_Science/Rosetta/Philae_comet_touch-down_webcast', @@ -856,6 +869,7 @@ class GenericIE(InfoExtractor): 'description': 'md5:601cb790edd05908957dae8aaa866465', 'upload_date': '20150220', }, + 'skip': 'All The Daily Show URLs now redirect to http://www.cc.com/shows/', }, # jwplayer YouTube { @@ -1342,6 +1356,23 @@ class GenericIE(InfoExtractor): }, 'add_ie': ['Vimeo'], }, + { + 'url': 'https://support.arkena.com/display/PLAY/Ways+to+embed+your+video', + 'md5': 'b96f2f71b359a8ecd05ce4e1daa72365', + 'info_dict': { + 'id': 'b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe', + 'ext': 'mp4', + 'title': 'Big Buck Bunny', + 'description': 'Royalty free test video', + 'timestamp': 1432816365, + 'upload_date': '20150528', + 'is_live': False, + }, + 'params': { + 'skip_download': True, + }, + 'add_ie': [ArkenaIE.ie_key()], + }, # { # # TODO: find another test # # http://schema.org/VideoObject @@ -1978,12 +2009,9 @@ class GenericIE(InfoExtractor): return self.url_result(myvi_url) # Look for embedded soundcloud player - mobj = re.search( - r'<iframe\s+(?:[a-zA-Z0-9_-]+="[^"]+"\s+)*src="(?P<url>https?://(?:w\.)?soundcloud\.com/player[^"]+)"', - webpage) - if mobj is not None: - url = unescapeHTML(mobj.group('url')) - return self.url_result(url) + soundcloud_urls = SoundcloudIE._extract_urls(webpage) + if soundcloud_urls: + return _playlist_from_matches(soundcloud_urls, getter=unescapeHTML, ie=SoundcloudIE.ie_key()) # Look for embedded mtvservices player mtvservices_url = MTVServicesEmbeddedIE._extract_url(webpage) @@ -2146,6 +2174,11 @@ class GenericIE(InfoExtractor): if digiteka_url: return self.url_result(self._proto_relative_url(digiteka_url), DigitekaIE.ie_key()) + # Look for Arkena embeds + arkena_url = ArkenaIE._extract_url(webpage) + if arkena_url: + return self.url_result(arkena_url, ArkenaIE.ie_key()) + # Look for Limelight embeds mobj = re.search(r'LimelightPlayer\.doLoad(Media|Channel|ChannelList)\(["\'](?P<id>[a-z0-9]{32})', webpage) if mobj: diff --git a/youtube_dl/extractor/instagram.py b/youtube_dl/extractor/instagram.py index fc0197ae1..8f7f232be 100644 --- a/youtube_dl/extractor/instagram.py +++ b/youtube_dl/extractor/instagram.py @@ -36,7 +36,6 @@ class InstagramIE(InfoExtractor): 'info_dict': { 'id': 'BA-pQFBG8HZ', 'ext': 'mp4', - 'uploader_id': 'britneyspears', 'title': 'Video by britneyspears', 'thumbnail': 're:^https?://.*\.jpg', 'timestamp': 1453760977, diff --git a/youtube_dl/extractor/lcp.py b/youtube_dl/extractor/lcp.py new file mode 100644 index 000000000..ade27a99e --- /dev/null +++ b/youtube_dl/extractor/lcp.py @@ -0,0 +1,90 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from .arkena import ArkenaIE + + +class LcpPlayIE(ArkenaIE): + _VALID_URL = r'https?://play\.lcp\.fr/embed/(?P<id>[^/]+)/(?P<account_id>[^/]+)/[^/]+/[^/]+' + _TESTS = [{ + 'url': 'http://play.lcp.fr/embed/327336/131064/darkmatter/0', + 'md5': 'b8bd9298542929c06c1c15788b1f277a', + 'info_dict': { + 'id': '327336', + 'ext': 'mp4', + 'title': '327336', + 'timestamp': 1456391602, + 'upload_date': '20160225', + }, + 'params': { + 'skip_download': True, + }, + }] + + +class LcpIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?lcp\.fr/(?:[^/]+/)*(?P<id>[^/]+)' + + _TESTS = [{ + # arkena embed + 'url': 'http://www.lcp.fr/la-politique-en-video/schwartzenberg-prg-preconise-francois-hollande-de-participer-une-primaire', + 'md5': 'b8bd9298542929c06c1c15788b1f277a', + 'info_dict': { + 'id': 'd56d03e9', + 'ext': 'mp4', + 'title': 'Schwartzenberg (PRG) préconise à François Hollande de participer à une primaire à gauche', + 'description': 'md5:96ad55009548da9dea19f4120c6c16a8', + 'timestamp': 1456488895, + 'upload_date': '20160226', + }, + 'params': { + 'skip_download': True, + }, + }, { + # dailymotion live stream + 'url': 'http://www.lcp.fr/le-direct', + 'info_dict': { + 'id': 'xji3qy', + 'ext': 'mp4', + 'title': 'La Chaine Parlementaire (LCP), Live TNT', + 'description': 'md5:5c69593f2de0f38bd9a949f2c95e870b', + 'uploader': 'LCP', + 'uploader_id': 'xbz33d', + 'timestamp': 1308923058, + 'upload_date': '20110624', + }, + 'params': { + # m3u8 live stream + 'skip_download': True, + }, + }, { + 'url': 'http://www.lcp.fr/emissions/277792-les-volontaires', + 'only_matching': True, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + + play_url = self._search_regex( + r'<iframe[^>]+src=(["\'])(?P<url>%s?(?:(?!\1).)*)\1' % LcpPlayIE._VALID_URL, + webpage, 'play iframe', default=None, group='url') + + if not play_url: + return self.url_result(url, 'Generic') + + title = self._og_search_title(webpage, default=None) or self._html_search_meta( + 'twitter:title', webpage, fatal=True) + description = self._html_search_meta( + ('description', 'twitter:description'), webpage) + + return { + '_type': 'url_transparent', + 'ie_key': LcpPlayIE.ie_key(), + 'url': play_url, + 'display_id': display_id, + 'title': title, + 'description': description, + } diff --git a/youtube_dl/extractor/mgtv.py b/youtube_dl/extractor/mgtv.py index d970e94ec..27bdff8b2 100644 --- a/youtube_dl/extractor/mgtv.py +++ b/youtube_dl/extractor/mgtv.py @@ -9,7 +9,7 @@ class MGTVIE(InfoExtractor): _VALID_URL = r'https?://www\.mgtv\.com/v/(?:[^/]+/)*(?P<id>\d+)\.html' IE_DESC = '芒果TV' - _TEST = { + _TESTS = [{ 'url': 'http://www.mgtv.com/v/1/290525/f/3116640.html', 'md5': '1bdadcf760a0b90946ca68ee9a2db41a', 'info_dict': { @@ -20,7 +20,11 @@ class MGTVIE(InfoExtractor): 'duration': 7461, 'thumbnail': 're:^https?://.*\.jpg$', }, - } + }, { + # no tbr extracted from stream_url + 'url': 'http://www.mgtv.com/v/1/1/f/3324755.html', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) @@ -41,7 +45,8 @@ class MGTVIE(InfoExtractor): def extract_format(stream_url, format_id, idx, query={}): format_info = self._download_json( stream_url, video_id, - note='Download video info for format %s' % format_id or '#%d' % idx, query=query) + note='Download video info for format %s' % (format_id or '#%d' % idx), + query=query) return { 'format_id': format_id, 'url': format_info['info'], diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py index f3ec2ebbc..2f455680e 100644 --- a/youtube_dl/extractor/mtv.py +++ b/youtube_dl/extractor/mtv.py @@ -16,6 +16,7 @@ from ..utils import ( HEADRequest, sanitized_Request, strip_or_none, + timeconvert, unescapeHTML, url_basename, RegexNotFoundError, @@ -36,13 +37,13 @@ class MTVServicesInfoExtractor(InfoExtractor): return uri.split(':')[-1] # This was originally implemented for ComedyCentral, but it also works here - @staticmethod - def _transform_rtmp_url(rtmp_video_url): + @classmethod + def _transform_rtmp_url(cls, rtmp_video_url): m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp\..+?/.*)$', rtmp_video_url) if not m: - return rtmp_video_url + return {'rtmp': rtmp_video_url} base = 'http://viacommtvstrmfs.fplive.net/' - return base + m.group('finalid') + return {'http': base + m.group('finalid')} def _get_feed_url(self, uri): return self._FEED_URL @@ -86,14 +87,14 @@ class MTVServicesInfoExtractor(InfoExtractor): rtmp_video_url = rendition.find('./src').text if rtmp_video_url.endswith('siteunavail.png'): continue - new_url = self._transform_rtmp_url(rtmp_video_url) - formats.append({ + new_urls = self._transform_rtmp_url(rtmp_video_url) + formats.extend([{ 'ext': 'flv' if new_url.startswith('rtmp') else ext, 'url': new_url, - 'format_id': rendition.get('bitrate'), + 'format_id': '-'.join(filter(None, [kind, rendition.get('bitrate')])), 'width': int(rendition.get('width')), 'height': int(rendition.get('height')), - }) + } for kind, new_url in new_urls.items()]) except (KeyError, TypeError): raise ExtractorError('Invalid rendition field.') self._sort_formats(formats) @@ -136,6 +137,8 @@ class MTVServicesInfoExtractor(InfoExtractor): description = strip_or_none(xpath_text(itemdoc, 'description')) + timestamp = timeconvert(xpath_text(itemdoc, 'pubDate')) + title_el = None if title_el is None: title_el = find_xpath_attr( @@ -168,6 +171,7 @@ class MTVServicesInfoExtractor(InfoExtractor): 'thumbnail': self._get_thumbnail_url(uri, itemdoc), 'description': description, 'duration': float_or_none(content_el.attrib.get('duration')), + 'timestamp': timestamp, } def _get_feed_query(self, uri): @@ -186,8 +190,13 @@ class MTVServicesInfoExtractor(InfoExtractor): idoc = self._download_xml( url, video_id, 'Downloading info', transform_source=fix_xml_ampersands) + + title = xpath_text(idoc, './channel/title') + description = xpath_text(idoc, './channel/description') + return self.playlist_result( - [self._get_video_info(item) for item in idoc.findall('.//item')]) + [self._get_video_info(item) for item in idoc.findall('.//item')], + playlist_title=title, playlist_description=description) def _extract_mgid(self, webpage): try: @@ -233,6 +242,8 @@ class MTVServicesEmbeddedIE(MTVServicesInfoExtractor): 'ext': 'mp4', 'title': 'Peter Dinklage Sums Up \'Game Of Thrones\' In 45 Seconds', 'description': '"Sexy sexy sexy, stabby stabby stabby, beautiful language," says Peter Dinklage as he tries summarizing "Game of Thrones" in under a minute.', + 'timestamp': 1400126400, + 'upload_date': '20140515', }, } @@ -275,6 +286,8 @@ class MTVIE(MTVServicesInfoExtractor): 'ext': 'mp4', 'title': 'Taylor Swift - "Ours (VH1 Storytellers)"', 'description': 'Album: Taylor Swift performs "Ours" for VH1 Storytellers at Harvey Mudd College.', + 'timestamp': 1352610000, + 'upload_date': '20121111', }, }, ] @@ -301,20 +314,6 @@ class MTVIE(MTVServicesInfoExtractor): return self._get_videos_info(uri) -class MTVIggyIE(MTVServicesInfoExtractor): - IE_NAME = 'mtviggy.com' - _VALID_URL = r'https?://www\.mtviggy\.com/videos/.+' - _TEST = { - 'url': 'http://www.mtviggy.com/videos/arcade-fire-behind-the-scenes-at-the-biggest-music-experiment-yet/', - 'info_dict': { - 'id': '984696', - 'ext': 'mp4', - 'title': 'Arcade Fire: Behind the Scenes at the Biggest Music Experiment Yet', - } - } - _FEED_URL = 'http://all.mtvworldverticals.com/feed-xml/' - - class MTVDEIE(MTVServicesInfoExtractor): IE_NAME = 'mtv.de' _VALID_URL = r'https?://(?:www\.)?mtv\.de/(?:artists|shows|news)/(?:[^/]+/)*(?P<id>\d+)-[^/#?]+/*(?:[#?].*)?$' @@ -322,7 +321,7 @@ class MTVDEIE(MTVServicesInfoExtractor): 'url': 'http://www.mtv.de/artists/10571-cro/videos/61131-traum', 'info_dict': { 'id': 'music_video-a50bc5f0b3aa4b3190aa', - 'ext': 'mp4', + 'ext': 'flv', 'title': 'MusicVideo_cro-traum', 'description': 'Cro - Traum', }, @@ -330,20 +329,21 @@ class MTVDEIE(MTVServicesInfoExtractor): # rtmp download 'skip_download': True, }, + 'skip': 'Blocked at Travis CI', }, { # mediagen URL without query (e.g. http://videos.mtvnn.com/mediagen/e865da714c166d18d6f80893195fcb97) 'url': 'http://www.mtv.de/shows/933-teen-mom-2/staffeln/5353/folgen/63565-enthullungen', 'info_dict': { 'id': 'local_playlist-f5ae778b9832cc837189', - 'ext': 'mp4', + 'ext': 'flv', 'title': 'Episode_teen-mom-2_shows_season-5_episode-1_full-episode_part1', }, 'params': { # rtmp download 'skip_download': True, }, + 'skip': 'Blocked at Travis CI', }, { - # single video in pagePlaylist with different id 'url': 'http://www.mtv.de/news/77491-mtv-movies-spotlight-pixels-teil-3', 'info_dict': { 'id': 'local_playlist-4e760566473c4c8c5344', @@ -355,6 +355,7 @@ class MTVDEIE(MTVServicesInfoExtractor): # rtmp download 'skip_download': True, }, + 'skip': 'Das Video kann zur Zeit nicht abgespielt werden.', }] def _real_extract(self, url): @@ -367,11 +368,14 @@ class MTVDEIE(MTVServicesInfoExtractor): r'window\.pagePlaylist\s*=\s*(\[.+?\]);\n', webpage, 'page playlist'), video_id) + def _mrss_url(item): + return item['mrss'] + item.get('mrssvars', '') + # news pages contain single video in playlist with different id if len(playlist) == 1: - return self._get_videos_info_from_url(playlist[0]['mrss'], video_id) + return self._get_videos_info_from_url(_mrss_url(playlist[0]), video_id) for item in playlist: item_id = item.get('id') if item_id and compat_str(item_id) == video_id: - return self._get_videos_info_from_url(item['mrss'], video_id) + return self._get_videos_info_from_url(_mrss_url(item), video_id) diff --git a/youtube_dl/extractor/nextmovie.py b/youtube_dl/extractor/nextmovie.py deleted file mode 100644 index 9ccd7d774..000000000 --- a/youtube_dl/extractor/nextmovie.py +++ /dev/null @@ -1,30 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .mtv import MTVServicesInfoExtractor -from ..compat import compat_urllib_parse_urlencode - - -class NextMovieIE(MTVServicesInfoExtractor): - IE_NAME = 'nextmovie.com' - _VALID_URL = r'https?://(?:www\.)?nextmovie\.com/shows/[^/]+/\d{4}-\d{2}-\d{2}/(?P<id>[^/?#]+)' - _FEED_URL = 'http://lite.dextr.mtvi.com/service1/dispatch.htm' - _TESTS = [{ - 'url': 'http://www.nextmovie.com/shows/exclusives/2013-03-10/mgid:uma:videolist:nextmovie.com:1715019/', - 'md5': '09a9199f2f11f10107d04fcb153218aa', - 'info_dict': { - 'id': '961726', - 'ext': 'mp4', - 'title': 'The Muppets\' Gravity', - }, - }] - - def _get_feed_query(self, uri): - return compat_urllib_parse_urlencode({ - 'feed': '1505', - 'mgid': uri, - }) - - def _real_extract(self, url): - mgid = self._match_id(url) - return self._get_videos_info(mgid) diff --git a/youtube_dl/extractor/nick.py b/youtube_dl/extractor/nick.py index 4935002d0..9c54846e1 100644 --- a/youtube_dl/extractor/nick.py +++ b/youtube_dl/extractor/nick.py @@ -7,6 +7,7 @@ from ..utils import update_url_query class NickIE(MTVServicesInfoExtractor): + # None of videos on the website are still alive? IE_NAME = 'nick.com' _VALID_URL = r'https?://(?:www\.)?nick(?:jr)?\.com/(?:videos/clip|[^/]+/videos)/(?P<id>[^/?#.]+)' _FEED_URL = 'http://udat.mtvnservices.com/service1/dispatch.htm' diff --git a/youtube_dl/extractor/onet.py b/youtube_dl/extractor/onet.py index 402d3a9f7..fc22ad5eb 100644 --- a/youtube_dl/extractor/onet.py +++ b/youtube_dl/extractor/onet.py @@ -59,11 +59,8 @@ class OnetBaseIE(InfoExtractor): # TODO: Support Microsoft Smooth Streaming continue elif ext == 'mpd': - # TODO: Current DASH formats are broken - $Time$ pattern in - # <SegmentTemplate> not implemented yet - # formats.extend(self._extract_mpd_formats( - # video_url, video_id, mpd_id='dash', fatal=False)) - continue + formats.extend(self._extract_mpd_formats( + video_url, video_id, mpd_id='dash', fatal=False)) else: formats.append({ 'url': video_url, diff --git a/youtube_dl/extractor/orf.py b/youtube_dl/extractor/orf.py index ccb23e069..6ae30679a 100644 --- a/youtube_dl/extractor/orf.py +++ b/youtube_dl/extractor/orf.py @@ -137,13 +137,16 @@ class ORFTVthekIE(InfoExtractor): class ORFOE1IE(InfoExtractor): IE_NAME = 'orf:oe1' IE_DESC = 'Radio Österreich 1' - _VALID_URL = r'https?://oe1\.orf\.at/(?:programm/|konsole.*?#\?track_id=)(?P<id>[0-9]+)' + _VALID_URL = r'https?://oe1\.orf\.at/(?:programm/|konsole\?.*?\btrack_id=)(?P<id>[0-9]+)' # Audios on ORF radio are only available for 7 days, so we can't add tests. - _TEST = { + _TESTS = [{ 'url': 'http://oe1.orf.at/konsole?show=on_demand#?track_id=394211', 'only_matching': True, - } + }, { + 'url': 'http://oe1.orf.at/konsole?show=ondemand&track_id=443608&load_day=/programm/konsole/tag/20160726', + 'only_matching': True, + }] def _real_extract(self, url): show_id = self._match_id(url) diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index d2c92531b..20976c101 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -111,7 +111,7 @@ class PornHubIE(InfoExtractor): webpage = self._download_webpage(req, video_id) error_msg = self._html_search_regex( - r'(?s)<div[^>]+class=(["\']).*?\b(?:removed|userMessageSection)\b.*?\1[^>]*>(?P<error>.+?)</div>', + r'(?s)<div[^>]+class=(["\'])(?:(?!\1).)*\b(?:removed|userMessageSection)\b(?:(?!\1).)*\1[^>]*>(?P<error>.+?)</div>', webpage, 'error message', default=None, group='error') if error_msg: error_msg = re.sub(r'\s+', ' ', error_msg) diff --git a/youtube_dl/extractor/shared.py b/youtube_dl/extractor/shared.py index e7e5f653e..d592dfeb8 100644 --- a/youtube_dl/extractor/shared.py +++ b/youtube_dl/extractor/shared.py @@ -6,7 +6,6 @@ from .common import InfoExtractor from ..utils import ( ExtractorError, int_or_none, - sanitized_Request, urlencode_postdata, ) @@ -37,28 +36,33 @@ class SharedIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + + webpage, urlh = self._download_webpage_handle(url, video_id) if '>File does not exist<' in webpage: raise ExtractorError( 'Video %s does not exist' % video_id, expected=True) download_form = self._hidden_inputs(webpage) - request = sanitized_Request( - url, urlencode_postdata(download_form)) - request.add_header('Content-Type', 'application/x-www-form-urlencoded') video_page = self._download_webpage( - request, video_id, 'Downloading video page') + urlh.geturl(), video_id, 'Downloading video page', + data=urlencode_postdata(download_form), + headers={ + 'Content-Type': 'application/x-www-form-urlencoded', + 'Referer': urlh.geturl(), + }) video_url = self._html_search_regex( - r'data-url="([^"]+)"', video_page, 'video URL') + r'data-url=(["\'])(?P<url>(?:(?!\1).)+)\1', + video_page, 'video URL', group='url') title = base64.b64decode(self._html_search_meta( 'full:title', webpage, 'title').encode('utf-8')).decode('utf-8') filesize = int_or_none(self._html_search_meta( 'full:size', webpage, 'file size', fatal=False)) thumbnail = self._html_search_regex( - r'data-poster="([^"]+)"', video_page, 'thumbnail', default=None) + r'data-poster=(["\'])(?P<url>(?:(?!\1).)+)\1', + video_page, 'thumbnail', default=None, group='url') return { 'id': video_id, diff --git a/youtube_dl/extractor/smotri.py b/youtube_dl/extractor/smotri.py index 5c3fd0fec..114358786 100644 --- a/youtube_dl/extractor/smotri.py +++ b/youtube_dl/extractor/smotri.py @@ -13,20 +13,21 @@ from ..utils import ( sanitized_Request, unified_strdate, urlencode_postdata, + xpath_text, ) class SmotriIE(InfoExtractor): IE_DESC = 'Smotri.com' IE_NAME = 'smotri' - _VALID_URL = r'^https?://(?:www\.)?(?:smotri\.com/video/view/\?id=|pics\.smotri\.com/(?:player|scrubber_custom8)\.swf\?file=)(?P<id>v(?P<realvideoid>[0-9]+)[a-z0-9]{4})' + _VALID_URL = r'https?://(?:www\.)?(?:smotri\.com/video/view/\?id=|pics\.smotri\.com/(?:player|scrubber_custom8)\.swf\?file=)(?P<id>v(?P<realvideoid>[0-9]+)[a-z0-9]{4})' _NETRC_MACHINE = 'smotri' _TESTS = [ # real video id 2610366 { 'url': 'http://smotri.com/video/view/?id=v261036632ab', - 'md5': '2a7b08249e6f5636557579c368040eb9', + 'md5': '02c0dfab2102984e9c5bb585cc7cc321', 'info_dict': { 'id': 'v261036632ab', 'ext': 'mp4', @@ -174,11 +175,11 @@ class SmotriIE(InfoExtractor): if video_password: video_form['pass'] = hashlib.md5(video_password.encode('utf-8')).hexdigest() - request = sanitized_Request( - 'http://smotri.com/video/view/url/bot/', urlencode_postdata(video_form)) - request.add_header('Content-Type', 'application/x-www-form-urlencoded') - - video = self._download_json(request, video_id, 'Downloading video JSON') + video = self._download_json( + 'http://smotri.com/video/view/url/bot/', + video_id, 'Downloading video JSON', + data=urlencode_postdata(video_form), + headers={'Content-Type': 'application/x-www-form-urlencoded'}) video_url = video.get('_vidURL') or video.get('_vidURL_mp4') @@ -196,11 +197,11 @@ class SmotriIE(InfoExtractor): raise ExtractorError(msg, expected=True) title = video['title'] - thumbnail = video['_imgURL'] - upload_date = unified_strdate(video['added']) - uploader = video['userNick'] - uploader_id = video['userLogin'] - duration = int_or_none(video['duration']) + thumbnail = video.get('_imgURL') + upload_date = unified_strdate(video.get('added')) + uploader = video.get('userNick') + uploader_id = video.get('userLogin') + duration = int_or_none(video.get('duration')) # Video JSON does not provide enough meta data # We will extract some from the video web page instead @@ -209,7 +210,7 @@ class SmotriIE(InfoExtractor): # Warning if video is unavailable warning = self._html_search_regex( - r'<div class="videoUnModer">(.*?)</div>', webpage, + r'<div[^>]+class="videoUnModer"[^>]*>(.+?)</div>', webpage, 'warning message', default=None) if warning is not None: self._downloader.report_warning( @@ -217,20 +218,22 @@ class SmotriIE(InfoExtractor): (video_id, warning)) # Adult content - if re.search('EroConfirmText">', webpage) is not None: + if 'EroConfirmText">' in webpage: self.report_age_confirmation() confirm_string = self._html_search_regex( - r'<a href="/video/view/\?id=%s&confirm=([^"]+)" title="[^"]+">' % video_id, + r'<a[^>]+href="/video/view/\?id=%s&confirm=([^"]+)"' % video_id, webpage, 'confirm string') confirm_url = webpage_url + '&confirm=%s' % confirm_string - webpage = self._download_webpage(confirm_url, video_id, 'Downloading video page (age confirmed)') + webpage = self._download_webpage( + confirm_url, video_id, + 'Downloading video page (age confirmed)') adult_content = True else: adult_content = False view_count = self._html_search_regex( - 'Общее количество просмотров.*?<span class="Number">(\\d+)</span>', - webpage, 'view count', fatal=False, flags=re.MULTILINE | re.DOTALL) + r'(?s)Общее количество просмотров.*?<span class="Number">(\d+)</span>', + webpage, 'view count', fatal=False) return { 'id': video_id, @@ -249,37 +252,33 @@ class SmotriIE(InfoExtractor): class SmotriCommunityIE(InfoExtractor): IE_DESC = 'Smotri.com community videos' IE_NAME = 'smotri:community' - _VALID_URL = r'^https?://(?:www\.)?smotri\.com/community/video/(?P<communityid>[0-9A-Za-z_\'-]+)' + _VALID_URL = r'https?://(?:www\.)?smotri\.com/community/video/(?P<id>[0-9A-Za-z_\'-]+)' _TEST = { 'url': 'http://smotri.com/community/video/kommuna', 'info_dict': { 'id': 'kommuna', - 'title': 'КПРФ', }, 'playlist_mincount': 4, } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - community_id = mobj.group('communityid') + community_id = self._match_id(url) - url = 'http://smotri.com/export/rss/video/by/community/-/%s/video.xml' % community_id - rss = self._download_xml(url, community_id, 'Downloading community RSS') + rss = self._download_xml( + 'http://smotri.com/export/rss/video/by/community/-/%s/video.xml' % community_id, + community_id, 'Downloading community RSS') - entries = [self.url_result(video_url.text, 'Smotri') - for video_url in rss.findall('./channel/item/link')] + entries = [ + self.url_result(video_url.text, SmotriIE.ie_key()) + for video_url in rss.findall('./channel/item/link')] - description_text = rss.find('./channel/description').text - community_title = self._html_search_regex( - '^Видео сообщества "([^"]+)"$', description_text, 'community title') - - return self.playlist_result(entries, community_id, community_title) + return self.playlist_result(entries, community_id) class SmotriUserIE(InfoExtractor): IE_DESC = 'Smotri.com user videos' IE_NAME = 'smotri:user' - _VALID_URL = r'^https?://(?:www\.)?smotri\.com/user/(?P<userid>[0-9A-Za-z_\'-]+)' + _VALID_URL = r'https?://(?:www\.)?smotri\.com/user/(?P<id>[0-9A-Za-z_\'-]+)' _TESTS = [{ 'url': 'http://smotri.com/user/inspector', 'info_dict': { @@ -290,19 +289,19 @@ class SmotriUserIE(InfoExtractor): }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - user_id = mobj.group('userid') + user_id = self._match_id(url) - url = 'http://smotri.com/export/rss/user/video/-/%s/video.xml' % user_id - rss = self._download_xml(url, user_id, 'Downloading user RSS') + rss = self._download_xml( + 'http://smotri.com/export/rss/user/video/-/%s/video.xml' % user_id, + user_id, 'Downloading user RSS') entries = [self.url_result(video_url.text, 'Smotri') for video_url in rss.findall('./channel/item/link')] - description_text = rss.find('./channel/description').text - user_nickname = self._html_search_regex( - '^Видео режиссера (.*)$', description_text, - 'user nickname') + description_text = xpath_text(rss, './channel/description') or '' + user_nickname = self._search_regex( + '^Видео режиссера (.+)$', description_text, + 'user nickname', fatal=False) return self.playlist_result(entries, user_id, user_nickname) @@ -310,11 +309,11 @@ class SmotriUserIE(InfoExtractor): class SmotriBroadcastIE(InfoExtractor): IE_DESC = 'Smotri.com broadcasts' IE_NAME = 'smotri:broadcast' - _VALID_URL = r'^https?://(?:www\.)?(?P<url>smotri\.com/live/(?P<broadcastid>[^/]+))/?.*' + _VALID_URL = r'https?://(?:www\.)?(?P<url>smotri\.com/live/(?P<id>[^/]+))/?.*' def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - broadcast_id = mobj.group('broadcastid') + broadcast_id = mobj.group('id') broadcast_url = 'http://' + mobj.group('url') broadcast_page = self._download_webpage(broadcast_url, broadcast_id, 'Downloading broadcast page') @@ -328,7 +327,8 @@ class SmotriBroadcastIE(InfoExtractor): (username, password) = self._get_login_info() if username is None: - self.raise_login_required('Erotic broadcasts allowed only for registered users') + self.raise_login_required( + 'Erotic broadcasts allowed only for registered users') login_form = { 'login-hint53': '1', @@ -343,8 +343,9 @@ class SmotriBroadcastIE(InfoExtractor): broadcast_page = self._download_webpage( request, broadcast_id, 'Logging in and confirming age') - if re.search('>Неверный логин или пароль<', broadcast_page) is not None: - raise ExtractorError('Unable to log in: bad username or password', expected=True) + if '>Неверный логин или пароль<' in broadcast_page: + raise ExtractorError( + 'Unable to log in: bad username or password', expected=True) adult_content = True else: @@ -383,11 +384,11 @@ class SmotriBroadcastIE(InfoExtractor): broadcast_playpath = broadcast_json['_streamName'] broadcast_app = '%s/%s' % (mobj.group('app'), broadcast_json['_vidURL']) - broadcast_thumbnail = broadcast_json['_imgURL'] + broadcast_thumbnail = broadcast_json.get('_imgURL') broadcast_title = self._live_title(broadcast_json['title']) - broadcast_description = broadcast_json['description'] - broadcaster_nick = broadcast_json['nick'] - broadcaster_login = broadcast_json['login'] + broadcast_description = broadcast_json.get('description') + broadcaster_nick = broadcast_json.get('nick') + broadcaster_login = broadcast_json.get('login') rtmp_conn = 'S:%s' % uuid.uuid4().hex except KeyError: if protected_broadcast: diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index 194dabc71..aeae931a2 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -119,6 +119,12 @@ class SoundcloudIE(InfoExtractor): _CLIENT_ID = '02gUJC0hH2ct1EGOcYXQIzRFU91c72Ea' _IPHONE_CLIENT_ID = '376f225bf427445fc4bfb6b99b72e0bf' + @staticmethod + def _extract_urls(webpage): + return [m.group('url') for m in re.finditer( + r'<iframe[^>]+src=(["\'])(?P<url>(?:https?://)?(?:w\.)?soundcloud\.com/player.+?)\1', + webpage)] + def report_resolve(self, video_id): """Report information extraction.""" self.to_screen('%s: Resolving id' % video_id) diff --git a/youtube_dl/extractor/southpark.py b/youtube_dl/extractor/southpark.py index 87b650468..a147f7db1 100644 --- a/youtube_dl/extractor/southpark.py +++ b/youtube_dl/extractor/southpark.py @@ -17,6 +17,8 @@ class SouthParkIE(MTVServicesInfoExtractor): 'ext': 'mp4', 'title': 'South Park|Bat Daded', 'description': 'Randy disqualifies South Park by getting into a fight with Bat Dad.', + 'timestamp': 1112760000, + 'upload_date': '20050406', }, }] @@ -28,6 +30,10 @@ class SouthParkEsIE(SouthParkIE): _TESTS = [{ 'url': 'http://southpark.cc.com/episodios-en-espanol/s01e01-cartman-consigue-una-sonda-anal#source=351c1323-0b96-402d-a8b9-40d01b2e9bde&position=1&sort=!airdate', + 'info_dict': { + 'title': 'Cartman Consigue Una Sonda Anal', + 'description': 'Cartman Consigue Una Sonda Anal', + }, 'playlist_count': 4, }] @@ -42,17 +48,27 @@ class SouthParkDeIE(SouthParkIE): 'info_dict': { 'id': '85487c96-b3b9-4e39-9127-ad88583d9bf2', 'ext': 'mp4', - 'title': 'The Government Won\'t Respect My Privacy', + 'title': 'South Park|The Government Won\'t Respect My Privacy', 'description': 'Cartman explains the benefits of "Shitter" to Stan, Kyle and Craig.', + 'timestamp': 1380160800, + 'upload_date': '20130926', }, }, { # non-ASCII characters in initial URL 'url': 'http://www.southpark.de/alle-episoden/s18e09-hashtag-aufwärmen', - 'playlist_count': 4, + 'info_dict': { + 'title': 'Hashtag „Aufwärmen“', + 'description': 'Kyle will mit seinem kleinen Bruder Ike Videospiele spielen. Als der nicht mehr mit ihm spielen will, hat Kyle Angst, dass er die Kids von heute nicht mehr versteht.', + }, + 'playlist_count': 3, }, { # non-ASCII characters in redirect URL 'url': 'http://www.southpark.de/alle-episoden/s18e09', - 'playlist_count': 4, + 'info_dict': { + 'title': 'Hashtag „Aufwärmen“', + 'description': 'Kyle will mit seinem kleinen Bruder Ike Videospiele spielen. Als der nicht mehr mit ihm spielen will, hat Kyle Angst, dass er die Kids von heute nicht mehr versteht.', + }, + 'playlist_count': 3, }] @@ -63,7 +79,11 @@ class SouthParkNlIE(SouthParkIE): _TESTS = [{ 'url': 'http://www.southpark.nl/full-episodes/s18e06-freemium-isnt-free', - 'playlist_count': 4, + 'info_dict': { + 'title': 'Freemium Isn\'t Free', + 'description': 'Stan is addicted to the new Terrance and Phillip mobile game.', + }, + 'playlist_mincount': 3, }] @@ -74,5 +94,9 @@ class SouthParkDkIE(SouthParkIE): _TESTS = [{ 'url': 'http://www.southparkstudios.dk/full-episodes/s18e07-grounded-vindaloop', - 'playlist_count': 4, + 'info_dict': { + 'title': 'Grounded Vindaloop', + 'description': 'Butters is convinced he\'s living in a virtual reality.', + }, + 'playlist_mincount': 3, }] diff --git a/youtube_dl/extractor/spike.py b/youtube_dl/extractor/spike.py index 63ea7718b..218785ee4 100644 --- a/youtube_dl/extractor/spike.py +++ b/youtube_dl/extractor/spike.py @@ -11,8 +11,10 @@ class SpikeIE(MTVServicesInfoExtractor): 'info_dict': { 'id': 'b9c8221a-4e50-479a-b86d-3333323e38ba', 'ext': 'mp4', - 'title': 'Auction Hunters|Can Allen Ride A Hundred Year-Old Motorcycle?', + 'title': 'Auction Hunters|December 27, 2013|4|414|Can Allen Ride A Hundred Year-Old Motorcycle?', 'description': 'md5:fbed7e82ed5fad493615b3094a9499cb', + 'timestamp': 1388120400, + 'upload_date': '20131227', }, }, { 'url': 'http://www.spike.com/video-clips/lhtu8m/', diff --git a/youtube_dl/extractor/telegraaf.py b/youtube_dl/extractor/telegraaf.py index 9092e9b85..58078c531 100644 --- a/youtube_dl/extractor/telegraaf.py +++ b/youtube_dl/extractor/telegraaf.py @@ -47,11 +47,10 @@ class TelegraafIE(InfoExtractor): ext = determine_ext(manifest_url) if ext == 'm3u8': formats.extend(self._extract_m3u8_formats( - manifest_url, video_id, ext='mp4', m3u8_id='hls')) + manifest_url, video_id, ext='mp4', m3u8_id='hls', fatal=False)) elif ext == 'mpd': - # TODO: Current DASH formats are broken - $Time$ pattern in - # <SegmentTemplate> not implemented yet - continue + formats.extend(self._extract_mpd_formats( + manifest_url, video_id, mpd_id='dash', fatal=False)) else: self.report_warning('Unknown adaptive format %s' % ext) for location in locations.get('progressive', []): diff --git a/youtube_dl/extractor/tvland.py b/youtube_dl/extractor/tvland.py index b73279dec..cb76a2a58 100644 --- a/youtube_dl/extractor/tvland.py +++ b/youtube_dl/extractor/tvland.py @@ -9,56 +9,23 @@ class TVLandIE(MTVServicesInfoExtractor): _VALID_URL = r'https?://(?:www\.)?tvland\.com/(?:video-clips|episodes)/(?P<id>[^/?#.]+)' _FEED_URL = 'http://www.tvland.com/feeds/mrss/' _TESTS = [{ + # Geo-restricted. Without a proxy metadata are still there. With a + # proxy it redirects to http://m.tvland.com/app/ 'url': 'http://www.tvland.com/episodes/hqhps2/everybody-loves-raymond-the-invasion-ep-048', - 'playlist': [ - { - 'md5': '227e9723b9669c05bf51098b10287aa7', - 'info_dict': { - 'id': 'bcbd3a83-3aca-4dca-809b-f78a87dcccdd', - 'ext': 'mp4', - 'title': 'Everybody Loves Raymond|Everybody Loves Raymond 048 HD, Part 1 of 5', - } - }, - { - 'md5': '9fa2b764ec0e8194fb3ebb01a83df88b', - 'info_dict': { - 'id': 'f4279548-6e13-40dd-92e8-860d27289197', - 'ext': 'mp4', - 'title': 'Everybody Loves Raymond|Everybody Loves Raymond 048 HD, Part 2 of 5', - } - }, - { - 'md5': 'fde4c3bccd7cc7e3576b338734153cec', - 'info_dict': { - 'id': '664e4a38-53ef-4115-9bc9-d0f789ec6334', - 'ext': 'mp4', - 'title': 'Everybody Loves Raymond|Everybody Loves Raymond 048 HD, Part 3 of 5', - } - }, - { - 'md5': '247f6780cda6891f2e49b8ae2b10e017', - 'info_dict': { - 'id': '9146ecf5-b15a-4d78-879c-6679b77f4960', - 'ext': 'mp4', - 'title': 'Everybody Loves Raymond|Everybody Loves Raymond 048 HD, Part 4 of 5', - } - }, - { - 'md5': 'fd269f33256e47bad5eb6c40de089ff6', - 'info_dict': { - 'id': '04334a2e-9a47-4214-a8c2-ae5792e2fab7', - 'ext': 'mp4', - 'title': 'Everybody Loves Raymond|Everybody Loves Raymond 048 HD, Part 5 of 5', - } - } - ], + 'info_dict': { + 'description': 'md5:80973e81b916a324e05c14a3fb506d29', + 'title': 'The Invasion', + }, + 'playlist': [], }, { 'url': 'http://www.tvland.com/video-clips/zea2ev/younger-younger--hilary-duff---little-lies', 'md5': 'e2c6389401cf485df26c79c247b08713', 'info_dict': { 'id': 'b8697515-4bbe-4e01-83d5-fa705ce5fa88', 'ext': 'mp4', - 'title': 'Younger|Younger: Hilary Duff - Little Lies', - 'description': 'md5:7d192f56ca8d958645c83f0de8ef0269' + 'title': 'Younger|December 28, 2015|2|NO-EPISODE#|Younger: Hilary Duff - Little Lies', + 'description': 'md5:7d192f56ca8d958645c83f0de8ef0269', + 'upload_date': '20151228', + 'timestamp': 1451289600, }, }] diff --git a/youtube_dl/extractor/tvp.py b/youtube_dl/extractor/tvp.py index 5070082da..e84876b54 100644 --- a/youtube_dl/extractor/tvp.py +++ b/youtube_dl/extractor/tvp.py @@ -89,8 +89,8 @@ class TVPIE(InfoExtractor): r'(https?://.+?/video)(?:\.(?:ism|f4m|m3u8)|-\d+\.mp4)', video_url, 'video base url', default=None) if video_url_base: - # TODO: Current DASH formats are broken - $Time$ pattern in - # <SegmentTemplate> not implemented yet + # TODO: <Group> found instead of <AdaptationSet> in MPD manifest. + # It's not mentioned in MPEG-DASH standard. Figure that out. # formats.extend(self._extract_mpd_formats( # video_url_base + '.ism/video.mpd', # video_id, mpd_id='dash', fatal=False)) diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py index 67b1277cc..890f55180 100644 --- a/youtube_dl/extractor/twitch.py +++ b/youtube_dl/extractor/twitch.py @@ -461,7 +461,7 @@ class TwitchClipsIE(InfoExtractor): IE_NAME = 'twitch:clips' _VALID_URL = r'https?://clips\.twitch\.tv/(?:[^/]+/)*(?P<id>[^/?#&]+)' - _TEST = { + _TESTS = [{ 'url': 'https://clips.twitch.tv/ea/AggressiveCobraPoooound', 'md5': '761769e1eafce0ffebfb4089cb3847cd', 'info_dict': { @@ -473,7 +473,11 @@ class TwitchClipsIE(InfoExtractor): 'uploader': 'stereotype_', 'uploader_id': 'stereotype_', }, - } + }, { + # multiple formats + 'url': 'https://clips.twitch.tv/rflegendary/UninterestedBeeDAESuppy', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) @@ -485,15 +489,27 @@ class TwitchClipsIE(InfoExtractor): r'(?s)clipInfo\s*=\s*({.+?});', webpage, 'clip info'), video_id, transform_source=js_to_json) - video_url = clip['clip_video_url'] - title = clip['channel_title'] + title = clip.get('channel_title') or self._og_search_title(webpage) + + formats = [{ + 'url': option['source'], + 'format_id': option.get('quality'), + 'height': int_or_none(option.get('quality')), + } for option in clip.get('quality_options', []) if option.get('source')] + + if not formats: + formats = [{ + 'url': clip['clip_video_url'], + }] + + self._sort_formats(formats) return { 'id': video_id, - 'url': video_url, 'title': title, 'thumbnail': self._og_search_thumbnail(webpage), 'creator': clip.get('broadcaster_display_name') or clip.get('broadcaster_login'), 'uploader': clip.get('curator_login'), 'uploader_id': clip.get('curator_display_name'), + 'formats': formats, } diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 270ee8861..268080ba6 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -53,6 +53,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor): """Provide base functions for Youtube extractors""" _LOGIN_URL = 'https://accounts.google.com/ServiceLogin' _TWOFACTOR_URL = 'https://accounts.google.com/signin/challenge' + _PASSWORD_CHALLENGE_URL = 'https://accounts.google.com/signin/challenge/sl/password' _NETRC_MACHINE = 'youtube' # If True it will raise an error if no login info is provided _LOGIN_REQUIRED = False @@ -116,12 +117,10 @@ class YoutubeBaseInfoExtractor(InfoExtractor): 'hl': 'en_US', } - login_data = urlencode_postdata(login_form_strs) - - req = sanitized_Request(self._LOGIN_URL, login_data) login_results = self._download_webpage( - req, None, - note='Logging in', errnote='unable to log in', fatal=False) + self._PASSWORD_CHALLENGE_URL, None, + note='Logging in', errnote='unable to log in', fatal=False, + data=urlencode_postdata(login_form_strs)) if login_results is False: return False diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 42028125b..5fa066170 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -2123,6 +2123,7 @@ def mimetype2ext(mt): 'dash+xml': 'mpd', 'f4m': 'f4m', 'f4m+xml': 'f4m', + 'hds+xml': 'f4m', 'vnd.ms-sstr+xml': 'ism', }.get(res, res) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 34b62480b..2cfa406d9 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2016.07.17' +__version__ = '2016.07.28'