diff --git a/test/test_youtube_lists.py b/test/test_youtube_lists.py index 47df0f348..af1c45421 100644 --- a/test/test_youtube_lists.py +++ b/test/test_youtube_lists.py @@ -44,7 +44,7 @@ class TestYoutubeLists(unittest.TestCase): ie = YoutubePlaylistIE(dl) result = ie.extract('https://www.youtube.com/watch?v=W01L70IGBgE&index=2&list=RDOQpdSVF_k_w') entries = result['entries'] - self.assertTrue(len(entries) >= 20) + self.assertTrue(len(entries) >= 50) original_video = entries[0] self.assertEqual(original_video['id'], 'OQpdSVF_k_w') diff --git a/youtube_dl/downloader/external.py b/youtube_dl/downloader/external.py index 30277dc20..8d642fc3e 100644 --- a/youtube_dl/downloader/external.py +++ b/youtube_dl/downloader/external.py @@ -225,7 +225,7 @@ class FFmpegFD(ExternalFD): args += ['-i', url, '-c', 'copy'] if protocol == 'm3u8': - if self.params.get('hls_use_mpegts', False): + if self.params.get('hls_use_mpegts', False) or tmpfilename == '-': args += ['-f', 'mpegts'] else: args += ['-f', 'mp4', '-bsf:a', 'aac_adtstoasc'] diff --git a/youtube_dl/extractor/audiomack.py b/youtube_dl/extractor/audiomack.py index 3eed91279..a52d26cec 100644 --- a/youtube_dl/extractor/audiomack.py +++ b/youtube_dl/extractor/audiomack.py @@ -30,14 +30,14 @@ class AudiomackIE(InfoExtractor): # audiomack wrapper around soundcloud song { 'add_ie': ['Soundcloud'], - 'url': 'http://www.audiomack.com/song/xclusiveszone/take-kare', + 'url': 'http://www.audiomack.com/song/hip-hop-daily/black-mamba-freestyle', 'info_dict': { - 'id': '172419696', + 'id': '258901379', 'ext': 'mp3', - 'description': 'md5:1fc3272ed7a635cce5be1568c2822997', - 'title': 'Young Thug ft Lil Wayne - Take Kare', - 'uploader': 'Young Thug World', - 'upload_date': '20141016', + 'description': 'mamba day freestyle for the legend Kobe Bryant ', + 'title': 'Black Mamba Freestyle [Prod. By Danny Wolf]', + 'uploader': 'ILOVEMAKONNEN', + 'upload_date': '20160414', } }, ] diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index 425f08f2b..74c4510f9 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -671,6 +671,7 @@ class BBCIE(BBCCoUkIE): 'info_dict': { 'id': '34475836', 'title': 'Jurgen Klopp: Furious football from a witty and winning coach', + 'description': 'Fast-paced football, wit, wisdom and a ready smile - why Liverpool fans should come to love new boss Jurgen Klopp.', }, 'playlist_count': 3, }, { diff --git a/youtube_dl/extractor/cbs.py b/youtube_dl/extractor/cbs.py index c621a08d5..051d783a2 100644 --- a/youtube_dl/extractor/cbs.py +++ b/youtube_dl/extractor/cbs.py @@ -5,7 +5,6 @@ from ..utils import ( xpath_text, xpath_element, int_or_none, - ExtractorError, find_xpath_attr, ) @@ -64,7 +63,7 @@ class CBSIE(CBSBaseIE): 'url': 'http://www.colbertlateshow.com/podcasts/dYSwjqPs_X1tvbV_P2FcPWRa_qT6akTC/in-the-bad-room-with-stephen/', 'only_matching': True, }] - TP_RELEASE_URL_TEMPLATE = 'http://link.theplatform.com/s/dJ5BDC/%s?manifest=m3u&mbr=true' + TP_RELEASE_URL_TEMPLATE = 'http://link.theplatform.com/s/dJ5BDC/%s?mbr=true' def _real_extract(self, url): display_id = self._match_id(url) @@ -84,11 +83,11 @@ class CBSIE(CBSBaseIE): pid = xpath_text(item, 'pid') if not pid: continue - try: - tp_formats, tp_subtitles = self._extract_theplatform_smil( - self.TP_RELEASE_URL_TEMPLATE % pid, content_id, 'Downloading %s SMIL data' % pid) - except ExtractorError: - continue + tp_release_url = self.TP_RELEASE_URL_TEMPLATE % pid + if '.m3u8' in xpath_text(item, 'contentUrl', default=''): + tp_release_url += '&manifest=m3u' + tp_formats, tp_subtitles = self._extract_theplatform_smil( + tp_release_url, content_id, 'Downloading %s SMIL data' % pid) formats.extend(tp_formats) subtitles = self._merge_subtitles(subtitles, tp_subtitles) self._sort_formats(formats) diff --git a/youtube_dl/extractor/eagleplatform.py b/youtube_dl/extractor/eagleplatform.py index 7bbf617d4..fa3cb7023 100644 --- a/youtube_dl/extractor/eagleplatform.py +++ b/youtube_dl/extractor/eagleplatform.py @@ -4,6 +4,7 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import compat_HTTPError from ..utils import ( ExtractorError, int_or_none, @@ -55,8 +56,13 @@ class EaglePlatformIE(InfoExtractor): raise ExtractorError(' '.join(response['errors']), expected=True) def _download_json(self, url_or_request, video_id, note='Downloading JSON metadata'): - response = super(EaglePlatformIE, self)._download_json(url_or_request, video_id, note) - self._handle_error(response) + try: + response = super(EaglePlatformIE, self)._download_json(url_or_request, video_id, note) + except ExtractorError as ee: + if isinstance(ee.cause, compat_HTTPError): + response = self._parse_json(ee.cause.read().decode('utf-8'), video_id) + self._handle_error(response) + raise return response def _get_video_url(self, url_or_request, video_id, note='Downloading JSON metadata'): diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index d00445b3c..aa83daa02 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -425,7 +425,6 @@ from .moevideo import MoeVideoIE from .mofosex import MofosexIE from .mojvideo import MojvideoIE from .moniker import MonikerIE -from .mooshare import MooshareIE from .morningstar import MorningstarIE from .motherless import MotherlessIE from .motorsport import MotorsportIE @@ -470,7 +469,6 @@ from .ndr import ( from .ndtv import NDTVIE from .netzkino import NetzkinoIE from .nerdcubed import NerdCubedFeedIE -from .nerdist import NerdistIE from .neteasemusic import ( NetEaseMusicIE, NetEaseMusicAlbumIE, @@ -753,7 +751,6 @@ from .teletask import TeleTaskIE from .testurl import TestURLIE from .tf1 import TF1IE from .theintercept import TheInterceptIE -from .theonion import TheOnionIE from .theplatform import ( ThePlatformIE, ThePlatformFeedIE, diff --git a/youtube_dl/extractor/gazeta.py b/youtube_dl/extractor/gazeta.py index ea32b621c..ba1c15414 100644 --- a/youtube_dl/extractor/gazeta.py +++ b/youtube_dl/extractor/gazeta.py @@ -7,7 +7,7 @@ from .common import InfoExtractor class GazetaIE(InfoExtractor): - _VALID_URL = r'(?Phttps?://(?:www\.)?gazeta\.ru/(?:[^/]+/)?video/(?:(?:main|\d{4}/\d{2}/\d{2})/)?(?P[A-Za-z0-9-_.]+)\.s?html)' + _VALID_URL = r'(?Phttps?://(?:www\.)?gazeta\.ru/(?:[^/]+/)?video/(?:main/)*(?:\d{4}/\d{2}/\d{2}/)?(?P[A-Za-z0-9-_.]+)\.s?html)' _TESTS = [{ 'url': 'http://www.gazeta.ru/video/main/zadaite_vopros_vladislavu_yurevichu.shtml', 'md5': 'd49c9bdc6e5a7888f27475dc215ee789', @@ -18,9 +18,22 @@ class GazetaIE(InfoExtractor): 'description': 'md5:38617526050bd17b234728e7f9620a71', 'thumbnail': 're:^https?://.*\.jpg', }, + 'skip': 'video not found', }, { 'url': 'http://www.gazeta.ru/lifestyle/video/2015/03/08/master-klass_krasivoi_byt._delaem_vesennii_makiyazh.shtml', 'only_matching': True, + }, { + 'url': 'http://www.gazeta.ru/video/main/main/2015/06/22/platit_ili_ne_platit_po_isku_yukosa.shtml', + 'info_dict': { + 'id': '252048', + 'ext': 'mp4', + 'title': '"Если по иску ЮКОСа придется платить, это будет большой удар по бюджету"', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + 'add_ie': ['EaglePlatform'], }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 5b22b6b5e..95d233259 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -105,7 +105,8 @@ class GenericIE(InfoExtractor): 'skip_download': True, # infinite live stream }, 'expected_warnings': [ - r'501.*Not Implemented' + r'501.*Not Implemented', + r'400.*Bad Request', ], }, # Direct link with incorrect MIME type @@ -1955,7 +1956,8 @@ class GenericIE(InfoExtractor): # Look for Instagram embeds instagram_embed_url = InstagramIE._extract_embed_url(webpage) if instagram_embed_url is not None: - return self.url_result(instagram_embed_url, InstagramIE.ie_key()) + return self.url_result( + self._proto_relative_url(instagram_embed_url), InstagramIE.ie_key()) # Look for LiveLeak embeds liveleak_url = LiveLeakIE._extract_url(webpage) diff --git a/youtube_dl/extractor/huffpost.py b/youtube_dl/extractor/huffpost.py index 1dc5701b2..059073749 100644 --- a/youtube_dl/extractor/huffpost.py +++ b/youtube_dl/extractor/huffpost.py @@ -69,7 +69,7 @@ class HuffPostIE(InfoExtractor): formats.extend(self._extract_m3u8_formats( url, video_id, ext='mp4', m3u8_id='hls', fatal=False)) elif ext == 'f4m': - formats.extend(self._extract_f4m_formatsa( + formats.extend(self._extract_f4m_formats( url + '?hdcore=2.9.5', video_id, f4m_id='hds', fatal=False)) else: formats.append({ diff --git a/youtube_dl/extractor/instagram.py b/youtube_dl/extractor/instagram.py index 11bb58d8a..3cbe77ad8 100644 --- a/youtube_dl/extractor/instagram.py +++ b/youtube_dl/extractor/instagram.py @@ -12,7 +12,7 @@ from ..utils import ( class InstagramIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?instagram\.com/p/(?P[^/?#&]+)' + _VALID_URL = r'(?Phttps?://(?:www\.)?instagram\.com/p/(?P[^/?#&]+))' _TESTS = [{ 'url': 'https://instagram.com/p/aye83DjauH/?foo=bar#abc', 'md5': '0d2da106a9d2631273e192b372806516', @@ -38,10 +38,19 @@ class InstagramIE(InfoExtractor): }, { 'url': 'https://instagram.com/p/-Cmh1cukG2/', 'only_matching': True, + }, { + 'url': 'http://instagram.com/p/9o6LshA7zy/embed/', + 'only_matching': True, }] @staticmethod def _extract_embed_url(webpage): + mobj = re.search( + r']+src=(["\'])(?P(?:https?:)?//(?:www\.)?instagram\.com/p/[^/]+/embed.*?)\1', + webpage) + if mobj: + return mobj.group('url') + blockquote_el = get_element_by_attribute( 'class', 'instagram-media', webpage) if blockquote_el is None: @@ -53,7 +62,9 @@ class InstagramIE(InfoExtractor): return mobj.group('link') def _real_extract(self, url): - video_id = self._match_id(url) + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + url = mobj.group('url') webpage = self._download_webpage(url, video_id) uploader_id = self._search_regex(r'"owner":{"username":"(.+?)"', diff --git a/youtube_dl/extractor/iqiyi.py b/youtube_dl/extractor/iqiyi.py index 88570f261..ea8fbb329 100644 --- a/youtube_dl/extractor/iqiyi.py +++ b/youtube_dl/extractor/iqiyi.py @@ -165,7 +165,7 @@ class IqiyiIE(InfoExtractor): IE_NAME = 'iqiyi' IE_DESC = '爱奇艺' - _VALID_URL = r'https?://(?:[^.]+\.)?iqiyi\.com/.+\.html' + _VALID_URL = r'https?://(?:(?:[^.]+\.)?iqiyi\.com|www\.pps\.tv)/.+\.html' _NETRC_MACHINE = 'iqiyi' @@ -273,6 +273,9 @@ class IqiyiIE(InfoExtractor): 'title': '灌篮高手 国语版', }, 'playlist_count': 101, + }, { + 'url': 'http://www.pps.tv/w_19rrbav0ph.html', + 'only_matching': True, }] _FORMATS_MAP = [ diff --git a/youtube_dl/extractor/karaoketv.py b/youtube_dl/extractor/karaoketv.py index b4c30b7f3..a6050c4de 100644 --- a/youtube_dl/extractor/karaoketv.py +++ b/youtube_dl/extractor/karaoketv.py @@ -2,39 +2,63 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..compat import compat_urllib_parse_unquote_plus -from ..utils import ( - js_to_json, -) class KaraoketvIE(InfoExtractor): - _VALID_URL = r'https?://karaoketv\.co\.il/\?container=songs&id=(?P[0-9]+)' + _VALID_URL = r'http://www.karaoketv.co.il/[^/]+/(?P\d+)' _TEST = { - 'url': 'http://karaoketv.co.il/?container=songs&id=171568', + 'url': 'http://www.karaoketv.co.il/%D7%A9%D7%99%D7%A8%D7%99_%D7%A7%D7%A8%D7%99%D7%95%D7%A7%D7%99/58356/%D7%90%D7%99%D7%96%D7%95%D7%9F', 'info_dict': { - 'id': '171568', - 'ext': 'mp4', - 'title': 'אל העולם שלך - רותם כהן - שרים קריוקי', + 'id': '58356', + 'ext': 'flv', + 'title': 'קריוקי של איזון', + }, + 'params': { + # rtmp download + 'skip_download': True, } } def _real_extract(self, url): video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + api_page_url = self._search_regex( + r']+src=(["\'])(?Phttps?://www\.karaoke\.co\.il/api_play\.php\?.+?)\1', + webpage, 'API play URL', group='url') - page_video_url = self._og_search_video_url(webpage, video_id) - config_json = compat_urllib_parse_unquote_plus(self._search_regex( - r'config=(.*)', page_video_url, 'configuration')) + api_page = self._download_webpage(api_page_url, video_id) + video_cdn_url = self._search_regex( + r']+src=(["\'])(?Phttps?://www\.video-cdn\.com/embed/iframe/.+?)\1', + api_page, 'video cdn URL', group='url') - urls_info_json = self._download_json( - config_json, video_id, 'Downloading configuration', - transform_source=js_to_json) + video_cdn = self._download_webpage(video_cdn_url, video_id) + play_path = self._parse_json( + self._search_regex( + r'var\s+options\s*=\s*({.+?});', video_cdn, 'options'), + video_id)['clip']['url'] - url = urls_info_json['playlist'][0]['url'] + settings = self._parse_json( + self._search_regex( + r'var\s+settings\s*=\s*({.+?});', video_cdn, 'servers', default='{}'), + video_id, fatal=False) or {} + + servers = settings.get('servers') + if not servers or not isinstance(servers, list): + servers = ('wowzail.video-cdn.com:80/vodcdn', ) + + formats = [{ + 'url': 'rtmp://%s' % server if not server.startswith('rtmp') else server, + 'play_path': play_path, + 'app': 'vodcdn', + 'page_url': video_cdn_url, + 'player_url': 'http://www.video-cdn.com/assets/flowplayer/flowplayer.commercial-3.2.18.swf', + 'rtmp_real_time': True, + 'ext': 'flv', + } for server in servers] return { 'id': video_id, 'title': self._og_search_title(webpage), - 'url': url, + 'formats': formats, } diff --git a/youtube_dl/extractor/kuwo.py b/youtube_dl/extractor/kuwo.py index 86c17c931..c0ece5113 100644 --- a/youtube_dl/extractor/kuwo.py +++ b/youtube_dl/extractor/kuwo.py @@ -268,7 +268,7 @@ class KuwoCategoryIE(InfoExtractor): 'title': '八十年代精选', 'description': '这些都是属于八十年代的回忆!', }, - 'playlist_count': 30, + 'playlist_count': 24, } def _real_extract(self, url): diff --git a/youtube_dl/extractor/mdr.py b/youtube_dl/extractor/mdr.py index 2338e7f96..2100583df 100644 --- a/youtube_dl/extractor/mdr.py +++ b/youtube_dl/extractor/mdr.py @@ -49,8 +49,8 @@ class MDRIE(InfoExtractor): 'ext': 'mp4', 'title': 'Beutolomäus und der geheime Weihnachtswunsch', 'description': 'md5:b69d32d7b2c55cbe86945ab309d39bbd', - 'timestamp': 1419047100, - 'upload_date': '20141220', + 'timestamp': 1450950000, + 'upload_date': '20151224', 'duration': 4628, 'uploader': 'KIKA', }, @@ -71,8 +71,8 @@ class MDRIE(InfoExtractor): webpage = self._download_webpage(url, video_id) data_url = self._search_regex( - r'(?:dataURL|playerXml(?:["\'])?)\s*:\s*(["\'])(?P\\?/.+/(?:video|audio)-?[0-9]+-avCustom\.xml)\1', - webpage, 'data url', default=None, group='url').replace('\/', '/') + r'(?:dataURL|playerXml(?:["\'])?)\s*:\s*(["\'])(?P.+/(?:video|audio)-?[0-9]+-avCustom\.xml)\1', + webpage, 'data url', group='url').replace('\/', '/') doc = self._download_xml( compat_urlparse.urljoin(url, data_url), video_id) diff --git a/youtube_dl/extractor/mooshare.py b/youtube_dl/extractor/mooshare.py deleted file mode 100644 index a85109a89..000000000 --- a/youtube_dl/extractor/mooshare.py +++ /dev/null @@ -1,110 +0,0 @@ -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - ExtractorError, - sanitized_Request, - urlencode_postdata, -) - - -class MooshareIE(InfoExtractor): - IE_NAME = 'mooshare' - IE_DESC = 'Mooshare.biz' - _VALID_URL = r'https?://(?:www\.)?mooshare\.biz/(?P[\da-z]{12})' - - _TESTS = [ - { - 'url': 'http://mooshare.biz/8dqtk4bjbp8g', - 'md5': '4e14f9562928aecd2e42c6f341c8feba', - 'info_dict': { - 'id': '8dqtk4bjbp8g', - 'ext': 'mp4', - 'title': 'Comedy Football 2011 - (part 1-2)', - 'duration': 893, - }, - }, - { - 'url': 'http://mooshare.biz/aipjtoc4g95j', - 'info_dict': { - 'id': 'aipjtoc4g95j', - 'ext': 'mp4', - 'title': 'Orange Caramel Dashing Through the Snow', - 'duration': 212, - }, - 'params': { - # rtmp download - 'skip_download': True, - } - } - ] - - def _real_extract(self, url): - video_id = self._match_id(url) - page = self._download_webpage(url, video_id, 'Downloading page') - - if re.search(r'>Video Not Found or Deleted<', page) is not None: - raise ExtractorError('Video %s does not exist' % video_id, expected=True) - - hash_key = self._html_search_regex(r'', page, 'hash') - title = self._html_search_regex(r'(?m)
\s*

Watch ([^<]+)

', page, 'title') - - download_form = { - 'op': 'download1', - 'id': video_id, - 'hash': hash_key, - } - - request = sanitized_Request( - 'http://mooshare.biz/%s' % video_id, urlencode_postdata(download_form)) - request.add_header('Content-Type', 'application/x-www-form-urlencoded') - - self._sleep(5, video_id) - - video_page = self._download_webpage(request, video_id, 'Downloading video page') - - thumbnail = self._html_search_regex(r'image:\s*"([^"]+)",', video_page, 'thumbnail', fatal=False) - duration_str = self._html_search_regex(r'duration:\s*"(\d+)",', video_page, 'duration', fatal=False) - duration = int(duration_str) if duration_str is not None else None - - formats = [] - - # SD video - mobj = re.search(r'(?m)file:\s*"(?P[^"]+)",\s*provider:', video_page) - if mobj is not None: - formats.append({ - 'url': mobj.group('url'), - 'format_id': 'sd', - 'format': 'SD', - }) - - # HD video - mobj = re.search(r'\'hd-2\': { file: \'(?P[^\']+)\' },', video_page) - if mobj is not None: - formats.append({ - 'url': mobj.group('url'), - 'format_id': 'hd', - 'format': 'HD', - }) - - # rtmp video - mobj = re.search(r'(?m)file: "(?P[^"]+)",\s*streamer: "(?Prtmp://[^"]+)",', video_page) - if mobj is not None: - formats.append({ - 'url': mobj.group('rtmpurl'), - 'play_path': mobj.group('playpath'), - 'rtmp_live': False, - 'ext': 'mp4', - 'format_id': 'rtmp', - 'format': 'HD', - }) - - return { - 'id': video_id, - 'title': title, - 'thumbnail': thumbnail, - 'duration': duration, - 'formats': formats, - } diff --git a/youtube_dl/extractor/musicplayon.py b/youtube_dl/extractor/musicplayon.py index 50d92b50a..2174e5665 100644 --- a/youtube_dl/extractor/musicplayon.py +++ b/youtube_dl/extractor/musicplayon.py @@ -1,17 +1,21 @@ # encoding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor -from ..utils import int_or_none +from ..compat import compat_urlparse +from ..utils import ( + int_or_none, + js_to_json, + mimetype2ext, +) class MusicPlayOnIE(InfoExtractor): - _VALID_URL = r'https?://(?:.+?\.)?musicplayon\.com/play(?:-touch)?\?(?:v|pl=100&play)=(?P\d+)' + _VALID_URL = r'https?://(?:.+?\.)?musicplayon\.com/play(?:-touch)?\?(?:v|pl=\d+&play)=(?P\d+)' - _TEST = { + _TESTS = [{ 'url': 'http://en.musicplayon.com/play?v=433377', + 'md5': '00cdcdea1726abdf500d1e7fd6dd59bb', 'info_dict': { 'id': '433377', 'ext': 'mp4', @@ -20,15 +24,16 @@ class MusicPlayOnIE(InfoExtractor): 'duration': 342, 'uploader': 'ultrafish', }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - } + }, { + 'url': 'http://en.musicplayon.com/play?pl=102&play=442629', + 'only_matching': True, + }] + + _URL_TEMPLATE = 'http://en.musicplayon.com/play?v=%s' def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + video_id = self._match_id(url) + url = self._URL_TEMPLATE % video_id page = self._download_webpage(url, video_id) @@ -40,28 +45,14 @@ class MusicPlayOnIE(InfoExtractor): uploader = self._html_search_regex( r'', page, 'uploader', fatal=False) - formats = [ - { - 'url': 'http://media0-eu-nl.musicplayon.com/stream-mobile?id=%s&type=.mp4' % video_id, - 'ext': 'mp4', - } - ] - - manifest = self._download_webpage( - 'http://en.musicplayon.com/manifest.m3u8?v=%s' % video_id, video_id, 'Downloading manifest') - - for entry in manifest.split('#')[1:]: - if entry.startswith('EXT-X-STREAM-INF:'): - meta, url, _ = entry.split('\n') - params = dict(param.split('=') for param in meta.split(',')[1:]) - formats.append({ - 'url': url, - 'ext': 'mp4', - 'tbr': int(params['BANDWIDTH']), - 'width': int(params['RESOLUTION'].split('x')[1]), - 'height': int(params['RESOLUTION'].split('x')[-1]), - 'format_note': params['NAME'].replace('"', '').strip(), - }) + sources = self._parse_json( + self._search_regex(r'setup\[\'_sources\'\]\s*=\s*([^;]+);', page, 'video sources'), + video_id, transform_source=js_to_json) + formats = [{ + 'url': compat_urlparse.urljoin(url, source['src']), + 'ext': mimetype2ext(source.get('type')), + 'format_note': source.get('data-res'), + } for source in sources] return { 'id': video_id, diff --git a/youtube_dl/extractor/nerdist.py b/youtube_dl/extractor/nerdist.py deleted file mode 100644 index c6dc34be4..000000000 --- a/youtube_dl/extractor/nerdist.py +++ /dev/null @@ -1,80 +0,0 @@ -# encoding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor - -from ..utils import ( - determine_ext, - parse_iso8601, - xpath_text, -) - - -class NerdistIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?nerdist\.com/vepisode/(?P[^/?#]+)' - _TEST = { - 'url': 'http://www.nerdist.com/vepisode/exclusive-which-dc-characters-w', - 'md5': '3698ed582931b90d9e81e02e26e89f23', - 'info_dict': { - 'display_id': 'exclusive-which-dc-characters-w', - 'id': 'RPHpvJyr', - 'ext': 'mp4', - 'title': 'Your TEEN TITANS Revealed! Who\'s on the show?', - 'thumbnail': 're:^https?://.*/thumbs/.*\.jpg$', - 'description': 'Exclusive: Find out which DC Comics superheroes will star in TEEN TITANS Live-Action TV Show on Nerdist News with Jessica Chobot!', - 'uploader': 'Eric Diaz', - 'upload_date': '20150202', - 'timestamp': 1422892808, - } - } - - def _real_extract(self, url): - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) - - video_id = self._search_regex( - r'''(?x)([^>]+)', webpage, 'title') + uploader = self._html_search_regex( - r',"artist":"([^"]+)",', webpage, 'music uploader') + [r',"artist":"([^"]+)",', r'[\'"]owner[\'"]\s*:\s*[\'"]([^\'"]+)[\'"],'], + webpage, 'uploader') music_url_json_string = self._html_search_regex( r'({"url":"[^"]+"),', webpage, 'music url') + '}' diff --git a/youtube_dl/extractor/onionstudios.py b/youtube_dl/extractor/onionstudios.py index 0f1f448fe..6e843c327 100644 --- a/youtube_dl/extractor/onionstudios.py +++ b/youtube_dl/extractor/onionstudios.py @@ -4,7 +4,10 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..utils import determine_ext +from ..utils import ( + determine_ext, + int_or_none, +) class OnionStudiosIE(InfoExtractor): @@ -17,7 +20,7 @@ class OnionStudiosIE(InfoExtractor): 'id': '2937', 'ext': 'mp4', 'title': 'Hannibal charges forward, stops for a cocktail', - 'description': 'md5:545299bda6abf87e5ec666548c6a9448', + 'description': 'md5:e786add7f280b7f0fe237b64cc73df76', 'thumbnail': 're:^https?://.*\.jpg$', 'uploader': 'The A.V. Club', 'uploader_id': 'TheAVClub', @@ -42,9 +45,19 @@ class OnionStudiosIE(InfoExtractor): formats = [] for src in re.findall(r']+src="([^"]+)"', webpage): - if determine_ext(src) != 'm3u8': # m3u8 always results in 403 + ext = determine_ext(src) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + src, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) + else: + height = int_or_none(self._search_regex( + r'/(\d+)\.%s' % ext, src, 'height', default=None)) formats.append({ + 'format_id': ext + ('-%sp' % height if height else ''), 'url': src, + 'height': height, + 'ext': ext, + 'preference': 1, }) self._sort_formats(formats) diff --git a/youtube_dl/extractor/puls4.py b/youtube_dl/extractor/puls4.py index cce84b9e4..fca30e1aa 100644 --- a/youtube_dl/extractor/puls4.py +++ b/youtube_dl/extractor/puls4.py @@ -40,7 +40,7 @@ class Puls4IE(InfoExtractor): webpage = self._download_webpage(url, video_id) error_message = self._html_search_regex( - r'
(.+?)
', + r']+class="message-error"[^>]*>(.+?)
', webpage, 'error message', default=None) if error_message: raise ExtractorError( diff --git a/youtube_dl/extractor/sportbox.py b/youtube_dl/extractor/sportbox.py index 4f0c66213..e5c28ae89 100644 --- a/youtube_dl/extractor/sportbox.py +++ b/youtube_dl/extractor/sportbox.py @@ -6,6 +6,7 @@ import re from .common import InfoExtractor from ..compat import compat_urlparse from ..utils import ( + js_to_json, unified_strdate, ) @@ -94,19 +95,32 @@ class SportBoxEmbedIE(InfoExtractor): webpage = self._download_webpage(url, video_id) - hls = self._search_regex( - r"sportboxPlayer\.jwplayer_common_params\.file\s*=\s*['\"]([^'\"]+)['\"]", - webpage, 'hls file') + formats = [] + + def cleanup_js(code): + # desktop_advert_config contains complex Javascripts and we don't need it + return js_to_json(re.sub(r'desktop_advert_config.*', '', code)) + + jwplayer_data = self._parse_json(self._search_regex( + r'(?s)player\.setup\(({.+?})\);', webpage, 'jwplayer settings'), video_id, + transform_source=cleanup_js) + + hls_url = jwplayer_data.get('hls_url') + if hls_url: + formats.extend(self._extract_m3u8_formats( + hls_url, video_id, ext='mp4', m3u8_id='hls')) + + rtsp_url = jwplayer_data.get('rtsp_url') + if rtsp_url: + formats.append({ + 'url': rtsp_url, + 'format_id': 'rtsp', + }) - formats = self._extract_m3u8_formats(hls, video_id, 'mp4') self._sort_formats(formats) - title = self._search_regex( - r'sportboxPlayer\.node_title\s*=\s*"([^"]+)"', webpage, 'title') - - thumbnail = self._search_regex( - r'sportboxPlayer\.jwplayer_common_params\.image\s*=\s*"([^"]+)"', - webpage, 'thumbnail', default=None) + title = jwplayer_data['node_title'] + thumbnail = jwplayer_data.get('image_url') return { 'id': video_id, diff --git a/youtube_dl/extractor/theonion.py b/youtube_dl/extractor/theonion.py deleted file mode 100644 index 10239c906..000000000 --- a/youtube_dl/extractor/theonion.py +++ /dev/null @@ -1,63 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor - - -class TheOnionIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?theonion\.com/video/[^,]+,(?P[0-9]+)/?' - _TEST = { - 'url': 'http://www.theonion.com/video/man-wearing-mm-jacket-gods-image,36918/', - 'md5': '19eaa9a39cf9b9804d982e654dc791ee', - 'info_dict': { - 'id': '2133', - 'ext': 'mp4', - 'title': 'Man Wearing M&M Jacket Apparently Made In God\'s Image', - 'description': 'md5:cc12448686b5600baae9261d3e180910', - 'thumbnail': 're:^https?://.*\.jpg\?\d+$', - } - } - - def _real_extract(self, url): - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) - - video_id = self._search_regex( - r'"videoId":\s(\d+),', webpage, 'video ID') - title = self._og_search_title(webpage) - description = self._og_search_description(webpage) - thumbnail = self._og_search_thumbnail(webpage) - - sources = re.findall(r']+class="animated-gif"(?P[^>]+)>\s* ]+video-src="(?P[^"]+)" diff --git a/youtube_dl/extractor/ustream.py b/youtube_dl/extractor/ustream.py index b5fe753d7..54605d863 100644 --- a/youtube_dl/extractor/ustream.py +++ b/youtube_dl/extractor/ustream.py @@ -41,6 +41,12 @@ class UstreamIE(InfoExtractor): 'uploader': 'sportscanadatv', }, 'skip': 'This Pro Broadcaster has chosen to remove this video from the ustream.tv site.', + }, { + 'url': 'http://www.ustream.tv/embed/10299409', + 'info_dict': { + 'id': '10299409', + }, + 'playlist_count': 3, }] def _real_extract(self, url): @@ -55,10 +61,12 @@ class UstreamIE(InfoExtractor): if m.group('type') == 'embed': video_id = m.group('id') webpage = self._download_webpage(url, video_id) - desktop_video_id = self._html_search_regex( - r'ContentVideoIds=\["([^"]*?)"\]', webpage, 'desktop_video_id') - desktop_url = 'http://www.ustream.tv/recorded/' + desktop_video_id - return self.url_result(desktop_url, 'Ustream') + content_video_ids = self._parse_json(self._search_regex( + r'ustream\.vars\.offAirContentVideoIds=([^;]+);', webpage, + 'content video IDs'), video_id) + return self.playlist_result( + map(lambda u: self.url_result('http://www.ustream.tv/recorded/' + u, 'Ustream'), content_video_ids), + video_id) params = self._download_json( 'https://api.ustream.tv/videos/%s.json' % video_id, video_id) diff --git a/youtube_dl/extractor/varzesh3.py b/youtube_dl/extractor/varzesh3.py index 9369abaf8..84698371a 100644 --- a/youtube_dl/extractor/varzesh3.py +++ b/youtube_dl/extractor/varzesh3.py @@ -2,11 +2,19 @@ from __future__ import unicode_literals from .common import InfoExtractor +from ..compat import ( + compat_urllib_parse_urlparse, + compat_parse_qs, +) +from ..utils import ( + clean_html, + remove_start, +) class Varzesh3IE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?video\.varzesh3\.com/(?:[^/]+/)+(?P[^/]+)/?' - _TEST = { + _TESTS = [{ 'url': 'http://video.varzesh3.com/germany/bundesliga/5-%D9%88%D8%A7%DA%A9%D9%86%D8%B4-%D8%A8%D8%B1%D8%AA%D8%B1-%D8%AF%D8%B1%D9%88%D8%A7%D8%B2%D9%87%E2%80%8C%D8%A8%D8%A7%D9%86%D8%A7%D9%86%D8%9B%D9%87%D9%81%D8%AA%D9%87-26-%D8%A8%D9%88%D9%86%D8%AF%D8%B3/', 'md5': '2a933874cb7dce4366075281eb49e855', 'info_dict': { @@ -15,8 +23,19 @@ class Varzesh3IE(InfoExtractor): 'title': '۵ واکنش برتر دروازه‌بانان؛هفته ۲۶ بوندسلیگا', 'description': 'فصل ۲۰۱۵-۲۰۱۴', 'thumbnail': 're:^https?://.*\.jpg$', - } - } + }, + 'skip': 'HTTP 404 Error', + }, { + 'url': 'http://video.varzesh3.com/video/112785/%D8%AF%D9%84%D9%87-%D8%B9%D9%84%DB%8C%D8%9B-%D8%B3%D8%AA%D8%A7%D8%B1%D9%87-%D9%86%D9%88%D8%B8%D9%87%D9%88%D8%B1-%D9%84%DB%8C%DA%AF-%D8%A8%D8%B1%D8%AA%D8%B1-%D8%AC%D8%B2%DB%8C%D8%B1%D9%87', + 'md5': '841b7cd3afbc76e61708d94e53a4a4e7', + 'info_dict': { + 'id': '112785', + 'ext': 'mp4', + 'title': 'دله علی؛ ستاره نوظهور لیگ برتر جزیره', + 'description': 'فوتبال 120', + }, + 'expected_warnings': ['description'], + }] def _real_extract(self, url): display_id = self._match_id(url) @@ -26,15 +45,30 @@ class Varzesh3IE(InfoExtractor): video_url = self._search_regex( r']+src="([^"]+)"', webpage, 'video url') - title = self._og_search_title(webpage) + title = remove_start(self._html_search_regex( + r'([^<]+)', webpage, 'title'), 'ویدیو ورزش 3 | ') + description = self._html_search_regex( r'(?s)
(.+?)
', - webpage, 'description', fatal=False) - thumbnail = self._og_search_thumbnail(webpage) + webpage, 'description', default=None) + if description is None: + description = clean_html(self._html_search_meta('description', webpage)) + + thumbnail = self._og_search_thumbnail(webpage, default=None) + if thumbnail is None: + fb_sharer_url = self._search_regex( + r']+href="(https?://www\.facebook\.com/sharer/sharer\.php?[^"]+)"', + webpage, 'facebook sharer URL', fatal=False) + sharer_params = compat_parse_qs(compat_urllib_parse_urlparse(fb_sharer_url).query) + thumbnail = sharer_params.get('p[images][0]', [None])[0] video_id = self._search_regex( r"]+rel='(?:canonical|shortlink)'[^>]+href='/\?p=([^']+)'", - webpage, display_id, default=display_id) + webpage, display_id, default=None) + if video_id is None: + video_id = self._search_regex( + 'var\s+VideoId\s*=\s*(\d+);', webpage, 'video id', + default=display_id) return { 'url': video_url, diff --git a/youtube_dl/extractor/vice.py b/youtube_dl/extractor/vice.py index 46c785ae1..95daf4dfd 100644 --- a/youtube_dl/extractor/vice.py +++ b/youtube_dl/extractor/vice.py @@ -3,7 +3,6 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from .ooyala import OoyalaIE from ..utils import ExtractorError @@ -14,13 +13,21 @@ class ViceIE(InfoExtractor): 'url': 'http://www.vice.com/video/cowboy-capitalists-part-1', 'info_dict': { 'id': '43cW1mYzpia9IlestBjVpd23Yu3afAfp', - 'ext': 'mp4', + 'ext': 'flv', 'title': 'VICE_COWBOYCAPITALISTS_PART01_v1_VICE_WM_1080p.mov', 'duration': 725.983, }, - 'params': { - # Requires ffmpeg (m3u8 manifest) - 'skip_download': True, + }, { + 'url': 'http://www.vice.com/video/how-to-hack-a-car', + 'md5': '6fb2989a3fed069fb8eab3401fc2d3c9', + 'info_dict': { + 'id': '3jstaBeXgAs', + 'ext': 'mp4', + 'title': 'How to Hack a Car: Phreaked Out (Episode 2)', + 'description': 'md5:ee95453f7ff495db8efe14ae8bf56f30', + 'uploader_id': 'MotherboardTV', + 'uploader': 'Motherboard', + 'upload_date': '20140529', }, }, { 'url': 'https://news.vice.com/video/experimenting-on-animals-inside-the-monkey-lab', @@ -39,11 +46,14 @@ class ViceIE(InfoExtractor): try: embed_code = self._search_regex( r'embedCode=([^&\'"]+)', webpage, - 'ooyala embed code') - ooyala_url = OoyalaIE._url_for_embed_code(embed_code) + 'ooyala embed code', default=None) + if embed_code: + return self.url_result('ooyala:%s' % embed_code, 'Ooyala') + youtube_id = self._search_regex( + r'data-youtube-id="([^"]+)"', webpage, 'youtube id') + return self.url_result(youtube_id, 'Youtube') except ExtractorError: raise ExtractorError('The page doesn\'t contain a video', expected=True) - return self.url_result(ooyala_url, ie='Ooyala') class ViceShowIE(InfoExtractor): diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 5a102de51..a4dd628a1 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1818,20 +1818,32 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): def _extract_mix(self, playlist_id): # The mixes are generated from a single video # the id of the playlist is just 'RD' + video_id - url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id) - webpage = self._download_webpage( - url, playlist_id, 'Downloading Youtube mix') + ids = [] + last_id = playlist_id[-11:] + for n in itertools.count(1): + url = 'https://youtube.com/watch?v=%s&list=%s' % (last_id, playlist_id) + webpage = self._download_webpage( + url, playlist_id, 'Downloading page {0} of Youtube mix'.format(n)) + new_ids = orderedSet(re.findall( + r'''(?xs)data-video-username=".*?".*? + href="/watch\?v=([0-9A-Za-z_-]{11})&[^"]*?list=%s''' % re.escape(playlist_id), + webpage)) + # Fetch new pages until all the videos are repeated, it seems that + # there are always 51 unique videos. + new_ids = [_id for _id in new_ids if _id not in ids] + if not new_ids: + break + ids.extend(new_ids) + last_id = ids[-1] + + url_results = self._ids_to_results(ids) + search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage) title_span = ( search_title('playlist-title') or search_title('title long-title') or search_title('title')) title = clean_html(title_span) - ids = orderedSet(re.findall( - r'''(?xs)data-video-username=".*?".*? - href="/watch\?v=([0-9A-Za-z_-]{11})&[^"]*?list=%s''' % re.escape(playlist_id), - webpage)) - url_results = self._ids_to_results(ids) return self.playlist_result(url_results, playlist_id, title) @@ -1987,8 +1999,8 @@ class YoutubeUserIE(YoutubeChannelIE): def suitable(cls, url): # Don't return True if the url can be extracted with other youtube # extractor, the regex would is too permissive and it would match. - other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls) - if any(ie.suitable(url) for ie in other_ies): + other_yt_ies = iter(klass for (name, klass) in globals().items() if name.startswith('Youtube') and name.endswith('IE') and klass is not cls) + if any(ie.suitable(url) for ie in other_yt_ies): return False else: return super(YoutubeUserIE, cls).suitable(url) diff --git a/youtube_dl/postprocessor/ffmpeg.py b/youtube_dl/postprocessor/ffmpeg.py index b64cd396b..1793a878c 100644 --- a/youtube_dl/postprocessor/ffmpeg.py +++ b/youtube_dl/postprocessor/ffmpeg.py @@ -175,7 +175,8 @@ class FFmpegPostProcessor(PostProcessor): # Always use 'file:' because the filename may contain ':' (ffmpeg # interprets that as a protocol) or can start with '-' (-- is broken in # ffmpeg, see https://ffmpeg.org/trac/ffmpeg/ticket/2127 for details) - return 'file:' + fn + # Also leave '-' intact in order not to break streaming to stdout. + return 'file:' + fn if fn != '-' else fn class FFmpegExtractAudioPP(FFmpegPostProcessor):