diff --git a/AUTHORS b/AUTHORS index 73fcafeb5..a46799506 100644 --- a/AUTHORS +++ b/AUTHORS @@ -154,3 +154,5 @@ Brian Foley Vignesh Venkat Tom Gijselinck Founder Fang +Andrew Alexeyew +Saso Bezlaj diff --git a/README.md b/README.md index 9dbeae1bc..7c582511f 100644 --- a/README.md +++ b/README.md @@ -173,6 +173,10 @@ which means you can modify it, redistribute it or use it however you like. expected filesize (experimental) --hls-prefer-native Use the native HLS downloader instead of ffmpeg (experimental) + --hls-use-mpegts Use the mpegts container for HLS videos, + allowing to play the video while + downloading (some players may not be able + to play it) --external-downloader COMMAND Use the specified external downloader. Currently supports aria2c,axel,curl,httpie,wget @@ -339,8 +343,8 @@ which means you can modify it, redistribute it or use it however you like. preference, for example: "srt" or "ass/srt/best" --sub-lang LANGS Languages of the subtitles to download - (optional) separated by commas, use IETF - language tags like 'en,pt' + (optional) separated by commas, use --list- + subs for available language tags ## Authentication Options: -u, --username USERNAME Login with this account ID diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 99b1e2731..0644436a8 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -24,7 +24,7 @@ - **AdobeTVShow** - **AdobeTVVideo** - **AdultSwim** - - **AE** + - **aenetworks**: A+E Networks: A&E, Lifetime, History.com, FYI Network - **Aftonbladet** - **AirMozilla** - **AlJazeera** @@ -43,6 +43,7 @@ - **ARD:mediathek** - **arte.tv** - **arte.tv:+7** + - **arte.tv:cinema** - **arte.tv:concert** - **arte.tv:creative** - **arte.tv:ddc** @@ -54,6 +55,7 @@ - **audiomack** - **audiomack:album** - **Azubu** + - **AzubuLive** - **BaiduVideo**: 百度视频 - **bambuser** - **bambuser:channel** @@ -124,6 +126,7 @@ - **CSpan**: C-SPAN - **CtsNews**: 華視新聞 - **culturebox.francetvinfo.fr** + - **CultureUnplugged** - **CWTV** - **dailymotion** - **dailymotion:playlist** @@ -141,6 +144,7 @@ - **defense.gouv.fr** - **democracynow** - **DHM**: Filmarchiv - Deutsches Historisches Museum + - **Digiteka** - **Discovery** - **Dotsub** - **DouyuTV**: 斗鱼 @@ -177,6 +181,7 @@ - **ExpoTV** - **ExtremeTube** - **facebook** + - **facebook:post** - **faz.net** - **fc2** - **Fczenit** @@ -287,7 +292,9 @@ - **la7.tv** - **Laola1Tv** - **Lecture2Go** + - **Lemonde** - **Letv**: 乐视网 + - **LetvCloud**: 乐视云 - **LetvPlaylist** - **LetvTv** - **Libsyn** @@ -300,6 +307,7 @@ - **livestream** - **livestream:original** - **LnkGo** + - **LoveHomePorn** - **lrt.lt** - **lynda**: lynda.com videos - **lynda:course**: lynda.com online courses @@ -308,6 +316,7 @@ - **mailru**: Видео@Mail.Ru - **MakerTV** - **Malemotion** + - **MatchTV** - **MDR**: MDR.DE and KiKA - **media.ccc.de** - **metacafe** @@ -486,6 +495,7 @@ - **rtve.es:live**: RTVE.es live streams - **RTVNH** - **RUHD** + - **RulePorn** - **rutube**: Rutube videos - **rutube:channel**: Rutube channels - **rutube:embed**: Rutube embedded videos @@ -499,6 +509,7 @@ - **Sapo**: SAPO Vídeos - **savefrom.net** - **SBS**: sbs.com.au + - **schooltv** - **SciVee** - **screen.yahoo:search**: Yahoo screen search - **Screencast** @@ -602,6 +613,7 @@ - **ToypicsUser**: Toypics user profile - **TrailerAddict** (Currently broken) - **Trilulilu** + - **trollvids** - **TruTube** - **Tube8** - **TubiTv** @@ -640,7 +652,6 @@ - **udemy** - **udemy:course** - **UDNEmbed**: 聯合影音 - - **Ultimedia** - **Unistra** - **Urort**: NRK P3 Urørt - **ustream** @@ -707,6 +718,7 @@ - **WebOfStories** - **WebOfStoriesPlaylist** - **Weibo** + - **WeiqiTV**: WQTV - **wholecloud**: WholeCloud - **Wimp** - **Wistia** @@ -758,3 +770,4 @@ - **ZDFChannel** - **zingmp3:album**: mp3.zing.vn albums - **zingmp3:song**: mp3.zing.vn songs + - **ZippCast** diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index 0caa43843..b53cfbe78 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -221,6 +221,16 @@ class TestFormatSelection(unittest.TestCase): downloaded = ydl.downloaded_info_dicts[0] self.assertEqual(downloaded['format_id'], 'dash-video-low') + formats = [ + {'format_id': 'vid-vcodec-dot', 'ext': 'mp4', 'preference': 1, 'vcodec': 'avc1.123456', 'acodec': 'none', 'url': TEST_URL}, + ] + info_dict = _make_result(formats) + + ydl = YDL({'format': 'bestvideo[vcodec=avc1.123456]'}) + ydl.process_ie_result(info_dict.copy()) + downloaded = ydl.downloaded_info_dicts[0] + self.assertEqual(downloaded['format_id'], 'vid-vcodec-dot') + def test_youtube_format_selection(self): order = [ '38', '37', '46', '22', '45', '35', '44', '18', '34', '43', '6', '5', '36', '17', '13', diff --git a/test/test_all_urls.py b/test/test_all_urls.py index a0c11e6c1..f5af184e6 100644 --- a/test/test_all_urls.py +++ b/test/test_all_urls.py @@ -56,7 +56,7 @@ class TestAllURLsMatching(unittest.TestCase): assertChannel('https://www.youtube.com/channel/HCtnHdj3df7iM/videos') def test_youtube_user_matching(self): - self.assertMatch('www.youtube.com/NASAgovVideo/videos', ['youtube:user']) + self.assertMatch('http://www.youtube.com/NASAgovVideo/videos', ['youtube:user']) def test_youtube_feeds(self): self.assertMatch('https://www.youtube.com/feed/watch_later', ['youtube:watchlater']) diff --git a/test/test_subtitles.py b/test/test_subtitles.py index 9ed9fe622..9a695c4e8 100644 --- a/test/test_subtitles.py +++ b/test/test_subtitles.py @@ -21,7 +21,7 @@ from youtube_dl.extractor import ( NPOIE, ComedyCentralIE, NRKTVIE, - RaiIE, + RaiTVIE, VikiIE, ThePlatformIE, ThePlatformFeedIE, @@ -260,7 +260,7 @@ class TestNRKSubtitles(BaseTestSubtitles): class TestRaiSubtitles(BaseTestSubtitles): url = 'http://www.rai.tv/dl/RaiTV/programmi/media/ContentItem-cb27157f-9dd0-4aee-b788-b1f67643a391.html' - IE = RaiIE + IE = RaiTVIE def test_allsubtitles(self): self.DL.params['writesubtitles'] = True diff --git a/test/test_update.py b/test/test_update.py new file mode 100644 index 000000000..d9c71511d --- /dev/null +++ b/test/test_update.py @@ -0,0 +1,30 @@ +#!/usr/bin/env python + +from __future__ import unicode_literals + +# Allow direct execution +import os +import sys +import unittest +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + + +import json +from youtube_dl.update import rsa_verify + + +class TestUpdate(unittest.TestCase): + def test_rsa_verify(self): + UPDATES_RSA_KEY = (0x9d60ee4d8f805312fdb15a62f87b95bd66177b91df176765d13514a0f1754bcd2057295c5b6f1d35daa6742c3ffc9a82d3e118861c207995a8031e151d863c9927e304576bc80692bc8e094896fcf11b66f3e29e04e3a71e9a11558558acea1840aec37fc396fb6b65dc81a1c4144e03bd1c011de62e3f1357b327d08426fe93, 65537) + with open(os.path.join(os.path.dirname(os.path.abspath(__file__)), 'versions.json'), 'rb') as f: + versions_info = f.read().decode() + versions_info = json.loads(versions_info) + signature = versions_info['signature'] + del versions_info['signature'] + self.assertTrue(rsa_verify( + json.dumps(versions_info, sort_keys=True).encode('utf-8'), + signature, UPDATES_RSA_KEY)) + + +if __name__ == '__main__': + unittest.main() diff --git a/test/test_youtube_lists.py b/test/test_youtube_lists.py index 26aadb34f..47df0f348 100644 --- a/test/test_youtube_lists.py +++ b/test/test_youtube_lists.py @@ -34,7 +34,7 @@ class TestYoutubeLists(unittest.TestCase): ie = YoutubePlaylistIE(dl) # TODO find a > 100 (paginating?) videos course result = ie.extract('https://www.youtube.com/course?list=ECUl4u3cNGP61MdtwGTqZA0MreSaDybji8') - entries = result['entries'] + entries = list(result['entries']) self.assertEqual(YoutubeIE().extract_id(entries[0]['url']), 'j9WZyLZCBzs') self.assertEqual(len(entries), 25) self.assertEqual(YoutubeIE().extract_id(entries[-1]['url']), 'rYefUsYuEp0') diff --git a/test/versions.json b/test/versions.json new file mode 100644 index 000000000..6cccc2259 --- /dev/null +++ b/test/versions.json @@ -0,0 +1,34 @@ +{ + "latest": "2013.01.06", + "signature": "72158cdba391628569ffdbea259afbcf279bbe3d8aeb7492690735dc1cfa6afa754f55c61196f3871d429599ab22f2667f1fec98865527b32632e7f4b3675a7ef0f0fbe084d359256ae4bba68f0d33854e531a70754712f244be71d4b92e664302aa99653ee4df19800d955b6c4149cd2b3f24288d6e4b40b16126e01f4c8ce6", + "versions": { + "2013.01.02": { + "bin": [ + "http://youtube-dl.org/downloads/2013.01.02/youtube-dl", + "f5b502f8aaa77675c4884938b1e4871ebca2611813a0c0e74f60c0fbd6dcca6b" + ], + "exe": [ + "http://youtube-dl.org/downloads/2013.01.02/youtube-dl.exe", + "75fa89d2ce297d102ff27675aa9d92545bbc91013f52ec52868c069f4f9f0422" + ], + "tar": [ + "http://youtube-dl.org/downloads/2013.01.02/youtube-dl-2013.01.02.tar.gz", + "6a66d022ac8e1c13da284036288a133ec8dba003b7bd3a5179d0c0daca8c8196" + ] + }, + "2013.01.06": { + "bin": [ + "http://youtube-dl.org/downloads/2013.01.06/youtube-dl", + "64b6ed8865735c6302e836d4d832577321b4519aa02640dc508580c1ee824049" + ], + "exe": [ + "http://youtube-dl.org/downloads/2013.01.06/youtube-dl.exe", + "58609baf91e4389d36e3ba586e21dab882daaaee537e4448b1265392ae86ff84" + ], + "tar": [ + "http://youtube-dl.org/downloads/2013.01.06/youtube-dl-2013.01.06.tar.gz", + "fe77ab20a95d980ed17a659aa67e371fdd4d656d19c4c7950e7b720b0c2f1a86" + ] + } + } +} \ No newline at end of file diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 09d2b18f2..2a3d6cd4a 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -263,7 +263,7 @@ class YoutubeDL(object): the downloader (see youtube_dl/downloader/common.py): nopart, updatetime, buffersize, ratelimit, min_filesize, max_filesize, test, noresizebuffer, retries, continuedl, noprogress, consoletitle, - xattr_set_filesize, external_downloader_args. + xattr_set_filesize, external_downloader_args, hls_use_mpegts. The following options are used by the post processors: prefer_ffmpeg: If True, use ffmpeg instead of avconv if both are available, @@ -906,7 +906,7 @@ class YoutubeDL(object): str_operator_rex = re.compile(r'''(?x) \s*(?Pext|acodec|vcodec|container|protocol) \s*(?P%s)(?P\s*\?)? - \s*(?P[a-zA-Z0-9_-]+) + \s*(?P[a-zA-Z0-9._-]+) \s*$ ''' % '|'.join(map(re.escape, STR_OPERATORS.keys()))) m = str_operator_rex.search(filter_spec) diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 9f131f5db..f5f064241 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -369,6 +369,7 @@ def _real_main(argv=None): 'no_color': opts.no_color, 'ffmpeg_location': opts.ffmpeg_location, 'hls_prefer_native': opts.hls_prefer_native, + 'hls_use_mpegts': opts.hls_use_mpegts, 'external_downloader_args': external_downloader_args, 'postprocessor_args': postprocessor_args, 'cn_verification_proxy': opts.cn_verification_proxy, diff --git a/youtube_dl/downloader/common.py b/youtube_dl/downloader/common.py index beae8c4d0..de815612c 100644 --- a/youtube_dl/downloader/common.py +++ b/youtube_dl/downloader/common.py @@ -45,6 +45,7 @@ class FileDownloader(object): (experimental) external_downloader_args: A list of additional command-line arguments for the external downloader. + hls_use_mpegts: Use the mpegts container for HLS videos. Subclasses of this one must re-define the real_download method. """ @@ -295,7 +296,7 @@ class FileDownloader(object): def report_retry(self, count, retries): """Report retry in case of HTTP error 5xx""" - self.to_screen('[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries)) + self.to_screen('[download] Got server HTTP error. Retrying (attempt %d of %.0f)...' % (count, retries)) def report_file_already_downloaded(self, file_name): """Report file has already been fully downloaded.""" diff --git a/youtube_dl/downloader/f4m.py b/youtube_dl/downloader/f4m.py index aaf0c49c8..fc9642905 100644 --- a/youtube_dl/downloader/f4m.py +++ b/youtube_dl/downloader/f4m.py @@ -273,15 +273,21 @@ class F4mFD(FragmentFD): return fragments_list def _parse_bootstrap_node(self, node, base_url): - if node.text is None: + # Sometimes non empty inline bootstrap info can be specified along + # with bootstrap url attribute (e.g. dummy inline bootstrap info + # contains whitespace characters in [1]). We will prefer bootstrap + # url over inline bootstrap info when present. + # 1. http://live-1-1.rutube.ru/stream/1024/HDS/SD/C2NKsS85HQNckgn5HdEmOQ/1454167650/S-s604419906/move/four/dirs/upper/1024-576p.f4m + bootstrap_url = node.get('url') + if bootstrap_url: bootstrap_url = compat_urlparse.urljoin( - base_url, node.attrib['url']) + base_url, bootstrap_url) boot_info = self._get_bootstrap_from_url(bootstrap_url) else: bootstrap_url = None bootstrap = base64.b64decode(node.text.encode('ascii')) boot_info = read_bootstrap_info(bootstrap) - return (boot_info, bootstrap_url) + return boot_info, bootstrap_url def real_download(self, filename, info_dict): man_url = info_dict['url'] @@ -316,7 +322,8 @@ class F4mFD(FragmentFD): metadata = None fragments_list = build_fragments_list(boot_info) - if self.params.get('test', False): + test = self.params.get('test', False) + if test: # We only download the first fragment fragments_list = fragments_list[:1] total_frags = len(fragments_list) @@ -326,6 +333,7 @@ class F4mFD(FragmentFD): ctx = { 'filename': filename, 'total_frags': total_frags, + 'live': live, } self._prepare_frag_download(ctx) @@ -380,7 +388,7 @@ class F4mFD(FragmentFD): else: raise - if not fragments_list and live and bootstrap_url: + if not fragments_list and not test and live and bootstrap_url: fragments_list = self._update_live_fragments(bootstrap_url, frag_i) total_frags += len(fragments_list) if fragments_list and (fragments_list[0][1] > frag_i + 1): diff --git a/youtube_dl/downloader/fragment.py b/youtube_dl/downloader/fragment.py index 0c9113d0f..8b96eceb9 100644 --- a/youtube_dl/downloader/fragment.py +++ b/youtube_dl/downloader/fragment.py @@ -26,7 +26,11 @@ class FragmentFD(FileDownloader): self._start_frag_download(ctx) def _prepare_frag_download(self, ctx): - self.to_screen('[%s] Total fragments: %d' % (self.FD_NAME, ctx['total_frags'])) + if 'live' not in ctx: + ctx['live'] = False + self.to_screen( + '[%s] Total fragments: %s' + % (self.FD_NAME, ctx['total_frags'] if not ctx['live'] else 'unknown (live)')) self.report_destination(ctx['filename']) dl = HttpQuietDownloader( self.ydl, @@ -74,14 +78,14 @@ class FragmentFD(FileDownloader): if s['status'] not in ('downloading', 'finished'): return - frag_total_bytes = s.get('total_bytes') or 0 - - estimated_size = ( - (ctx['complete_frags_downloaded_bytes'] + frag_total_bytes) / - (state['frag_index'] + 1) * total_frags) time_now = time.time() - state['total_bytes_estimate'] = estimated_size state['elapsed'] = time_now - start + frag_total_bytes = s.get('total_bytes') or 0 + if not ctx['live']: + estimated_size = ( + (ctx['complete_frags_downloaded_bytes'] + frag_total_bytes) / + (state['frag_index'] + 1) * total_frags) + state['total_bytes_estimate'] = estimated_size if s['status'] == 'finished': state['frag_index'] += 1 @@ -91,9 +95,10 @@ class FragmentFD(FileDownloader): else: frag_downloaded_bytes = s['downloaded_bytes'] state['downloaded_bytes'] += frag_downloaded_bytes - ctx['prev_frag_downloaded_bytes'] - state['eta'] = self.calc_eta( - start, time_now, estimated_size, - state['downloaded_bytes']) + if not ctx['live']: + state['eta'] = self.calc_eta( + start, time_now, estimated_size, + state['downloaded_bytes']) state['speed'] = s.get('speed') ctx['prev_frag_downloaded_bytes'] = frag_downloaded_bytes self._hook_progress(state) diff --git a/youtube_dl/downloader/hls.py b/youtube_dl/downloader/hls.py index 10b83c6b2..cb34dc4ab 100644 --- a/youtube_dl/downloader/hls.py +++ b/youtube_dl/downloader/hls.py @@ -39,7 +39,11 @@ class HlsFD(FileDownloader): '-headers', ''.join('%s: %s\r\n' % (key, val) for key, val in headers.items())] - args += ['-i', url, '-f', 'mp4', '-c', 'copy', '-bsf:a', 'aac_adtstoasc'] + args += ['-i', url, '-c', 'copy'] + if self.params.get('hls_use_mpegts', False): + args += ['-f', 'mpegts'] + else: + args += ['-f', 'mp4', '-bsf:a', 'aac_adtstoasc'] args = [encodeArgument(opt) for opt in args] args.append(encodeFilename(ffpp._ffmpeg_filename_argument(tmpfilename), True)) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index bab3d7b46..dbdfb86c0 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -42,6 +42,7 @@ from .arte import ( ArteTVCreativeIE, ArteTVConcertIE, ArteTVFutureIE, + ArteTVCinemaIE, ArteTVDDCIE, ArteTVEmbedIE, ) @@ -49,7 +50,7 @@ from .atresplayer import AtresPlayerIE from .atttechchannel import ATTTechChannelIE from .audimedia import AudiMediaIE from .audiomack import AudiomackIE, AudiomackAlbumIE -from .azubu import AzubuIE +from .azubu import AzubuIE, AzubuLiveIE from .baidu import BaiduVideoIE from .bambuser import BambuserIE, BambuserChannelIE from .bandcamp import BandcampIE, BandcampAlbumIE @@ -195,7 +196,10 @@ from .everyonesmixtape import EveryonesMixtapeIE from .exfm import ExfmIE from .expotv import ExpoTVIE from .extremetube import ExtremeTubeIE -from .facebook import FacebookIE +from .facebook import ( + FacebookIE, + FacebookPostIE, +) from .faz import FazIE from .fc2 import FC2IE from .fczenit import FczenitIE @@ -357,6 +361,7 @@ from .livestream import ( LivestreamShortenerIE, ) from .lnkgo import LnkGoIE +from .lovehomeporn import LoveHomePornIE from .lrt import LRTIE from .lynda import ( LyndaIE, @@ -367,6 +372,7 @@ from .macgamestore import MacGameStoreIE from .mailru import MailRuIE from .makertv import MakerTVIE from .malemotion import MalemotionIE +from .matchtv import MatchTVIE from .mdr import MDRIE from .metacafe import MetacafeIE from .metacritic import MetacriticIE @@ -477,6 +483,7 @@ from .npo import ( NPOLiveIE, NPORadioIE, NPORadioFragmentIE, + SchoolTVIE, VPROIE, WNLIE ) @@ -579,6 +586,7 @@ from .rts import RTSIE from .rtve import RTVEALaCartaIE, RTVELiveIE, RTVEInfantilIE from .rtvnh import RTVNHIE from .ruhd import RUHDIE +from .ruleporn import RulePornIE from .rutube import ( RutubeIE, RutubeChannelIE, @@ -725,6 +733,7 @@ from .toutv import TouTvIE from .toypics import ToypicsUserIE, ToypicsIE from .traileraddict import TrailerAddictIE from .trilulilu import TriluliluIE +from .trollvids import TrollvidsIE from .trutube import TruTubeIE from .tube8 import Tube8IE from .tubitv import TubiTvIE diff --git a/youtube_dl/extractor/acast.py b/youtube_dl/extractor/acast.py index be7913bc7..92eee8119 100644 --- a/youtube_dl/extractor/acast.py +++ b/youtube_dl/extractor/acast.py @@ -8,11 +8,7 @@ from ..compat import compat_str from ..utils import int_or_none -class ACastBaseIE(InfoExtractor): - _API_BASE_URL = 'https://www.acast.com/api/' - - -class ACastIE(ACastBaseIE): +class ACastIE(InfoExtractor): IE_NAME = 'acast' _VALID_URL = r'https?://(?:www\.)?acast\.com/(?P[^/]+)/(?P[^/#?]+)' _TEST = { @@ -23,14 +19,19 @@ class ACastIE(ACastBaseIE): 'ext': 'mp3', 'title': '"Where Are You?": Taipei 101, Taiwan', 'timestamp': 1196172000000, - 'description': 'md5:0c5d8201dfea2b93218ea986c91eee6e', + 'description': 'md5:a0b4ef3634e63866b542e5b1199a1a0e', 'duration': 211, } } def _real_extract(self, url): channel, display_id = re.match(self._VALID_URL, url).groups() - cast_data = self._download_json(self._API_BASE_URL + 'channels/%s/acasts/%s/playback' % (channel, display_id), display_id) + + embed_page = self._download_webpage( + re.sub('(?:www\.)?acast\.com', 'embedcdn.acast.com', url), display_id) + cast_data = self._parse_json(self._search_regex( + r'window\[\'acast/queries\'\]\s*=\s*([^;]+);', embed_page, 'acast data'), + display_id)['GetAcast/%s/%s' % (channel, display_id)] return { 'id': compat_str(cast_data['id']), @@ -44,7 +45,7 @@ class ACastIE(ACastBaseIE): } -class ACastChannelIE(ACastBaseIE): +class ACastChannelIE(InfoExtractor): IE_NAME = 'acast:channel' _VALID_URL = r'https?://(?:www\.)?acast\.com/(?P[^/#?]+)' _TEST = { @@ -56,6 +57,7 @@ class ACastChannelIE(ACastBaseIE): }, 'playlist_mincount': 20, } + _API_BASE_URL = 'https://www.acast.com/api/' @classmethod def suitable(cls, url): diff --git a/youtube_dl/extractor/allocine.py b/youtube_dl/extractor/allocine.py index 7d65b8193..190bc2cc8 100644 --- a/youtube_dl/extractor/allocine.py +++ b/youtube_dl/extractor/allocine.py @@ -8,6 +8,8 @@ from .common import InfoExtractor from ..compat import compat_str from ..utils import ( qualities, + unescapeHTML, + xpath_element, ) @@ -31,7 +33,7 @@ class AllocineIE(InfoExtractor): 'id': '19540403', 'ext': 'mp4', 'title': 'Planes 2 Bande-annonce VF', - 'description': 'md5:eeaffe7c2d634525e21159b93acf3b1e', + 'description': 'Regardez la bande annonce du film Planes 2 (Planes 2 Bande-annonce VF). Planes 2, un film de Roberts Gannaway', 'thumbnail': 're:http://.*\.jpg', }, }, { @@ -41,7 +43,7 @@ class AllocineIE(InfoExtractor): 'id': '19544709', 'ext': 'mp4', 'title': 'Dragons 2 - Bande annonce finale VF', - 'description': 'md5:71742e3a74b0d692c7fce0dd2017a4ac', + 'description': 'md5:601d15393ac40f249648ef000720e7e3', 'thumbnail': 're:http://.*\.jpg', }, }, { @@ -59,14 +61,18 @@ class AllocineIE(InfoExtractor): if typ == 'film': video_id = self._search_regex(r'href="/video/player_gen_cmedia=([0-9]+).+"', webpage, 'video id') else: - player = self._search_regex(r'data-player=\'([^\']+)\'>', webpage, 'data player') - - player_data = json.loads(player) - video_id = compat_str(player_data['refMedia']) + player = self._search_regex(r'data-player=\'([^\']+)\'>', webpage, 'data player', default=None) + if player: + player_data = json.loads(player) + video_id = compat_str(player_data['refMedia']) + else: + model = self._search_regex(r'data-model="([^"]+)">', webpage, 'data model') + model_data = self._parse_json(unescapeHTML(model), display_id) + video_id = compat_str(model_data['id']) xml = self._download_xml('http://www.allocine.fr/ws/AcVisiondataV4.ashx?media=%s' % video_id, display_id) - video = xml.find('.//AcVisionVideo').attrib + video = xpath_element(xml, './/AcVisionVideo').attrib quality = qualities(['ld', 'md', 'hd']) formats = [] diff --git a/youtube_dl/extractor/anitube.py b/youtube_dl/extractor/anitube.py index 23f942ae2..2fd912da4 100644 --- a/youtube_dl/extractor/anitube.py +++ b/youtube_dl/extractor/anitube.py @@ -1,11 +1,9 @@ from __future__ import unicode_literals -import re - -from .common import InfoExtractor +from .nuevo import NuevoBaseIE -class AnitubeIE(InfoExtractor): +class AnitubeIE(NuevoBaseIE): IE_NAME = 'anitube.se' _VALID_URL = r'https?://(?:www\.)?anitube\.se/video/(?P\d+)' @@ -22,38 +20,11 @@ class AnitubeIE(InfoExtractor): } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) key = self._search_regex( r'src=["\']https?://[^/]+/embed/([A-Za-z0-9_-]+)', webpage, 'key') - config_xml = self._download_xml( - 'http://www.anitube.se/nuevo/econfig.php?key=%s' % key, key) - - video_title = config_xml.find('title').text - thumbnail = config_xml.find('image').text - duration = float(config_xml.find('duration').text) - - formats = [] - video_url = config_xml.find('file') - if video_url is not None: - formats.append({ - 'format_id': 'sd', - 'url': video_url.text, - }) - video_url = config_xml.find('filehd') - if video_url is not None: - formats.append({ - 'format_id': 'hd', - 'url': video_url.text, - }) - - return { - 'id': video_id, - 'title': video_title, - 'thumbnail': thumbnail, - 'duration': duration, - 'formats': formats - } + return self._extract_nuevo( + 'http://www.anitube.se/nuevo/econfig.php?key=%s' % key, video_id) diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py index 10301a8ea..b9e07f0ef 100644 --- a/youtube_dl/extractor/arte.py +++ b/youtube_dl/extractor/arte.py @@ -199,25 +199,19 @@ class ArteTVCreativeIE(ArteTVPlus7IE): class ArteTVFutureIE(ArteTVPlus7IE): IE_NAME = 'arte.tv:future' - _VALID_URL = r'https?://future\.arte\.tv/(?Pfr|de)/(thema|sujet)/.*?#article-anchor-(?P\d+)' + _VALID_URL = r'https?://future\.arte\.tv/(?Pfr|de)/(?P.+)' - _TEST = { - 'url': 'http://future.arte.tv/fr/sujet/info-sciences#article-anchor-7081', + _TESTS = [{ + 'url': 'http://future.arte.tv/fr/info-sciences/les-ecrevisses-aussi-sont-anxieuses', 'info_dict': { - 'id': '5201', + 'id': '050940-028-A', 'ext': 'mp4', - 'title': 'Les champignons au secours de la planète', - 'upload_date': '20131101', + 'title': 'Les écrevisses aussi peuvent être anxieuses', }, - } - - def _real_extract(self, url): - anchor_id, lang = self._extract_url_info(url) - webpage = self._download_webpage(url, anchor_id) - row = self._search_regex( - r'(?s)id="%s"[^>]*>.+?(]*arte_vp_url[^>]*>)' % anchor_id, - webpage, 'row') - return self._extract_from_webpage(row, anchor_id, lang) + }, { + 'url': 'http://future.arte.tv/fr/la-science-est-elle-responsable', + 'only_matching': True, + }] class ArteTVDDCIE(ArteTVPlus7IE): @@ -255,6 +249,23 @@ class ArteTVConcertIE(ArteTVPlus7IE): } +class ArteTVCinemaIE(ArteTVPlus7IE): + IE_NAME = 'arte.tv:cinema' + _VALID_URL = r'https?://cinema\.arte\.tv/(?Pde|fr)/(?P.+)' + + _TEST = { + 'url': 'http://cinema.arte.tv/de/node/38291', + 'md5': '6b275511a5107c60bacbeeda368c3aa1', + 'info_dict': { + 'id': '055876-000_PWA12025-D', + 'ext': 'mp4', + 'title': 'Tod auf dem Nil', + 'upload_date': '20160122', + 'description': 'md5:7f749bbb77d800ef2be11d54529b96bc', + }, + } + + class ArteTVEmbedIE(ArteTVPlus7IE): IE_NAME = 'arte.tv:embed' _VALID_URL = r'''(?x) diff --git a/youtube_dl/extractor/azubu.py b/youtube_dl/extractor/azubu.py index 0961d339f..011edf128 100644 --- a/youtube_dl/extractor/azubu.py +++ b/youtube_dl/extractor/azubu.py @@ -3,7 +3,11 @@ from __future__ import unicode_literals import json from .common import InfoExtractor -from ..utils import float_or_none +from ..utils import ( + ExtractorError, + float_or_none, + sanitized_Request, +) class AzubuIE(InfoExtractor): @@ -91,3 +95,37 @@ class AzubuIE(InfoExtractor): 'view_count': view_count, 'formats': formats, } + + +class AzubuLiveIE(InfoExtractor): + _VALID_URL = r'http://www.azubu.tv/(?P[^/]+)$' + + _TEST = { + 'url': 'http://www.azubu.tv/MarsTVMDLen', + 'only_matching': True, + } + + def _real_extract(self, url): + user = self._match_id(url) + + info = self._download_json( + 'http://api.azubu.tv/public/modules/last-video/{0}/info'.format(user), + user)['data'] + if info['type'] != 'STREAM': + raise ExtractorError('{0} is not streaming live'.format(user), expected=True) + + req = sanitized_Request( + 'https://edge-elb.api.brightcove.com/playback/v1/accounts/3361910549001/videos/ref:' + info['reference_id']) + req.add_header('Accept', 'application/json;pk=BCpkADawqM1gvI0oGWg8dxQHlgT8HkdE2LnAlWAZkOlznO39bSZX726u4JqnDsK3MDXcO01JxXK2tZtJbgQChxgaFzEVdHRjaDoxaOu8hHOO8NYhwdxw9BzvgkvLUlpbDNUuDoc4E4wxDToV') + bc_info = self._download_json(req, user) + m3u8_url = next(source['src'] for source in bc_info['sources'] if source['container'] == 'M2TS') + formats = self._extract_m3u8_formats(m3u8_url, user, ext='mp4') + + return { + 'id': info['id'], + 'title': self._live_title(info['title']), + 'uploader_id': user, + 'formats': formats, + 'is_live': True, + 'thumbnail': bc_info['poster'], + } diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index 1c493b72d..6ddee686c 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -193,6 +193,19 @@ class BBCCoUkIE(InfoExtractor): # rtmp download 'skip_download': True, }, + }, { + # compact player (https://github.com/rg3/youtube-dl/issues/8147) + 'url': 'http://www.bbc.co.uk/programmes/p028bfkf/player', + 'info_dict': { + 'id': 'p028bfkj', + 'ext': 'flv', + 'title': 'Extract from BBC documentary Look Stranger - Giant Leeks and Magic Brews', + 'description': 'Extract from BBC documentary Look Stranger - Giant Leeks and Magic Brews', + }, + 'params': { + # rtmp download + 'skip_download': True, + }, }, { 'url': 'http://www.bbc.co.uk/iplayer/playlist/p01dvks4', 'only_matching': True, @@ -482,9 +495,11 @@ class BBCCoUkIE(InfoExtractor): if programme_id: formats, subtitles = self._download_media_selector(programme_id) title = self._og_search_title(webpage, default=None) or self._html_search_regex( - r']+id="parent-title"[^>]*>(.+?)', webpage, 'title') + (r']+id="parent-title"[^>]*>(.+?)', + r']+class="info"[^>]*>\s*

(.+?)

'), webpage, 'title') description = self._search_regex( - r'

([^<]+)

', + (r'

([^<]+)

', + r']+class="info_+synopsis"[^>]*>([^<]+)'), webpage, 'description', default=None) if not description: description = self._html_search_meta('description', webpage) diff --git a/youtube_dl/extractor/bpb.py b/youtube_dl/extractor/bpb.py index 510813f76..c28e72927 100644 --- a/youtube_dl/extractor/bpb.py +++ b/youtube_dl/extractor/bpb.py @@ -1,7 +1,13 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor +from ..utils import ( + js_to_json, + determine_ext, +) class BpbIE(InfoExtractor): @@ -10,7 +16,8 @@ class BpbIE(InfoExtractor): _TEST = { 'url': 'http://www.bpb.de/mediathek/297/joachim-gauck-zu-1989-und-die-erinnerung-an-die-ddr', - 'md5': '0792086e8e2bfbac9cdf27835d5f2093', + # md5 fails in Python 2.6 due to buggy server response and wrong handling of urllib2 + 'md5': 'c4f84c8a8044ca9ff68bb8441d300b3f', 'info_dict': { 'id': '297', 'ext': 'mp4', @@ -25,13 +32,26 @@ class BpbIE(InfoExtractor): title = self._html_search_regex( r'

(.*?)

', webpage, 'title') - video_url = self._html_search_regex( - r'(http://film\.bpb\.de/player/dokument_[0-9]+\.mp4)', - webpage, 'video URL') + video_info_dicts = re.findall( + r"({\s*src:\s*'http://film\.bpb\.de/[^}]+})", webpage) + + formats = [] + for video_info in video_info_dicts: + video_info = self._parse_json(video_info, video_id, transform_source=js_to_json) + quality = video_info['quality'] + video_url = video_info['src'] + formats.append({ + 'url': video_url, + 'preference': 10 if quality == 'high' else 0, + 'format_note': quality, + 'format_id': '%s-%s' % (quality, determine_ext(video_url)), + }) + + self._sort_formats(formats) return { 'id': video_id, - 'url': video_url, + 'formats': formats, 'title': title, 'description': self._og_search_description(webpage), } diff --git a/youtube_dl/extractor/cbsnews.py b/youtube_dl/extractor/cbsnews.py index d211ec23b..cabf7e73b 100644 --- a/youtube_dl/extractor/cbsnews.py +++ b/youtube_dl/extractor/cbsnews.py @@ -4,11 +4,10 @@ from __future__ import unicode_literals import re import json -from .common import InfoExtractor -from ..utils import remove_start +from .theplatform import ThePlatformIE -class CBSNewsIE(InfoExtractor): +class CBSNewsIE(ThePlatformIE): IE_DESC = 'CBS News' _VALID_URL = r'http://(?:www\.)?cbsnews\.com/(?:[^/]+/)+(?P[\da-z_-]+)' @@ -31,13 +30,18 @@ class CBSNewsIE(InfoExtractor): 'url': 'http://www.cbsnews.com/videos/fort-hood-shooting-army-downplays-mental-illness-as-cause-of-attack/', 'info_dict': { 'id': 'fort-hood-shooting-army-downplays-mental-illness-as-cause-of-attack', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Fort Hood shooting: Army downplays mental illness as cause of attack', 'thumbnail': 're:^https?://.*\.jpg$', 'duration': 205, + 'subtitles': { + 'en': [{ + 'ext': 'ttml', + }], + }, }, 'params': { - # rtmp download + # m3u8 download 'skip_download': True, }, }, @@ -58,32 +62,23 @@ class CBSNewsIE(InfoExtractor): duration = item.get('duration') thumbnail = item.get('mediaImage') or item.get('thumbnail') + subtitles = {} + if 'mpxRefId' in video_info: + subtitles['en'] = [{ + 'ext': 'ttml', + 'url': 'http://www.cbsnews.com/videos/captions/%s.adb_xml' % video_info['mpxRefId'], + }] + formats = [] for format_id in ['RtmpMobileLow', 'RtmpMobileHigh', 'Hls', 'RtmpDesktop']: - uri = item.get('media' + format_id + 'URI') - if not uri: + pid = item.get('media' + format_id) + if not pid: continue - uri = remove_start(uri, '{manifest:none}') - fmt = { - 'url': uri, - 'format_id': format_id, - } - if uri.startswith('rtmp'): - play_path = re.sub( - r'{slistFilePath}', '', - uri.split('')[-1].split('{break}')[-1]) - play_path = re.sub( - r'{manifest:.+}.*$', '', play_path) - fmt.update({ - 'app': 'ondemand?auth=cbs', - 'play_path': 'mp4:' + play_path, - 'player_url': 'http://www.cbsnews.com/[[IMPORT]]/vidtech.cbsinteractive.com/player/3_3_0/CBSI_PLAYER_HD.swf', - 'page_url': 'http://www.cbsnews.com', - 'ext': 'flv', - }) - elif uri.endswith('.m3u8'): - fmt['ext'] = 'mp4' - formats.append(fmt) + release_url = 'http://link.theplatform.com/s/dJ5BDC/%s?format=SMIL&mbr=true' % pid + tp_formats, tp_subtitles = self._extract_theplatform_smil(release_url, video_id, 'Downloading %s SMIL data' % pid) + formats.extend(tp_formats) + subtitles = self._merge_subtitles(subtitles, tp_subtitles) + self._sort_formats(formats) return { 'id': video_id, @@ -91,4 +86,5 @@ class CBSNewsIE(InfoExtractor): 'thumbnail': thumbnail, 'duration': duration, 'formats': formats, + 'subtitles': subtitles, } diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 8da70ae14..b3d57dfce 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -825,6 +825,12 @@ class InfoExtractor(object): if not formats: raise ExtractorError('No video formats found') + for f in formats: + # Automatically determine tbr when missing based on abr and vbr (improves + # formats sorting in some cases) + if 'tbr' not in f and f.get('abr') is not None and f.get('vbr') is not None: + f['tbr'] = f['abr'] + f['vbr'] + def _formats_key(f): # TODO remove the following workaround from ..utils import determine_ext @@ -1014,6 +1020,18 @@ class InfoExtractor(object): return [] m3u8_doc, urlh = res m3u8_url = urlh.geturl() + # A Media Playlist Tag MUST NOT appear in a Master Playlist + # https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3 + # The EXT-X-TARGETDURATION tag is REQUIRED for every M3U8 Media Playlists + # https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.1 + if '#EXT-X-TARGETDURATION' in m3u8_doc: + return [{ + 'url': m3u8_url, + 'format_id': m3u8_id, + 'ext': ext, + 'protocol': entry_protocol, + 'preference': preference, + }] last_info = None last_media = None kv_rex = re.compile( @@ -1058,9 +1076,9 @@ class InfoExtractor(object): # TODO: looks like video codec is not always necessarily goes first va_codecs = codecs.split(',') if va_codecs[0]: - f['vcodec'] = va_codecs[0].partition('.')[0] + f['vcodec'] = va_codecs[0] if len(va_codecs) > 1 and va_codecs[1]: - f['acodec'] = va_codecs[1].partition('.')[0] + f['acodec'] = va_codecs[1] resolution = last_info.get('RESOLUTION') if resolution: width_str, height_str = resolution.split('x') @@ -1164,6 +1182,7 @@ class InfoExtractor(object): formats = [] rtmp_count = 0 http_count = 0 + m3u8_count = 0 videos = smil.findall(self._xpath_ns('.//video', namespace)) for video in videos: @@ -1203,8 +1222,17 @@ class InfoExtractor(object): src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src) if proto == 'm3u8' or src_ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)) + m3u8_formats = self._extract_m3u8_formats( + src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False) + if len(m3u8_formats) == 1: + m3u8_count += 1 + m3u8_formats[0].update({ + 'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate), + 'tbr': bitrate, + 'width': width, + 'height': height, + }) + formats.extend(m3u8_formats) continue if src_ext == 'f4m': diff --git a/youtube_dl/extractor/cspan.py b/youtube_dl/extractor/cspan.py index b3ee67018..b8b9d058d 100644 --- a/youtube_dl/extractor/cspan.py +++ b/youtube_dl/extractor/cspan.py @@ -68,11 +68,16 @@ class CSpanIE(InfoExtractor): video_type, video_id = matches.groups() video_type = 'clip' if video_type == 'id' else 'program' else: - senate_isvp_url = SenateISVPIE._search_iframe_url(webpage) - if senate_isvp_url: - title = self._og_search_title(webpage) - surl = smuggle_url(senate_isvp_url, {'force_title': title}) - return self.url_result(surl, 'SenateISVP', video_id, title) + m = re.search(r'data-(?Pclip|prog)id=["\'](?P\d+)', webpage) + if m: + video_id = m.group('id') + video_type = 'program' if m.group('type') == 'prog' else 'clip' + else: + senate_isvp_url = SenateISVPIE._search_iframe_url(webpage) + if senate_isvp_url: + title = self._og_search_title(webpage) + surl = smuggle_url(senate_isvp_url, {'force_title': title}) + return self.url_result(surl, 'SenateISVP', video_id, title) if video_type is None or video_id is None: raise ExtractorError('unable to find video id and type') @@ -107,6 +112,13 @@ class CSpanIE(InfoExtractor): 'height': int_or_none(get_text_attr(quality, 'height')), 'tbr': int_or_none(get_text_attr(quality, 'bitrate')), }) + if not formats: + path = unescapeHTML(get_text_attr(f, 'path')) + if not path: + continue + formats = self._extract_m3u8_formats( + path, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls') if determine_ext(path) == 'm3u8' else [{'url': path, }] self._sort_formats(formats) entries.append({ 'id': '%s_%d' % (video_id, partnum + 1), diff --git a/youtube_dl/extractor/daum.py b/youtube_dl/extractor/daum.py index f08f57157..9bc345f60 100644 --- a/youtube_dl/extractor/daum.py +++ b/youtube_dl/extractor/daum.py @@ -2,8 +2,13 @@ from __future__ import unicode_literals +import re + from .common import InfoExtractor -from ..compat import compat_urllib_parse +from ..compat import ( + compat_urllib_parse, + compat_urllib_parse_unquote, +) from ..utils import ( int_or_none, str_to_int, @@ -12,7 +17,7 @@ from ..utils import ( class DaumIE(InfoExtractor): - _VALID_URL = r'https?://(?:m\.)?tvpot\.daum\.net/v/(?P[^?#&]+)' + _VALID_URL = r'https?://(?:(?:m\.)?tvpot\.daum\.net/v/|videofarm\.daum\.net/controller/player/VodPlayer\.swf\?vid=)(?P[^?#&]+)' IE_NAME = 'daum.net' _TESTS = [{ @@ -23,25 +28,57 @@ class DaumIE(InfoExtractor): 'title': '마크 헌트 vs 안토니오 실바', 'description': 'Mark Hunt vs Antonio Silva', 'upload_date': '20131217', + 'thumbnail': 're:^https?://.*\.(?:jpg|png)', 'duration': 2117, 'view_count': int, 'comment_count': int, }, + }, { + 'url': 'http://m.tvpot.daum.net/v/65139429', + 'info_dict': { + 'id': '65139429', + 'ext': 'mp4', + 'title': 'md5:a100d65d09cec246d8aa9bde7de45aed', + 'description': 'md5:79794514261164ff27e36a21ad229fc5', + 'upload_date': '20150604', + 'thumbnail': 're:^https?://.*\.(?:jpg|png)', + 'duration': 154, + 'view_count': int, + 'comment_count': int, + }, }, { 'url': 'http://tvpot.daum.net/v/07dXWRka62Y%24', 'only_matching': True, + }, { + 'url': 'http://videofarm.daum.net/controller/player/VodPlayer.swf?vid=vwIpVpCQsT8%24&ref=', + 'info_dict': { + 'id': 'vwIpVpCQsT8$', + 'ext': 'flv', + 'title': '01-Korean War ( Trouble on the horizon )', + 'description': '\nKorean War 01\nTrouble on the horizon\n전쟁의 먹구름', + 'upload_date': '20080223', + 'thumbnail': 're:^https?://.*\.(?:jpg|png)', + 'duration': 249, + 'view_count': int, + 'comment_count': int, + }, }] def _real_extract(self, url): - video_id = self._match_id(url) + video_id = compat_urllib_parse_unquote(self._match_id(url)) query = compat_urllib_parse.urlencode({'vid': video_id}) - info = self._download_xml( - 'http://tvpot.daum.net/clip/ClipInfoXml.do?' + query, video_id, - 'Downloading video info') movie_data = self._download_json( 'http://videofarm.daum.net/controller/api/closed/v1_2/IntegratedMovieData.json?' + query, video_id, 'Downloading video formats info') + # For urls like http://m.tvpot.daum.net/v/65139429, where the video_id is really a clipid + if not movie_data.get('output_list', {}).get('output_list') and re.match(r'^\d+$', video_id): + return self.url_result('http://tvpot.daum.net/clip/ClipView.do?clipid=%s' % video_id) + + info = self._download_xml( + 'http://tvpot.daum.net/clip/ClipInfoXml.do?' + query, video_id, + 'Downloading video info') + formats = [] for format_el in movie_data['output_list']['output_list']: profile = format_el['profile'] @@ -76,7 +113,7 @@ class DaumIE(InfoExtractor): class DaumClipIE(InfoExtractor): - _VALID_URL = r'https?://(?:m\.)?tvpot\.daum\.net/(?:clip/ClipView.do|mypot/View.do)\?.*?clipid=(?P\d+)' + _VALID_URL = r'https?://(?:m\.)?tvpot\.daum\.net/(?:clip/ClipView.(?:do|tv)|mypot/View.do)\?.*?clipid=(?P\d+)' IE_NAME = 'daum.net:clip' _TESTS = [{ @@ -87,9 +124,13 @@ class DaumClipIE(InfoExtractor): 'title': 'DOTA 2GETHER 시즌2 6회 - 2부', 'description': 'DOTA 2GETHER 시즌2 6회 - 2부', 'upload_date': '20130831', + 'thumbnail': 're:^https?://.*\.(?:jpg|png)', 'duration': 3868, 'view_count': int, }, + }, { + 'url': 'http://m.tvpot.daum.net/clip/ClipView.tv?clipid=54999425', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/drtv.py b/youtube_dl/extractor/drtv.py index baa24c6d1..2d74ff855 100644 --- a/youtube_dl/extractor/drtv.py +++ b/youtube_dl/extractor/drtv.py @@ -91,7 +91,7 @@ class DRTVIE(InfoExtractor): subtitles_list = asset.get('SubtitlesList') if isinstance(subtitles_list, list): LANGS = { - 'Danish': 'dk', + 'Danish': 'da', } for subs in subtitles_list: lang = subs['Language'] diff --git a/youtube_dl/extractor/espn.py b/youtube_dl/extractor/espn.py index 3762d8748..db4b263bc 100644 --- a/youtube_dl/extractor/espn.py +++ b/youtube_dl/extractor/espn.py @@ -53,8 +53,8 @@ class ESPNIE(InfoExtractor): webpage = self._download_webpage(url, video_id) video_id = self._search_regex( - r'class="video-play-button"[^>]+data-id="(\d+)', - webpage, 'video id') + r'class=(["\']).*?video-play-button.*?\1[^>]+data-id=["\'](?P\d+)', + webpage, 'video id', group='id') cms = 'espn' if 'data-source="intl"' in webpage: diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index ec699ba54..899b0896b 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -23,15 +23,23 @@ from ..utils import ( class FacebookIE(InfoExtractor): _VALID_URL = r'''(?x) - https?://(?:\w+\.)?facebook\.com/ - (?:[^#]*?\#!/)? - (?: - (?:video/video\.php|photo\.php|video\.php|video/embed)\?(?:.*?) - (?:v|video_id)=| - [^/]+/videos/(?:[^/]+/)? - ) - (?P[0-9]+) - (?:.*)''' + (?: + https?:// + (?:\w+\.)?facebook\.com/ + (?:[^#]*?\#!/)? + (?: + (?: + video/video\.php| + photo\.php| + video\.php| + video/embed + )\?(?:.*?)(?:v|video_id)=| + [^/]+/videos/(?:[^/]+/)? + )| + facebook: + ) + (?P[0-9]+) + ''' _LOGIN_URL = 'https://www.facebook.com/login.php?next=http%3A%2F%2Ffacebook.com%2Fhome.php&login_attempt=1' _CHECKPOINT_URL = 'https://www.facebook.com/checkpoint/?next=http%3A%2F%2Ffacebook.com%2Fhome.php&_fb_noscript=1' _NETRC_MACHINE = 'facebook' @@ -66,6 +74,9 @@ class FacebookIE(InfoExtractor): }, { 'url': 'https://www.facebook.com/ChristyClarkForBC/videos/vb.22819070941/10153870694020942/?type=2&theater', 'only_matching': True, + }, { + 'url': 'facebook:544765982287235', + 'only_matching': True, }] def _login(self): @@ -139,10 +150,32 @@ class FacebookIE(InfoExtractor): url = 'https://www.facebook.com/video/video.php?v=%s' % video_id webpage = self._download_webpage(url, video_id) + video_data = None + BEFORE = '{swf.addParam(param[0], param[1]);});\n' AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});' m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage) - if not m: + if m: + data = dict(json.loads(m.group(1))) + params_raw = compat_urllib_parse_unquote(data['params']) + video_data = json.loads(params_raw)['video_data'] + + def video_data_list2dict(video_data): + ret = {} + for item in video_data: + format_id = item['stream_type'] + ret.setdefault(format_id, []).append(item) + return ret + + if not video_data: + server_js_data = self._parse_json(self._search_regex( + r'handleServerJS\(({.+})\);', webpage, 'server js data'), video_id) + for item in server_js_data['instances']: + if item[1][0] == 'VideoConfig': + video_data = video_data_list2dict(item[2][0]['videoData']) + break + + if not video_data: m_msg = re.search(r'class="[^"]*uiInterstitialContent[^"]*">
(.*?)
', webpage) if m_msg is not None: raise ExtractorError( @@ -150,12 +183,9 @@ class FacebookIE(InfoExtractor): expected=True) else: raise ExtractorError('Cannot parse data') - data = dict(json.loads(m.group(1))) - params_raw = compat_urllib_parse_unquote(data['params']) - params = json.loads(params_raw) formats = [] - for format_id, f in params['video_data'].items(): + for format_id, f in video_data.items(): if not f or not isinstance(f, list): continue for quality in ('sd', 'hd'): @@ -188,3 +218,33 @@ class FacebookIE(InfoExtractor): 'formats': formats, 'uploader': uploader, } + + +class FacebookPostIE(InfoExtractor): + IE_NAME = 'facebook:post' + _VALID_URL = r'https?://(?:\w+\.)?facebook\.com/[^/]+/posts/(?P\d+)' + _TEST = { + 'url': 'https://www.facebook.com/maxlayn/posts/10153807558977570', + 'md5': '037b1fa7f3c2d02b7a0d7bc16031ecc6', + 'info_dict': { + 'id': '544765982287235', + 'ext': 'mp4', + 'title': '"What are you doing running in the snow?"', + 'uploader': 'FailArmy', + } + } + + def _real_extract(self, url): + post_id = self._match_id(url) + + webpage = self._download_webpage(url, post_id) + + entries = [ + self.url_result('facebook:%s' % video_id, FacebookIE.ie_key()) + for video_id in self._parse_json( + self._search_regex( + r'(["\'])video_ids\1\s*:\s*(?P\[.+?\])', + webpage, 'video ids', group='ids'), + post_id)] + + return self.playlist_result(entries, post_id) diff --git a/youtube_dl/extractor/gamekings.py b/youtube_dl/extractor/gamekings.py index 027f55eb2..f6b9046f9 100644 --- a/youtube_dl/extractor/gamekings.py +++ b/youtube_dl/extractor/gamekings.py @@ -6,24 +6,29 @@ from ..utils import ( xpath_text, xpath_with_ns, ) +from .youtube import YoutubeIE class GamekingsIE(InfoExtractor): - _VALID_URL = r'http://www\.gamekings\.tv/(?:videos|nieuws)/(?P[^/]+)' + _VALID_URL = r'http://www\.gamekings\.nl/(?:videos|nieuws)/(?P[^/]+)' _TESTS = [{ - 'url': 'http://www.gamekings.tv/videos/phoenix-wright-ace-attorney-dual-destinies-review/', - # MD5 is flaky, seems to change regularly - # 'md5': '2f32b1f7b80fdc5cb616efb4f387f8a3', + # YouTube embed video + 'url': 'http://www.gamekings.nl/videos/phoenix-wright-ace-attorney-dual-destinies-review/', + 'md5': '5208d3a17adeaef829a7861887cb9029', 'info_dict': { - 'id': 'phoenix-wright-ace-attorney-dual-destinies-review', + 'id': 'HkSQKetlGOU', 'ext': 'mp4', - 'title': 'Phoenix Wright: Ace Attorney \u2013 Dual Destinies Review', - 'description': 'md5:36fd701e57e8c15ac8682a2374c99731', + 'title': 'Phoenix Wright: Ace Attorney - Dual Destinies Review', + 'description': 'md5:db88c0e7f47e9ea50df3271b9dc72e1d', 'thumbnail': 're:^https?://.*\.jpg$', + 'uploader_id': 'UCJugRGo4STYMeFr5RoOShtQ', + 'uploader': 'Gamekings Vault', + 'upload_date': '20151123', }, + 'add_ie': ['Youtube'], }, { # vimeo video - 'url': 'http://www.gamekings.tv/videos/the-legend-of-zelda-majoras-mask/', + 'url': 'http://www.gamekings.nl/videos/the-legend-of-zelda-majoras-mask/', 'md5': '12bf04dfd238e70058046937657ea68d', 'info_dict': { 'id': 'the-legend-of-zelda-majoras-mask', @@ -33,7 +38,7 @@ class GamekingsIE(InfoExtractor): 'thumbnail': 're:^https?://.*\.jpg$', }, }, { - 'url': 'http://www.gamekings.tv/nieuws/gamekings-extra-shelly-en-david-bereiden-zich-voor-op-de-livestream/', + 'url': 'http://www.gamekings.nl/nieuws/gamekings-extra-shelly-en-david-bereiden-zich-voor-op-de-livestream/', 'only_matching': True, }] @@ -43,7 +48,11 @@ class GamekingsIE(InfoExtractor): webpage = self._download_webpage(url, video_id) playlist_id = self._search_regex( - r'gogoVideo\(\s*\d+\s*,\s*"([^"]+)', webpage, 'playlist id') + r'gogoVideo\([^,]+,\s*"([^"]+)', webpage, 'playlist id') + + # Check if a YouTube embed is used + if YoutubeIE.suitable(playlist_id): + return self.url_result(playlist_id, ie='Youtube') playlist = self._download_xml( 'http://www.gamekings.tv/wp-content/themes/gk2010/rss_playlist.php?id=%s' % playlist_id, diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 26d3698c8..b18e734c4 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -1819,6 +1819,17 @@ class GenericIE(InfoExtractor): if digiteka_url: return self.url_result(self._proto_relative_url(digiteka_url), DigitekaIE.ie_key()) + # Look for Limelight embeds + mobj = re.search(r'LimelightPlayer\.doLoad(Media|Channel|ChannelList)\(["\'](?P[a-z0-9]{32})', webpage) + if mobj: + lm = { + 'Media': 'media', + 'Channel': 'channel', + 'ChannelList': 'channel_list', + } + return self.url_result('limelight:%s:%s' % ( + lm[mobj.group(1)], mobj.group(2)), 'Limelight%s' % mobj.group(1), mobj.group(2)) + # Look for AdobeTVVideo embeds mobj = re.search( r']+src=[\'"]((?:https?:)?//video\.tv\.adobe\.com/v/\d+[^"]+)[\'"]', diff --git a/youtube_dl/extractor/hitbox.py b/youtube_dl/extractor/hitbox.py index 421f55bbe..ff797438d 100644 --- a/youtube_dl/extractor/hitbox.py +++ b/youtube_dl/extractor/hitbox.py @@ -159,6 +159,9 @@ class HitboxLiveIE(HitboxIE): cdns = player_config.get('cdns') servers = [] for cdn in cdns: + # Subscribe URLs are not playable + if cdn.get('rtmpSubscribe') is True: + continue base_url = cdn.get('netConnectionUrl') host = re.search('.+\.([^\.]+\.[^\./]+)/.+', base_url).group(1) if base_url not in servers: diff --git a/youtube_dl/extractor/instagram.py b/youtube_dl/extractor/instagram.py index e5e16ca3b..ed3e07118 100644 --- a/youtube_dl/extractor/instagram.py +++ b/youtube_dl/extractor/instagram.py @@ -21,6 +21,18 @@ class InstagramIE(InfoExtractor): 'title': 'Video by naomipq', 'description': 'md5:1f17f0ab29bd6fe2bfad705f58de3cb8', } + }, { + # missing description + 'url': 'https://www.instagram.com/p/BA-pQFBG8HZ/?taken-by=britneyspears', + 'info_dict': { + 'id': 'BA-pQFBG8HZ', + 'ext': 'mp4', + 'uploader_id': 'britneyspears', + 'title': 'Video by britneyspears', + }, + 'params': { + 'skip_download': True, + }, }, { 'url': 'https://instagram.com/p/-Cmh1cukG2/', 'only_matching': True, @@ -32,8 +44,8 @@ class InstagramIE(InfoExtractor): webpage = self._download_webpage(url, video_id) uploader_id = self._search_regex(r'"owner":{"username":"(.+?)"', webpage, 'uploader id', fatal=False) - desc = self._search_regex(r'"caption":"(.*?)"', webpage, 'description', - fatal=False) + desc = self._search_regex( + r'"caption":"(.+?)"', webpage, 'description', default=None) return { 'id': video_id, diff --git a/youtube_dl/extractor/iqiyi.py b/youtube_dl/extractor/iqiyi.py index 66a70a181..691cb66d6 100644 --- a/youtube_dl/extractor/iqiyi.py +++ b/youtube_dl/extractor/iqiyi.py @@ -214,8 +214,8 @@ class IqiyiIE(InfoExtractor): def get_enc_key(self, swf_url, video_id): # TODO: automatic key extraction - # last update at 2015-12-18 for Zombie::bite - enc_key = '8b6b683780897eb8d9a48a02ccc4817d'[::-1] + # last update at 2016-01-22 for Zombie::bite + enc_key = '6ab6d0280511493ba85594779759d4ed' return enc_key def _real_extract(self, url): diff --git a/youtube_dl/extractor/kanalplay.py b/youtube_dl/extractor/kanalplay.py index 4597d1b96..6c3498c67 100644 --- a/youtube_dl/extractor/kanalplay.py +++ b/youtube_dl/extractor/kanalplay.py @@ -49,7 +49,7 @@ class KanalPlayIE(InfoExtractor): subs = self._download_json( 'http://www.kanal%splay.se/api/subtitles/%s' % (channel_id, video_id), video_id, 'Downloading subtitles JSON', fatal=False) - return {'se': [{'ext': 'srt', 'data': self._fix_subtitles(subs)}]} if subs else {} + return {'sv': [{'ext': 'srt', 'data': self._fix_subtitles(subs)}]} if subs else {} def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) diff --git a/youtube_dl/extractor/letv.py b/youtube_dl/extractor/letv.py index 08bdae8a2..9665ece89 100644 --- a/youtube_dl/extractor/letv.py +++ b/youtube_dl/extractor/letv.py @@ -5,11 +5,13 @@ import datetime import re import time import base64 +import hashlib from .common import InfoExtractor from ..compat import ( compat_urllib_parse, compat_ord, + compat_str, ) from ..utils import ( determine_ext, @@ -258,6 +260,7 @@ class LetvCloudIE(InfoExtractor): }, }, { 'url': 'http://yuntv.letv.com/bcloud.html?uu=p7jnfw5hw9&vu=ec93197892&pu=2c7cd40209&auto_play=1&gpcflag=1&width=640&height=360', + 'md5': 'e03d9cc8d9c13191e1caf277e42dbd31', 'info_dict': { 'id': 'p7jnfw5hw9_ec93197892', 'ext': 'mp4', @@ -265,6 +268,7 @@ class LetvCloudIE(InfoExtractor): }, }, { 'url': 'http://yuntv.letv.com/bcloud.html?uu=p7jnfw5hw9&vu=187060b6fd', + 'md5': 'cb988699a776b22d4a41b9d43acfb3ac', 'info_dict': { 'id': 'p7jnfw5hw9_187060b6fd', 'ext': 'mp4', @@ -272,21 +276,37 @@ class LetvCloudIE(InfoExtractor): }, }] - def _real_extract(self, url): - uu_mobj = re.search('uu=([\w]+)', url) - vu_mobj = re.search('vu=([\w]+)', url) + @staticmethod + def sign_data(obj): + if obj['cf'] == 'flash': + salt = '2f9d6924b33a165a6d8b5d3d42f4f987' + items = ['cf', 'format', 'ran', 'uu', 'ver', 'vu'] + elif obj['cf'] == 'html5': + salt = 'fbeh5player12c43eccf2bec3300344' + items = ['cf', 'ran', 'uu', 'bver', 'vu'] + input_data = ''.join([item + obj[item] for item in items]) + salt + obj['sign'] = hashlib.md5(input_data.encode('utf-8')).hexdigest() - if not uu_mobj or not vu_mobj: - raise ExtractorError('Invalid URL: %s' % url, expected=True) + def _get_formats(self, cf, uu, vu, media_id): + def get_play_json(cf, timestamp): + data = { + 'cf': cf, + 'ver': '2.2', + 'bver': 'firefox44.0', + 'format': 'json', + 'uu': uu, + 'vu': vu, + 'ran': compat_str(timestamp), + } + self.sign_data(data) + return self._download_json( + 'http://api.letvcloud.com/gpc.php?' + compat_urllib_parse.urlencode(data), + media_id, 'Downloading playJson data for type %s' % cf) - uu = uu_mobj.group(1) - vu = vu_mobj.group(1) - media_id = uu + '_' + vu - - play_json_req = sanitized_Request( - 'http://api.letvcloud.com/gpc.php?cf=html5&sign=signxxxxx&ver=2.2&format=json&' + - 'uu=' + uu + '&vu=' + vu) - play_json = self._download_json(play_json_req, media_id, 'Downloading playJson data') + play_json = get_play_json(cf, time.time()) + # The server time may be different from local time + if play_json.get('code') == 10071: + play_json = get_play_json(cf, play_json['timestamp']) if not play_json.get('data'): if play_json.get('message'): @@ -312,6 +332,21 @@ class LetvCloudIE(InfoExtractor): 'width': int_or_none(play_url.get('vwidth')), 'height': int_or_none(play_url.get('vheight')), }) + + return formats + + def _real_extract(self, url): + uu_mobj = re.search('uu=([\w]+)', url) + vu_mobj = re.search('vu=([\w]+)', url) + + if not uu_mobj or not vu_mobj: + raise ExtractorError('Invalid URL: %s' % url, expected=True) + + uu = uu_mobj.group(1) + vu = vu_mobj.group(1) + media_id = uu + '_' + vu + + formats = self._get_formats('flash', uu, vu, media_id) + self._get_formats('html5', uu, vu, media_id) self._sort_formats(formats) return { diff --git a/youtube_dl/extractor/limelight.py b/youtube_dl/extractor/limelight.py index fb03dd527..1a0625ac3 100644 --- a/youtube_dl/extractor/limelight.py +++ b/youtube_dl/extractor/limelight.py @@ -40,7 +40,8 @@ class LimelightBaseIE(InfoExtractor): if not stream_url: continue if '.f4m' in stream_url: - formats.extend(self._extract_f4m_formats(stream_url, video_id)) + formats.extend(self._extract_f4m_formats( + stream_url, video_id, fatal=False)) else: fmt = { 'url': stream_url, @@ -72,8 +73,8 @@ class LimelightBaseIE(InfoExtractor): format_id = mobile_url.get('targetMediaPlatform') if determine_ext(media_url) == 'm3u8': formats.extend(self._extract_m3u8_formats( - media_url, video_id, 'mp4', entry_protocol='m3u8_native', - preference=-1, m3u8_id=format_id)) + media_url, video_id, 'mp4', 'm3u8_native', + m3u8_id=format_id, fatal=False)) else: formats.append({ 'url': media_url, diff --git a/youtube_dl/extractor/lovehomeporn.py b/youtube_dl/extractor/lovehomeporn.py new file mode 100644 index 000000000..8f65a3c03 --- /dev/null +++ b/youtube_dl/extractor/lovehomeporn.py @@ -0,0 +1,37 @@ +from __future__ import unicode_literals + +import re + +from .nuevo import NuevoBaseIE + + +class LoveHomePornIE(NuevoBaseIE): + _VALID_URL = r'https?://(?:www\.)?lovehomeporn\.com/video/(?P\d+)(?:/(?P[^/?#&]+))?' + _TEST = { + 'url': 'http://lovehomeporn.com/video/48483/stunning-busty-brunette-girlfriend-sucking-and-riding-a-big-dick#menu', + 'info_dict': { + 'id': '48483', + 'display_id': 'stunning-busty-brunette-girlfriend-sucking-and-riding-a-big-dick', + 'ext': 'mp4', + 'title': 'Stunning busty brunette girlfriend sucking and riding a big dick', + 'age_limit': 18, + 'duration': 238.47, + }, + 'params': { + 'skip_download': True, + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + display_id = mobj.group('display_id') + + info = self._extract_nuevo( + 'http://lovehomeporn.com/media/nuevo/config.php?key=%s' % video_id, + video_id) + info.update({ + 'display_id': display_id, + 'age_limit': 18 + }) + return info diff --git a/youtube_dl/extractor/matchtv.py b/youtube_dl/extractor/matchtv.py new file mode 100644 index 000000000..28e0dfe63 --- /dev/null +++ b/youtube_dl/extractor/matchtv.py @@ -0,0 +1,55 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import random + +from .common import InfoExtractor +from ..compat import compat_urllib_parse +from ..utils import ( + sanitized_Request, + xpath_text, +) + + +class MatchTVIE(InfoExtractor): + _VALID_URL = r'https?://matchtv\.ru/?#live-player' + _TEST = { + 'url': 'http://matchtv.ru/#live-player', + 'info_dict': { + 'id': 'matchtv-live', + 'ext': 'flv', + 'title': 're:^Матч ТВ - Прямой эфир \d{4}-\d{2}-\d{2} \d{2}:\d{2}$', + 'is_live': True, + }, + 'params': { + 'skip_download': True, + }, + } + + def _real_extract(self, url): + video_id = 'matchtv-live' + request = sanitized_Request( + 'http://player.matchtv.ntvplus.tv/player/smil?%s' % compat_urllib_parse.urlencode({ + 'ts': '', + 'quality': 'SD', + 'contentId': '561d2c0df7159b37178b4567', + 'sign': '', + 'includeHighlights': '0', + 'userId': '', + 'sessionId': random.randint(1, 1000000000), + 'contentType': 'channel', + 'timeShift': '0', + 'platform': 'portal', + }), + headers={ + 'Referer': 'http://player.matchtv.ntvplus.tv/embed-player/NTVEmbedPlayer.swf', + }) + video_url = self._download_json(request, video_id)['data']['videoUrl'] + f4m_url = xpath_text(self._download_xml(video_url, video_id), './to') + formats = self._extract_f4m_formats(f4m_url, video_id) + return { + 'id': video_id, + 'title': self._live_title('Матч ТВ - Прямой эфир'), + 'is_live': True, + 'formats': formats, + } diff --git a/youtube_dl/extractor/nba.py b/youtube_dl/extractor/nba.py index 9d26030d3..a071378b6 100644 --- a/youtube_dl/extractor/nba.py +++ b/youtube_dl/extractor/nba.py @@ -18,13 +18,17 @@ class NBAIE(InfoExtractor): 'md5': '9e7729d3010a9c71506fd1248f74e4f4', 'info_dict': { 'id': '0021200253-okc-bkn-recap', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Thunder vs. Nets', 'description': 'Kevin Durant scores 32 points and dishes out six assists as the Thunder beat the Nets in Brooklyn.', 'duration': 181, 'timestamp': 1354638466, 'upload_date': '20121204', }, + 'params': { + # m3u8 download + 'skip_download': True, + }, }, { 'url': 'http://www.nba.com/video/games/hornets/2014/12/05/0021400276-nyk-cha-play5.nba/', 'only_matching': True, @@ -68,7 +72,7 @@ class NBAIE(InfoExtractor): if video_url.startswith('/'): continue if video_url.endswith('.m3u8'): - formats.extend(self._extract_m3u8_formats(video_url, video_id, m3u8_id='hls', fatal=False)) + formats.extend(self._extract_m3u8_formats(video_url, video_id, ext='mp4', m3u8_id='hls', fatal=False)) elif video_url.endswith('.f4m'): formats.extend(self._extract_f4m_formats(video_url + '?hdcore=3.4.1.1', video_id, f4m_id='hds', fatal=False)) else: diff --git a/youtube_dl/extractor/nbc.py b/youtube_dl/extractor/nbc.py index 1dd54c2f1..18d01f423 100644 --- a/youtube_dl/extractor/nbc.py +++ b/youtube_dl/extractor/nbc.py @@ -19,32 +19,39 @@ class NBCIE(InfoExtractor): _TESTS = [ { 'url': 'http://www.nbc.com/the-tonight-show/segments/112966', - # md5 checksum is not stable 'info_dict': { - 'id': 'c9xnCo0YPOPH', - 'ext': 'flv', + 'id': '112966', + 'ext': 'mp4', 'title': 'Jimmy Fallon Surprises Fans at Ben & Jerry\'s', 'description': 'Jimmy gives out free scoops of his new "Tonight Dough" ice cream flavor by surprising customers at the Ben & Jerry\'s scoop shop.', }, + 'params': { + # m3u8 download + 'skip_download': True, + }, }, { 'url': 'http://www.nbc.com/the-tonight-show/episodes/176', 'info_dict': { - 'id': 'XwU9KZkp98TH', + 'id': '176', 'ext': 'flv', 'title': 'Ricky Gervais, Steven Van Zandt, ILoveMakonnen', 'description': 'A brand new episode of The Tonight Show welcomes Ricky Gervais, Steven Van Zandt and ILoveMakonnen.', }, - 'skip': 'Only works from US', + 'skip': '404 Not Found', }, { 'url': 'http://www.nbc.com/saturday-night-live/video/star-wars-teaser/2832821', 'info_dict': { - 'id': '8iUuyzWDdYUZ', - 'ext': 'flv', + 'id': '2832821', + 'ext': 'mp4', 'title': 'Star Wars Teaser', 'description': 'md5:0b40f9cbde5b671a7ff62fceccc4f442', }, + 'params': { + # m3u8 download + 'skip_download': True, + }, 'skip': 'Only works from US', }, { @@ -66,7 +73,11 @@ class NBCIE(InfoExtractor): webpage, 'theplatform url').replace('_no_endcard', '').replace('\\/', '/'))) if theplatform_url.startswith('//'): theplatform_url = 'http:' + theplatform_url - return self.url_result(smuggle_url(theplatform_url, {'source_url': url})) + return { + '_type': 'url_transparent', + 'url': smuggle_url(theplatform_url, {'source_url': url}), + 'id': video_id, + } class NBCSportsVPlayerIE(InfoExtractor): diff --git a/youtube_dl/extractor/ndr.py b/youtube_dl/extractor/ndr.py index 894c51399..0cded6b5c 100644 --- a/youtube_dl/extractor/ndr.py +++ b/youtube_dl/extractor/ndr.py @@ -193,7 +193,7 @@ class NDREmbedBaseIE(InfoExtractor): src + '?hdcore=3.7.0&plugin=aasp-3.7.0.39.44', video_id, f4m_id='hds')) elif ext == 'm3u8': formats.extend(self._extract_m3u8_formats( - src, video_id, m3u8_id='hls', entry_protocol='m3u8_native')) + src, video_id, 'mp4', m3u8_id='hls', entry_protocol='m3u8_native')) else: quality = f.get('quality') ff = { diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index eb12fb810..87f5675c7 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -189,7 +189,7 @@ class NPOIE(NPOBaseIE): if not video_url: continue if format_id == 'adaptive': - formats.extend(self._extract_m3u8_formats(video_url, video_id)) + formats.extend(self._extract_m3u8_formats(video_url, video_id, 'mp4')) else: formats.append({ 'url': video_url, @@ -406,6 +406,38 @@ class NPORadioFragmentIE(InfoExtractor): } +class SchoolTVIE(InfoExtractor): + IE_NAME = 'schooltv' + _VALID_URL = r'https?://(?:www\.)?schooltv\.nl/video/(?P[^/?#&]+)' + + _TEST = { + 'url': 'http://www.schooltv.nl/video/ademhaling-de-hele-dag-haal-je-adem-maar-wat-gebeurt-er-dan-eigenlijk-in-je-lichaam/', + 'info_dict': { + 'id': 'WO_NTR_429477', + 'display_id': 'ademhaling-de-hele-dag-haal-je-adem-maar-wat-gebeurt-er-dan-eigenlijk-in-je-lichaam', + 'title': 'Ademhaling: De hele dag haal je adem. Maar wat gebeurt er dan eigenlijk in je lichaam?', + 'ext': 'mp4', + 'description': 'md5:abfa0ff690adb73fd0297fd033aaa631' + }, + 'params': { + # Skip because of m3u8 download + 'skip_download': True + } + } + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + video_id = self._search_regex( + r'data-mid=(["\'])(?P.+?)\1', webpage, 'video_id', group='id') + return { + '_type': 'url_transparent', + 'ie_key': 'NPO', + 'url': 'npo:%s' % video_id, + 'display_id': display_id + } + + class VPROIE(NPOIE): IE_NAME = 'vpro' _VALID_URL = r'https?://(?:www\.)?(?:tegenlicht\.)?vpro\.nl/(?:[^/]+/){2,}(?P[^/]+)\.html' diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py index 6ff13050d..a126f5054 100644 --- a/youtube_dl/extractor/nrk.py +++ b/youtube_dl/extractor/nrk.py @@ -133,26 +133,32 @@ class NRKTVIE(InfoExtractor): _TESTS = [ { 'url': 'https://tv.nrk.no/serie/20-spoersmaal-tv/MUHH48000314/23-05-2014', - 'md5': 'adf2c5454fa2bf032f47a9f8fb351342', 'info_dict': { 'id': 'MUHH48000314', - 'ext': 'flv', + 'ext': 'mp4', 'title': '20 spørsmål', 'description': 'md5:bdea103bc35494c143c6a9acdd84887a', 'upload_date': '20140523', 'duration': 1741.52, }, + 'params': { + # m3u8 download + 'skip_download': True, + }, }, { 'url': 'https://tv.nrk.no/program/mdfp15000514', - 'md5': '383650ece2b25ecec996ad7b5bb2a384', 'info_dict': { 'id': 'mdfp15000514', - 'ext': 'flv', - 'title': 'Kunnskapskanalen: Grunnlovsjubiléet - Stor ståhei for ingenting', + 'ext': 'mp4', + 'title': 'Grunnlovsjubiléet - Stor ståhei for ingenting', 'description': 'md5:654c12511f035aed1e42bdf5db3b206a', 'upload_date': '20140524', - 'duration': 4605.0, + 'duration': 4605.08, + }, + 'params': { + # m3u8 download + 'skip_download': True, }, }, { diff --git a/youtube_dl/extractor/nuevo.py b/youtube_dl/extractor/nuevo.py new file mode 100644 index 000000000..ef093dec2 --- /dev/null +++ b/youtube_dl/extractor/nuevo.py @@ -0,0 +1,38 @@ +# encoding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + +from ..utils import ( + float_or_none, + xpath_text +) + + +class NuevoBaseIE(InfoExtractor): + def _extract_nuevo(self, config_url, video_id): + config = self._download_xml( + config_url, video_id, transform_source=lambda s: s.strip()) + + title = xpath_text(config, './title', 'title', fatal=True).strip() + video_id = xpath_text(config, './mediaid', default=video_id) + thumbnail = xpath_text(config, ['./image', './thumb']) + duration = float_or_none(xpath_text(config, './duration')) + + formats = [] + for element_name, format_id in (('file', 'sd'), ('filehd', 'hd')): + video_url = xpath_text(config, element_name) + if video_url: + formats.append({ + 'url': video_url, + 'format_id': format_id, + }) + self._check_formats(formats, video_id) + + return { + 'id': video_id, + 'title': title, + 'thumbnail': thumbnail, + 'duration': duration, + 'formats': formats + } diff --git a/youtube_dl/extractor/odnoklassniki.py b/youtube_dl/extractor/odnoklassniki.py index 184c7a323..f9e064a60 100644 --- a/youtube_dl/extractor/odnoklassniki.py +++ b/youtube_dl/extractor/odnoklassniki.py @@ -13,7 +13,7 @@ from ..utils import ( class OdnoklassnikiIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?(?:odnoklassniki|ok)\.ru/(?:video(?:embed)?|web-api/video/moviePlayer)/(?P[\d-]+)' + _VALID_URL = r'https?://(?:(?:www|m|mobile)\.)?(?:odnoklassniki|ok)\.ru/(?:video(?:embed)?|web-api/video/moviePlayer)/(?P[\d-]+)' _TESTS = [{ # metadata in JSON 'url': 'http://ok.ru/video/20079905452', @@ -69,6 +69,12 @@ class OdnoklassnikiIE(InfoExtractor): }, { 'url': 'http://www.ok.ru/videoembed/20648036891', 'only_matching': True, + }, { + 'url': 'http://m.ok.ru/video/20079905452', + 'only_matching': True, + }, { + 'url': 'http://mobile.ok.ru/video/20079905452', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/ruleporn.py b/youtube_dl/extractor/ruleporn.py new file mode 100644 index 000000000..ebf9808d5 --- /dev/null +++ b/youtube_dl/extractor/ruleporn.py @@ -0,0 +1,44 @@ +from __future__ import unicode_literals + +from .nuevo import NuevoBaseIE + + +class RulePornIE(NuevoBaseIE): + _VALID_URL = r'https?://(?:www\.)?ruleporn\.com/(?:[^/?#&]+/)*(?P[^/?#&]+)' + _TEST = { + 'url': 'http://ruleporn.com/brunette-nympho-chick-takes-her-boyfriend-in-every-angle/', + 'md5': '86861ebc624a1097c7c10eaf06d7d505', + 'info_dict': { + 'id': '48212', + 'display_id': 'brunette-nympho-chick-takes-her-boyfriend-in-every-angle', + 'ext': 'mp4', + 'title': 'Brunette Nympho Chick Takes Her Boyfriend In Every Angle', + 'description': 'md5:6d28be231b981fff1981deaaa03a04d5', + 'age_limit': 18, + 'duration': 635.1, + } + } + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + + video_id = self._search_regex( + r'lovehomeporn\.com/embed/(\d+)', webpage, 'video id') + + title = self._search_regex( + r']+title=(["\'])(?P.+?)\1', + webpage, 'title', group='url') + description = self._html_search_meta('description', webpage) + + info = self._extract_nuevo( + 'http://lovehomeporn.com/media/nuevo/econfig.php?key=%s&rp=true' % video_id, + video_id) + info.update({ + 'display_id': display_id, + 'title': title, + 'description': description, + 'age_limit': 18 + }) + return info diff --git a/youtube_dl/extractor/screenwavemedia.py b/youtube_dl/extractor/screenwavemedia.py index 05f93904c..e5d62a139 100644 --- a/youtube_dl/extractor/screenwavemedia.py +++ b/youtube_dl/extractor/screenwavemedia.py @@ -71,7 +71,7 @@ class ScreenwaveMediaIE(InfoExtractor): formats = [] for source in sources: if source['type'] == 'hls': - formats.extend(self._extract_m3u8_formats(source['file'], video_id)) + formats.extend(self._extract_m3u8_formats(source['file'], video_id, ext='mp4')) else: file_ = source.get('file') if not file_: @@ -107,7 +107,11 @@ class TeamFourIE(InfoExtractor): 'upload_date': '20130401', 'description': 'Check out this and more on our website: http://teamfourstar.com\nTFS Store: http://sharkrobot.com/team-four-star\nFollow on Twitter: http://twitter.com/teamfourstar\nLike on FB: http://facebook.com/teamfourstar', 'title': 'A Moment With TFS Episode 4', - } + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, } def _real_extract(self, url): diff --git a/youtube_dl/extractor/senateisvp.py b/youtube_dl/extractor/senateisvp.py index 474ebb49b..990ea0fa8 100644 --- a/youtube_dl/extractor/senateisvp.py +++ b/youtube_dl/extractor/senateisvp.py @@ -53,17 +53,25 @@ class SenateISVPIE(InfoExtractor): 'url': 'http://www.senate.gov/isvp/?comm=judiciary&type=live&stt=&filename=judiciary031715&auto_play=false&wmode=transparent&poster=http%3A%2F%2Fwww.judiciary.senate.gov%2Fthemes%2Fjudiciary%2Fimages%2Fvideo-poster-flash-fit.png', 'info_dict': { 'id': 'judiciary031715', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Integrated Senate Video Player', 'thumbnail': 're:^https?://.*\.(?:jpg|png)$', - } + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, }, { 'url': 'http://www.senate.gov/isvp/?type=live&comm=commerce&filename=commerce011514.mp4&auto_play=false', 'info_dict': { 'id': 'commerce011514', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Integrated Senate Video Player' - } + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, }, { 'url': 'http://www.senate.gov/isvp/?type=arch&comm=intel&filename=intel090613&hc_location=ufi', # checksum differs each time diff --git a/youtube_dl/extractor/spankbang.py b/youtube_dl/extractor/spankbang.py index 7f060b15b..3cfa671ed 100644 --- a/youtube_dl/extractor/spankbang.py +++ b/youtube_dl/extractor/spankbang.py @@ -34,11 +34,11 @@ class SpankBangIE(InfoExtractor): 'ext': 'mp4', 'format_id': '%sp' % height, 'height': int(height), - } for height in re.findall(r']+q_(\d+)p', webpage)] + } for height in re.findall(r'<(?:span|li)[^>]+q_(\d+)p', webpage)] self._sort_formats(formats) title = self._html_search_regex( - r'(?s)

(.+?)

', webpage, 'title') + r'(?s)]*>(.+?)', webpage, 'title') description = self._search_regex( r'class="desc"[^>]*>([^<]+)', webpage, 'description', default=None) diff --git a/youtube_dl/extractor/svt.py b/youtube_dl/extractor/svt.py index fc20f664b..399c3b8ee 100644 --- a/youtube_dl/extractor/svt.py +++ b/youtube_dl/extractor/svt.py @@ -37,6 +37,14 @@ class SVTBaseIE(InfoExtractor): }) self._sort_formats(formats) + subtitles = {} + subtitle_references = video_info.get('subtitleReferences') + if isinstance(subtitle_references, list): + for sr in subtitle_references: + subtitle_url = sr.get('url') + if subtitle_url: + subtitles.setdefault('sv', []).append({'url': subtitle_url}) + duration = video_info.get('materialLength') age_limit = 18 if video_info.get('inappropriateForChildren') else 0 @@ -44,6 +52,7 @@ class SVTBaseIE(InfoExtractor): 'id': video_id, 'title': title, 'formats': formats, + 'subtitles': subtitles, 'thumbnail': thumbnail, 'duration': duration, 'age_limit': age_limit, @@ -83,30 +92,23 @@ class SVTIE(SVTBaseIE): class SVTPlayIE(SVTBaseIE): IE_DESC = 'SVT Play and Öppet arkiv' _VALID_URL = r'https?://(?:www\.)?(?Psvtplay|oppetarkiv)\.se/video/(?P[0-9]+)' - _TESTS = [{ - 'url': 'http://www.svtplay.se/video/2609989/sm-veckan/sm-veckan-rally-final-sasong-1-sm-veckan-rally-final', - 'md5': 'ade3def0643fa1c40587a422f98edfd9', + _TEST = { + 'url': 'http://www.svtplay.se/video/5996901/flygplan-till-haile-selassie/flygplan-till-haile-selassie-2', + 'md5': '2b6704fe4a28801e1a098bbf3c5ac611', 'info_dict': { - 'id': '2609989', - 'ext': 'flv', - 'title': 'SM veckan vinter, Örebro - Rally, final', - 'duration': 4500, + 'id': '5996901', + 'ext': 'mp4', + 'title': 'Flygplan till Haile Selassie', + 'duration': 3527, 'thumbnail': 're:^https?://.*[\.-]jpg$', 'age_limit': 0, + 'subtitles': { + 'sv': [{ + 'ext': 'wsrt', + }] + }, }, - }, { - 'url': 'http://www.oppetarkiv.se/video/1058509/rederiet-sasong-1-avsnitt-1-av-318', - 'md5': 'c3101a17ce9634f4c1f9800f0746c187', - 'info_dict': { - 'id': '1058509', - 'ext': 'flv', - 'title': 'Farlig kryssning', - 'duration': 2566, - 'thumbnail': 're:^https?://.*[\.-]jpg$', - 'age_limit': 0, - }, - 'skip': 'Only works from Sweden', - }] + } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) diff --git a/youtube_dl/extractor/trollvids.py b/youtube_dl/extractor/trollvids.py new file mode 100644 index 000000000..d239949a6 --- /dev/null +++ b/youtube_dl/extractor/trollvids.py @@ -0,0 +1,36 @@ +# encoding: utf-8 +from __future__ import unicode_literals + +import re + +from .nuevo import NuevoBaseIE + + +class TrollvidsIE(NuevoBaseIE): + _VALID_URL = r'http://(?:www\.)?trollvids\.com/video/(?P\d+)/(?P[^/?#&]+)' + IE_NAME = 'trollvids' + _TEST = { + 'url': 'http://trollvids.com/video/2349002/%E3%80%90MMD-R-18%E3%80%91%E3%82%AC%E3%83%BC%E3%83%AB%E3%83%95%E3%83%AC%E3%83%B3%E3%83%89-carrymeoff', + 'md5': '1d53866b2c514b23ed69e4352fdc9839', + 'info_dict': { + 'id': '2349002', + 'ext': 'mp4', + 'title': '【MMD R-18】ガールフレンド carry_me_off', + 'age_limit': 18, + 'duration': 216.78, + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + display_id = mobj.group('display_id') + + info = self._extract_nuevo( + 'http://trollvids.com/nuevo/player/config.php?v=%s' % video_id, + video_id) + info.update({ + 'display_id': display_id, + 'age_limit': 18 + }) + return info diff --git a/youtube_dl/extractor/trutube.py b/youtube_dl/extractor/trutube.py index e7b79243a..d55e0c563 100644 --- a/youtube_dl/extractor/trutube.py +++ b/youtube_dl/extractor/trutube.py @@ -1,11 +1,10 @@ from __future__ import unicode_literals -from .common import InfoExtractor -from ..utils import xpath_text +from .nuevo import NuevoBaseIE -class TruTubeIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?trutube\.tv/(?:video/|nuevo/player/embed\.php\?v=)(?P[0-9]+)' +class TruTubeIE(NuevoBaseIE): + _VALID_URL = r'https?://(?:www\.)?trutube\.tv/(?:video/|nuevo/player/embed\.php\?v=)(?P\d+)' _TESTS = [{ 'url': 'http://trutube.tv/video/14880/Ramses-II-Proven-To-Be-A-Red-Headed-Caucasoid-', 'md5': 'c5b6e301b0a2040b074746cbeaa26ca1', @@ -22,19 +21,6 @@ class TruTubeIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - - config = self._download_xml( + return self._extract_nuevo( 'https://trutube.tv/nuevo/player/config.php?v=%s' % video_id, - video_id, transform_source=lambda s: s.strip()) - - # filehd is always 404 - video_url = xpath_text(config, './file', 'video URL', fatal=True) - title = xpath_text(config, './title', 'title').strip() - thumbnail = xpath_text(config, './image', ' thumbnail') - - return { - 'id': video_id, - 'url': video_url, - 'title': title, - 'thumbnail': thumbnail, - } + video_id) diff --git a/youtube_dl/extractor/tv2.py b/youtube_dl/extractor/tv2.py index fa338b936..1457e524e 100644 --- a/youtube_dl/extractor/tv2.py +++ b/youtube_dl/extractor/tv2.py @@ -17,18 +17,21 @@ class TV2IE(InfoExtractor): _VALID_URL = 'http://(?:www\.)?tv2\.no/v/(?P\d+)' _TEST = { 'url': 'http://www.tv2.no/v/916509/', - 'md5': '9cb9e3410b18b515d71892f27856e9b1', 'info_dict': { 'id': '916509', - 'ext': 'flv', - 'title': 'Se Gryttens hyllest av Steven Gerrard', + 'ext': 'mp4', + 'title': 'Se Frode Gryttens hyllest av Steven Gerrard', 'description': 'TV 2 Sportens huspoet tar avskjed med Liverpools kaptein Steven Gerrard.', 'timestamp': 1431715610, 'upload_date': '20150515', 'duration': 156.967, 'view_count': int, 'categories': list, - } + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, } def _real_extract(self, url): diff --git a/youtube_dl/extractor/vevo.py b/youtube_dl/extractor/vevo.py index 02dfd36f4..35fcff1b2 100644 --- a/youtube_dl/extractor/vevo.py +++ b/youtube_dl/extractor/vevo.py @@ -3,22 +3,20 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import ( - compat_etree_fromstring, - compat_urlparse, -) +from ..compat import compat_etree_fromstring from ..utils import ( ExtractorError, int_or_none, sanitized_Request, + parse_iso8601, ) class VevoIE(InfoExtractor): - """ + ''' Accepts urls from vevo.com or in the format 'vevo:{id}' (currently used by MTVIE and MySpaceIE) - """ + ''' _VALID_URL = r'''(?x) (?:https?://www\.vevo\.com/watch/(?:[^/]+/(?:[^/]+/)?)?| https?://cache\.vevo\.com/m/html/embed\.html\?video=| @@ -28,19 +26,15 @@ class VevoIE(InfoExtractor): _TESTS = [{ 'url': 'http://www.vevo.com/watch/hurts/somebody-to-die-for/GB1101300280', - "md5": "95ee28ee45e70130e3ab02b0f579ae23", + 'md5': '95ee28ee45e70130e3ab02b0f579ae23', 'info_dict': { 'id': 'GB1101300280', 'ext': 'mp4', - "upload_date": "20130624", - "uploader": "Hurts", - "title": "Somebody to Die For", - "duration": 230.12, - "width": 1920, - "height": 1080, - # timestamp and upload_date are often incorrect; seem to change randomly - 'timestamp': int, - } + 'title': 'Somebody to Die For', + 'upload_date': '20130624', + 'uploader': 'Hurts', + 'timestamp': 1372057200, + }, }, { 'note': 'v3 SMIL format', 'url': 'http://www.vevo.com/watch/cassadee-pope/i-wish-i-could-break-your-heart/USUV71302923', @@ -48,28 +42,23 @@ class VevoIE(InfoExtractor): 'info_dict': { 'id': 'USUV71302923', 'ext': 'mp4', + 'title': 'I Wish I Could Break Your Heart', 'upload_date': '20140219', 'uploader': 'Cassadee Pope', - 'title': 'I Wish I Could Break Your Heart', - 'duration': 226.101, - 'age_limit': 0, - 'timestamp': int, - } + 'timestamp': 1392796919, + }, }, { 'note': 'Age-limited video', 'url': 'https://www.vevo.com/watch/justin-timberlake/tunnel-vision-explicit/USRV81300282', 'info_dict': { 'id': 'USRV81300282', 'ext': 'mp4', - 'age_limit': 18, 'title': 'Tunnel Vision (Explicit)', + 'upload_date': '20130703', + 'age_limit': 18, 'uploader': 'Justin Timberlake', - 'upload_date': 're:2013070[34]', - 'timestamp': int, + 'timestamp': 1372888800, }, - 'params': { - 'skip_download': 'true', - } }, { 'note': 'No video_info', 'url': 'http://www.vevo.com/watch/k-camp-1/Till-I-Die/USUV71503000', @@ -77,69 +66,46 @@ class VevoIE(InfoExtractor): 'info_dict': { 'id': 'USUV71503000', 'ext': 'mp4', - 'title': 'Till I Die - K Camp ft. T.I.', - 'duration': 193, + 'title': 'Till I Die', + 'upload_date': '20151207', + 'age_limit': 18, + 'uploader': 'K Camp', + 'timestamp': 1449468000, }, - 'expected_warnings': ['Unable to download SMIL file'], }] - _SMIL_BASE_URL = 'http://smil.lvl3.vevo.com/' + _SMIL_BASE_URL = 'http://smil.lvl3.vevo.com' + _SOURCE_TYPES = { + 0: 'youtube', + 1: 'brightcove', + 2: 'http', + 3: 'hls_ios', + 4: 'hls', + 5: 'smil', # http + 7: 'f4m_cc', + 8: 'f4m_ak', + 9: 'f4m_l3', + 10: 'ism', + 13: 'smil', # rtmp + 18: 'dash', + } + _VERSIONS = { + 0: 'youtube', # only in AuthenticateVideo videoVersions + 1: 'level3', + 2: 'akamai', + 3: 'level3', + 4: 'amazon', + } - def _real_initialize(self): - req = sanitized_Request( - 'http://www.vevo.com/auth', data=b'') - webpage = self._download_webpage( - req, None, - note='Retrieving oauth token', - errnote='Unable to retrieve oauth token', - fatal=False) - if webpage is False: - self._oauth_token = None - else: - if 'THIS PAGE IS CURRENTLY UNAVAILABLE IN YOUR REGION' in webpage: - raise ExtractorError('%s said: This page is currently unavailable in your region.' % self.IE_NAME, expected=True) - - self._oauth_token = self._search_regex( - r'access_token":\s*"([^"]+)"', - webpage, 'access token', fatal=False) - - def _formats_from_json(self, video_info): - if not video_info: - return [] - - last_version = {'version': -1} - for version in video_info['videoVersions']: - # These are the HTTP downloads, other types are for different manifests - if version['sourceType'] == 2: - if version['version'] > last_version['version']: - last_version = version - if last_version['version'] == -1: - raise ExtractorError('Unable to extract last version of the video') - - renditions = compat_etree_fromstring(last_version['data']) + def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None): formats = [] - # Already sorted from worst to best quality - for rend in renditions.findall('rendition'): - attr = rend.attrib - format_note = '%(videoCodec)s@%(videoBitrate)4sk, %(audioCodec)s@%(audioBitrate)3sk' % attr - formats.append({ - 'url': attr['url'], - 'format_id': attr['name'], - 'format_note': format_note, - 'height': int(attr['frameheight']), - 'width': int(attr['frameWidth']), - }) - return formats - - def _formats_from_smil(self, smil_doc): - formats = [] - els = smil_doc.findall('.//{http://www.w3.org/2001/SMIL20/Language}video') + els = smil.findall('.//{http://www.w3.org/2001/SMIL20/Language}video') for el in els: src = el.attrib['src'] m = re.match(r'''(?xi) (?P[a-z0-9]+): (?P [/a-z0-9]+ # The directory and main part of the URL - _(?P[0-9]+)k + _(?P[0-9]+)k _(?P[0-9]+)x(?P[0-9]+) _(?P[a-z0-9]+) _(?P[0-9]+) @@ -153,9 +119,10 @@ class VevoIE(InfoExtractor): format_url = self._SMIL_BASE_URL + m.group('path') formats.append({ 'url': format_url, - 'format_id': 'SMIL_' + m.group('cbr'), + 'format_id': 'smil_' + m.group('tbr'), 'vcodec': m.group('vcodec'), 'acodec': m.group('acodec'), + 'tbr': int(m.group('tbr')), 'vbr': int(m.group('vbr')), 'abr': int(m.group('abr')), 'ext': m.group('ext'), @@ -164,48 +131,148 @@ class VevoIE(InfoExtractor): }) return formats - def _download_api_formats(self, video_id, video_url): - if not self._oauth_token: - self._downloader.report_warning( - 'No oauth token available, skipping API HLS download') - return [] + def _initialize_api(self, video_id): + req = sanitized_Request( + 'http://www.vevo.com/auth', data=b'') + webpage = self._download_webpage( + req, None, + note='Retrieving oauth token', + errnote='Unable to retrieve oauth token') - api_url = compat_urlparse.urljoin(video_url, '//apiv2.vevo.com/video/%s/streams/hls?token=%s' % ( - video_id, self._oauth_token)) - api_data = self._download_json( - api_url, video_id, - note='Downloading HLS formats', - errnote='Failed to download HLS format list', fatal=False) - if api_data is None: - return [] + if 'THIS PAGE IS CURRENTLY UNAVAILABLE IN YOUR REGION' in webpage: + raise ExtractorError( + '%s said: This page is currently unavailable in your region.' % self.IE_NAME, expected=True) - m3u8_url = api_data[0]['url'] - return self._extract_m3u8_formats( - m3u8_url, video_id, entry_protocol='m3u8_native', ext='mp4', - preference=0) + auth_info = self._parse_json(webpage, video_id) + self._api_url_template = self.http_scheme() + '//apiv2.vevo.com/%s?token=' + auth_info['access_token'] + + def _call_api(self, path, video_id, note, errnote, fatal=True): + return self._download_json(self._api_url_template % path, video_id, note, errnote) def _real_extract(self, url): video_id = self._match_id(url) - webpage = None - json_url = 'http://videoplayer.vevo.com/VideoService/AuthenticateVideo?isrc=%s' % video_id - response = self._download_json(json_url, video_id) - video_info = response['video'] or {} - - if not video_info and response.get('statusCode') != 909: - if 'statusMessage' in response: - raise ExtractorError('%s said: %s' % (self.IE_NAME, response['statusMessage']), expected=True) - raise ExtractorError('Unable to extract videos') + response = self._download_json( + json_url, video_id, 'Downloading video info', 'Unable to download info') + video_info = response.get('video') or {} + video_versions = video_info.get('videoVersions') + uploader = None + timestamp = None + view_count = None + formats = [] if not video_info: - if url.startswith('vevo:'): - raise ExtractorError('Please specify full Vevo URL for downloading', expected=True) - webpage = self._download_webpage(url, video_id) + if response.get('statusCode') != 909: + ytid = response.get('errorInfo', {}).get('ytid') + if ytid: + self.report_warning( + 'Video is geoblocked, trying with the YouTube video %s' % ytid) + return self.url_result(ytid, 'Youtube', ytid) - title = video_info.get('title') or self._og_search_title(webpage) + if 'statusMessage' in response: + raise ExtractorError('%s said: %s' % ( + self.IE_NAME, response['statusMessage']), expected=True) + raise ExtractorError('Unable to extract videos') - formats = self._formats_from_json(video_info) + self._initialize_api(video_id) + video_info = self._call_api( + 'video/%s' % video_id, video_id, 'Downloading api video info', + 'Failed to download video info') + + video_versions = self._call_api( + 'video/%s/streams' % video_id, video_id, + 'Downloading video versions info', + 'Failed to download video versions info') + + timestamp = parse_iso8601(video_info.get('releaseDate')) + artists = video_info.get('artists') + if artists: + uploader = artists[0]['name'] + view_count = int_or_none(video_info.get('views', {}).get('total')) + + for video_version in video_versions: + version = self._VERSIONS.get(video_version['version']) + version_url = video_version.get('url') + if not version_url: + continue + + if '.mpd' in version_url or '.ism' in version_url: + continue + elif '.m3u8' in version_url: + formats.extend(self._extract_m3u8_formats( + version_url, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls-%s' % version, + note='Downloading %s m3u8 information' % version, + errnote='Failed to download %s m3u8 information' % version, + fatal=False)) + else: + m = re.search(r'''(?xi) + _(?P[0-9]+)x(?P[0-9]+) + _(?P[a-z0-9]+) + _(?P[0-9]+) + _(?P[a-z0-9]+) + _(?P[0-9]+) + \.(?P[a-z0-9]+)''', version_url) + if not m: + continue + + formats.append({ + 'url': version_url, + 'format_id': 'http-%s-%s' % (version, video_version['quality']), + 'vcodec': m.group('vcodec'), + 'acodec': m.group('acodec'), + 'vbr': int(m.group('vbr')), + 'abr': int(m.group('abr')), + 'ext': m.group('ext'), + 'width': int(m.group('width')), + 'height': int(m.group('height')), + }) + else: + timestamp = int_or_none(self._search_regex( + r'/Date\((\d+)\)/', + video_info['releaseDate'], 'release date', fatal=False), + scale=1000) + artists = video_info.get('mainArtists') + if artists: + uploader = artists[0]['artistName'] + + smil_parsed = False + for video_version in video_info['videoVersions']: + version = self._VERSIONS.get(video_version['version']) + if version == 'youtube': + continue + else: + source_type = self._SOURCE_TYPES.get(video_version['sourceType']) + renditions = compat_etree_fromstring(video_version['data']) + if source_type == 'http': + for rend in renditions.findall('rendition'): + attr = rend.attrib + formats.append({ + 'url': attr['url'], + 'format_id': 'http-%s-%s' % (version, attr['name']), + 'height': int_or_none(attr.get('frameheight')), + 'width': int_or_none(attr.get('frameWidth')), + 'tbr': int_or_none(attr.get('totalBitrate')), + 'vbr': int_or_none(attr.get('videoBitrate')), + 'abr': int_or_none(attr.get('audioBitrate')), + 'vcodec': attr.get('videoCodec'), + 'acodec': attr.get('audioCodec'), + }) + elif source_type == 'hls': + formats.extend(self._extract_m3u8_formats( + renditions.find('rendition').attrib['url'], video_id, + 'mp4', 'm3u8_native', m3u8_id='hls-%s' % version, + note='Downloading %s m3u8 information' % version, + errnote='Failed to download %s m3u8 information' % version, + fatal=False)) + elif source_type == 'smil' and version == 'level3' and not smil_parsed: + formats.extend(self._extract_smil_formats( + renditions.find('rendition').attrib['url'], video_id, False)) + smil_parsed = True + self._sort_formats(formats) + + title = video_info['title'] is_explicit = video_info.get('isExplicit') if is_explicit is True: @@ -215,43 +282,16 @@ class VevoIE(InfoExtractor): else: age_limit = None - # Download via HLS API - formats.extend(self._download_api_formats(video_id, url)) - - # Download SMIL - smil_blocks = sorted(( - f for f in video_info.get('videoVersions', []) - if f['sourceType'] == 13), - key=lambda f: f['version']) - smil_url = '%s/Video/V2/VFILE/%s/%sr.smil' % ( - self._SMIL_BASE_URL, video_id, video_id.lower()) - if smil_blocks: - smil_url_m = self._search_regex( - r'url="([^"]+)"', smil_blocks[-1]['data'], 'SMIL URL', - default=None) - if smil_url_m is not None: - smil_url = smil_url_m - if smil_url: - smil_doc = self._download_smil(smil_url, video_id, fatal=False) - if smil_doc: - formats.extend(self._formats_from_smil(smil_doc)) - - self._sort_formats(formats) - timestamp = int_or_none(self._search_regex( - r'/Date\((\d+)\)/', - video_info['launchDate'], 'launch date', fatal=False), - scale=1000) if video_info else None - - duration = video_info.get('duration') or int_or_none( - self._html_search_meta('video:duration', webpage)) + duration = video_info.get('duration') return { 'id': video_id, 'title': title, 'formats': formats, - 'thumbnail': video_info.get('imageUrl'), + 'thumbnail': video_info.get('imageUrl') or video_info.get('thumbnailUrl'), 'timestamp': timestamp, - 'uploader': video_info['mainArtists'][0]['artistName'] if video_info else None, + 'uploader': uploader, 'duration': duration, + 'view_count': view_count, 'age_limit': age_limit, } diff --git a/youtube_dl/extractor/vgtv.py b/youtube_dl/extractor/vgtv.py index 86ba70ed9..14e945d49 100644 --- a/youtube_dl/extractor/vgtv.py +++ b/youtube_dl/extractor/vgtv.py @@ -86,10 +86,9 @@ class VGTVIE(XstreamIE): { # streamType: wasLive 'url': 'http://www.vgtv.no/#!/live/113063/direkte-v75-fra-solvalla', - 'md5': '458f4841239dab414343b50e5af8869c', 'info_dict': { 'id': '113063', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'V75 fra Solvalla 30.05.15', 'description': 'md5:b3743425765355855f88e096acc93231', 'thumbnail': 're:^https?://.*\.jpg', @@ -98,6 +97,10 @@ class VGTVIE(XstreamIE): 'upload_date': '20150530', 'view_count': int, }, + 'params': { + # m3u8 download + 'skip_download': True, + }, }, { 'url': 'http://www.aftenposten.no/webtv/#!/video/21039/trailer-sweatshop-i-can-t-take-any-more', diff --git a/youtube_dl/extractor/vidzi.py b/youtube_dl/extractor/vidzi.py index 2ba9f31df..210a738a6 100644 --- a/youtube_dl/extractor/vidzi.py +++ b/youtube_dl/extractor/vidzi.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals from .common import InfoExtractor +from ..utils import smuggle_url class VidziIE(InfoExtractor): @@ -20,19 +21,14 @@ class VidziIE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - video_host = self._html_search_regex( - r'id=\'vplayer\'>(.*?)', webpage, 'title') + # Vidzi now uses jwplayer, which can be handled by GenericIE return { + '_type': 'url_transparent', 'id': video_id, 'title': title, - 'url': video_url, + 'url': smuggle_url(url, {'to_generic': True}), + 'ie_key': 'Generic', } diff --git a/youtube_dl/extractor/viidea.py b/youtube_dl/extractor/viidea.py index 525e303d4..315984bf9 100644 --- a/youtube_dl/extractor/viidea.py +++ b/youtube_dl/extractor/viidea.py @@ -45,6 +45,10 @@ class ViideaIE(InfoExtractor): 'upload_date': '20130627', 'duration': 565, }, + 'params': { + # m3u8 download + 'skip_download': True, + }, }, { # video with invalid direct format links (HTTP 403) 'url': 'http://videolectures.net/russir2010_filippova_nlp/', diff --git a/youtube_dl/extractor/xuite.py b/youtube_dl/extractor/xuite.py index 8bbac54e2..2466410fa 100644 --- a/youtube_dl/extractor/xuite.py +++ b/youtube_dl/extractor/xuite.py @@ -34,19 +34,20 @@ class XuiteIE(InfoExtractor): }, }, { # Video with only one format - 'url': 'http://vlog.xuite.net/play/TkRZNjhULTM0NDE2MjkuZmx2', - 'md5': 'c45737fc8ac5dc8ac2f92ecbcecf505e', + 'url': 'http://vlog.xuite.net/play/WUxxR2xCLTI1OTI1MDk5LmZsdg==', + 'md5': '21f7b39c009b5a4615b4463df6eb7a46', 'info_dict': { - 'id': '3441629', + 'id': '25925099', 'ext': 'mp4', - 'title': '孫燕姿 - 眼淚成詩', + 'title': 'BigBuckBunny_320x180', 'thumbnail': 're:^https?://.*\.jpg$', - 'duration': 217.399, - 'timestamp': 1299383640, - 'upload_date': '20110306', - 'uploader': 'Valen', - 'uploader_id': '10400126', - 'categories': ['影視娛樂'], + 'duration': 596.458, + 'timestamp': 1454242500, + 'upload_date': '20160131', + 'uploader': 'yan12125', + 'uploader_id': '12158353', + 'categories': ['個人短片'], + 'description': 'http://download.blender.org/peach/bigbuckbunny_movies/BigBuckBunny_320x180.mp4', }, }, { # Video with two formats diff --git a/youtube_dl/extractor/yahoo.py b/youtube_dl/extractor/yahoo.py index 4a492f784..4c6142927 100644 --- a/youtube_dl/extractor/yahoo.py +++ b/youtube_dl/extractor/yahoo.py @@ -221,6 +221,8 @@ class YahooIE(InfoExtractor): r'root\.App\.Cache\.context\.videoCache\.curVideo = \{"([^"]+)"', r'"first_videoid"\s*:\s*"([^"]+)"', r'%s[^}]*"ccm_id"\s*:\s*"([^"]+)"' % re.escape(page_id), + r']data-uuid=["\']([^"\']+)', + r'yahoo://article/view\?.*\buuid=([^&"\']+)', ] video_id = self._search_regex( CONTENT_ID_REGEXES, webpage, 'content ID') diff --git a/youtube_dl/extractor/youporn.py b/youtube_dl/extractor/youporn.py index dd724085a..b29baafc4 100644 --- a/youtube_dl/extractor/youporn.py +++ b/youtube_dl/extractor/youporn.py @@ -114,15 +114,13 @@ class YouPornIE(InfoExtractor): formats.append(f) self._sort_formats(formats) - description = self._html_search_regex( - r'(?s)]+class=["\']video-description["\'][^>]*>(.+?)', - webpage, 'description', default=None) + description = self._og_search_description(webpage, default=None) thumbnail = self._search_regex( r'(?:imageurl\s*=|poster\s*:)\s*(["\'])(?P.+?)\1', webpage, 'thumbnail', fatal=False, group='thumbnail') uploader = self._html_search_regex( - r'(?s)]+class=["\']videoInfoBy["\'][^>]*>\s*By:\s*(.+?)', + r'(?s)]+class=["\']videoInfoBy(?:\s+[^"\']+)?["\'][^>]*>\s*By:\s*(.+?)', webpage, 'uploader', fatal=False) upload_date = unified_strdate(self._html_search_regex( r'(?s)]+class=["\']videoInfoTime["\'][^>]*>(.+?)', diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index d31161d21..bd87c75b6 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -32,6 +32,7 @@ from ..utils import ( get_element_by_attribute, get_element_by_id, int_or_none, + mimetype2ext, orderedSet, parse_duration, remove_quotes, @@ -180,7 +181,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor): return -class YoutubeEntryListBaseInfoExtractor(InfoExtractor): +class YoutubeEntryListBaseInfoExtractor(YoutubeBaseInfoExtractor): # Extract entries from page with "Load more" button def _entries(self, page, playlist_id): more_widget_html = content_html = page @@ -232,7 +233,7 @@ class YoutubePlaylistBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor): class YoutubePlaylistsBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor): def _process_page(self, content): - for playlist_id in re.findall(r'href="/?playlist\?list=(.+?)"', content): + for playlist_id in orderedSet(re.findall(r'href="/?playlist\?list=([0-9A-Za-z-_]{10,})"', content)): yield self.url_result( 'https://www.youtube.com/playlist?list=%s' % playlist_id, 'YoutubePlaylist') @@ -277,55 +278,55 @@ class YoutubeIE(YoutubeBaseInfoExtractor): $""" _NEXT_URL_RE = r'[\?&]next_url=([^&]+)' _formats = { - '5': {'ext': 'flv', 'width': 400, 'height': 240}, - '6': {'ext': 'flv', 'width': 450, 'height': 270}, - '13': {'ext': '3gp'}, - '17': {'ext': '3gp', 'width': 176, 'height': 144}, - '18': {'ext': 'mp4', 'width': 640, 'height': 360}, - '22': {'ext': 'mp4', 'width': 1280, 'height': 720}, - '34': {'ext': 'flv', 'width': 640, 'height': 360}, - '35': {'ext': 'flv', 'width': 854, 'height': 480}, - '36': {'ext': '3gp', 'width': 320, 'height': 240}, - '37': {'ext': 'mp4', 'width': 1920, 'height': 1080}, - '38': {'ext': 'mp4', 'width': 4096, 'height': 3072}, - '43': {'ext': 'webm', 'width': 640, 'height': 360}, - '44': {'ext': 'webm', 'width': 854, 'height': 480}, - '45': {'ext': 'webm', 'width': 1280, 'height': 720}, - '46': {'ext': 'webm', 'width': 1920, 'height': 1080}, - '59': {'ext': 'mp4', 'width': 854, 'height': 480}, - '78': {'ext': 'mp4', 'width': 854, 'height': 480}, + '5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'}, + '6': {'ext': 'flv', 'width': 450, 'height': 270, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'}, + '13': {'ext': '3gp', 'acodec': 'aac', 'vcodec': 'mp4v'}, + '17': {'ext': '3gp', 'width': 176, 'height': 144, 'acodec': 'aac', 'abr': 24, 'vcodec': 'mp4v'}, + '18': {'ext': 'mp4', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 96, 'vcodec': 'h264'}, + '22': {'ext': 'mp4', 'width': 1280, 'height': 720, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'}, + '34': {'ext': 'flv', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'}, + '35': {'ext': 'flv', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'}, + '36': {'ext': '3gp', 'width': 320, 'height': 240, 'acodec': 'aac', 'abr': 32, 'vcodec': 'mp4v'}, + '37': {'ext': 'mp4', 'width': 1920, 'height': 1080, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'}, + '38': {'ext': 'mp4', 'width': 4096, 'height': 3072, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'}, + '43': {'ext': 'webm', 'width': 640, 'height': 360, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'}, + '44': {'ext': 'webm', 'width': 854, 'height': 480, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'}, + '45': {'ext': 'webm', 'width': 1280, 'height': 720, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'}, + '46': {'ext': 'webm', 'width': 1920, 'height': 1080, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'}, + '59': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'}, + '78': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'}, - # 3d videos - '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'preference': -20}, - '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'preference': -20}, - '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'preference': -20}, - '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'preference': -20}, - '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'preference': -20}, - '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'preference': -20}, - '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'preference': -20}, + # 3D videos + '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20}, + '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20}, + '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20}, + '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20}, + '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8', 'preference': -20}, + '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20}, + '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20}, # Apple HTTP Live Streaming - '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10}, - '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'preference': -10}, - '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'preference': -10}, - '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'preference': -10}, - '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'preference': -10}, - '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10}, - '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'preference': -10}, + '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10}, + '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10}, + '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10}, + '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10}, + '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10}, + '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10}, + '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 24, 'vcodec': 'h264', 'preference': -10}, # DASH mp4 video - '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, - '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, - '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, - '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, - '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, - '138': {'ext': 'mp4', 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, # Height can vary (https://github.com/rg3/youtube-dl/issues/4559) - '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, - '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, - '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'h264'}, - '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'h264'}, - '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'vcodec': 'h264'}, + '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'vcodec': 'h264', 'preference': -40}, + '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'vcodec': 'h264', 'preference': -40}, + '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'vcodec': 'h264', 'preference': -40}, + '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'vcodec': 'h264', 'preference': -40}, + '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'vcodec': 'h264', 'preference': -40}, + '138': {'ext': 'mp4', 'format_note': 'DASH video', 'acodec': 'none', 'vcodec': 'h264', 'preference': -40}, # Height can vary (https://github.com/rg3/youtube-dl/issues/4559) + '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'vcodec': 'h264', 'preference': -40}, + '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'vcodec': 'h264', 'preference': -40}, + '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'vcodec': 'h264', 'fps': 60, 'preference': -40}, + '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'vcodec': 'h264', 'fps': 60, 'preference': -40}, + '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'vcodec': 'h264', 'preference': -40}, # Dash mp4 audio '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'vcodec': 'none', 'abr': 48, 'preference': -50, 'container': 'm4a_dash'}, @@ -339,26 +340,26 @@ class YoutubeIE(YoutubeBaseInfoExtractor): '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40}, '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40}, '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40}, - '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'container': 'webm', 'vcodec': 'vp9'}, - '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, - '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, - '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, - '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, - '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, - '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, - '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, - '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, + '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'vp9', 'preference': -40}, + '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'vcodec': 'vp9', 'preference': -40}, + '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'vcodec': 'vp9', 'preference': -40}, + '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'vcodec': 'vp9', 'preference': -40}, + '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'vcodec': 'vp9', 'preference': -40}, + '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'vcodec': 'vp9', 'preference': -40}, + '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'vcodec': 'vp9', 'preference': -40}, + '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'vcodec': 'vp9', 'preference': -40}, + '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'vcodec': 'vp9', 'preference': -40}, # itag 272 videos are either 3840x2160 (e.g. RtoitU2A-3E) or 7680x4320 (sLprVF6d7Ug) - '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, - '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'vp9'}, - '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'vp9'}, - '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'vp9'}, - '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'vcodec': 'vp9'}, - '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'vp9'}, + '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'vcodec': 'vp9', 'preference': -40}, + '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'vcodec': 'vp9', 'fps': 60, 'preference': -40}, + '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'vcodec': 'vp9', 'fps': 60, 'preference': -40}, + '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'vcodec': 'vp9', 'fps': 60, 'preference': -40}, + '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'vcodec': 'vp9', 'preference': -40}, + '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'vcodec': 'vp9', 'fps': 60, 'preference': -40}, # Dash webm audio - '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 128, 'preference': -50}, - '172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 256, 'preference': -50}, + '171': {'ext': 'webm', 'acodec': 'vorbis', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 128, 'preference': -50}, + '172': {'ext': 'webm', 'acodec': 'vorbis', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 256, 'preference': -50}, # Dash webm audio with opus inside '249': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50, 'preference': -50}, @@ -964,6 +965,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): try: args = player_config['args'] caption_url = args['ttsurl'] + if not caption_url: + self._downloader.report_warning(err_msg) + return {} timestamp = args['timestamp'] # We get the available subtitles list_params = compat_urllib_parse.urlencode({ @@ -1087,9 +1091,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): full_info.update(f) codecs = r.attrib.get('codecs') if codecs: - if full_info.get('acodec') == 'none' and 'vcodec' not in full_info: + if full_info.get('acodec') == 'none': full_info['vcodec'] = codecs - elif full_info.get('vcodec') == 'none' and 'acodec' not in full_info: + elif full_info.get('vcodec') == 'none': full_info['acodec'] = codecs formats.append(full_info) else: @@ -1458,15 +1462,21 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if 'ratebypass' not in url: url += '&ratebypass=yes' + dct = { + 'format_id': format_id, + 'url': url, + 'player_url': player_url, + } + if format_id in self._formats: + dct.update(self._formats[format_id]) + # Some itags are not included in DASH manifest thus corresponding formats will # lack metadata (see https://github.com/rg3/youtube-dl/pull/5993). # Trying to extract metadata from url_encoded_fmt_stream_map entry. mobj = re.search(r'^(?P\d+)[xX](?P\d+)$', url_data.get('size', [''])[0]) width, height = (int(mobj.group('width')), int(mobj.group('height'))) if mobj else (None, None) - dct = { - 'format_id': format_id, - 'url': url, - 'player_url': player_url, + + more_fields = { 'filesize': int_or_none(url_data.get('clen', [None])[0]), 'tbr': float_or_none(url_data.get('bitrate', [None])[0], 1000), 'width': width, @@ -1474,13 +1484,16 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'fps': int_or_none(url_data.get('fps', [None])[0]), 'format_note': url_data.get('quality_label', [None])[0] or url_data.get('quality', [None])[0], } + for key, value in more_fields.items(): + if value: + dct[key] = value type_ = url_data.get('type', [None])[0] if type_: type_split = type_.split(';') kind_ext = type_split[0].split('/') if len(kind_ext) == 2: - kind, ext = kind_ext - dct['ext'] = ext + kind, _ = kind_ext + dct['ext'] = mimetype2ext(type_split[0]) if kind in ('audio', 'video'): codecs = None for mobj in re.finditer( @@ -1498,8 +1511,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'acodec': acodec, 'vcodec': vcodec, }) - if format_id in self._formats: - dct.update(self._formats[format_id]) formats.append(dct) elif video_info.get('hlsvp'): manifest_url = video_info['hlsvp'][0] @@ -1591,7 +1602,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): } -class YoutubePlaylistIE(YoutubeBaseInfoExtractor, YoutubePlaylistBaseInfoExtractor): +class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): IE_DESC = 'YouTube.com playlists' _VALID_URL = r"""(?x)(?: (?:https?://)? @@ -1835,7 +1846,7 @@ class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor): class YoutubeUserIE(YoutubeChannelIE): IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)' - _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)(?P[A-Za-z0-9_-]+)' + _VALID_URL = r'(?:(?:https?://(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)(?P[A-Za-z0-9_-]+)' _TEMPLATE_URL = 'https://www.youtube.com/user/%s/videos' IE_NAME = 'youtube:user' diff --git a/youtube_dl/options.py b/youtube_dl/options.py index ade58c375..2137dfb3f 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -380,7 +380,7 @@ def parseOpts(overrideArguments=None): '--sub-lang', '--sub-langs', '--srt-lang', action='callback', dest='subtitleslangs', metavar='LANGS', type='str', default=[], callback=_comma_separated_values_options_callback, - help='Languages of the subtitles to download (optional) separated by commas, use IETF language tags like \'en,pt\'') + help='Languages of the subtitles to download (optional) separated by commas, use --list-subs for available language tags') downloader = optparse.OptionGroup(parser, 'Download Options') downloader.add_option( @@ -415,6 +415,11 @@ def parseOpts(overrideArguments=None): '--hls-prefer-native', dest='hls_prefer_native', action='store_true', help='Use the native HLS downloader instead of ffmpeg (experimental)') + downloader.add_option( + '--hls-use-mpegts', + dest='hls_use_mpegts', action='store_true', + help='Use the mpegts container for HLS videos, allowing to play the ' + 'video while downloading (some players may not be able to play it)') downloader.add_option( '--external-downloader', dest='external_downloader', metavar='COMMAND', diff --git a/youtube_dl/postprocessor/ffmpeg.py b/youtube_dl/postprocessor/ffmpeg.py index daca5d814..16a64802a 100644 --- a/youtube_dl/postprocessor/ffmpeg.py +++ b/youtube_dl/postprocessor/ffmpeg.py @@ -479,6 +479,7 @@ class FFmpegSubtitlesConvertorPP(FFmpegPostProcessor): self._downloader.to_screen('[ffmpeg] There aren\'t any subtitles to convert') return [], info self._downloader.to_screen('[ffmpeg] Converting subtitles') + sub_filenames = [] for lang, sub in subs.items(): ext = sub['ext'] if ext == new_ext: @@ -486,6 +487,8 @@ class FFmpegSubtitlesConvertorPP(FFmpegPostProcessor): '[ffmpeg] Subtitle file for %s is already in the requested' 'format' % new_ext) continue + old_file = subtitles_filename(filename, lang, ext) + sub_filenames.append(old_file) new_file = subtitles_filename(filename, lang, new_ext) if ext == 'dfxp' or ext == 'ttml': @@ -493,7 +496,7 @@ class FFmpegSubtitlesConvertorPP(FFmpegPostProcessor): 'You have requested to convert dfxp (TTML) subtitles into another format, ' 'which results in style information loss') - dfxp_file = subtitles_filename(filename, lang, ext) + dfxp_file = old_file srt_file = subtitles_filename(filename, lang, 'srt') with io.open(dfxp_file, 'rt', encoding='utf-8') as f: @@ -511,9 +514,7 @@ class FFmpegSubtitlesConvertorPP(FFmpegPostProcessor): if new_ext == 'srt': continue - self.run_ffmpeg( - subtitles_filename(filename, lang, ext), - new_file, ['-f', new_format]) + self.run_ffmpeg(old_file, new_file, ['-f', new_format]) with io.open(new_file, 'rt', encoding='utf-8') as f: subs[lang] = { @@ -521,4 +522,4 @@ class FFmpegSubtitlesConvertorPP(FFmpegPostProcessor): 'data': f.read(), } - return [], info + return sub_filenames, info diff --git a/youtube_dl/update.py b/youtube_dl/update.py index 995b8ed96..e4a1aaa64 100644 --- a/youtube_dl/update.py +++ b/youtube_dl/update.py @@ -15,33 +15,17 @@ from .version import __version__ def rsa_verify(message, signature, key): - from struct import pack from hashlib import sha256 - assert isinstance(message, bytes) - block_size = 0 - n = key[0] - while n: - block_size += 1 - n >>= 8 - signature = pow(int(signature, 16), key[1], key[0]) - raw_bytes = [] - while signature: - raw_bytes.insert(0, pack("B", signature & 0xFF)) - signature >>= 8 - signature = (block_size - len(raw_bytes)) * b'\x00' + b''.join(raw_bytes) - if signature[0:2] != b'\x00\x01': + byte_size = (len(bin(key[0])) - 2 + 8 - 1) // 8 + signature = ('%x' % pow(int(signature, 16), key[1], key[0])).encode() + signature = (byte_size * 2 - len(signature)) * b'0' + signature + asn1 = b'3031300d060960864801650304020105000420' + asn1 += sha256(message).hexdigest().encode() + if byte_size < len(asn1) // 2 + 11: return False - signature = signature[2:] - if b'\x00' not in signature: - return False - signature = signature[signature.index(b'\x00') + 1:] - if not signature.startswith(b'\x30\x31\x30\x0D\x06\x09\x60\x86\x48\x01\x65\x03\x04\x02\x01\x05\x00\x04\x20'): - return False - signature = signature[19:] - if signature != sha256(message).digest(): - return False - return True + expected = b'0001' + (byte_size - len(asn1) // 2 - 3) * b'ff' + b'00' + asn1 + return expected == signature def update_self(to_screen, verbose, opener): diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 9c1c0e0bd..c63b61598 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1828,9 +1828,11 @@ def mimetype2ext(mt): _, _, res = mt.rpartition('/') return { - 'x-ms-wmv': 'wmv', - 'x-mp4-fragmented': 'mp4', + '3gpp': '3gp', 'ttml+xml': 'ttml', + 'x-flv': 'flv', + 'x-mp4-fragmented': 'mp4', + 'x-ms-wmv': 'wmv', }.get(res, res) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 02c438f3a..006b960b3 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2016.01.15' +__version__ = '2016.01.31'