diff --git a/AUTHORS b/AUTHORS index 47f12a9ee..bdd2a15dc 100644 --- a/AUTHORS +++ b/AUTHORS @@ -111,3 +111,4 @@ Paul Hartmann Frans de Jonge Robin de Rooij Ryan Schmidt +Leslie P. Polzer diff --git a/Makefile b/Makefile index 573c82685..708732956 100644 --- a/Makefile +++ b/Makefile @@ -43,7 +43,7 @@ test: ot: offlinetest offlinetest: codetest - nosetests --verbose test --exclude test_download --exclude test_age_restriction --exclude test_subtitles --exclude test_write_annotations --exclude test_youtube_lists + nosetests --verbose test --exclude test_download.py --exclude test_age_restriction.py --exclude test_subtitles.py --exclude test_write_annotations.py --exclude test_youtube_lists.py tar: youtube-dl.tar.gz diff --git a/README.md b/README.md index a2c148311..699401b49 100644 --- a/README.md +++ b/README.md @@ -351,8 +351,8 @@ which means you can modify it, redistribute it or use it however you like. --all-subs downloads all the available subtitles of the video --list-subs lists all available subtitles for the video - --sub-format FORMAT subtitle format (default=srt) ([sbv/vtt] - youtube only) + --sub-format FORMAT subtitle format, accepts formats + preference, for example: "ass/srt/best" --sub-lang LANGS languages of the subtitles to download (optional) separated by commas, use IETF language tags like 'en,pt' @@ -571,7 +571,7 @@ Support requests for services that **do** purchase the rights to distribute thei ### How can I detect whether a given URL is supported by youtube-dl? -For one, have a look at the [list of supported sites](docs/supportedsites.md). Note that it can sometimes happen that the site changes its URL scheme (say, from http://example.com/v/1234567 to http://example.com/v/1234567 ) and youtube-dl reports an URL of a service in that list as unsupported. In that case, simply report a bug. +For one, have a look at the [list of supported sites](docs/supportedsites.md). Note that it can sometimes happen that the site changes its URL scheme (say, from http://example.com/video/1234567 to http://example.com/v/1234567 ) and youtube-dl reports an URL of a service in that list as unsupported. In that case, simply report a bug. It is *not* possible to detect whether a URL is supported or not. That's because youtube-dl contains a generic extractor which matches **all** URLs. You may be tempted to disable, exclude, or remove the generic extractor, but the generic extractor not only allows users to extract videos from lots of websites that embed a video from another service, but may also be used to extract video from a service that it's hosting itself. Therefore, we neither recommend nor support disabling, excluding, or removing the generic extractor. diff --git a/devscripts/check-porn.py b/devscripts/check-porn.py index 216282712..6a5bd9eda 100644 --- a/devscripts/check-porn.py +++ b/devscripts/check-porn.py @@ -45,12 +45,12 @@ for test in get_testcases(): RESULT = ('.' + domain + '\n' in LIST or '\n' + domain + '\n' in LIST) - if RESULT and ('info_dict' not in test or 'age_limit' not in test['info_dict'] - or test['info_dict']['age_limit'] != 18): + if RESULT and ('info_dict' not in test or 'age_limit' not in test['info_dict'] or + test['info_dict']['age_limit'] != 18): print('\nPotential missing age_limit check: {0}'.format(test['name'])) - elif not RESULT and ('info_dict' in test and 'age_limit' in test['info_dict'] - and test['info_dict']['age_limit'] == 18): + elif not RESULT and ('info_dict' in test and 'age_limit' in test['info_dict'] and + test['info_dict']['age_limit'] == 18): print('\nPotential false negative: {0}'.format(test['name'])) else: diff --git a/docs/supportedsites.md b/docs/supportedsites.md index f6ba28e7a..9f70db80a 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -72,6 +72,8 @@ - **CeskaTelevize** - **channel9**: Channel 9 - **Chilloutzone** + - **chirbit** + - **chirbit:profile** - **Cinchcast** - **Cinemassacre** - **clipfish** @@ -330,6 +332,7 @@ - **prosiebensat1**: ProSiebenSat.1 Digital - **Pyvideo** - **QuickVid** + - **R7** - **radio.de** - **radiobremen** - **radiofrance** @@ -385,7 +388,8 @@ - **soundcloud:playlist** - **soundcloud:set** - **soundcloud:user** - - **Soundgasm** + - **soundgasm** + - **soundgasm:profile** - **southpark.cc.com** - **southpark.de** - **Space** @@ -451,6 +455,7 @@ - **Turbo** - **Tutv** - **tv.dfb.de** + - **TV4**: tv4.se and tv4play.se - **tvigle**: Интернет-телевидение Tvigle.ru - **tvp.pl** - **tvp.pl:Series** @@ -558,6 +563,7 @@ - **youtube:subscriptions**: YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication) - **youtube:user**: YouTube.com user videos (URL or "ytuser" keyword) - **youtube:watch_later**: Youtube watch later list, ":ytwatchlater" for short (requires authentication) + - **Zapiks** - **ZDF** - **ZDFChannel** - **zingmp3:album**: mp3.zing.vn albums diff --git a/test/parameters.json b/test/parameters.json index af77b89b4..cbff9bd16 100644 --- a/test/parameters.json +++ b/test/parameters.json @@ -28,7 +28,7 @@ "retries": 10, "simulate": false, "subtitleslang": null, - "subtitlesformat": "srt", + "subtitlesformat": "best", "test": true, "updatetime": true, "usenetrc": false, diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index b1cd6a69f..055e42555 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -337,6 +337,65 @@ class TestFormatSelection(unittest.TestCase): downloaded = ydl.downloaded_info_dicts[0] self.assertEqual(downloaded['format_id'], 'G') + def test_subtitles(self): + def s_formats(lang, autocaption=False): + return [{ + 'ext': ext, + 'url': 'http://localhost/video.%s.%s' % (lang, ext), + '_auto': autocaption, + } for ext in ['vtt', 'srt', 'ass']] + subtitles = dict((l, s_formats(l)) for l in ['en', 'fr', 'es']) + auto_captions = dict((l, s_formats(l, True)) for l in ['it', 'pt', 'es']) + info_dict = { + 'id': 'test', + 'title': 'Test', + 'url': 'http://localhost/video.mp4', + 'subtitles': subtitles, + 'automatic_captions': auto_captions, + 'extractor': 'TEST', + } + + def get_info(params={}): + params.setdefault('simulate', True) + ydl = YDL(params) + ydl.report_warning = lambda *args, **kargs: None + return ydl.process_video_result(info_dict, download=False) + + result = get_info() + self.assertFalse(result.get('requested_subtitles')) + self.assertEqual(result['subtitles'], subtitles) + self.assertEqual(result['automatic_captions'], auto_captions) + + result = get_info({'writesubtitles': True}) + subs = result['requested_subtitles'] + self.assertTrue(subs) + self.assertEqual(set(subs.keys()), set(['en'])) + self.assertTrue(subs['en'].get('data') is None) + self.assertEqual(subs['en']['ext'], 'ass') + + result = get_info({'writesubtitles': True, 'subtitlesformat': 'foo/srt'}) + subs = result['requested_subtitles'] + self.assertEqual(subs['en']['ext'], 'srt') + + result = get_info({'writesubtitles': True, 'subtitleslangs': ['es', 'fr', 'it']}) + subs = result['requested_subtitles'] + self.assertTrue(subs) + self.assertEqual(set(subs.keys()), set(['es', 'fr'])) + + result = get_info({'writesubtitles': True, 'writeautomaticsub': True, 'subtitleslangs': ['es', 'pt']}) + subs = result['requested_subtitles'] + self.assertTrue(subs) + self.assertEqual(set(subs.keys()), set(['es', 'pt'])) + self.assertFalse(subs['es']['_auto']) + self.assertTrue(subs['pt']['_auto']) + + result = get_info({'writeautomaticsub': True, 'subtitleslangs': ['es', 'pt']}) + subs = result['requested_subtitles'] + self.assertTrue(subs) + self.assertEqual(set(subs.keys()), set(['es', 'pt'])) + self.assertTrue(subs['es']['_auto']) + self.assertTrue(subs['pt']['_auto']) + def test_add_extra_info(self): test_dict = { 'extractor': 'Foo', diff --git a/test/test_subtitles.py b/test/test_subtitles.py index bcc69a778..3f2d8a2ba 100644 --- a/test/test_subtitles.py +++ b/test/test_subtitles.py @@ -18,6 +18,14 @@ from youtube_dl.extractor import ( VimeoIE, WallaIE, CeskaTelevizeIE, + LyndaIE, + NPOIE, + ComedyCentralIE, + NRKTVIE, + RaiIE, + VikiIE, + ThePlatformIE, + RTVEALaCartaIE, ) @@ -27,42 +35,38 @@ class BaseTestSubtitles(unittest.TestCase): def setUp(self): self.DL = FakeYDL() - self.ie = self.IE(self.DL) + self.ie = self.IE() + self.DL.add_info_extractor(self.ie) def getInfoDict(self): - info_dict = self.ie.extract(self.url) + info_dict = self.DL.extract_info(self.url, download=False) return info_dict def getSubtitles(self): info_dict = self.getInfoDict() - return info_dict['subtitles'] + subtitles = info_dict['requested_subtitles'] + if not subtitles: + return subtitles + for sub_info in subtitles.values(): + if sub_info.get('data') is None: + uf = self.DL.urlopen(sub_info['url']) + sub_info['data'] = uf.read().decode('utf-8') + return dict((l, sub_info['data']) for l, sub_info in subtitles.items()) class TestYoutubeSubtitles(BaseTestSubtitles): url = 'QRS8MkLhQmM' IE = YoutubeIE - def test_youtube_no_writesubtitles(self): - self.DL.params['writesubtitles'] = False - subtitles = self.getSubtitles() - self.assertEqual(subtitles, None) - - def test_youtube_subtitles(self): - self.DL.params['writesubtitles'] = True - subtitles = self.getSubtitles() - self.assertEqual(md5(subtitles['en']), '4cd9278a35ba2305f47354ee13472260') - - def test_youtube_subtitles_lang(self): - self.DL.params['writesubtitles'] = True - self.DL.params['subtitleslangs'] = ['it'] - subtitles = self.getSubtitles() - self.assertEqual(md5(subtitles['it']), '164a51f16f260476a05b50fe4c2f161d') - def test_youtube_allsubtitles(self): self.DL.params['writesubtitles'] = True self.DL.params['allsubtitles'] = True subtitles = self.getSubtitles() self.assertEqual(len(subtitles.keys()), 13) + self.assertEqual(md5(subtitles['en']), '4cd9278a35ba2305f47354ee13472260') + self.assertEqual(md5(subtitles['it']), '164a51f16f260476a05b50fe4c2f161d') + for lang in ['it', 'fr', 'de']: + self.assertTrue(subtitles.get(lang) is not None, 'Subtitles for \'%s\' not extracted' % lang) def test_youtube_subtitles_sbv_format(self): self.DL.params['writesubtitles'] = True @@ -76,12 +80,6 @@ class TestYoutubeSubtitles(BaseTestSubtitles): subtitles = self.getSubtitles() self.assertEqual(md5(subtitles['en']), '3cb210999d3e021bd6c7f0ea751eab06') - def test_youtube_list_subtitles(self): - self.DL.expect_warning('Video doesn\'t have automatic captions') - self.DL.params['listsubtitles'] = True - info_dict = self.getInfoDict() - self.assertEqual(info_dict, None) - def test_youtube_automatic_captions(self): self.url = '8YoUxe5ncPo' self.DL.params['writeautomaticsub'] = True @@ -103,55 +101,22 @@ class TestYoutubeSubtitles(BaseTestSubtitles): self.DL.params['writesubtitles'] = True self.DL.params['allsubtitles'] = True subtitles = self.getSubtitles() - self.assertEqual(len(subtitles), 0) - - def test_youtube_multiple_langs(self): - self.url = 'QRS8MkLhQmM' - self.DL.params['writesubtitles'] = True - langs = ['it', 'fr', 'de'] - self.DL.params['subtitleslangs'] = langs - subtitles = self.getSubtitles() - for lang in langs: - self.assertTrue(subtitles.get(lang) is not None, 'Subtitles for \'%s\' not extracted' % lang) + self.assertFalse(subtitles) class TestDailymotionSubtitles(BaseTestSubtitles): url = 'http://www.dailymotion.com/video/xczg00' IE = DailymotionIE - def test_no_writesubtitles(self): - subtitles = self.getSubtitles() - self.assertEqual(subtitles, None) - - def test_subtitles(self): - self.DL.params['writesubtitles'] = True - subtitles = self.getSubtitles() - self.assertEqual(md5(subtitles['en']), '976553874490cba125086bbfea3ff76f') - - def test_subtitles_lang(self): - self.DL.params['writesubtitles'] = True - self.DL.params['subtitleslangs'] = ['fr'] - subtitles = self.getSubtitles() - self.assertEqual(md5(subtitles['fr']), '594564ec7d588942e384e920e5341792') - def test_allsubtitles(self): self.DL.params['writesubtitles'] = True self.DL.params['allsubtitles'] = True subtitles = self.getSubtitles() self.assertTrue(len(subtitles.keys()) >= 6) - - def test_list_subtitles(self): - self.DL.expect_warning('Automatic Captions not supported by this server') - self.DL.params['listsubtitles'] = True - info_dict = self.getInfoDict() - self.assertEqual(info_dict, None) - - def test_automatic_captions(self): - self.DL.expect_warning('Automatic Captions not supported by this server') - self.DL.params['writeautomaticsub'] = True - self.DL.params['subtitleslang'] = ['en'] - subtitles = self.getSubtitles() - self.assertTrue(len(subtitles.keys()) == 0) + self.assertEqual(md5(subtitles['en']), '976553874490cba125086bbfea3ff76f') + self.assertEqual(md5(subtitles['fr']), '594564ec7d588942e384e920e5341792') + for lang in ['es', 'fr', 'de']: + self.assertTrue(subtitles.get(lang) is not None, 'Subtitles for \'%s\' not extracted' % lang) def test_nosubtitles(self): self.DL.expect_warning('video doesn\'t have subtitles') @@ -159,61 +124,21 @@ class TestDailymotionSubtitles(BaseTestSubtitles): self.DL.params['writesubtitles'] = True self.DL.params['allsubtitles'] = True subtitles = self.getSubtitles() - self.assertEqual(len(subtitles), 0) - - def test_multiple_langs(self): - self.DL.params['writesubtitles'] = True - langs = ['es', 'fr', 'de'] - self.DL.params['subtitleslangs'] = langs - subtitles = self.getSubtitles() - for lang in langs: - self.assertTrue(subtitles.get(lang) is not None, 'Subtitles for \'%s\' not extracted' % lang) + self.assertFalse(subtitles) class TestTedSubtitles(BaseTestSubtitles): url = 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html' IE = TEDIE - def test_no_writesubtitles(self): - subtitles = self.getSubtitles() - self.assertEqual(subtitles, None) - - def test_subtitles(self): - self.DL.params['writesubtitles'] = True - subtitles = self.getSubtitles() - self.assertEqual(md5(subtitles['en']), '4262c1665ff928a2dada178f62cb8d14') - - def test_subtitles_lang(self): - self.DL.params['writesubtitles'] = True - self.DL.params['subtitleslangs'] = ['fr'] - subtitles = self.getSubtitles() - self.assertEqual(md5(subtitles['fr']), '66a63f7f42c97a50f8c0e90bc7797bb5') - def test_allsubtitles(self): self.DL.params['writesubtitles'] = True self.DL.params['allsubtitles'] = True subtitles = self.getSubtitles() self.assertTrue(len(subtitles.keys()) >= 28) - - def test_list_subtitles(self): - self.DL.expect_warning('Automatic Captions not supported by this server') - self.DL.params['listsubtitles'] = True - info_dict = self.getInfoDict() - self.assertEqual(info_dict, None) - - def test_automatic_captions(self): - self.DL.expect_warning('Automatic Captions not supported by this server') - self.DL.params['writeautomaticsub'] = True - self.DL.params['subtitleslang'] = ['en'] - subtitles = self.getSubtitles() - self.assertTrue(len(subtitles.keys()) == 0) - - def test_multiple_langs(self): - self.DL.params['writesubtitles'] = True - langs = ['es', 'fr', 'de'] - self.DL.params['subtitleslangs'] = langs - subtitles = self.getSubtitles() - for lang in langs: + self.assertEqual(md5(subtitles['en']), '4262c1665ff928a2dada178f62cb8d14') + self.assertEqual(md5(subtitles['fr']), '66a63f7f42c97a50f8c0e90bc7797bb5') + for lang in ['es', 'fr', 'de']: self.assertTrue(subtitles.get(lang) is not None, 'Subtitles for \'%s\' not extracted' % lang) @@ -221,14 +146,7 @@ class TestBlipTVSubtitles(BaseTestSubtitles): url = 'http://blip.tv/a/a-6603250' IE = BlipTVIE - def test_list_subtitles(self): - self.DL.expect_warning('Automatic Captions not supported by this server') - self.DL.params['listsubtitles'] = True - info_dict = self.getInfoDict() - self.assertEqual(info_dict, None) - def test_allsubtitles(self): - self.DL.expect_warning('Automatic Captions not supported by this server') self.DL.params['writesubtitles'] = True self.DL.params['allsubtitles'] = True subtitles = self.getSubtitles() @@ -240,39 +158,13 @@ class TestVimeoSubtitles(BaseTestSubtitles): url = 'http://vimeo.com/76979871' IE = VimeoIE - def test_no_writesubtitles(self): - subtitles = self.getSubtitles() - self.assertEqual(subtitles, None) - - def test_subtitles(self): - self.DL.params['writesubtitles'] = True - subtitles = self.getSubtitles() - self.assertEqual(md5(subtitles['en']), '8062383cf4dec168fc40a088aa6d5888') - - def test_subtitles_lang(self): - self.DL.params['writesubtitles'] = True - self.DL.params['subtitleslangs'] = ['fr'] - subtitles = self.getSubtitles() - self.assertEqual(md5(subtitles['fr']), 'b6191146a6c5d3a452244d853fde6dc8') - def test_allsubtitles(self): self.DL.params['writesubtitles'] = True self.DL.params['allsubtitles'] = True subtitles = self.getSubtitles() self.assertEqual(set(subtitles.keys()), set(['de', 'en', 'es', 'fr'])) - - def test_list_subtitles(self): - self.DL.expect_warning('Automatic Captions not supported by this server') - self.DL.params['listsubtitles'] = True - info_dict = self.getInfoDict() - self.assertEqual(info_dict, None) - - def test_automatic_captions(self): - self.DL.expect_warning('Automatic Captions not supported by this server') - self.DL.params['writeautomaticsub'] = True - self.DL.params['subtitleslang'] = ['en'] - subtitles = self.getSubtitles() - self.assertTrue(len(subtitles.keys()) == 0) + self.assertEqual(md5(subtitles['en']), '8062383cf4dec168fc40a088aa6d5888') + self.assertEqual(md5(subtitles['fr']), 'b6191146a6c5d3a452244d853fde6dc8') def test_nosubtitles(self): self.DL.expect_warning('video doesn\'t have subtitles') @@ -280,27 +172,13 @@ class TestVimeoSubtitles(BaseTestSubtitles): self.DL.params['writesubtitles'] = True self.DL.params['allsubtitles'] = True subtitles = self.getSubtitles() - self.assertEqual(len(subtitles), 0) - - def test_multiple_langs(self): - self.DL.params['writesubtitles'] = True - langs = ['es', 'fr', 'de'] - self.DL.params['subtitleslangs'] = langs - subtitles = self.getSubtitles() - for lang in langs: - self.assertTrue(subtitles.get(lang) is not None, 'Subtitles for \'%s\' not extracted' % lang) + self.assertFalse(subtitles) class TestWallaSubtitles(BaseTestSubtitles): url = 'http://vod.walla.co.il/movie/2705958/the-yes-men' IE = WallaIE - def test_list_subtitles(self): - self.DL.expect_warning('Automatic Captions not supported by this server') - self.DL.params['listsubtitles'] = True - info_dict = self.getInfoDict() - self.assertEqual(info_dict, None) - def test_allsubtitles(self): self.DL.expect_warning('Automatic Captions not supported by this server') self.DL.params['writesubtitles'] = True @@ -315,19 +193,13 @@ class TestWallaSubtitles(BaseTestSubtitles): self.DL.params['writesubtitles'] = True self.DL.params['allsubtitles'] = True subtitles = self.getSubtitles() - self.assertEqual(len(subtitles), 0) + self.assertFalse(subtitles) class TestCeskaTelevizeSubtitles(BaseTestSubtitles): url = 'http://www.ceskatelevize.cz/ivysilani/10600540290-u6-uzasny-svet-techniky' IE = CeskaTelevizeIE - def test_list_subtitles(self): - self.DL.expect_warning('Automatic Captions not supported by this server') - self.DL.params['listsubtitles'] = True - info_dict = self.getInfoDict() - self.assertEqual(info_dict, None) - def test_allsubtitles(self): self.DL.expect_warning('Automatic Captions not supported by this server') self.DL.params['writesubtitles'] = True @@ -342,7 +214,110 @@ class TestCeskaTelevizeSubtitles(BaseTestSubtitles): self.DL.params['writesubtitles'] = True self.DL.params['allsubtitles'] = True subtitles = self.getSubtitles() - self.assertEqual(len(subtitles), 0) + self.assertFalse(subtitles) + + +class TestLyndaSubtitles(BaseTestSubtitles): + url = 'http://www.lynda.com/Bootstrap-tutorials/Using-exercise-files/110885/114408-4.html' + IE = LyndaIE + + def test_allsubtitles(self): + self.DL.params['writesubtitles'] = True + self.DL.params['allsubtitles'] = True + subtitles = self.getSubtitles() + self.assertEqual(set(subtitles.keys()), set(['en'])) + self.assertEqual(md5(subtitles['en']), '09bbe67222259bed60deaa26997d73a7') + + +class TestNPOSubtitles(BaseTestSubtitles): + url = 'http://www.npo.nl/nos-journaal/28-08-2014/POW_00722860' + IE = NPOIE + + def test_allsubtitles(self): + self.DL.params['writesubtitles'] = True + self.DL.params['allsubtitles'] = True + subtitles = self.getSubtitles() + self.assertEqual(set(subtitles.keys()), set(['nl'])) + self.assertEqual(md5(subtitles['nl']), 'fc6435027572b63fb4ab143abd5ad3f4') + + +class TestMTVSubtitles(BaseTestSubtitles): + url = 'http://www.cc.com/video-clips/kllhuv/stand-up-greg-fitzsimmons--uncensored---too-good-of-a-mother' + IE = ComedyCentralIE + + def getInfoDict(self): + return super(TestMTVSubtitles, self).getInfoDict()['entries'][0] + + def test_allsubtitles(self): + self.DL.params['writesubtitles'] = True + self.DL.params['allsubtitles'] = True + subtitles = self.getSubtitles() + self.assertEqual(set(subtitles.keys()), set(['en'])) + self.assertEqual(md5(subtitles['en']), 'b9f6ca22a6acf597ec76f61749765e65') + + +class TestNRKSubtitles(BaseTestSubtitles): + url = 'http://tv.nrk.no/serie/ikke-gjoer-dette-hjemme/DMPV73000411/sesong-2/episode-1' + IE = NRKTVIE + + def test_allsubtitles(self): + self.DL.params['writesubtitles'] = True + self.DL.params['allsubtitles'] = True + subtitles = self.getSubtitles() + self.assertEqual(set(subtitles.keys()), set(['no'])) + self.assertEqual(md5(subtitles['no']), '1d221e6458c95c5494dcd38e6a1f129a') + + +class TestRaiSubtitles(BaseTestSubtitles): + url = 'http://www.rai.tv/dl/RaiTV/programmi/media/ContentItem-cb27157f-9dd0-4aee-b788-b1f67643a391.html' + IE = RaiIE + + def test_allsubtitles(self): + self.DL.params['writesubtitles'] = True + self.DL.params['allsubtitles'] = True + subtitles = self.getSubtitles() + self.assertEqual(set(subtitles.keys()), set(['it'])) + self.assertEqual(md5(subtitles['it']), 'b1d90a98755126b61e667567a1f6680a') + + +class TestVikiSubtitles(BaseTestSubtitles): + url = 'http://www.viki.com/videos/1060846v-punch-episode-18' + IE = VikiIE + + def test_allsubtitles(self): + self.DL.params['writesubtitles'] = True + self.DL.params['allsubtitles'] = True + subtitles = self.getSubtitles() + self.assertEqual(set(subtitles.keys()), set(['en'])) + self.assertEqual(md5(subtitles['en']), '53cb083a5914b2d84ef1ab67b880d18a') + + +class TestThePlatformSubtitles(BaseTestSubtitles): + # from http://www.3playmedia.com/services-features/tools/integrations/theplatform/ + # (see http://theplatform.com/about/partners/type/subtitles-closed-captioning/) + url = 'theplatform:JFUjUE1_ehvq' + IE = ThePlatformIE + + def test_allsubtitles(self): + self.DL.params['writesubtitles'] = True + self.DL.params['allsubtitles'] = True + subtitles = self.getSubtitles() + self.assertEqual(set(subtitles.keys()), set(['en'])) + self.assertEqual(md5(subtitles['en']), '97e7670cbae3c4d26ae8bcc7fdd78d4b') + + +class TestRtveSubtitles(BaseTestSubtitles): + url = 'http://www.rtve.es/alacarta/videos/los-misterios-de-laura/misterios-laura-capitulo-32-misterio-del-numero-17-2-parte/2428621/' + IE = RTVEALaCartaIE + + def test_allsubtitles(self): + print('Skipping, only available from Spain') + return + self.DL.params['writesubtitles'] = True + self.DL.params['allsubtitles'] = True + subtitles = self.getSubtitles() + self.assertEqual(set(subtitles.keys()), set(['es'])) + self.assertEqual(md5(subtitles['es']), '69e70cae2d40574fb7316f31d6eb7fca') if __name__ == '__main__': diff --git a/test/test_swfinterp.py b/test/test_swfinterp.py index 9f18055e6..f1e899819 100644 --- a/test/test_swfinterp.py +++ b/test/test_swfinterp.py @@ -34,8 +34,8 @@ def _make_testfunc(testfile): def test_func(self): as_file = os.path.join(TEST_DIR, testfile) swf_file = os.path.join(TEST_DIR, test_id + '.swf') - if ((not os.path.exists(swf_file)) - or os.path.getmtime(swf_file) < os.path.getmtime(as_file)): + if ((not os.path.exists(swf_file)) or + os.path.getmtime(swf_file) < os.path.getmtime(as_file)): # Recompile try: subprocess.check_call([ diff --git a/test/test_utils.py b/test/test_utils.py index c7373af1e..2f8996d7b 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -85,6 +85,8 @@ class TestUtil(unittest.TestCase): self.assertEqual( sanitize_filename('New World record at 0:12:34'), 'New World record at 0_12_34') + self.assertEqual(sanitize_filename('--gasdgf'), '_-gasdgf') + self.assertEqual(sanitize_filename('--gasdgf', is_id=True), '--gasdgf') forbidden = '"\0\\/' for fc in forbidden: diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 88809783b..76fc394bc 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -154,7 +154,7 @@ class YoutubeDL(object): allsubtitles: Downloads all the subtitles of the video (requires writesubtitles or writeautomaticsub) listsubtitles: Lists all available subtitles for the video - subtitlesformat: Subtitle format [srt/sbv/vtt] (default=srt) + subtitlesformat: The format code for subtitles subtitleslangs: List of languages of the subtitles to download keepvideo: Keep the video file after post-processing daterange: A DateRange object, download only if the upload_date is in the range. @@ -308,8 +308,8 @@ class YoutubeDL(object): raise if (sys.version_info >= (3,) and sys.platform != 'win32' and - sys.getfilesystemencoding() in ['ascii', 'ANSI_X3.4-1968'] - and not params.get('restrictfilenames', False)): + sys.getfilesystemencoding() in ['ascii', 'ANSI_X3.4-1968'] and + not params.get('restrictfilenames', False)): # On Python 3, the Unicode filesystem API will throw errors (#1474) self.report_warning( 'Assuming --restrict-filenames since file system encoding ' @@ -1008,6 +1008,15 @@ class YoutubeDL(object): info_dict['timestamp']) info_dict['upload_date'] = upload_date.strftime('%Y%m%d') + if self.params.get('listsubtitles', False): + if 'automatic_captions' in info_dict: + self.list_subtitles(info_dict['id'], info_dict.get('automatic_captions'), 'automatic captions') + self.list_subtitles(info_dict['id'], info_dict.get('subtitles'), 'subtitles') + return + info_dict['requested_subtitles'] = self.process_subtitles( + info_dict['id'], info_dict.get('subtitles'), + info_dict.get('automatic_captions')) + # This extractors handle format selection themselves if info_dict['extractor'] in ['Youku']: if download: @@ -1136,6 +1145,55 @@ class YoutubeDL(object): info_dict.update(formats_to_download[-1]) return info_dict + def process_subtitles(self, video_id, normal_subtitles, automatic_captions): + """Select the requested subtitles and their format""" + available_subs = {} + if normal_subtitles and self.params.get('writesubtitles'): + available_subs.update(normal_subtitles) + if automatic_captions and self.params.get('writeautomaticsub'): + for lang, cap_info in automatic_captions.items(): + if lang not in available_subs: + available_subs[lang] = cap_info + + if (not self.params.get('writesubtitles') and not + self.params.get('writeautomaticsub') or not + available_subs): + return None + + if self.params.get('allsubtitles', False): + requested_langs = available_subs.keys() + else: + if self.params.get('subtitleslangs', False): + requested_langs = self.params.get('subtitleslangs') + elif 'en' in available_subs: + requested_langs = ['en'] + else: + requested_langs = [list(available_subs.keys())[0]] + + formats_query = self.params.get('subtitlesformat', 'best') + formats_preference = formats_query.split('/') if formats_query else [] + subs = {} + for lang in requested_langs: + formats = available_subs.get(lang) + if formats is None: + self.report_warning('%s subtitles not available for %s' % (lang, video_id)) + continue + for ext in formats_preference: + if ext == 'best': + f = formats[-1] + break + matches = list(filter(lambda f: f['ext'] == ext, formats)) + if matches: + f = matches[-1] + break + else: + f = formats[-1] + self.report_warning( + 'No subtitle format found matching "%s" for language %s, ' + 'using %s' % (formats_query, lang, f['ext'])) + subs[lang] = f + return subs + def process_info(self, info_dict): """Process a single resolved IE result.""" @@ -1238,15 +1296,22 @@ class YoutubeDL(object): subtitles_are_requested = any([self.params.get('writesubtitles', False), self.params.get('writeautomaticsub')]) - if subtitles_are_requested and 'subtitles' in info_dict and info_dict['subtitles']: + if subtitles_are_requested and info_dict.get('requested_subtitles'): # subtitles download errors are already managed as troubles in relevant IE # that way it will silently go on when used with unsupporting IE - subtitles = info_dict['subtitles'] - sub_format = self.params.get('subtitlesformat', 'srt') - for sub_lang in subtitles.keys(): - sub = subtitles[sub_lang] - if sub is None: - continue + subtitles = info_dict['requested_subtitles'] + for sub_lang, sub_info in subtitles.items(): + sub_format = sub_info['ext'] + if sub_info.get('data') is not None: + sub_data = sub_info['data'] + else: + try: + uf = self.urlopen(sub_info['url']) + sub_data = uf.read().decode('utf-8') + except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: + self.report_warning('Unable to download subtitle for "%s": %s' % + (sub_lang, compat_str(err))) + continue try: sub_filename = subtitles_filename(filename, sub_lang, sub_format) if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(sub_filename)): @@ -1254,7 +1319,7 @@ class YoutubeDL(object): else: self.to_screen('[info] Writing video subtitles to: ' + sub_filename) with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8') as subfile: - subfile.write(sub) + subfile.write(sub_data) except (OSError, IOError): self.report_error('Cannot write subtitles file ' + sub_filename) return @@ -1366,8 +1431,8 @@ class YoutubeDL(object): """Download a given list of URLs.""" outtmpl = self.params.get('outtmpl', DEFAULT_OUTTMPL) if (len(url_list) > 1 and - '%' not in outtmpl - and self.params.get('max_downloads') != 1): + '%' not in outtmpl and + self.params.get('max_downloads') != 1): raise SameFileError(outtmpl) for url in url_list: @@ -1564,6 +1629,17 @@ class YoutubeDL(object): ['ID', 'width', 'height', 'URL'], [[t['id'], t.get('width', 'unknown'), t.get('height', 'unknown'), t['url']] for t in thumbnails])) + def list_subtitles(self, video_id, subtitles, name='subtitles'): + if not subtitles: + self.to_screen('%s has no %s' % (video_id, name)) + return + self.to_screen( + 'Available %s for %s:' % (name, video_id)) + self.to_screen(render_table( + ['Language', 'formats'], + [[lang, ', '.join(f['ext'] for f in reversed(formats))] + for lang, formats in subtitles.items()])) + def urlopen(self, req): """ Start an HTTP download """ diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index eac2a26ec..5ce201800 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -189,14 +189,14 @@ def _real_main(argv=None): # In Python 2, sys.argv is a bytestring (also note http://bugs.python.org/issue2128 for Windows systems) if opts.outtmpl is not None: opts.outtmpl = opts.outtmpl.decode(preferredencoding()) - outtmpl = ((opts.outtmpl is not None and opts.outtmpl) - or (opts.format == '-1' and opts.usetitle and '%(title)s-%(id)s-%(format)s.%(ext)s') - or (opts.format == '-1' and '%(id)s-%(format)s.%(ext)s') - or (opts.usetitle and opts.autonumber and '%(autonumber)s-%(title)s-%(id)s.%(ext)s') - or (opts.usetitle and '%(title)s-%(id)s.%(ext)s') - or (opts.useid and '%(id)s.%(ext)s') - or (opts.autonumber and '%(autonumber)s-%(id)s.%(ext)s') - or DEFAULT_OUTTMPL) + outtmpl = ((opts.outtmpl is not None and opts.outtmpl) or + (opts.format == '-1' and opts.usetitle and '%(title)s-%(id)s-%(format)s.%(ext)s') or + (opts.format == '-1' and '%(id)s-%(format)s.%(ext)s') or + (opts.usetitle and opts.autonumber and '%(autonumber)s-%(title)s-%(id)s.%(ext)s') or + (opts.usetitle and '%(title)s-%(id)s.%(ext)s') or + (opts.useid and '%(id)s.%(ext)s') or + (opts.autonumber and '%(autonumber)s-%(id)s.%(ext)s') or + DEFAULT_OUTTMPL) if not os.path.splitext(outtmpl)[1] and opts.extractaudio: parser.error('Cannot download a video and extract audio into the same' ' file! Use "{0}.%(ext)s" instead of "{0}" as the output' @@ -226,7 +226,6 @@ def _real_main(argv=None): if opts.embedsubtitles: postprocessors.append({ 'key': 'FFmpegEmbedSubtitle', - 'subtitlesformat': opts.subtitlesformat, }) if opts.xattrs: postprocessors.append({'key': 'XAttrMetadata'}) diff --git a/youtube_dl/downloader/common.py b/youtube_dl/downloader/common.py index 45e55b99c..3ae90021a 100644 --- a/youtube_dl/downloader/common.py +++ b/youtube_dl/downloader/common.py @@ -311,14 +311,14 @@ class FileDownloader(object): """ nooverwrites_and_exists = ( - self.params.get('nooverwrites', False) - and os.path.exists(encodeFilename(filename)) + self.params.get('nooverwrites', False) and + os.path.exists(encodeFilename(filename)) ) continuedl_and_exists = ( - self.params.get('continuedl', False) - and os.path.isfile(encodeFilename(filename)) - and not self.params.get('nopart', False) + self.params.get('continuedl', False) and + os.path.isfile(encodeFilename(filename)) and + not self.params.get('nopart', False) ) # Check file already present diff --git a/youtube_dl/downloader/f4m.py b/youtube_dl/downloader/f4m.py index b40ebfa50..7b8fe8cf5 100644 --- a/youtube_dl/downloader/f4m.py +++ b/youtube_dl/downloader/f4m.py @@ -325,8 +325,8 @@ class F4mFD(FileDownloader): state['frag_index'] += 1 estimated_size = ( - (state['downloaded_bytes'] + frag_total_bytes) - / (state['frag_index'] + 1) * total_frags) + (state['downloaded_bytes'] + frag_total_bytes) / + (state['frag_index'] + 1) * total_frags) time_now = time.time() state['total_bytes_estimate'] = estimated_size state['elapsed'] = time_now - start diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 202c0e67a..b35173291 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -63,6 +63,10 @@ from .ccc import CCCIE from .ceskatelevize import CeskaTelevizeIE from .channel9 import Channel9IE from .chilloutzone import ChilloutzoneIE +from .chirbit import ( + ChirbitIE, + ChirbitProfileIE, +) from .cinchcast import CinchcastIE from .clipfish import ClipfishIE from .cliphunter import CliphunterIE @@ -365,6 +369,7 @@ from .promptfile import PromptFileIE from .prosiebensat1 import ProSiebenSat1IE from .pyvideo import PyvideoIE from .quickvid import QuickVidIE +from .r7 import R7IE from .radiode import RadioDeIE from .radiobremen import RadioBremenIE from .radiofrance import RadioFranceIE @@ -425,7 +430,10 @@ from .soundcloud import ( SoundcloudUserIE, SoundcloudPlaylistIE ) -from .soundgasm import SoundgasmIE +from .soundgasm import ( + SoundgasmIE, + SoundgasmProfileIE +) from .southpark import ( SouthParkIE, SouthparkDeIE, @@ -613,6 +621,7 @@ from .youtube import ( YoutubeUserIE, YoutubeWatchLaterIE, ) +from .zapiks import ZapiksIE from .zdf import ZDFIE, ZDFChannelIE from .zingmp3 import ( ZingMp3SongIE, diff --git a/youtube_dl/extractor/adobetv.py b/youtube_dl/extractor/adobetv.py index 28e07f8b0..97d128560 100644 --- a/youtube_dl/extractor/adobetv.py +++ b/youtube_dl/extractor/adobetv.py @@ -28,7 +28,6 @@ class AdobeTVIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) player = self._parse_json( @@ -44,8 +43,10 @@ class AdobeTVIE(InfoExtractor): self._html_search_meta('datepublished', webpage, 'upload date')) duration = parse_duration( - self._html_search_meta('duration', webpage, 'duration') - or self._search_regex(r'Runtime:\s*(\d{2}:\d{2}:\d{2})', webpage, 'duration')) + self._html_search_meta('duration', webpage, 'duration') or + self._search_regex( + r'Runtime:\s*(\d{2}:\d{2}:\d{2})', + webpage, 'duration', fatal=False)) view_count = str_to_int(self._search_regex( r'
\s*Views?:\s*([\d,.]+)\s*
', diff --git a/youtube_dl/extractor/appletrailers.py b/youtube_dl/extractor/appletrailers.py index 43e82847f..576f03b5b 100644 --- a/youtube_dl/extractor/appletrailers.py +++ b/youtube_dl/extractor/appletrailers.py @@ -11,8 +11,8 @@ from ..utils import ( class AppleTrailersIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?trailers\.apple\.com/trailers/(?P[^/]+)/(?P[^/]+)' - _TEST = { + _VALID_URL = r'https?://(?:www\.)?trailers\.apple\.com/(?:trailers|ca)/(?P[^/]+)/(?P[^/]+)' + _TESTS = [{ "url": "http://trailers.apple.com/trailers/wb/manofsteel/", 'info_dict': { 'id': 'manofsteel', @@ -63,7 +63,10 @@ class AppleTrailersIE(InfoExtractor): }, }, ] - } + }, { + 'url': 'http://trailers.apple.com/ca/metropole/autrui/', + 'only_matching': True, + }] _JSON_RE = r'iTunes.playURL\((.*?)\);' diff --git a/youtube_dl/extractor/atresplayer.py b/youtube_dl/extractor/atresplayer.py index f016368fa..7669e0e3d 100644 --- a/youtube_dl/extractor/atresplayer.py +++ b/youtube_dl/extractor/atresplayer.py @@ -3,7 +3,7 @@ from __future__ import unicode_literals import time import hmac -from .subtitles import SubtitlesInfoExtractor +from .common import InfoExtractor from ..compat import ( compat_str, compat_urllib_parse, @@ -17,7 +17,7 @@ from ..utils import ( ) -class AtresPlayerIE(SubtitlesInfoExtractor): +class AtresPlayerIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?atresplayer\.com/television/[^/]+/[^/]+/[^/]+/(?P.+?)_\d+\.html' _TESTS = [ { @@ -144,13 +144,12 @@ class AtresPlayerIE(SubtitlesInfoExtractor): thumbnail = xpath_text(episode, './media/asset/files/background', 'thumbnail') subtitles = {} - subtitle = xpath_text(episode, './media/asset/files/subtitle', 'subtitle') - if subtitle: - subtitles['es'] = subtitle - - if self._downloader.params.get('listsubtitles', False): - self._list_available_subtitles(video_id, subtitles) - return + subtitle_url = xpath_text(episode, './media/asset/files/subtitle', 'subtitle') + if subtitle_url: + subtitles['es'] = [{ + 'ext': 'srt', + 'url': subtitle_url, + }] return { 'id': video_id, @@ -159,5 +158,5 @@ class AtresPlayerIE(SubtitlesInfoExtractor): 'thumbnail': thumbnail, 'duration': duration, 'formats': formats, - 'subtitles': self.extract_subtitles(video_id, subtitles), + 'subtitles': subtitles, } diff --git a/youtube_dl/extractor/bbccouk.py b/youtube_dl/extractor/bbccouk.py index f23e39545..abc34a576 100644 --- a/youtube_dl/extractor/bbccouk.py +++ b/youtube_dl/extractor/bbccouk.py @@ -2,12 +2,12 @@ from __future__ import unicode_literals import xml.etree.ElementTree -from .subtitles import SubtitlesInfoExtractor +from .common import InfoExtractor from ..utils import ExtractorError from ..compat import compat_HTTPError -class BBCCoUkIE(SubtitlesInfoExtractor): +class BBCCoUkIE(InfoExtractor): IE_NAME = 'bbc.co.uk' IE_DESC = 'BBC iPlayer' _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:(?:(?:programmes|iplayer(?:/[^/]+)?/(?:episode|playlist))/)|music/clips[/#])(?P[\da-z]{8})' @@ -215,17 +215,32 @@ class BBCCoUkIE(SubtitlesInfoExtractor): formats.extend(conn_formats) return formats - def _extract_captions(self, media, programme_id): + def _get_subtitles(self, media, programme_id): subtitles = {} for connection in self._extract_connections(media): captions = self._download_xml(connection.get('href'), programme_id, 'Downloading captions') lang = captions.get('{http://www.w3.org/XML/1998/namespace}lang', 'en') ps = captions.findall('./{0}body/{0}div/{0}p'.format('{http://www.w3.org/2006/10/ttaf1}')) srt = '' + + def _extract_text(p): + if p.text is not None: + stripped_text = p.text.strip() + if stripped_text: + return stripped_text + return ' '.join(span.text.strip() for span in p.findall('{http://www.w3.org/2006/10/ttaf1}span')) for pos, p in enumerate(ps): - srt += '%s\r\n%s --> %s\r\n%s\r\n\r\n' % (str(pos), p.get('begin'), p.get('end'), - p.text.strip() if p.text is not None else '') - subtitles[lang] = srt + srt += '%s\r\n%s --> %s\r\n%s\r\n\r\n' % (str(pos), p.get('begin'), p.get('end'), _extract_text(p)) + subtitles[lang] = [ + { + 'url': connection.get('href'), + 'ext': 'ttml', + }, + { + 'data': srt, + 'ext': 'srt', + }, + ] return subtitles def _download_media_selector(self, programme_id): @@ -249,7 +264,7 @@ class BBCCoUkIE(SubtitlesInfoExtractor): elif kind == 'video': formats.extend(self._extract_video(media, programme_id)) elif kind == 'captions': - subtitles = self._extract_captions(media, programme_id) + subtitles = self.extract_subtitles(media, programme_id) return formats, subtitles @@ -324,10 +339,6 @@ class BBCCoUkIE(SubtitlesInfoExtractor): else: programme_id, title, description, duration, formats, subtitles = self._download_playlist(group_id) - if self._downloader.params.get('listsubtitles', False): - self._list_available_subtitles(programme_id, subtitles) - return - self._sort_formats(formats) return { diff --git a/youtube_dl/extractor/bliptv.py b/youtube_dl/extractor/bliptv.py index 436cc5155..8c7ba4b91 100644 --- a/youtube_dl/extractor/bliptv.py +++ b/youtube_dl/extractor/bliptv.py @@ -3,7 +3,6 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from .subtitles import SubtitlesInfoExtractor from ..compat import ( compat_str, @@ -18,7 +17,7 @@ from ..utils import ( ) -class BlipTVIE(SubtitlesInfoExtractor): +class BlipTVIE(InfoExtractor): _VALID_URL = r'https?://(?:\w+\.)?blip\.tv/(?:(?:.+-|rss/flash/)(?P\d+)|((?:play/|api\.swf#)(?P[\da-zA-Z+_]+)))' _TESTS = [ @@ -143,7 +142,7 @@ class BlipTVIE(SubtitlesInfoExtractor): categories = [category.text for category in item.findall('category')] formats = [] - subtitles = {} + subtitles_urls = {} media_group = item.find(media('group')) for media_content in media_group.findall(media('content')): @@ -161,7 +160,7 @@ class BlipTVIE(SubtitlesInfoExtractor): } lang = role.rpartition('-')[-1].strip().lower() langcode = LANGS.get(lang, lang) - subtitles[langcode] = url + subtitles_urls[langcode] = url elif media_type.startswith('video/'): formats.append({ 'url': real_url, @@ -175,11 +174,7 @@ class BlipTVIE(SubtitlesInfoExtractor): }) self._sort_formats(formats) - # subtitles - video_subtitles = self.extract_subtitles(video_id, subtitles) - if self._downloader.params.get('listsubtitles', False): - self._list_available_subtitles(video_id, subtitles) - return + subtitles = self.extract_subtitles(video_id, subtitles_urls) return { 'id': video_id, @@ -192,15 +187,22 @@ class BlipTVIE(SubtitlesInfoExtractor): 'thumbnail': thumbnail, 'categories': categories, 'formats': formats, - 'subtitles': video_subtitles, + 'subtitles': subtitles, } - def _download_subtitle_url(self, sub_lang, url): - # For some weird reason, blip.tv serves a video instead of subtitles - # when we request with a common UA - req = compat_urllib_request.Request(url) - req.add_header('User-Agent', 'youtube-dl') - return self._download_webpage(req, None, note=False) + def _get_subtitles(self, video_id, subtitles_urls): + subtitles = {} + for lang, url in subtitles_urls.items(): + # For some weird reason, blip.tv serves a video instead of subtitles + # when we request with a common UA + req = compat_urllib_request.Request(url) + req.add_header('User-Agent', 'youtube-dl') + subtitles[lang] = [{ + # The extension is 'srt' but it's actually an 'ass' file + 'ext': 'ass', + 'data': self._download_webpage(req, None, note=False), + }] + return subtitles class BlipTVUserIE(InfoExtractor): diff --git a/youtube_dl/extractor/bloomberg.py b/youtube_dl/extractor/bloomberg.py index c51a97ce4..4a88ccd13 100644 --- a/youtube_dl/extractor/bloomberg.py +++ b/youtube_dl/extractor/bloomberg.py @@ -6,7 +6,7 @@ from .common import InfoExtractor class BloombergIE(InfoExtractor): - _VALID_URL = r'https?://www\.bloomberg\.com/video/(?P.+?)\.html' + _VALID_URL = r'https?://www\.bloomberg\.com/video/(?P.+?)\.html' _TEST = { 'url': 'http://www.bloomberg.com/video/shah-s-presentation-on-foreign-exchange-strategies-qurhIVlJSB6hzkVi229d8g.html', @@ -20,9 +20,9 @@ class BloombergIE(InfoExtractor): } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - name = mobj.group('name') + name = self._match_id(url) webpage = self._download_webpage(url, name) + f4m_url = self._search_regex( r'[^?#]+)' _TESTS = [ @@ -107,13 +107,7 @@ class CeskaTelevizeIE(SubtitlesInfoExtractor): subtitles = {} subs = item.get('subtitles') if subs: - subtitles['cs'] = subs[0]['url'] - - if self._downloader.params.get('listsubtitles', False): - self._list_available_subtitles(video_id, subtitles) - return - - subtitles = self._fix_subtitles(self.extract_subtitles(video_id, subtitles)) + subtitles = self.extract_subtitles(episode_id, subs) return { 'id': episode_id, @@ -125,11 +119,20 @@ class CeskaTelevizeIE(SubtitlesInfoExtractor): 'subtitles': subtitles, } + def _get_subtitles(self, episode_id, subs): + original_subtitles = self._download_webpage( + subs[0]['url'], episode_id, 'Downloading subtitles') + srt_subs = self._fix_subtitles(original_subtitles) + return { + 'cs': [{ + 'ext': 'srt', + 'data': srt_subs, + }] + } + @staticmethod def _fix_subtitles(subtitles): """ Convert millisecond-based subtitles to SRT """ - if subtitles is None: - return subtitles # subtitles not requested def _msectotimecode(msec): """ Helper utility to convert milliseconds to timecode """ @@ -149,7 +152,4 @@ class CeskaTelevizeIE(SubtitlesInfoExtractor): else: yield line - fixed_subtitles = {} - for k, v in subtitles.items(): - fixed_subtitles[k] = "\r\n".join(_fix_subtitle(v)) - return fixed_subtitles + return "\r\n".join(_fix_subtitle(subtitles)) diff --git a/youtube_dl/extractor/chirbit.py b/youtube_dl/extractor/chirbit.py new file mode 100644 index 000000000..b1eeaf101 --- /dev/null +++ b/youtube_dl/extractor/chirbit.py @@ -0,0 +1,84 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + parse_duration, + int_or_none, +) + + +class ChirbitIE(InfoExtractor): + IE_NAME = 'chirbit' + _VALID_URL = r'https?://(?:www\.)?chirb\.it/(?:(?:wp|pl)/|fb_chirbit_player\.swf\?key=)?(?P[\da-zA-Z]+)' + _TESTS = [{ + 'url': 'http://chirb.it/PrIPv5', + 'md5': '9847b0dad6ac3e074568bf2cfb197de8', + 'info_dict': { + 'id': 'PrIPv5', + 'ext': 'mp3', + 'title': 'Фасадстрой', + 'duration': 52, + 'view_count': int, + 'comment_count': int, + } + }, { + 'url': 'https://chirb.it/fb_chirbit_player.swf?key=PrIPv5', + 'only_matching': True, + }] + + def _real_extract(self, url): + audio_id = self._match_id(url) + + webpage = self._download_webpage( + 'http://chirb.it/%s' % audio_id, audio_id) + + audio_url = self._search_regex( + r'"setFile"\s*,\s*"([^"]+)"', webpage, 'audio url') + + title = self._search_regex( + r'itemprop="name">([^<]+)', webpage, 'title') + duration = parse_duration(self._html_search_meta( + 'duration', webpage, 'duration', fatal=False)) + view_count = int_or_none(self._search_regex( + r'itemprop="playCount"\s*>(\d+)', webpage, + 'listen count', fatal=False)) + comment_count = int_or_none(self._search_regex( + r'>(\d+) Comments?:', webpage, + 'comment count', fatal=False)) + + return { + 'id': audio_id, + 'url': audio_url, + 'title': title, + 'duration': duration, + 'view_count': view_count, + 'comment_count': comment_count, + } + + +class ChirbitProfileIE(InfoExtractor): + IE_NAME = 'chirbit:profile' + _VALID_URL = r'https?://(?:www\.)?chirbit.com/(?:rss/)?(?P[^/]+)' + _TEST = { + 'url': 'http://chirbit.com/ScarletBeauty', + 'info_dict': { + 'id': 'ScarletBeauty', + 'title': 'Chirbits by ScarletBeauty', + }, + 'playlist_mincount': 3, + } + + def _real_extract(self, url): + profile_id = self._match_id(url) + + rss = self._download_xml( + 'http://chirbit.com/rss/%s' % profile_id, profile_id) + + entries = [ + self.url_result(audio_url.text, 'Chirbit') + for audio_url in rss.findall('./channel/item/link')] + + title = rss.find('./channel/title').text + + return self.playlist_result(entries, profile_id, title) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 08b8ad37c..87fce9cd8 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -150,8 +150,14 @@ class InfoExtractor(object): If not explicitly set, calculated from timestamp. uploader_id: Nickname or id of the video uploader. location: Physical location where the video was filmed. - subtitles: The subtitle file contents as a dictionary in the format - {language: subtitles}. + subtitles: The available subtitles as a dictionary in the format + {language: subformats}. "subformats" is a list sorted from + lower to higher preference, each element is a dictionary + with the "ext" entry and one of: + * "data": The subtitles file contents + * "url": A url pointing to the subtitles file + automatic_captions: Like 'subtitles', used by the YoutubeIE for + automatically generated captions duration: Length of the video in seconds, as an integer. view_count: How many users have watched the video on the platform. like_count: Number of positive ratings of the video @@ -391,6 +397,16 @@ class InfoExtractor(object): if blocked_iframe: msg += ' Visit %s for more details' % blocked_iframe raise ExtractorError(msg, expected=True) + if 'The URL you requested has been blocked' in content[:512]: + msg = ( + 'Access to this webpage has been blocked by Indian censorship. ' + 'Use a VPN or proxy server (with --proxy) to route around it.') + block_msg = self._html_search_regex( + r'

(.*?)

', + content, 'block message', default=None) + if block_msg: + msg += ' (Message: "%s")' % block_msg.replace('\n', ' ') + raise ExtractorError(msg, expected=True) return content @@ -798,8 +814,8 @@ class InfoExtractor(object): media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media') for i, media_el in enumerate(media_nodes): if manifest_version == '2.0': - manifest_url = ('/'.join(manifest_url.split('/')[:-1]) + '/' - + (media_el.attrib.get('href') or media_el.attrib.get('url'))) + manifest_url = ('/'.join(manifest_url.split('/')[:-1]) + '/' + + (media_el.attrib.get('href') or media_el.attrib.get('url'))) tbr = int_or_none(media_el.attrib.get('bitrate')) formats.append({ 'format_id': '-'.join(filter(None, [f4m_id, 'f4m-%d' % (i if tbr is None else tbr)])), @@ -823,7 +839,7 @@ class InfoExtractor(object): 'url': m3u8_url, 'ext': ext, 'protocol': 'm3u8', - 'preference': -1, + 'preference': preference - 1 if preference else -1, 'resolution': 'multiple', 'format_note': 'Quality selection URL', }] @@ -1001,6 +1017,24 @@ class InfoExtractor(object): any_restricted = any_restricted or is_restricted return not any_restricted + def extract_subtitles(self, *args, **kwargs): + if (self._downloader.params.get('writesubtitles', False) or + self._downloader.params.get('listsubtitles')): + return self._get_subtitles(*args, **kwargs) + return {} + + def _get_subtitles(self, *args, **kwargs): + raise NotImplementedError("This method must be implemented by subclasses") + + def extract_automatic_captions(self, *args, **kwargs): + if (self._downloader.params.get('writeautomaticsub', False) or + self._downloader.params.get('listsubtitles')): + return self._get_automatic_captions(*args, **kwargs) + return {} + + def _get_automatic_captions(self, *args, **kwargs): + raise NotImplementedError("This method must be implemented by subclasses") + class SearchInfoExtractor(InfoExtractor): """ diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index 1680f532f..f1da7d09b 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -9,7 +9,7 @@ import xml.etree.ElementTree from hashlib import sha1 from math import pow, sqrt, floor -from .subtitles import SubtitlesInfoExtractor +from .common import InfoExtractor from ..compat import ( compat_urllib_parse, compat_urllib_request, @@ -25,10 +25,9 @@ from ..aes import ( aes_cbc_decrypt, inc, ) -from .common import InfoExtractor -class CrunchyrollIE(SubtitlesInfoExtractor): +class CrunchyrollIE(InfoExtractor): _VALID_URL = r'https?://(?:(?Pwww|m)\.)?(?Pcrunchyroll\.(?:com|fr)/(?:[^/]*/[^/?&]*?|media/\?id=)(?P[0-9]+))(?:[/?&]|$)' _TESTS = [{ 'url': 'http://www.crunchyroll.com/wanna-be-the-strongest-in-the-world/episode-1-an-idol-wrestler-is-born-645513', @@ -187,6 +186,38 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text return output + def _get_subtitles(self, video_id, webpage): + subtitles = {} + for sub_id, sub_name in re.findall(r'\?ssid=([0-9]+)" title="([^"]+)', webpage): + sub_page = self._download_webpage( + 'http://www.crunchyroll.com/xml/?req=RpcApiSubtitle_GetXml&subtitle_script_id=' + sub_id, + video_id, note='Downloading subtitles for ' + sub_name) + id = self._search_regex(r'id=\'([0-9]+)', sub_page, 'subtitle_id', fatal=False) + iv = self._search_regex(r'([^<]+)', sub_page, 'subtitle_iv', fatal=False) + data = self._search_regex(r'([^<]+)', sub_page, 'subtitle_data', fatal=False) + if not id or not iv or not data: + continue + id = int(id) + iv = base64.b64decode(iv) + data = base64.b64decode(data) + + subtitle = self._decrypt_subtitles(data, iv, id).decode('utf-8') + lang_code = self._search_regex(r'lang_code=["\']([^"\']+)', subtitle, 'subtitle_lang_code', fatal=False) + if not lang_code: + continue + sub_root = xml.etree.ElementTree.fromstring(subtitle) + subtitles[lang_code] = [ + { + 'ext': 'srt', + 'data': self._convert_subtitles_to_srt(sub_root), + }, + { + 'ext': 'ass', + 'data': self._convert_subtitles_to_ass(sub_root), + }, + ] + return subtitles + def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('video_id') @@ -249,34 +280,7 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text 'format_id': video_format, }) - subtitles = {} - sub_format = self._downloader.params.get('subtitlesformat', 'srt') - for sub_id, sub_name in re.findall(r'\?ssid=([0-9]+)" title="([^"]+)', webpage): - sub_page = self._download_webpage( - 'http://www.crunchyroll.com/xml/?req=RpcApiSubtitle_GetXml&subtitle_script_id=' + sub_id, - video_id, note='Downloading subtitles for ' + sub_name) - id = self._search_regex(r'id=\'([0-9]+)', sub_page, 'subtitle_id', fatal=False) - iv = self._search_regex(r'([^<]+)', sub_page, 'subtitle_iv', fatal=False) - data = self._search_regex(r'([^<]+)', sub_page, 'subtitle_data', fatal=False) - if not id or not iv or not data: - continue - id = int(id) - iv = base64.b64decode(iv) - data = base64.b64decode(data) - - subtitle = self._decrypt_subtitles(data, iv, id).decode('utf-8') - lang_code = self._search_regex(r'lang_code=["\']([^"\']+)', subtitle, 'subtitle_lang_code', fatal=False) - if not lang_code: - continue - sub_root = xml.etree.ElementTree.fromstring(subtitle) - if sub_format == 'ass': - subtitles[lang_code] = self._convert_subtitles_to_ass(sub_root) - else: - subtitles[lang_code] = self._convert_subtitles_to_srt(sub_root) - - if self._downloader.params.get('listsubtitles', False): - self._list_available_subtitles(video_id, subtitles) - return + subtitles = self.extract_subtitles(video_id, webpage) return { 'id': video_id, diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index b2dbf4a92..42b20a46d 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -6,7 +6,6 @@ import json import itertools from .common import InfoExtractor -from .subtitles import SubtitlesInfoExtractor from ..compat import ( compat_str, @@ -31,7 +30,7 @@ class DailymotionBaseInfoExtractor(InfoExtractor): return request -class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor): +class DailymotionIE(DailymotionBaseInfoExtractor): """Information Extractor for Dailymotion""" _VALID_URL = r'(?i)(?:https?://)?(?:(www|touch)\.)?dailymotion\.[a-z]{2,3}/(?:(embed|#)/)?video/(?P[^/?_]+)' @@ -143,9 +142,6 @@ class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor): # subtitles video_subtitles = self.extract_subtitles(video_id, webpage) - if self._downloader.params.get('listsubtitles', False): - self._list_available_subtitles(video_id, webpage) - return view_count = str_to_int(self._search_regex( r'video_views_count[^>]+>\s+([\d\.,]+)', @@ -169,7 +165,7 @@ class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor): 'view_count': view_count, } - def _get_available_subtitles(self, video_id, webpage): + def _get_subtitles(self, video_id, webpage): try: sub_list = self._download_webpage( 'https://api.dailymotion.com/video/%s/subtitles?fields=id,language,url' % video_id, @@ -179,7 +175,7 @@ class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor): return {} info = json.loads(sub_list) if (info['total'] > 0): - sub_lang_list = dict((l['language'], l['url']) for l in info['list']) + sub_lang_list = dict((l['language'], [{'url': l['url'], 'ext': 'srt'}]) for l in info['list']) return sub_lang_list self._downloader.report_warning('video doesn\'t have subtitles') return {} diff --git a/youtube_dl/extractor/defense.py b/youtube_dl/extractor/defense.py index 2b90bf4fc..98e3aedfd 100644 --- a/youtube_dl/extractor/defense.py +++ b/youtube_dl/extractor/defense.py @@ -25,8 +25,9 @@ class DefenseGouvFrIE(InfoExtractor): r"flashvars.pvg_id=\"(\d+)\";", webpage, 'ID') - json_url = ('http://static.videos.gouv.fr/brightcovehub/export/json/' - + video_id) + json_url = ( + 'http://static.videos.gouv.fr/brightcovehub/export/json/%s' % + video_id) info = self._download_json(json_url, title, 'Downloading JSON config') video_url = info['renditions'][0]['url'] diff --git a/youtube_dl/extractor/drtv.py b/youtube_dl/extractor/drtv.py index d5df18d7c..8257e35a4 100644 --- a/youtube_dl/extractor/drtv.py +++ b/youtube_dl/extractor/drtv.py @@ -1,11 +1,10 @@ from __future__ import unicode_literals -from .subtitles import SubtitlesInfoExtractor -from .common import ExtractorError +from .common import InfoExtractor, ExtractorError from ..utils import parse_iso8601 -class DRTVIE(SubtitlesInfoExtractor): +class DRTVIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?dr\.dk/tv/se/(?:[^/]+/)*(?P[\da-z-]+)(?:[/#?]|$)' _TEST = { @@ -76,7 +75,7 @@ class DRTVIE(SubtitlesInfoExtractor): } for subs in subtitles_list: lang = subs['Language'] - subtitles[LANGS.get(lang, lang)] = subs['Uri'] + subtitles[LANGS.get(lang, lang)] = [{'url': subs['Uri'], 'ext': 'vtt'}] if not formats and restricted_to_denmark: raise ExtractorError( @@ -84,10 +83,6 @@ class DRTVIE(SubtitlesInfoExtractor): self._sort_formats(formats) - if self._downloader.params.get('listsubtitles', False): - self._list_available_subtitles(video_id, subtitles) - return - return { 'id': video_id, 'title': title, @@ -96,5 +91,5 @@ class DRTVIE(SubtitlesInfoExtractor): 'timestamp': timestamp, 'duration': duration, 'formats': formats, - 'subtitles': self.extract_subtitles(video_id, subtitles), + 'subtitles': subtitles, } diff --git a/youtube_dl/extractor/eporner.py b/youtube_dl/extractor/eporner.py index 4de8d4bc5..e006921ec 100644 --- a/youtube_dl/extractor/eporner.py +++ b/youtube_dl/extractor/eporner.py @@ -35,10 +35,7 @@ class EpornerIE(InfoExtractor): title = self._html_search_regex( r'(.*?) - EPORNER', webpage, 'title') - redirect_code = self._html_search_regex( - r'<script type="text/javascript" src="/config5/%s/([a-f\d]+)/">' % video_id, - webpage, 'redirect_code') - redirect_url = 'http://www.eporner.com/config5/%s/%s' % (video_id, redirect_code) + redirect_url = 'http://www.eporner.com/config5/%s' % video_id player_code = self._download_webpage( redirect_url, display_id, note='Downloading player config') @@ -69,5 +66,5 @@ class EpornerIE(InfoExtractor): 'duration': duration, 'view_count': view_count, 'formats': formats, - 'age_limit': self._rta_search(webpage), + 'age_limit': 18, } diff --git a/youtube_dl/extractor/escapist.py b/youtube_dl/extractor/escapist.py index 4303feccd..51ffec7ee 100644 --- a/youtube_dl/extractor/escapist.py +++ b/youtube_dl/extractor/escapist.py @@ -22,6 +22,7 @@ class EscapistIE(InfoExtractor): 'uploader_id': 'the-escapist-presents', 'uploader': 'The Escapist Presents', 'title': "Breaking Down Baldur's Gate", + 'thumbnail': 're:^https?://.*\.jpg$', } } @@ -30,19 +31,25 @@ class EscapistIE(InfoExtractor): webpage = self._download_webpage(url, video_id) uploader_id = self._html_search_regex( - r"<h1 class='headline'><a href='/videos/view/(.*?)'", + r"<h1\s+class='headline'>\s*<a\s+href='/videos/view/(.*?)'", webpage, 'uploader ID', fatal=False) uploader = self._html_search_regex( - r"<h1 class='headline'>(.*?)</a>", + r"<h1\s+class='headline'>(.*?)</a>", webpage, 'uploader', fatal=False) description = self._html_search_meta('description', webpage) raw_title = self._html_search_meta('title', webpage, fatal=True) title = raw_title.partition(' : ')[2] - player_url = self._og_search_video_url(webpage, name='player URL') - config_url = compat_urllib_parse.unquote(self._search_regex( - r'config=(.*)$', player_url, 'config URL')) + config_url = compat_urllib_parse.unquote(self._html_search_regex( + r'''(?x) + (?: + <param\s+name="flashvars"\s+value="config=| + flashvars="config= + ) + ([^"&]+) + ''', + webpage, 'config URL')) formats = [] @@ -81,5 +88,4 @@ class EscapistIE(InfoExtractor): 'title': title, 'thumbnail': self._og_search_thumbnail(webpage), 'description': description, - 'player_url': player_url, } diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index 1ad4e77a8..f0e575320 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -126,11 +126,17 @@ class FacebookIE(InfoExtractor): params_raw = compat_urllib_parse.unquote(data['params']) params = json.loads(params_raw) video_data = params['video_data'][0] - video_url = video_data.get('hd_src') - if not video_url: - video_url = video_data['sd_src'] - if not video_url: - raise ExtractorError('Cannot find video URL') + + formats = [] + for quality in ['sd', 'hd']: + src = video_data.get('%s_src' % quality) + if src is not None: + formats.append({ + 'format_id': quality, + 'url': src, + }) + if not formats: + raise ExtractorError('Cannot find video formats') video_title = self._html_search_regex( r'<h2 class="uiHeaderTitle">([^<]*)</h2>', webpage, 'title', @@ -146,7 +152,7 @@ class FacebookIE(InfoExtractor): return { 'id': video_id, 'title': video_title, - 'url': video_url, + 'formats': formats, 'duration': int_or_none(video_data.get('video_duration')), 'thumbnail': video_data.get('thumbnail_src'), } diff --git a/youtube_dl/extractor/gdcvault.py b/youtube_dl/extractor/gdcvault.py index fed968f51..f7b467b0a 100644 --- a/youtube_dl/extractor/gdcvault.py +++ b/youtube_dl/extractor/gdcvault.py @@ -7,6 +7,7 @@ from ..compat import ( compat_urllib_parse, compat_urllib_request, ) +from ..utils import remove_end class GDCVaultIE(InfoExtractor): @@ -65,10 +66,12 @@ class GDCVaultIE(InfoExtractor): def _parse_flv(self, xml_description): video_formats = [] - akami_url = xml_description.find('./metadata/akamaiHost').text + akamai_url = xml_description.find('./metadata/akamaiHost').text slide_video_path = xml_description.find('./metadata/slideVideo').text video_formats.append({ - 'url': 'rtmp://' + akami_url + '/' + slide_video_path, + 'url': 'rtmp://%s/ondemand?ovpfv=1.1' % akamai_url, + 'play_path': remove_end(slide_video_path, '.flv'), + 'ext': 'flv', 'format_note': 'slide deck video', 'quality': -2, 'preference': -2, @@ -76,7 +79,9 @@ class GDCVaultIE(InfoExtractor): }) speaker_video_path = xml_description.find('./metadata/speakerVideo').text video_formats.append({ - 'url': 'rtmp://' + akami_url + '/' + speaker_video_path, + 'url': 'rtmp://%s/ondemand?ovpfv=1.1' % akamai_url, + 'play_path': remove_end(speaker_video_path, '.flv'), + 'ext': 'flv', 'format_note': 'speaker video', 'quality': -1, 'preference': -1, diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 8dce96a64..875e1bf05 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -547,7 +547,16 @@ class GenericIE(InfoExtractor): 'id': 'aanslagen-kopenhagen', 'title': 'Aanslagen Kopenhagen | RTL Nieuws', } - } + }, + # Zapiks embed + { + 'url': 'http://www.skipass.com/news/116090-bon-appetit-s5ep3-baqueira-mi-cor.html', + 'info_dict': { + 'id': '118046', + 'ext': 'mp4', + 'title': 'EP3S5 - Bon Appétit - Baqueira Mi Corazon !', + } + }, ] def report_following_redirect(self, new_url): @@ -1098,6 +1107,12 @@ class GenericIE(InfoExtractor): if mobj is not None: return self.url_result(mobj.group('url'), 'Livestream') + # Look for Zapiks embed + mobj = re.search( + r'<iframe[^>]+src="(?P<url>https?://(?:www\.)?zapiks\.fr/index\.php\?.+?)"', webpage) + if mobj is not None: + return self.url_result(mobj.group('url'), 'Zapiks') + def check_video(vurl): if YoutubeIE.suitable(vurl): return True diff --git a/youtube_dl/extractor/imgur.py b/youtube_dl/extractor/imgur.py index b16c7aed0..fe5d95e2c 100644 --- a/youtube_dl/extractor/imgur.py +++ b/youtube_dl/extractor/imgur.py @@ -19,16 +19,16 @@ class ImgurIE(InfoExtractor): 'info_dict': { 'id': 'A61SaA1', 'ext': 'mp4', - 'title': 'MRW gifv is up and running without any bugs', - 'description': 'The Internet\'s visual storytelling community. Explore, share, and discuss the best visual stories the Internet has to offer.', + 'title': 're:Imgur GIF$|MRW gifv is up and running without any bugs$', + 'description': 're:The origin of the Internet\'s most viral images$|The Internet\'s visual storytelling community\. Explore, share, and discuss the best visual stories the Internet has to offer\.$', }, }, { 'url': 'https://imgur.com/A61SaA1', 'info_dict': { 'id': 'A61SaA1', 'ext': 'mp4', - 'title': 'MRW gifv is up and running without any bugs', - 'description': 'The Internet\'s visual storytelling community. Explore, share, and discuss the best visual stories the Internet has to offer.', + 'title': 're:Imgur GIF$|MRW gifv is up and running without any bugs$', + 'description': 're:The origin of the Internet\'s most viral images$|The Internet\'s visual storytelling community\. Explore, share, and discuss the best visual stories the Internet has to offer\.$', }, }] diff --git a/youtube_dl/extractor/laola1tv.py b/youtube_dl/extractor/laola1tv.py index 2fd3b4699..e8ca49fd1 100644 --- a/youtube_dl/extractor/laola1tv.py +++ b/youtube_dl/extractor/laola1tv.py @@ -1,23 +1,26 @@ +# -*- coding: utf-8 -*- from __future__ import unicode_literals import random import re from .common import InfoExtractor -from ..utils import ExtractorError +from ..utils import ( + ExtractorError, + xpath_text, +) class Laola1TvIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?laola1\.tv/(?P<lang>[a-z]+)-(?P<portal>[a-z]+)/.*?/(?P<id>[0-9]+)\.html' _TEST = { - 'url': 'http://www.laola1.tv/de-de/live/bwf-bitburger-open-grand-prix-gold-court-1/250019.html', + 'url': 'http://www.laola1.tv/de-de/video/straubing-tigers-koelner-haie/227883.html', 'info_dict': { - 'id': '250019', + 'id': '227883', 'ext': 'mp4', - 'title': 'Bitburger Open Grand Prix Gold - Court 1', - 'categories': ['Badminton'], - 'uploader': 'BWF - Badminton World Federation', - 'is_live': True, + 'title': 'Straubing Tigers - Kölner Haie', + 'categories': ['Eishockey'], + 'is_live': False, }, 'params': { 'skip_download': True, @@ -43,15 +46,26 @@ class Laola1TvIE(InfoExtractor): r'flashvars\.([_a-zA-Z0-9]+)\s*=\s*"([^"]*)";', iframe) flashvars = dict((m[0], m[1]) for m in flashvars_m) + partner_id = self._search_regex( + r'partnerid\s*:\s*"([^"]+)"', iframe, 'partner id') + xml_url = ('http://www.laola1.tv/server/hd_video.php?' + - 'play=%s&partner=1&portal=%s&v5ident=&lang=%s' % ( - video_id, portal, lang)) + 'play=%s&partner=%s&portal=%s&v5ident=&lang=%s' % ( + video_id, partner_id, portal, lang)) hd_doc = self._download_xml(xml_url, video_id) - title = hd_doc.find('.//video/title').text - flash_url = hd_doc.find('.//video/url').text - categories = hd_doc.find('.//video/meta_sports').text.split(',') - uploader = hd_doc.find('.//video/meta_organistation').text + title = xpath_text(hd_doc, './/video/title', fatal=True) + flash_url = xpath_text(hd_doc, './/video/url', fatal=True) + uploader = xpath_text(hd_doc, './/video/meta_organistation') + + is_live = xpath_text(hd_doc, './/video/islive') == 'true' + if is_live: + raise ExtractorError( + 'Live streams are not supported by the f4m downloader.') + + categories = xpath_text(hd_doc, './/video/meta_sports') + if categories: + categories = categories.split(',') ident = random.randint(10000000, 99999999) token_url = '%s&ident=%s&klub=0&unikey=0×tamp=%s&auth=%s' % ( @@ -60,15 +74,16 @@ class Laola1TvIE(InfoExtractor): token_doc = self._download_xml( token_url, video_id, note='Downloading token') token_attrib = token_doc.find('.//token').attrib - if token_attrib.get('auth') == 'blocked': - raise ExtractorError('Token error: ' % token_attrib.get('comment')) + if token_attrib.get('auth') in ('blocked', 'restricted'): + raise ExtractorError( + 'Token error: %s' % token_attrib.get('comment'), expected=True) video_url = '%s?hdnea=%s&hdcore=3.2.0' % ( token_attrib['url'], token_attrib['auth']) return { 'id': video_id, - 'is_live': True, + 'is_live': is_live, 'title': title, 'url': video_url, 'uploader': uploader, diff --git a/youtube_dl/extractor/lynda.py b/youtube_dl/extractor/lynda.py index 762cefa34..109055e72 100644 --- a/youtube_dl/extractor/lynda.py +++ b/youtube_dl/extractor/lynda.py @@ -3,7 +3,6 @@ from __future__ import unicode_literals import re import json -from .subtitles import SubtitlesInfoExtractor from .common import InfoExtractor from ..compat import ( compat_str, @@ -16,7 +15,7 @@ from ..utils import ( ) -class LyndaIE(SubtitlesInfoExtractor): +class LyndaIE(InfoExtractor): IE_NAME = 'lynda' IE_DESC = 'lynda.com videos' _VALID_URL = r'https?://www\.lynda\.com/[^/]+/[^/]+/\d+/(\d+)-\d\.html' @@ -88,11 +87,7 @@ class LyndaIE(SubtitlesInfoExtractor): self._check_formats(formats, video_id) self._sort_formats(formats) - if self._downloader.params.get('listsubtitles', False): - self._list_available_subtitles(video_id, page) - return - - subtitles = self._fix_subtitles(self.extract_subtitles(video_id, page)) + subtitles = self.extract_subtitles(video_id, page) return { 'id': video_id, @@ -144,38 +139,31 @@ class LyndaIE(SubtitlesInfoExtractor): if re.search(self._SUCCESSFUL_LOGIN_REGEX, login_page) is None: raise ExtractorError('Unable to log in') - def _fix_subtitles(self, subtitles): - if subtitles is None: - return subtitles # subtitles not requested - - fixed_subtitles = {} - for k, v in subtitles.items(): - subs = json.loads(v) - if len(subs) == 0: + def _fix_subtitles(self, subs): + srt = '' + for pos in range(0, len(subs) - 1): + seq_current = subs[pos] + m_current = re.match(self._TIMECODE_REGEX, seq_current['Timecode']) + if m_current is None: continue - srt = '' - for pos in range(0, len(subs) - 1): - seq_current = subs[pos] - m_current = re.match(self._TIMECODE_REGEX, seq_current['Timecode']) - if m_current is None: - continue - seq_next = subs[pos + 1] - m_next = re.match(self._TIMECODE_REGEX, seq_next['Timecode']) - if m_next is None: - continue - appear_time = m_current.group('timecode') - disappear_time = m_next.group('timecode') - text = seq_current['Caption'] - srt += '%s\r\n%s --> %s\r\n%s' % (str(pos), appear_time, disappear_time, text) - if srt: - fixed_subtitles[k] = srt - return fixed_subtitles + seq_next = subs[pos + 1] + m_next = re.match(self._TIMECODE_REGEX, seq_next['Timecode']) + if m_next is None: + continue + appear_time = m_current.group('timecode') + disappear_time = m_next.group('timecode') + text = seq_current['Caption'] + srt += '%s\r\n%s --> %s\r\n%s' % (str(pos), appear_time, disappear_time, text) + if srt: + return srt - def _get_available_subtitles(self, video_id, webpage): + def _get_subtitles(self, video_id, webpage): url = 'http://www.lynda.com/ajax/player?videoId=%s&type=transcript' % video_id - sub = self._download_webpage(url, None, False) - sub_json = json.loads(sub) - return {'en': url} if len(sub_json) > 0 else {} + subs = self._download_json(url, None, False) + if subs: + return {'en': [{'ext': 'srt', 'data': self._fix_subtitles(subs)}]} + else: + return {} class LyndaCourseIE(InfoExtractor): diff --git a/youtube_dl/extractor/mit.py b/youtube_dl/extractor/mit.py index 3c61a850f..d7ab6a9ae 100644 --- a/youtube_dl/extractor/mit.py +++ b/youtube_dl/extractor/mit.py @@ -5,9 +5,6 @@ import json from .common import InfoExtractor from .youtube import YoutubeIE -from ..compat import ( - compat_urlparse, -) from ..utils import ( clean_html, ExtractorError, @@ -108,7 +105,6 @@ class OCWMITIE(InfoExtractor): 'upload_date': '20121109', 'uploader_id': 'MIT', 'uploader': 'MIT OpenCourseWare', - # 'subtitles': 'http://ocw.mit.edu/courses/electrical-engineering-and-computer-science/6-041-probabilistic-systems-analysis-and-applied-probability-fall-2010/video-lectures/lecture-7-multiple-variables-expectations-independence/MIT6_041F11_lec07_300k.mp4.srt' } }, { @@ -121,7 +117,6 @@ class OCWMITIE(InfoExtractor): 'uploader_id': 'MIT', 'uploader': 'MIT OpenCourseWare', 'description': 'This section contains lecture video excerpts, lecture notes, an interactive mathlet with supporting documents, and problem solving videos.', - # 'subtitles': 'http://ocw.mit.edu//courses/mathematics/18-01sc-single-variable-calculus-fall-2010/ocw-18.01-f07-lec01_300k.SRT' } } ] @@ -140,7 +135,6 @@ class OCWMITIE(InfoExtractor): metadata = re.sub(r'[\'"]', '', embed_chapter_media.group(1)) metadata = re.split(r', ?', metadata) yt = metadata[1] - subs = compat_urlparse.urljoin(self._BASE_URL, metadata[7]) else: # search for call to ocw_embed_chapter_media(container_id, media_url, provider, page_url, image_url, captions_file) embed_media = re.search(r'ocw_embed_media\((.+?)\)', webpage) @@ -148,7 +142,6 @@ class OCWMITIE(InfoExtractor): metadata = re.sub(r'[\'"]', '', embed_media.group(1)) metadata = re.split(r', ?', metadata) yt = metadata[1] - subs = compat_urlparse.urljoin(self._BASE_URL, metadata[5]) else: raise ExtractorError('Unable to find embedded YouTube video.') video_id = YoutubeIE.extract_id(yt) @@ -159,7 +152,5 @@ class OCWMITIE(InfoExtractor): 'title': title, 'description': description, 'url': yt, - 'url_transparent' - 'subtitles': subs, 'ie_key': 'Youtube', } diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py index bc7f49ebb..c11de1cb6 100644 --- a/youtube_dl/extractor/mtv.py +++ b/youtube_dl/extractor/mtv.py @@ -2,7 +2,7 @@ from __future__ import unicode_literals import re -from .subtitles import SubtitlesInfoExtractor +from .common import InfoExtractor from ..compat import ( compat_urllib_parse, compat_urllib_request, @@ -23,7 +23,7 @@ def _media_xml_tag(tag): return '{http://search.yahoo.com/mrss/}%s' % tag -class MTVServicesInfoExtractor(SubtitlesInfoExtractor): +class MTVServicesInfoExtractor(InfoExtractor): _MOBILE_TEMPLATE = None @staticmethod @@ -95,25 +95,15 @@ class MTVServicesInfoExtractor(SubtitlesInfoExtractor): def _extract_subtitles(self, mdoc, mtvn_id): subtitles = {} - FORMATS = { - 'scc': 'cea-608', - 'eia-608': 'cea-608', - 'xml': 'ttml', - } - subtitles_format = FORMATS.get( - self._downloader.params.get('subtitlesformat'), 'ttml') for transcript in mdoc.findall('.//transcript'): if transcript.get('kind') != 'captions': continue lang = transcript.get('srclang') - for typographic in transcript.findall('./typographic'): - captions_format = typographic.get('format') - if captions_format == subtitles_format: - subtitles[lang] = compat_str(typographic.get('src')) - break - if self._downloader.params.get('listsubtitles', False): - self._list_available_subtitles(mtvn_id, subtitles) - return self.extract_subtitles(mtvn_id, subtitles) + subtitles[lang] = [{ + 'url': compat_str(typographic.get('src')), + 'ext': typographic.get('format') + } for typographic in transcript.findall('./typographic')] + return subtitles def _get_video_info(self, itemdoc): uri = itemdoc.find('guid').text @@ -196,8 +186,6 @@ class MTVServicesInfoExtractor(SubtitlesInfoExtractor): webpage, 'mgid') videos_info = self._get_videos_info(mgid) - if self._downloader.params.get('listsubtitles', False): - return return videos_info diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index c075618e8..9c01eb0af 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -1,6 +1,5 @@ from __future__ import unicode_literals -from .subtitles import SubtitlesInfoExtractor from .common import InfoExtractor from ..utils import ( fix_xml_ampersands, @@ -12,7 +11,7 @@ from ..utils import ( ) -class NPOBaseIE(SubtitlesInfoExtractor): +class NPOBaseIE(InfoExtractor): def _get_token(self, video_id): token_page = self._download_webpage( 'http://ida.omroep.nl/npoplayer/i.js', @@ -164,13 +163,10 @@ class NPOIE(NPOBaseIE): subtitles = {} if metadata.get('tt888') == 'ja': - subtitles['nl'] = 'http://e.omroep.nl/tt888/%s' % video_id - - if self._downloader.params.get('listsubtitles', False): - self._list_available_subtitles(video_id, subtitles) - return - - subtitles = self.extract_subtitles(video_id, subtitles) + subtitles['nl'] = [{ + 'ext': 'vtt', + 'url': 'http://e.omroep.nl/tt888/%s' % video_id, + }] return { 'id': video_id, diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py index f6de26022..1e4cfa2e7 100644 --- a/youtube_dl/extractor/nrk.py +++ b/youtube_dl/extractor/nrk.py @@ -4,13 +4,13 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( ExtractorError, float_or_none, parse_duration, unified_strdate, ) -from .subtitles import SubtitlesInfoExtractor class NRKIE(InfoExtractor): @@ -73,7 +73,7 @@ class NRKIE(InfoExtractor): } -class NRKTVIE(SubtitlesInfoExtractor): +class NRKTVIE(InfoExtractor): _VALID_URL = r'(?P<baseurl>http://tv\.nrk(?:super)?\.no/)(?:serie/[^/]+|program)/(?P<id>[a-zA-Z]{4}\d{8})(?:/\d{2}-\d{2}-\d{4})?(?:#del=(?P<part_id>\d+))?' _TESTS = [ @@ -156,10 +156,12 @@ class NRKTVIE(SubtitlesInfoExtractor): if self._downloader.params.get('verbose', False): self.to_screen('[debug] %s' % txt) - def _extract_captions(self, subtitlesurl, video_id, baseurl): + def _get_subtitles(self, subtitlesurl, video_id, baseurl): url = "%s%s" % (baseurl, subtitlesurl) self._debug_print('%s: Subtitle url: %s' % (video_id, url)) - captions = self._download_xml(url, video_id, 'Downloading subtitles') + captions = self._download_xml( + url, video_id, 'Downloading subtitles', + transform_source=lambda s: s.replace(r'<br />', '\r\n')) lang = captions.get('lang', 'no') ps = captions.findall('./{0}body/{0}div/{0}p'.format('{http://www.w3.org/ns/ttml}')) srt = '' @@ -168,9 +170,11 @@ class NRKTVIE(SubtitlesInfoExtractor): duration = parse_duration(p.get('dur')) starttime = self._seconds2str(begin) endtime = self._seconds2str(begin + duration) - text = '\n'.join(p.itertext()) - srt += '%s\r\n%s --> %s\r\n%s\r\n\r\n' % (str(pos), starttime, endtime, text) - return {lang: srt} + srt += '%s\r\n%s --> %s\r\n%s\r\n\r\n' % (compat_str(pos), starttime, endtime, p.text) + return {lang: [ + {'ext': 'ttml', 'url': url}, + {'ext': 'srt', 'data': srt}, + ]} def _extract_f4m(self, manifest_url, video_id): return self._extract_f4m_formats(manifest_url + '?hdcore=3.1.1&plugin=aasp-3.1.1.69.124', video_id) @@ -243,10 +247,7 @@ class NRKTVIE(SubtitlesInfoExtractor): webpage, 'subtitle URL', default=None) subtitles = None if subtitles_url: - subtitles = self._extract_captions(subtitles_url, video_id, baseurl) - if self._downloader.params.get('listsubtitles', False): - self._list_available_subtitles(video_id, subtitles) - return + subtitles = self.extract_subtitles(subtitles_url, video_id, baseurl) return { 'id': video_id, diff --git a/youtube_dl/extractor/r7.py b/youtube_dl/extractor/r7.py new file mode 100644 index 000000000..976c8feec --- /dev/null +++ b/youtube_dl/extractor/r7.py @@ -0,0 +1,88 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + js_to_json, + unescapeHTML, + int_or_none, +) + + +class R7IE(InfoExtractor): + _VALID_URL = r'''(?x)https?:// + (?: + (?:[a-zA-Z]+)\.r7\.com(?:/[^/]+)+/idmedia/| + noticias\.r7\.com(?:/[^/]+)+/[^/]+-| + player\.r7\.com/video/i/ + ) + (?P<id>[\da-f]{24}) + ''' + _TESTS = [{ + 'url': 'http://videos.r7.com/policiais-humilham-suspeito-a-beira-da-morte-morre-com-dignidade-/idmedia/54e7050b0cf2ff57e0279389.html', + 'md5': '403c4e393617e8e8ddc748978ee8efde', + 'info_dict': { + 'id': '54e7050b0cf2ff57e0279389', + 'ext': 'mp4', + 'title': 'Policiais humilham suspeito à beira da morte: "Morre com dignidade"', + 'thumbnail': 're:^https?://.*\.jpg$', + 'duration': 98, + 'like_count': int, + 'view_count': int, + }, + }, { + 'url': 'http://esportes.r7.com/videos/cigano-manda-recado-aos-fas/idmedia/4e176727b51a048ee6646a1b.html', + 'only_matching': True, + }, { + 'url': 'http://noticias.r7.com/record-news/video/representante-do-instituto-sou-da-paz-fala-sobre-fim-do-estatuto-do-desarmamento-5480fc580cf2285b117f438d/', + 'only_matching': True, + }, { + 'url': 'http://player.r7.com/video/i/54e7050b0cf2ff57e0279389?play=true&video=http://vsh.r7.com/54e7050b0cf2ff57e0279389/ER7_RE_BG_MORTE_JOVENS_570kbps_2015-02-2009f17818-cc82-4c8f-86dc-89a66934e633-ATOS_copy.mp4&linkCallback=http://videos.r7.com/policiais-humilham-suspeito-a-beira-da-morte-morre-com-dignidade-/idmedia/54e7050b0cf2ff57e0279389.html&thumbnail=http://vtb.r7.com/ER7_RE_BG_MORTE_JOVENS_570kbps_2015-02-2009f17818-cc82-4c8f-86dc-89a66934e633-thumb.jpg&idCategory=192&share=true&layout=full&full=true', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage( + 'http://player.r7.com/video/i/%s' % video_id, video_id) + + item = self._parse_json(js_to_json(self._search_regex( + r'(?s)var\s+item\s*=\s*({.+?});', webpage, 'player')), video_id) + + title = unescapeHTML(item['title']) + thumbnail = item.get('init', {}).get('thumbUri') + duration = None + + statistics = item.get('statistics', {}) + like_count = int_or_none(statistics.get('likes')) + view_count = int_or_none(statistics.get('views')) + + formats = [] + for format_key, format_dict in item['playlist'][0].items(): + src = format_dict.get('src') + if not src: + continue + format_id = format_dict.get('format') or format_key + if duration is None: + duration = format_dict.get('duration') + if '.f4m' in src: + formats.extend(self._extract_f4m_formats(src, video_id, preference=-1)) + elif src.endswith('.m3u8'): + formats.extend(self._extract_m3u8_formats(src, video_id, 'mp4', preference=-2)) + else: + formats.append({ + 'url': src, + 'format_id': format_id, + }) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'thumbnail': thumbnail, + 'duration': duration, + 'like_count': like_count, + 'view_count': view_count, + 'formats': formats, + } diff --git a/youtube_dl/extractor/rai.py b/youtube_dl/extractor/rai.py index aa26b7e0b..144e33982 100644 --- a/youtube_dl/extractor/rai.py +++ b/youtube_dl/extractor/rai.py @@ -2,7 +2,7 @@ from __future__ import unicode_literals import re -from .subtitles import SubtitlesInfoExtractor +from .common import InfoExtractor from ..compat import ( compat_urllib_parse, ) @@ -12,7 +12,7 @@ from ..utils import ( ) -class RaiIE(SubtitlesInfoExtractor): +class RaiIE(InfoExtractor): _VALID_URL = r'(?P<url>http://(?:.+?\.)?(?:rai\.it|rai\.tv|rainews\.it)/dl/.+?-(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})(?:-.+?)?\.html)' _TESTS = [ { @@ -89,15 +89,7 @@ class RaiIE(SubtitlesInfoExtractor): 'ext': 'mp4', }) - if self._downloader.params.get('listsubtitles', False): - page = self._download_webpage(url, video_id) - self._list_available_subtitles(video_id, page) - return - - subtitles = {} - if self._have_to_download_any_subtitles: - page = self._download_webpage(url, video_id) - subtitles = self.extract_subtitles(video_id, page) + subtitles = self.extract_subtitles(video_id, url) return { 'id': video_id, @@ -111,7 +103,8 @@ class RaiIE(SubtitlesInfoExtractor): 'subtitles': subtitles, } - def _get_available_subtitles(self, video_id, webpage): + def _get_subtitles(self, video_id, url): + webpage = self._download_webpage(url, video_id) subtitles = {} m = re.search(r'<meta name="closedcaption" content="(?P<captions>[^"]+)"', webpage) if m: @@ -120,5 +113,8 @@ class RaiIE(SubtitlesInfoExtractor): SRT_EXT = '.srt' if captions.endswith(STL_EXT): captions = captions[:-len(STL_EXT)] + SRT_EXT - subtitles['it'] = 'http://www.rai.tv%s' % compat_urllib_parse.quote(captions) + subtitles['it'] = [{ + 'ext': 'srt', + 'url': 'http://www.rai.tv%s' % compat_urllib_parse.quote(captions), + }] return subtitles diff --git a/youtube_dl/extractor/rtve.py b/youtube_dl/extractor/rtve.py index 3469d9578..c0fd23ff1 100644 --- a/youtube_dl/extractor/rtve.py +++ b/youtube_dl/extractor/rtve.py @@ -6,6 +6,7 @@ import re import time from .common import InfoExtractor +from ..compat import compat_urlparse from ..utils import ( struct_unpack, remove_end, @@ -96,12 +97,14 @@ class RTVEALaCartaIE(InfoExtractor): ).replace('.net.rtve', '.multimedia.cdn.rtve') video_path = self._download_webpage( auth_url, video_id, 'Getting video url') - # Use mvod.akcdn instead of flash.akamaihd.multimedia.cdn to get + # Use mvod1.akcdn instead of flash.akamaihd.multimedia.cdn to get # the right Content-Length header and the mp4 format - video_url = ( - 'http://mvod.akcdn.rtve.es/{0}&v=2.6.8' - '&fp=MAC%2016,0,0,296&r=MRUGG&g=OEOJWFXNFGCP'.format(video_path) - ) + video_url = compat_urlparse.urljoin( + 'http://mvod1.akcdn.rtve.es/', video_path) + + subtitles = None + if info.get('sbtFile') is not None: + subtitles = self.extract_subtitles(video_id, info['sbtFile']) return { 'id': video_id, @@ -109,8 +112,17 @@ class RTVEALaCartaIE(InfoExtractor): 'url': video_url, 'thumbnail': info.get('image'), 'page_url': url, + 'subtitles': subtitles, } + def _get_subtitles(self, video_id, sub_file): + subs = self._download_json( + sub_file + '.json', video_id, + 'Downloading subtitles info')['page']['items'] + return dict( + (s['lang'], [{'ext': 'vtt', 'url': s['src']}]) + for s in subs) + class RTVELiveIE(InfoExtractor): IE_NAME = 'rtve.es:live' diff --git a/youtube_dl/extractor/soundgasm.py b/youtube_dl/extractor/soundgasm.py index a4f8ce6c3..3a4ddf57e 100644 --- a/youtube_dl/extractor/soundgasm.py +++ b/youtube_dl/extractor/soundgasm.py @@ -7,6 +7,7 @@ from .common import InfoExtractor class SoundgasmIE(InfoExtractor): + IE_NAME = 'soundgasm' _VALID_URL = r'https?://(?:www\.)?soundgasm\.net/u/(?P<user>[0-9a-zA-Z_\-]+)/(?P<title>[0-9a-zA-Z_\-]+)' _TEST = { 'url': 'http://soundgasm.net/u/ytdl/Piano-sample', @@ -38,3 +39,26 @@ class SoundgasmIE(InfoExtractor): 'title': audio_title, 'description': description } + + +class SoundgasmProfileIE(InfoExtractor): + IE_NAME = 'soundgasm:profile' + _VALID_URL = r'https?://(?:www\.)?soundgasm\.net/u/(?P<id>[^/]+)/?(?:\#.*)?$' + _TEST = { + 'url': 'http://soundgasm.net/u/ytdl', + 'info_dict': { + 'id': 'ytdl', + }, + 'playlist_count': 1, + } + + def _real_extract(self, url): + profile_id = self._match_id(url) + + webpage = self._download_webpage(url, profile_id) + + entries = [ + self.url_result(audio_url, 'Soundgasm') + for audio_url in re.findall(r'href="([^"]+/u/%s/[^"]+)' % profile_id, webpage)] + + return self.playlist_result(entries, profile_id) diff --git a/youtube_dl/extractor/subtitles.py b/youtube_dl/extractor/subtitles.py deleted file mode 100644 index 59a51268d..000000000 --- a/youtube_dl/extractor/subtitles.py +++ /dev/null @@ -1,99 +0,0 @@ -from __future__ import unicode_literals -from .common import InfoExtractor - -from ..compat import compat_str -from ..utils import ( - ExtractorError, -) - - -class SubtitlesInfoExtractor(InfoExtractor): - @property - def _have_to_download_any_subtitles(self): - return any([self._downloader.params.get('writesubtitles', False), - self._downloader.params.get('writeautomaticsub')]) - - def _list_available_subtitles(self, video_id, webpage): - """ outputs the available subtitles for the video """ - sub_lang_list = self._get_available_subtitles(video_id, webpage) - auto_captions_list = self._get_available_automatic_caption(video_id, webpage) - sub_lang = ",".join(list(sub_lang_list.keys())) - self.to_screen('%s: Available subtitles for video: %s' % - (video_id, sub_lang)) - auto_lang = ",".join(auto_captions_list.keys()) - self.to_screen('%s: Available automatic captions for video: %s' % - (video_id, auto_lang)) - - def extract_subtitles(self, video_id, webpage): - """ - returns {sub_lang: sub} ,{} if subtitles not found or None if the - subtitles aren't requested. - """ - if not self._have_to_download_any_subtitles: - return None - available_subs_list = {} - if self._downloader.params.get('writeautomaticsub', False): - available_subs_list.update(self._get_available_automatic_caption(video_id, webpage)) - if self._downloader.params.get('writesubtitles', False): - available_subs_list.update(self._get_available_subtitles(video_id, webpage)) - - if not available_subs_list: # error, it didn't get the available subtitles - return {} - if self._downloader.params.get('allsubtitles', False): - sub_lang_list = available_subs_list - else: - if self._downloader.params.get('subtitleslangs', False): - requested_langs = self._downloader.params.get('subtitleslangs') - elif 'en' in available_subs_list: - requested_langs = ['en'] - else: - requested_langs = [list(available_subs_list.keys())[0]] - - sub_lang_list = {} - for sub_lang in requested_langs: - if sub_lang not in available_subs_list: - self._downloader.report_warning('no closed captions found in the specified language "%s"' % sub_lang) - continue - sub_lang_list[sub_lang] = available_subs_list[sub_lang] - - subtitles = {} - for sub_lang, url in sub_lang_list.items(): - subtitle = self._request_subtitle_url(sub_lang, url) - if subtitle: - subtitles[sub_lang] = subtitle - return subtitles - - def _download_subtitle_url(self, sub_lang, url): - return self._download_webpage(url, None, note=False) - - def _request_subtitle_url(self, sub_lang, url): - """ makes the http request for the subtitle """ - try: - sub = self._download_subtitle_url(sub_lang, url) - except ExtractorError as err: - self._downloader.report_warning('unable to download video subtitles for %s: %s' % (sub_lang, compat_str(err))) - return - if not sub: - self._downloader.report_warning('Did not fetch video subtitles') - return - return sub - - def _get_available_subtitles(self, video_id, webpage): - """ - returns {sub_lang: url} or {} if not available - Must be redefined by the subclasses - """ - - # By default, allow implementations to simply pass in the result - assert isinstance(webpage, dict), \ - '_get_available_subtitles not implemented' - return webpage - - def _get_available_automatic_caption(self, video_id, webpage): - """ - returns {sub_lang: url} or {} if not available - Must be redefined by the subclasses that support automatic captions, - otherwise it will return {} - """ - self._downloader.report_warning('Automatic Captions not supported by this server') - return {} diff --git a/youtube_dl/extractor/teamcoco.py b/youtube_dl/extractor/teamcoco.py index a73da1c9c..5793dbc10 100644 --- a/youtube_dl/extractor/teamcoco.py +++ b/youtube_dl/extractor/teamcoco.py @@ -1,8 +1,10 @@ from __future__ import unicode_literals +import base64 import re from .common import InfoExtractor +from ..utils import qualities class TeamcocoIE(InfoExtractor): @@ -24,8 +26,8 @@ class TeamcocoIE(InfoExtractor): 'info_dict': { 'id': '19705', 'ext': 'mp4', - "description": "Louis C.K. got starstruck by George W. Bush, so what? Part one.", - "title": "Louis C.K. Interview Pt. 1 11/3/11", + 'description': 'Louis C.K. got starstruck by George W. Bush, so what? Part one.', + 'title': 'Louis C.K. Interview Pt. 1 11/3/11', 'age_limit': 0, } } @@ -42,42 +44,39 @@ class TeamcocoIE(InfoExtractor): display_id = mobj.group('display_id') webpage = self._download_webpage(url, display_id) - video_id = mobj.group("video_id") + video_id = mobj.group('video_id') if not video_id: video_id = self._html_search_regex( self._VIDEO_ID_REGEXES, webpage, 'video id') - data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id - data = self._download_xml( - data_url, display_id, 'Downloading data webpage') + embed_url = 'http://teamcoco.com/embed/v/%s' % video_id + embed = self._download_webpage( + embed_url, video_id, 'Downloading embed page') + + encoded_data = self._search_regex( + r'"preload"\s*:\s*"([^"]+)"', embed, 'encoded data') + data = self._parse_json( + base64.b64decode(encoded_data.encode('ascii')).decode('utf-8'), video_id) - qualities = ['500k', '480p', '1000k', '720p', '1080p'] formats = [] - for filed in data.findall('files/file'): - if filed.attrib.get('playmode') == 'all': - # it just duplicates one of the entries - break - file_url = filed.text - m_format = re.search(r'(\d+(k|p))\.mp4', file_url) + get_quality = qualities(['500k', '480p', '1000k', '720p', '1080p']) + for filed in data['files']: + m_format = re.search(r'(\d+(k|p))\.mp4', filed['url']) if m_format is not None: format_id = m_format.group(1) else: - format_id = filed.attrib['bitrate'] + format_id = filed['bitrate'] tbr = ( - int(filed.attrib['bitrate']) - if filed.attrib['bitrate'].isdigit() + int(filed['bitrate']) + if filed['bitrate'].isdigit() else None) - try: - quality = qualities.index(format_id) - except ValueError: - quality = -1 formats.append({ - 'url': file_url, + 'url': filed['url'], 'ext': 'mp4', 'tbr': tbr, 'format_id': format_id, - 'quality': quality, + 'quality': get_quality(format_id), }) self._sort_formats(formats) @@ -86,8 +85,8 @@ class TeamcocoIE(InfoExtractor): 'id': video_id, 'display_id': display_id, 'formats': formats, - 'title': self._og_search_title(webpage), - 'thumbnail': self._og_search_thumbnail(webpage), - 'description': self._og_search_description(webpage), + 'title': data['title'], + 'thumbnail': data.get('thumb', {}).get('href'), + 'description': data.get('teaser'), 'age_limit': self._family_friendly_search(webpage), } diff --git a/youtube_dl/extractor/ted.py b/youtube_dl/extractor/ted.py index 59678399d..4cec06f8b 100644 --- a/youtube_dl/extractor/ted.py +++ b/youtube_dl/extractor/ted.py @@ -3,14 +3,14 @@ from __future__ import unicode_literals import json import re -from .subtitles import SubtitlesInfoExtractor +from .common import InfoExtractor from ..compat import ( compat_str, ) -class TEDIE(SubtitlesInfoExtractor): +class TEDIE(InfoExtractor): _VALID_URL = r'''(?x) (?P<proto>https?://) (?P<type>www|embed(?:-ssl)?)(?P<urlmain>\.ted\.com/ @@ -184,11 +184,6 @@ class TEDIE(SubtitlesInfoExtractor): self._sort_formats(formats) video_id = compat_str(talk_info['id']) - # subtitles - video_subtitles = self.extract_subtitles(video_id, talk_info) - if self._downloader.params.get('listsubtitles', False): - self._list_available_subtitles(video_id, talk_info) - return thumbnail = talk_info['thumb'] if not thumbnail.startswith('http'): @@ -199,21 +194,25 @@ class TEDIE(SubtitlesInfoExtractor): 'uploader': talk_info['speaker'], 'thumbnail': thumbnail, 'description': self._og_search_description(webpage), - 'subtitles': video_subtitles, + 'subtitles': self._get_subtitles(video_id, talk_info), 'formats': formats, 'duration': talk_info.get('duration'), } - def _get_available_subtitles(self, video_id, talk_info): + def _get_subtitles(self, video_id, talk_info): languages = [lang['languageCode'] for lang in talk_info.get('languages', [])] if languages: sub_lang_list = {} for l in languages: - url = 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/srt' % (video_id, l) - sub_lang_list[l] = url + sub_lang_list[l] = [ + { + 'url': 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/%s' % (video_id, l, ext), + 'ext': ext, + } + for ext in ['ted', 'srt'] + ] return sub_lang_list else: - self._downloader.report_warning('video doesn\'t have subtitles') return {} def _watch_info(self, url, name): diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py index f7b34bd26..feac666f7 100644 --- a/youtube_dl/extractor/theplatform.py +++ b/youtube_dl/extractor/theplatform.py @@ -8,7 +8,7 @@ import binascii import hashlib -from .subtitles import SubtitlesInfoExtractor +from .common import InfoExtractor from ..compat import ( compat_str, ) @@ -22,7 +22,7 @@ from ..utils import ( _x = lambda p: xpath_with_ns(p, {'smil': 'http://www.w3.org/2005/SMIL21/Language'}) -class ThePlatformIE(SubtitlesInfoExtractor): +class ThePlatformIE(InfoExtractor): _VALID_URL = r'''(?x) (?:https?://(?:link|player)\.theplatform\.com/[sp]/(?P<provider_id>[^/]+)/ (?P<config>(?:[^/\?]+/(?:swf|config)|onsite)/select/)? @@ -106,15 +106,11 @@ class ThePlatformIE(SubtitlesInfoExtractor): captions = info.get('captions') if isinstance(captions, list): for caption in captions: - lang, src = caption.get('lang'), caption.get('src') - if lang and src: - subtitles[lang] = src - - if self._downloader.params.get('listsubtitles', False): - self._list_available_subtitles(video_id, subtitles) - return - - subtitles = self.extract_subtitles(video_id, subtitles) + lang, src, mime = caption.get('lang', 'en'), caption.get('src'), caption.get('type') + subtitles[lang] = [{ + 'ext': 'srt' if mime == 'text/srt' else 'ttml', + 'url': src, + }] head = meta.find(_x('smil:head')) body = meta.find(_x('smil:body')) diff --git a/youtube_dl/extractor/viki.py b/youtube_dl/extractor/viki.py index 944901e14..6816dacb6 100644 --- a/youtube_dl/extractor/viki.py +++ b/youtube_dl/extractor/viki.py @@ -2,16 +2,17 @@ from __future__ import unicode_literals import re +from ..compat import compat_urlparse from ..utils import ( ExtractorError, unescapeHTML, unified_strdate, US_RATINGS, ) -from .subtitles import SubtitlesInfoExtractor +from .common import InfoExtractor -class VikiIE(SubtitlesInfoExtractor): +class VikiIE(InfoExtractor): IE_NAME = 'viki' _VALID_URL = r'^https?://(?:www\.)?viki\.com/videos/(?P<id>[0-9]+v)' @@ -69,9 +70,6 @@ class VikiIE(SubtitlesInfoExtractor): # subtitles video_subtitles = self.extract_subtitles(video_id, info_webpage) - if self._downloader.params.get('listsubtitles', False): - self._list_available_subtitles(video_id, info_webpage) - return return { 'id': video_id, @@ -85,12 +83,15 @@ class VikiIE(SubtitlesInfoExtractor): 'upload_date': upload_date, } - def _get_available_subtitles(self, video_id, info_webpage): + def _get_subtitles(self, video_id, info_webpage): res = {} - for sturl_html in re.findall(r'<track src="([^"]+)"/>', info_webpage): + for sturl_html in re.findall(r'<track src="([^"]+)"', info_webpage): sturl = unescapeHTML(sturl_html) m = re.search(r'/(?P<lang>[a-z]+)\.vtt', sturl) if not m: continue - res[m.group('lang')] = sturl + res[m.group('lang')] = [{ + 'url': compat_urlparse.urljoin('http://www.viki.com', sturl), + 'ext': 'vtt', + }] return res diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 78d287e0e..8f540f578 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -4,9 +4,9 @@ from __future__ import unicode_literals import json import re import itertools +import hashlib from .common import InfoExtractor -from .subtitles import SubtitlesInfoExtractor from ..compat import ( compat_HTTPError, compat_urllib_parse, @@ -52,7 +52,7 @@ class VimeoBaseInfoExtractor(InfoExtractor): self._download_webpage(login_request, None, False, 'Wrong login info') -class VimeoIE(VimeoBaseInfoExtractor, SubtitlesInfoExtractor): +class VimeoIE(VimeoBaseInfoExtractor): """Information extractor for vimeo.com.""" # _VALID_URL matches Vimeo URLs @@ -225,6 +225,11 @@ class VimeoIE(VimeoBaseInfoExtractor, SubtitlesInfoExtractor): if mobj.group('pro') or mobj.group('player'): url = 'http://player.vimeo.com/video/' + video_id + password = self._downloader.params.get('videopassword', None) + if password: + headers['Cookie'] = '%s_password=%s' % ( + video_id, hashlib.md5(password.encode('utf-8')).hexdigest()) + # Retrieve video webpage to extract further information request = compat_urllib_request.Request(url, None, headers) try: @@ -372,12 +377,10 @@ class VimeoIE(VimeoBaseInfoExtractor, SubtitlesInfoExtractor): text_tracks = config['request'].get('text_tracks') if text_tracks: for tt in text_tracks: - subtitles[tt['lang']] = 'http://vimeo.com' + tt['url'] - - video_subtitles = self.extract_subtitles(video_id, subtitles) - if self._downloader.params.get('listsubtitles', False): - self._list_available_subtitles(video_id, subtitles) - return + subtitles[tt['lang']] = [{ + 'ext': 'vtt', + 'url': 'http://vimeo.com' + tt['url'], + }] return { 'id': video_id, @@ -393,7 +396,7 @@ class VimeoIE(VimeoBaseInfoExtractor, SubtitlesInfoExtractor): 'view_count': view_count, 'like_count': like_count, 'comment_count': comment_count, - 'subtitles': video_subtitles, + 'subtitles': subtitles, } diff --git a/youtube_dl/extractor/walla.py b/youtube_dl/extractor/walla.py index 672bda7a7..24efbd6e6 100644 --- a/youtube_dl/extractor/walla.py +++ b/youtube_dl/extractor/walla.py @@ -3,14 +3,14 @@ from __future__ import unicode_literals import re -from .subtitles import SubtitlesInfoExtractor +from .common import InfoExtractor from ..utils import ( xpath_text, int_or_none, ) -class WallaIE(SubtitlesInfoExtractor): +class WallaIE(InfoExtractor): _VALID_URL = r'http://vod\.walla\.co\.il/[^/]+/(?P<id>\d+)/(?P<display_id>.+)' _TEST = { 'url': 'http://vod.walla.co.il/movie/2642630/one-direction-all-for-one', @@ -52,13 +52,10 @@ class WallaIE(SubtitlesInfoExtractor): subtitles = {} for subtitle in item.findall('./subtitles/subtitle'): lang = xpath_text(subtitle, './title') - subtitles[self._SUBTITLE_LANGS.get(lang, lang)] = xpath_text(subtitle, './src') - - if self._downloader.params.get('listsubtitles', False): - self._list_available_subtitles(video_id, subtitles) - return - - subtitles = self.extract_subtitles(video_id, subtitles) + subtitles[self._SUBTITLE_LANGS.get(lang, lang)] = [{ + 'ext': 'srt', + 'url': xpath_text(subtitle, './src'), + }] formats = [] for quality in item.findall('./qualities/quality'): diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 3d3d43491..3690f8021 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -11,7 +11,6 @@ import time import traceback from .common import InfoExtractor, SearchInfoExtractor -from .subtitles import SubtitlesInfoExtractor from ..jsinterp import JSInterpreter from ..swfinterp import SWFInterpreter from ..compat import ( @@ -185,7 +184,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor): return -class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): +class YoutubeIE(YoutubeBaseInfoExtractor): IE_DESC = 'YouTube.com' _VALID_URL = r"""(?x)^ ( @@ -648,7 +647,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): raise ExtractorError( 'Signature extraction failed: ' + tb, cause=e) - def _get_available_subtitles(self, video_id, webpage): + def _get_subtitles(self, video_id, webpage): try: subs_doc = self._download_xml( 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id, @@ -662,23 +661,27 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): lang = track.attrib['lang_code'] if lang in sub_lang_list: continue - params = compat_urllib_parse.urlencode({ - 'lang': lang, - 'v': video_id, - 'fmt': self._downloader.params.get('subtitlesformat', 'srt'), - 'name': track.attrib['name'].encode('utf-8'), - }) - url = 'https://www.youtube.com/api/timedtext?' + params - sub_lang_list[lang] = url + sub_formats = [] + for ext in ['sbv', 'vtt', 'srt']: + params = compat_urllib_parse.urlencode({ + 'lang': lang, + 'v': video_id, + 'fmt': ext, + 'name': track.attrib['name'].encode('utf-8'), + }) + sub_formats.append({ + 'url': 'https://www.youtube.com/api/timedtext?' + params, + 'ext': ext, + }) + sub_lang_list[lang] = sub_formats if not sub_lang_list: self._downloader.report_warning('video doesn\'t have subtitles') return {} return sub_lang_list - def _get_available_automatic_caption(self, video_id, webpage): + def _get_automatic_captions(self, video_id, webpage): """We need the webpage for getting the captions url, pass it as an argument to speed up the process.""" - sub_format = self._downloader.params.get('subtitlesformat', 'srt') self.to_screen('%s: Looking for automatic captions' % video_id) mobj = re.search(r';ytplayer.config = ({.*?});', webpage) err_msg = 'Couldn\'t find automatic captions for %s' % video_id @@ -708,14 +711,20 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): sub_lang_list = {} for lang_node in caption_list.findall('target'): sub_lang = lang_node.attrib['lang_code'] - params = compat_urllib_parse.urlencode({ - 'lang': original_lang, - 'tlang': sub_lang, - 'fmt': sub_format, - 'ts': timestamp, - 'kind': caption_kind, - }) - sub_lang_list[sub_lang] = caption_url + '&' + params + sub_formats = [] + for ext in ['sbv', 'vtt', 'srt']: + params = compat_urllib_parse.urlencode({ + 'lang': original_lang, + 'tlang': sub_lang, + 'fmt': ext, + 'ts': timestamp, + 'kind': caption_kind, + }) + sub_formats.append({ + 'url': caption_url + '&' + params, + 'ext': ext, + }) + sub_lang_list[sub_lang] = sub_formats return sub_lang_list # An extractor error can be raise by the download process if there are # no automatic captions but there are subtitles @@ -970,10 +979,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): # subtitles video_subtitles = self.extract_subtitles(video_id, video_webpage) - - if self._downloader.params.get('listsubtitles', False): - self._list_available_subtitles(video_id, video_webpage) - return + automatic_captions = self.extract_automatic_captions(video_id, video_webpage) if 'length_seconds' not in video_info: self._downloader.report_warning('unable to extract video duration') @@ -1122,6 +1128,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): 'description': video_description, 'categories': video_categories, 'subtitles': video_subtitles, + 'automatic_captions': automatic_captions, 'duration': video_duration, 'age_limit': 18 if age_gate else 0, 'annotations': video_annotations, @@ -1146,13 +1153,13 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): | p/ ) ( - (?:PL|LL|EC|UU|FL|RD)?[0-9A-Za-z-_]{10,} + (?:PL|LL|EC|UU|FL|RD|UL)?[0-9A-Za-z-_]{10,} # Top tracks, they can also include dots |(?:MC)[\w\.]* ) .* | - ((?:PL|LL|EC|UU|FL|RD)[0-9A-Za-z-_]{10,}) + ((?:PL|LL|EC|UU|FL|RD|UL)[0-9A-Za-z-_]{10,}) )""" _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s' _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&[^"]*?index=(?P<index>\d+)' @@ -1237,7 +1244,7 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): for vid_id in ids] def _extract_mix(self, playlist_id): - # The mixes are generated from a a single video + # The mixes are generated from a single video # the id of the playlist is just 'RD' + video_id url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id) webpage = self._download_webpage( @@ -1273,7 +1280,7 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): else: self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id)) - if playlist_id.startswith('RD'): + if playlist_id.startswith('RD') or playlist_id.startswith('UL'): # Mixes require a custom extraction process return self._extract_mix(playlist_id) diff --git a/youtube_dl/extractor/zapiks.py b/youtube_dl/extractor/zapiks.py new file mode 100644 index 000000000..22a9a57e8 --- /dev/null +++ b/youtube_dl/extractor/zapiks.py @@ -0,0 +1,110 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + parse_duration, + parse_iso8601, + xpath_with_ns, + xpath_text, + int_or_none, +) + + +class ZapiksIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?zapiks\.(?:fr|com)/(?:(?:[a-z]{2}/)?(?P<display_id>.+?)\.html|index\.php\?.*\bmedia_id=(?P<id>\d+))' + _TESTS = [ + { + 'url': 'http://www.zapiks.fr/ep2s3-bon-appetit-eh-be-viva.html', + 'md5': 'aeb3c473b2d564b2d46d664d28d5f050', + 'info_dict': { + 'id': '80798', + 'ext': 'mp4', + 'title': 'EP2S3 - Bon Appétit - Eh bé viva les pyrénées con!', + 'description': 'md5:7054d6f6f620c6519be1fe710d4da847', + 'thumbnail': 're:^https?://.*\.jpg$', + 'duration': 528, + 'timestamp': 1359044972, + 'upload_date': '20130124', + 'view_count': int, + 'comment_count': int, + }, + }, + { + 'url': 'http://www.zapiks.com/ep3s5-bon-appetit-baqueira-m-1.html', + 'only_matching': True, + }, + { + 'url': 'http://www.zapiks.com/nl/ep3s5-bon-appetit-baqueira-m-1.html', + 'only_matching': True, + }, + { + 'url': 'http://www.zapiks.fr/index.php?action=playerIframe&media_id=118046&width=640&height=360&autoStart=false&language=fr', + 'only_matching': True, + }, + ] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + display_id = mobj.group('display_id') or video_id + + webpage = self._download_webpage(url, display_id) + + if not video_id: + video_id = self._search_regex( + r'data-media-id="(\d+)"', webpage, 'video id') + + playlist = self._download_xml( + 'http://www.zapiks.fr/view/index.php?action=playlist&media_id=%s&lang=en' % video_id, + display_id) + + NS_MAP = { + 'jwplayer': 'http://rss.jwpcdn.com/' + } + + def ns(path): + return xpath_with_ns(path, NS_MAP) + + item = playlist.find('./channel/item') + + title = xpath_text(item, 'title', 'title') or self._og_search_title(webpage) + description = self._og_search_description(webpage, default=None) + thumbnail = xpath_text( + item, ns('./jwplayer:image'), 'thumbnail') or self._og_search_thumbnail(webpage, default=None) + duration = parse_duration(self._html_search_meta( + 'duration', webpage, 'duration', default=None)) + timestamp = parse_iso8601(self._html_search_meta( + 'uploadDate', webpage, 'upload date', default=None), ' ') + + view_count = int_or_none(self._search_regex( + r'UserPlays:(\d+)', webpage, 'view count', default=None)) + comment_count = int_or_none(self._search_regex( + r'UserComments:(\d+)', webpage, 'comment count', default=None)) + + formats = [] + for source in item.findall(ns('./jwplayer:source')): + format_id = source.attrib['label'] + f = { + 'url': source.attrib['file'], + 'format_id': format_id, + } + m = re.search(r'^(?P<height>\d+)[pP]', format_id) + if m: + f['height'] = int(m.group('height')) + formats.append(f) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'duration': duration, + 'timestamp': timestamp, + 'view_count': view_count, + 'comment_count': comment_count, + 'formats': formats, + } diff --git a/youtube_dl/options.py b/youtube_dl/options.py index 5f678f76b..886ce9613 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -272,6 +272,10 @@ def parseOpts(overrideArguments=None): '--no-playlist', action='store_true', dest='noplaylist', default=False, help='If the URL refers to a video and a playlist, download only the video.') + selection.add_option( + '--yes-playlist', + action='store_false', dest='noplaylist', default=False, + help='If the URL refers to a video and a playlist, download the playlist.') selection.add_option( '--age-limit', metavar='YEARS', dest='age_limit', default=None, type=int, @@ -387,8 +391,8 @@ def parseOpts(overrideArguments=None): help='lists all available subtitles for the video') subtitles.add_option( '--sub-format', - action='store', dest='subtitlesformat', metavar='FORMAT', default='srt', - help='subtitle format (default=srt) ([sbv/vtt] youtube only)') + action='store', dest='subtitlesformat', metavar='FORMAT', default='best', + help='subtitle format, accepts formats preference, for example: "ass/srt/best"') subtitles.add_option( '--sub-lang', '--sub-langs', '--srt-lang', action='callback', dest='subtitleslangs', metavar='LANGS', type='str', diff --git a/youtube_dl/postprocessor/ffmpeg.py b/youtube_dl/postprocessor/ffmpeg.py index 3f2e6cf1d..398fe050e 100644 --- a/youtube_dl/postprocessor/ffmpeg.py +++ b/youtube_dl/postprocessor/ffmpeg.py @@ -496,10 +496,6 @@ class FFmpegEmbedSubtitlePP(FFmpegPostProcessor): 'zu': 'zul', } - def __init__(self, downloader=None, subtitlesformat='srt'): - super(FFmpegEmbedSubtitlePP, self).__init__(downloader) - self._subformat = subtitlesformat - @classmethod def _conver_lang_code(cls, code): """Convert language code from ISO 639-1 to ISO 639-2/T""" @@ -509,13 +505,14 @@ class FFmpegEmbedSubtitlePP(FFmpegPostProcessor): if information['ext'] != 'mp4': self._downloader.to_screen('[ffmpeg] Subtitles can only be embedded in mp4 files') return True, information - if not information.get('subtitles'): + subtitles = information.get('requested_subtitles') + if not subtitles: self._downloader.to_screen('[ffmpeg] There aren\'t any subtitles to embed') return True, information - sub_langs = [key for key in information['subtitles']] + sub_langs = list(subtitles.keys()) filename = information['filepath'] - input_files = [filename] + [subtitles_filename(filename, lang, self._subformat) for lang in sub_langs] + input_files = [filename] + [subtitles_filename(filename, lang, sub_info['ext']) for lang, sub_info in subtitles.items()] opts = [ '-map', '0', diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 238b6556b..e2631dccd 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -304,6 +304,8 @@ def sanitize_filename(s, restricted=False, is_id=False): # Common case of "Foreign band name - English song title" if restricted and result.startswith('-_'): result = result[2:] + if result.startswith('-'): + result = '_' + result[len('-'):] if not result: result = '_' return result @@ -900,8 +902,8 @@ def _windows_write_string(s, out): def not_a_console(handle): if handle == INVALID_HANDLE_VALUE or handle is None: return True - return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR - or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0) + return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or + GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0) if not_a_console(h): return False diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 537e8cf60..d23c6ae3d 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2015.02.19.3' +__version__ = '2015.02.24.2'