diff --git a/.travis.yml b/.travis.yml index 45b71f11b..c6cc7a994 100644 --- a/.travis.yml +++ b/.travis.yml @@ -3,6 +3,7 @@ python: - "2.6" - "2.7" - "3.3" + - "3.4" script: nosetests test --verbose notifications: email: diff --git a/CHANGELOG b/CHANGELOG deleted file mode 100644 index 3fa116733..000000000 --- a/CHANGELOG +++ /dev/null @@ -1,14 +0,0 @@ -2013.01.02 Codename: GIULIA - - * Add support for ComedyCentral clips - * Corrected Vimeo description fetching - * Added the --no-post-overwrites argument - * --verbose offers more environment info - * New info_dict field: uploader_id - * New updates system, with signature checking - * New IEs: NBA, JustinTV, FunnyOrDie, TweetReel, Steam, Ustream - * Fixed IEs: BlipTv - * Fixed for Python 3 IEs: Xvideo, Youku, XNXX, Dailymotion, Vimeo, InfoQ - * Simplified IEs and test code - * Various (Python 3 and other) fixes - * Revamped and expanded tests diff --git a/Makefile b/Makefile index f7d917d09..a82785861 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ all: youtube-dl README.md README.txt youtube-dl.1 youtube-dl.bash-completion clean: - rm -rf youtube-dl.1 youtube-dl.bash-completion README.txt MANIFEST build/ dist/ .coverage cover/ youtube-dl.tar.gz + rm -rf youtube-dl.1.temp.md youtube-dl.1 youtube-dl.bash-completion README.txt MANIFEST build/ dist/ .coverage cover/ youtube-dl.tar.gz cleanall: clean rm -f youtube-dl youtube-dl.exe @@ -55,7 +55,9 @@ README.txt: README.md pandoc -f markdown -t plain README.md -o README.txt youtube-dl.1: README.md - pandoc -s -f markdown -t man README.md -o youtube-dl.1 + python devscripts/prepare_manpage.py >youtube-dl.1.temp.md + pandoc -s -f markdown -t man youtube-dl.1.temp.md -o youtube-dl.1 + rm -f youtube-dl.1.temp.md youtube-dl.bash-completion: youtube_dl/*.py youtube_dl/*/*.py devscripts/bash-completion.in python devscripts/bash-completion.py diff --git a/README.md b/README.md index b4069515e..2bea609bf 100644 --- a/README.md +++ b/README.md @@ -1,11 +1,24 @@ -% YOUTUBE-DL(1) - -# NAME youtube-dl - download videos from youtube.com or other video platforms # SYNOPSIS **youtube-dl** [OPTIONS] URL [URL...] +# INSTALLATION + +To install it right away for all UNIX users (Linux, OS X, etc.), type: + + sudo curl https://yt-dl.org/latest/youtube-dl -o /usr/local/bin/youtube-dl + sudo chmod a+x /usr/local/bin/youtube-dl + +If you do not have curl, you can alternatively use a recent wget: + + sudo wget https://yt-dl.org/downloads/2014.05.13/youtube-dl -O /usr/local/bin/youtube-dl + sudo chmod a+x /usr/local/bin/youtube-dl + +Windows users can [download a .exe file](https://yt-dl.org/latest/youtube-dl.exe) and place it in their home directory or any other location on their [PATH](http://en.wikipedia.org/wiki/PATH_%28variable%29). + +Alternatively, refer to the developer instructions below for how to check out and work with the git repository. For further options, including PGP signatures, see https://rg3.github.io/youtube-dl/download.html . + # DESCRIPTION **youtube-dl** is a small command-line program to download videos from YouTube.com and a few more sites. It requires the Python interpreter, version @@ -458,7 +471,7 @@ If your report is shorter than two lines, it is almost certainly missing some of For bug reports, this means that your report should contain the *complete* output of youtube-dl when called with the -v flag. The error message you get for (most) bugs even says so, but you would not believe how many of our bug reports do not contain this information. -Site support requests must contain an example URL. An example URL is a URL you might want to download, like http://www.youtube.com/watch?v=BaW_jenozKc . There should be an obvious video present. Except under very special circumstances, the main page of a video service (e.g. http://www.youtube.com/ ) is *not* an example URL. +Site support requests **must contain an example URL**. An example URL is a URL you might want to download, like http://www.youtube.com/watch?v=BaW_jenozKc . There should be an obvious video present. Except under very special circumstances, the main page of a video service (e.g. http://www.youtube.com/ ) is *not* an example URL. ### Are you using the latest version? diff --git a/devscripts/make_readme.py b/devscripts/make_readme.py index cae1fa4f2..70fa942dd 100755 --- a/devscripts/make_readme.py +++ b/devscripts/make_readme.py @@ -15,7 +15,7 @@ header = oldreadme[:oldreadme.index('# OPTIONS')] footer = oldreadme[oldreadme.index('# CONFIGURATION'):] options = helptext[helptext.index(' General Options:') + 19:] -options = re.sub(r'^ (\w.+)$', r'## \1', options, flags=re.M) +options = re.sub(r'(?m)^ (\w.+)$', r'## \1', options) options = '# OPTIONS\n' + options + '\n' with io.open(README_FILE, 'w', encoding='utf-8') as f: diff --git a/devscripts/prepare_manpage.py b/devscripts/prepare_manpage.py new file mode 100644 index 000000000..d9c857015 --- /dev/null +++ b/devscripts/prepare_manpage.py @@ -0,0 +1,20 @@ + +import io +import os.path +import sys +import re + +ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +README_FILE = os.path.join(ROOT_DIR, 'README.md') + +with io.open(README_FILE, encoding='utf-8') as f: + readme = f.read() + +PREFIX = '%YOUTUBE-DL(1)\n\n# NAME\n' +readme = re.sub(r'(?s)# INSTALLATION.*?(?=# DESCRIPTION)', '', readme) +readme = PREFIX + readme + +if sys.version_info < (3, 0): + print(readme.encode('utf-8')) +else: + print(readme) diff --git a/test/helper.py b/test/helper.py index 984f2554a..1fa12a1e4 100644 --- a/test/helper.py +++ b/test/helper.py @@ -74,13 +74,19 @@ class FakeYDL(YoutubeDL): old_report_warning(message) self.report_warning = types.MethodType(report_warning, self) -def gettestcases(): + +def gettestcases(include_onlymatching=False): for ie in youtube_dl.extractor.gen_extractors(): t = getattr(ie, '_TEST', None) if t: - t['name'] = type(ie).__name__[:-len('IE')] - yield t - for t in getattr(ie, '_TESTS', []): + assert not hasattr(ie, '_TESTS'), \ + '%s has _TEST and _TESTS' % type(ie).__name__ + tests = [t] + else: + tests = getattr(ie, '_TESTS', []) + for t in tests: + if not include_onlymatching and t.get('only_matching', False): + continue t['name'] = type(ie).__name__[:-len('IE')] yield t @@ -101,7 +107,7 @@ def expect_info_dict(self, expected_dict, got_dict): elif isinstance(expected, type): got = got_dict.get(info_field) self.assertTrue(isinstance(got, expected), - u'Expected type %r, but got value %r of type %r' % (expected, got, type(got))) + u'Expected type %r for field %s, but got value %r of type %r' % (expected, info_field, got, type(got))) else: if isinstance(expected, compat_str) and expected.startswith('md5:'): got = 'md5:' + md5(got_dict.get(info_field)) @@ -129,3 +135,17 @@ def expect_info_dict(self, expected_dict, got_dict): missing_keys, 'Missing keys in test definition: %s' % ( ', '.join(sorted(missing_keys)))) + + +def assertRegexpMatches(self, text, regexp, msg=None): + if hasattr(self, 'assertRegexpMatches'): + return self.assertRegexpMatches(text, regexp, msg) + else: + m = re.match(regexp, text) + if not m: + note = 'Regexp didn\'t match: %r not found in %r' % (regexp, text) + if msg is None: + msg = note + else: + msg = note + ', ' + msg + self.assertTrue(m, msg) diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index 2902dbec7..e794cc97f 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -8,7 +8,7 @@ import sys import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from test.helper import FakeYDL +from test.helper import FakeYDL, assertRegexpMatches from youtube_dl import YoutubeDL from youtube_dl.extractor import YoutubeIE @@ -67,7 +67,7 @@ class TestFormatSelection(unittest.TestCase): downloaded = ydl.downloaded_info_dicts[0] self.assertEqual(downloaded['ext'], 'mp4') - # No prefer_free_formats => prefer mp4 and flv for greater compatibilty + # No prefer_free_formats => prefer mp4 and flv for greater compatibility ydl = YDL() ydl.params['prefer_free_formats'] = False formats = [ @@ -274,6 +274,12 @@ class TestFormatSelection(unittest.TestCase): # Replace missing fields with 'NA' self.assertEqual(fname('%(uploader_date)s-%(id)s.%(ext)s'), 'NA-1234.mp4') + def test_format_note(self): + ydl = YoutubeDL() + self.assertEqual(ydl._format_note({}), '') + assertRegexpMatches(self, ydl._format_note({ + 'vbr': 10, + }), '^\s*10k$') if __name__ == '__main__': unittest.main() diff --git a/test/test_all_urls.py b/test/test_all_urls.py index 7f7362a3a..4b56137ce 100644 --- a/test/test_all_urls.py +++ b/test/test_all_urls.py @@ -77,20 +77,20 @@ class TestAllURLsMatching(unittest.TestCase): self.assertMatch('https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video', ['youtube:search_url']) def test_justin_tv_channelid_matching(self): - self.assertTrue(JustinTVIE.suitable(u"justin.tv/vanillatv")) - self.assertTrue(JustinTVIE.suitable(u"twitch.tv/vanillatv")) - self.assertTrue(JustinTVIE.suitable(u"www.justin.tv/vanillatv")) - self.assertTrue(JustinTVIE.suitable(u"www.twitch.tv/vanillatv")) - self.assertTrue(JustinTVIE.suitable(u"http://www.justin.tv/vanillatv")) - self.assertTrue(JustinTVIE.suitable(u"http://www.twitch.tv/vanillatv")) - self.assertTrue(JustinTVIE.suitable(u"http://www.justin.tv/vanillatv/")) - self.assertTrue(JustinTVIE.suitable(u"http://www.twitch.tv/vanillatv/")) + self.assertTrue(JustinTVIE.suitable('justin.tv/vanillatv')) + self.assertTrue(JustinTVIE.suitable('twitch.tv/vanillatv')) + self.assertTrue(JustinTVIE.suitable('www.justin.tv/vanillatv')) + self.assertTrue(JustinTVIE.suitable('www.twitch.tv/vanillatv')) + self.assertTrue(JustinTVIE.suitable('http://www.justin.tv/vanillatv')) + self.assertTrue(JustinTVIE.suitable('http://www.twitch.tv/vanillatv')) + self.assertTrue(JustinTVIE.suitable('http://www.justin.tv/vanillatv/')) + self.assertTrue(JustinTVIE.suitable('http://www.twitch.tv/vanillatv/')) def test_justintv_videoid_matching(self): - self.assertTrue(JustinTVIE.suitable(u"http://www.twitch.tv/vanillatv/b/328087483")) + self.assertTrue(JustinTVIE.suitable('http://www.twitch.tv/vanillatv/b/328087483')) def test_justin_tv_chapterid_matching(self): - self.assertTrue(JustinTVIE.suitable(u"http://www.twitch.tv/tsm_theoddone/c/2349361")) + self.assertTrue(JustinTVIE.suitable('http://www.twitch.tv/tsm_theoddone/c/2349361')) def test_youtube_extract(self): assertExtractId = lambda url, id: self.assertEqual(YoutubeIE.extract_id(url), id) @@ -106,7 +106,7 @@ class TestAllURLsMatching(unittest.TestCase): def test_no_duplicates(self): ies = gen_extractors() - for tc in gettestcases(): + for tc in gettestcases(include_onlymatching=True): url = tc['url'] for ie in ies: if type(ie).__name__ in ('GenericIE', tc['name'] + 'IE'): @@ -176,5 +176,6 @@ class TestAllURLsMatching(unittest.TestCase): 'https://screen.yahoo.com/smartwatches-latest-wearable-gadgets-163745379-cbs.html', ['Yahoo']) + if __name__ == '__main__': unittest.main() diff --git a/test/test_playlists.py b/test/test_playlists.py index 17f1e5fab..63d31db8c 100644 --- a/test/test_playlists.py +++ b/test/test_playlists.py @@ -10,6 +10,7 @@ import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from test.helper import ( + assertRegexpMatches, expect_info_dict, FakeYDL, ) @@ -22,9 +23,11 @@ from youtube_dl.extractor import ( VimeoUserIE, VimeoAlbumIE, VimeoGroupsIE, + VineUserIE, UstreamChannelIE, SoundcloudSetIE, SoundcloudUserIE, + SoundcloudPlaylistIE, LivestreamIE, NHLVideocenterIE, BambuserChannelIE, @@ -100,6 +103,13 @@ class TestPlaylists(unittest.TestCase): self.assertEqual(result['title'], 'Rolex Awards for Enterprise') self.assertTrue(len(result['entries']) > 72) + def test_vine_user(self): + dl = FakeYDL() + ie = VineUserIE(dl) + result = ie.extract('https://vine.co/Visa') + self.assertIsPlaylist(result) + self.assertTrue(len(result['entries']) >= 50) + def test_ustream_channel(self): dl = FakeYDL() ie = UstreamChannelIE(dl) @@ -124,6 +134,17 @@ class TestPlaylists(unittest.TestCase): self.assertEqual(result['id'], '9615865') self.assertTrue(len(result['entries']) >= 12) + def test_soundcloud_playlist(self): + dl = FakeYDL() + ie = SoundcloudPlaylistIE(dl) + result = ie.extract('http://api.soundcloud.com/playlists/4110309') + self.assertIsPlaylist(result) + self.assertEqual(result['id'], '4110309') + self.assertEqual(result['title'], 'TILT Brass - Bowery Poetry Club, August \'03 [Non-Site SCR 02]') + assertRegexpMatches( + self, result['description'], r'TILT Brass - Bowery Poetry Club') + self.assertEqual(len(result['entries']), 6) + def test_livestream_event(self): dl = FakeYDL() ie = LivestreamIE(dl) @@ -192,16 +213,16 @@ class TestPlaylists(unittest.TestCase): self.assertIsPlaylist(result) self.assertEqual(result['id'], 'dezhurnyi_angel') self.assertEqual(result['title'], 'Дежурный ангел (2010 - 2012)') - self.assertTrue(len(result['entries']) >= 36) - + self.assertTrue(len(result['entries']) >= 16) + def test_ivi_compilation_season(self): dl = FakeYDL() ie = IviCompilationIE(dl) - result = ie.extract('http://www.ivi.ru/watch/dezhurnyi_angel/season2') + result = ie.extract('http://www.ivi.ru/watch/dezhurnyi_angel/season1') self.assertIsPlaylist(result) - self.assertEqual(result['id'], 'dezhurnyi_angel/season2') - self.assertEqual(result['title'], 'Дежурный ангел (2010 - 2012) 2 сезон') - self.assertTrue(len(result['entries']) >= 20) + self.assertEqual(result['id'], 'dezhurnyi_angel/season1') + self.assertEqual(result['title'], 'Дежурный ангел (2010 - 2012) 1 сезон') + self.assertTrue(len(result['entries']) >= 16) def test_imdb_list(self): dl = FakeYDL() diff --git a/test/test_subtitles.py b/test/test_subtitles.py index 79991e646..5736fe581 100644 --- a/test/test_subtitles.py +++ b/test/test_subtitles.py @@ -181,7 +181,7 @@ class TestTedSubtitles(BaseTestSubtitles): self.DL.params['writesubtitles'] = True self.DL.params['allsubtitles'] = True subtitles = self.getSubtitles() - self.assertEqual(len(subtitles.keys()), 28) + self.assertTrue(len(subtitles.keys()) >= 28) def test_list_subtitles(self): self.DL.expect_warning(u'Automatic Captions not supported by this server') diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index dbdf79cac..a1531ae0b 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -31,6 +31,7 @@ from .utils import ( ContentTooShortError, date_from_str, DateRange, + DEFAULT_OUTTMPL, determine_ext, DownloadError, encodeFilename, @@ -441,7 +442,8 @@ class YoutubeDL(object): if v is not None) template_dict = collections.defaultdict(lambda: 'NA', template_dict) - tmpl = os.path.expanduser(self.params['outtmpl']) + outtmpl = self.params.get('outtmpl', DEFAULT_OUTTMPL) + tmpl = os.path.expanduser(outtmpl) filename = tmpl % template_dict return filename except ValueError as err: @@ -1049,10 +1051,11 @@ class YoutubeDL(object): def download(self, url_list): """Download a given list of URLs.""" + outtmpl = self.params.get('outtmpl', DEFAULT_OUTTMPL) if (len(url_list) > 1 and - '%' not in self.params['outtmpl'] + '%' not in outtmpl and self.params.get('max_downloads') != 1): - raise SameFileError(self.params['outtmpl']) + raise SameFileError(outtmpl) for url in url_list: try: @@ -1163,57 +1166,57 @@ class YoutubeDL(object): res = default return res - def list_formats(self, info_dict): - def format_note(fdict): - res = '' - if fdict.get('ext') in ['f4f', 'f4m']: - res += '(unsupported) ' - if fdict.get('format_note') is not None: - res += fdict['format_note'] + ' ' - if fdict.get('tbr') is not None: - res += '%4dk ' % fdict['tbr'] - if fdict.get('container') is not None: - if res: - res += ', ' - res += '%s container' % fdict['container'] - if (fdict.get('vcodec') is not None and - fdict.get('vcodec') != 'none'): - if res: - res += ', ' - res += fdict['vcodec'] - if fdict.get('vbr') is not None: - res += '@' - elif fdict.get('vbr') is not None and fdict.get('abr') is not None: - res += 'video@' + def _format_note(self, fdict): + res = '' + if fdict.get('ext') in ['f4f', 'f4m']: + res += '(unsupported) ' + if fdict.get('format_note') is not None: + res += fdict['format_note'] + ' ' + if fdict.get('tbr') is not None: + res += '%4dk ' % fdict['tbr'] + if fdict.get('container') is not None: + if res: + res += ', ' + res += '%s container' % fdict['container'] + if (fdict.get('vcodec') is not None and + fdict.get('vcodec') != 'none'): + if res: + res += ', ' + res += fdict['vcodec'] if fdict.get('vbr') is not None: - res += '%4dk' % fdict['vbr'] - if fdict.get('acodec') is not None: - if res: - res += ', ' - if fdict['acodec'] == 'none': - res += 'video only' - else: - res += '%-5s' % fdict['acodec'] - elif fdict.get('abr') is not None: - if res: - res += ', ' - res += 'audio' - if fdict.get('abr') is not None: - res += '@%3dk' % fdict['abr'] - if fdict.get('asr') is not None: - res += ' (%5dHz)' % fdict['asr'] - if fdict.get('filesize') is not None: - if res: - res += ', ' - res += format_bytes(fdict['filesize']) - return res + res += '@' + elif fdict.get('vbr') is not None and fdict.get('abr') is not None: + res += 'video@' + if fdict.get('vbr') is not None: + res += '%4dk' % fdict['vbr'] + if fdict.get('acodec') is not None: + if res: + res += ', ' + if fdict['acodec'] == 'none': + res += 'video only' + else: + res += '%-5s' % fdict['acodec'] + elif fdict.get('abr') is not None: + if res: + res += ', ' + res += 'audio' + if fdict.get('abr') is not None: + res += '@%3dk' % fdict['abr'] + if fdict.get('asr') is not None: + res += ' (%5dHz)' % fdict['asr'] + if fdict.get('filesize') is not None: + if res: + res += ', ' + res += format_bytes(fdict['filesize']) + return res + def list_formats(self, info_dict): def line(format, idlen=20): return (('%-' + compat_str(idlen + 1) + 's%-10s%-12s%s') % ( format['format_id'], format['ext'], self.format_resolution(format), - format_note(format), + self._format_note(format), )) formats = info_dict.get('formats', [info_dict]) @@ -1221,8 +1224,8 @@ class YoutubeDL(object): max(len(f['format_id']) for f in formats)) formats_s = [line(f, idlen) for f in formats] if len(formats) > 1: - formats_s[0] += (' ' if format_note(formats[0]) else '') + '(worst)' - formats_s[-1] += (' ' if format_note(formats[-1]) else '') + '(best)' + formats_s[0] += (' ' if self._format_note(formats[0]) else '') + '(worst)' + formats_s[-1] += (' ' if self._format_note(formats[-1]) else '') + '(best)' header_line = line({ 'format_id': 'format code', 'ext': 'extension', diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index fd0291849..1890b2abe 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -53,6 +53,10 @@ __authors__ = ( 'Mattias Harrysson', 'phaer', 'Sainyam Kapoor', + 'Nicolas Évrard', + 'Jason Normore', + 'Hoje Lee', + 'Adam Thalhammer', ) __license__ = 'Public Domain' @@ -72,6 +76,7 @@ from .utils import ( compat_getpass, compat_print, DateRange, + DEFAULT_OUTTMPL, decodeOption, get_term_width, DownloadError, @@ -679,7 +684,7 @@ def _real_main(argv=None): if not opts.audioquality.isdigit(): parser.error(u'invalid audio quality specified') if opts.recodevideo is not None: - if opts.recodevideo not in ['mp4', 'flv', 'webm', 'ogg']: + if opts.recodevideo not in ['mp4', 'flv', 'webm', 'ogg', 'mkv']: parser.error(u'invalid video recode format specified') if opts.date is not None: date = DateRange.day(opts.date) @@ -708,7 +713,7 @@ def _real_main(argv=None): or (opts.usetitle and u'%(title)s-%(id)s.%(ext)s') or (opts.useid and u'%(id)s.%(ext)s') or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s') - or u'%(title)s-%(id)s.%(ext)s') + or DEFAULT_OUTTMPL) if not os.path.splitext(outtmpl)[1] and opts.extractaudio: parser.error(u'Cannot download a video and extract audio into the same' u' file! Use "{0}.%(ext)s" instead of "{0}" as the output' diff --git a/youtube_dl/downloader/http.py b/youtube_dl/downloader/http.py index cc8b9c9a7..f79e6a995 100644 --- a/youtube_dl/downloader/http.py +++ b/youtube_dl/downloader/http.py @@ -14,6 +14,8 @@ from ..utils import ( class HttpFD(FileDownloader): + _TEST_FILE_SIZE = 10241 + def real_download(self, filename, info_dict): url = info_dict['url'] tmpfilename = self.temp_name(filename) @@ -28,8 +30,10 @@ class HttpFD(FileDownloader): basic_request = compat_urllib_request.Request(url, None, headers) request = compat_urllib_request.Request(url, None, headers) - if self.params.get('test', False): - request.add_header('Range', 'bytes=0-10240') + is_test = self.params.get('test', False) + + if is_test: + request.add_header('Range', 'bytes=0-%s' % str(self._TEST_FILE_SIZE - 1)) # Establish possible resume length if os.path.isfile(encodeFilename(tmpfilename)): @@ -100,6 +104,15 @@ class HttpFD(FileDownloader): return False data_len = data.info().get('Content-length', None) + + # Range HTTP header may be ignored/unsupported by a webserver + # (e.g. extractor/scivee.py, extractor/bambuser.py). + # However, for a test we still would like to download just a piece of a file. + # To achieve this we limit data_len to _TEST_FILE_SIZE and manually control + # block size when downloading a file. + if is_test and (data_len is None or int(data_len) > self._TEST_FILE_SIZE): + data_len = self._TEST_FILE_SIZE + if data_len is not None: data_len = int(data_len) + resume_len min_data_len = self.params.get("min_filesize", None) @@ -118,7 +131,7 @@ class HttpFD(FileDownloader): while True: # Download and write before = time.time() - data_block = data.read(block_size) + data_block = data.read(block_size if not is_test else min(block_size, data_len - byte_counter)) after = time.time() if len(data_block) == 0: break @@ -162,6 +175,9 @@ class HttpFD(FileDownloader): 'speed': speed, }) + if is_test and byte_counter == data_len: + break + # Apply rate limit self.slow_down(start, byte_counter - resume_len) diff --git a/youtube_dl/downloader/rtmp.py b/youtube_dl/downloader/rtmp.py index 94233bcc3..78b1e7cd2 100644 --- a/youtube_dl/downloader/rtmp.py +++ b/youtube_dl/downloader/rtmp.py @@ -10,6 +10,7 @@ from .common import FileDownloader from ..utils import ( encodeFilename, format_bytes, + compat_str, ) @@ -127,7 +128,10 @@ class RtmpFD(FileDownloader): basic_args += ['--flashVer', flash_version] if live: basic_args += ['--live'] - if conn: + if isinstance(conn, list): + for entry in conn: + basic_args += ['--conn', entry] + elif isinstance(conn, compat_str): basic_args += ['--conn', conn] args = basic_args + [[], ['--resume', '--skip', '1']][not live and self.params.get('continuedl', False)] diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 3a91e1a46..def58f1d6 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -20,6 +20,7 @@ from .auengine import AUEngineIE from .bambuser import BambuserIE, BambuserChannelIE from .bandcamp import BandcampIE, BandcampAlbumIE from .bbccouk import BBCCoUkIE +from .bilibili import BiliBiliIE from .blinkx import BlinkxIE from .bliptv import BlipTVIE, BlipTVUserIE from .bloomberg import BloombergIE @@ -40,6 +41,7 @@ from .cinemassacre import CinemassacreIE from .clipfish import ClipfishIE from .cliphunter import CliphunterIE from .clipsyndicate import ClipsyndicateIE +from .clubic import ClubicIE from .cmt import CMTIE from .cnet import CNETIE from .cnn import ( @@ -70,6 +72,7 @@ from .ehow import EHowIE from .eighttracks import EightTracksIE from .eitb import EitbIE from .elpais import ElPaisIE +from .empflix import EmpflixIE from .engadget import EngadgetIE from .escapist import EscapistIE from .everyonesmixtape import EveryonesMixtapeIE @@ -77,6 +80,7 @@ from .exfm import ExfmIE from .extremetube import ExtremeTubeIE from .facebook import FacebookIE from .faz import FazIE +from .fc2 import FC2IE from .firstpost import FirstpostIE from .firsttv import FirstTVIE from .fivemin import FiveMinIE @@ -107,10 +111,12 @@ from .googleplus import GooglePlusIE from .googlesearch import GoogleSearchIE from .hark import HarkIE from .helsinki import HelsinkiIE +from .hentaistigma import HentaiStigmaIE from .hotnewhiphop import HotNewHipHopIE from .howcast import HowcastIE from .huffpost import HuffPostIE from .hypem import HypemIE +from .iconosquare import IconosquareIE from .ign import IGNIE, OneUPIE from .imdb import ( ImdbIE, @@ -158,6 +164,7 @@ from .mofosex import MofosexIE from .mooshare import MooshareIE from .morningstar import MorningstarIE from .motorsport import MotorsportIE +from .moviezine import MoviezineIE from .movshare import MovShareIE from .mtv import ( MTVIE, @@ -177,15 +184,23 @@ from .nbc import ( from .ndr import NDRIE from .ndtv import NDTVIE from .newgrounds import NewgroundsIE +from .newstube import NewstubeIE from .nfb import NFBIE from .nhl import NHLIE, NHLVideocenterIE from .niconico import NiconicoIE from .ninegag import NineGagIE +from .noco import NocoIE from .normalboots import NormalbootsIE from .novamov import NovaMovIE from .nowness import NownessIE from .nowvideo import NowVideoIE +from .nrk import ( + NRKIE, + NRKTVIE, +) from .ntv import NTVIE +from .nytimes import NYTimesIE +from .nuvid import NuvidIE from .oe1 import OE1IE from .ooyala import OoyalaIE from .orf import ORFIE @@ -206,6 +221,7 @@ from .ringtv import RingTVIE from .ro220 import Ro220IE from .rottentomatoes import RottenTomatoesIE from .roxwel import RoxwelIE +from .rtbf import RTBFIE from .rtlnow import RTLnowIE from .rts import RTSIE from .rtve import RTVEALaCartaIE @@ -217,9 +233,11 @@ from .rutube import ( ) from .rutv import RUTVIE from .savefrom import SaveFromIE +from .scivee import SciVeeIE from .servingsys import ServingSysIE from .sina import SinaIE from .slideshare import SlideshareIE +from .slutload import SlutloadIE from .smotri import ( SmotriIE, SmotriCommunityIE, @@ -227,7 +245,12 @@ from .smotri import ( SmotriBroadcastIE, ) from .sohu import SohuIE -from .soundcloud import SoundcloudIE, SoundcloudSetIE, SoundcloudUserIE +from .soundcloud import ( + SoundcloudIE, + SoundcloudSetIE, + SoundcloudUserIE, + SoundcloudPlaylistIE +) from .southparkstudios import ( SouthParkStudiosIE, SouthparkDeIE, @@ -237,10 +260,10 @@ from .spankwire import SpankwireIE from .spiegel import SpiegelIE from .spike import SpikeIE from .stanfordoc import StanfordOpenClassroomIE -from .statigram import StatigramIE from .steam import SteamIE from .streamcloud import StreamcloudIE from .streamcz import StreamCZIE +from .swrmediathek import SWRMediathekIE from .syfy import SyfyIE from .sztvhu import SztvHuIE from .teamcoco import TeamcocoIE @@ -251,6 +274,7 @@ from .tf1 import TF1IE from .theplatform import ThePlatformIE from .thisav import ThisAVIE from .tinypic import TinyPicIE +from .tlc import TlcIE, TlcDeIE from .toutv import TouTvIE from .toypics import ToypicsUserIE, ToypicsIE from .traileraddict import TrailerAddictIE @@ -280,6 +304,7 @@ from .videodetective import VideoDetectiveIE from .videolecturesnet import VideoLecturesNetIE from .videofyme import VideofyMeIE from .videopremium import VideoPremiumIE +from .videott import VideoTtIE from .videoweed import VideoWeedIE from .vimeo import ( VimeoIE, @@ -288,15 +313,21 @@ from .vimeo import ( VimeoAlbumIE, VimeoGroupsIE, VimeoReviewIE, + VimeoWatchLaterIE, +) +from .vine import ( + VineIE, + VineUserIE, ) -from .vine import VineIE from .viki import VikiIE from .vk import VKIE from .vube import VubeIE +from .vuclip import VuClipIE from .washingtonpost import WashingtonPostIE from .wat import WatIE from .wdr import ( WDRIE, + WDRMobileIE, WDRMausIE, ) from .weibo import WeiboIE diff --git a/youtube_dl/extractor/aftonbladet.py b/youtube_dl/extractor/aftonbladet.py index 6a8cd14c9..cfc7370ae 100644 --- a/youtube_dl/extractor/aftonbladet.py +++ b/youtube_dl/extractor/aftonbladet.py @@ -1,7 +1,6 @@ # encoding: utf-8 from __future__ import unicode_literals -import datetime import re from .common import InfoExtractor @@ -16,6 +15,7 @@ class AftonbladetIE(InfoExtractor): 'ext': 'mp4', 'title': 'Vulkanutbrott i rymden - nu släpper NASA bilderna', 'description': 'Jupiters måne mest aktiv av alla himlakroppar', + 'timestamp': 1394142732, 'upload_date': '20140306', }, } @@ -27,17 +27,17 @@ class AftonbladetIE(InfoExtractor): webpage = self._download_webpage(url, video_id) # find internal video meta data - META_URL = 'http://aftonbladet-play.drlib.aptoma.no/video/%s.json' + meta_url = 'http://aftonbladet-play.drlib.aptoma.no/video/%s.json' internal_meta_id = self._html_search_regex( r'data-aptomaId="([\w\d]+)"', webpage, 'internal_meta_id') - internal_meta_url = META_URL % internal_meta_id + internal_meta_url = meta_url % internal_meta_id internal_meta_json = self._download_json( internal_meta_url, video_id, 'Downloading video meta data') # find internal video formats - FORMATS_URL = 'http://aftonbladet-play.videodata.drvideo.aptoma.no/actions/video/?id=%s' + format_url = 'http://aftonbladet-play.videodata.drvideo.aptoma.no/actions/video/?id=%s' internal_video_id = internal_meta_json['videoId'] - internal_formats_url = FORMATS_URL % internal_video_id + internal_formats_url = format_url % internal_video_id internal_formats_json = self._download_json( internal_formats_url, video_id, 'Downloading video formats') @@ -54,16 +54,13 @@ class AftonbladetIE(InfoExtractor): }) self._sort_formats(formats) - timestamp = datetime.datetime.fromtimestamp(internal_meta_json['timePublished']) - upload_date = timestamp.strftime('%Y%m%d') - return { 'id': video_id, 'title': internal_meta_json['title'], 'formats': formats, 'thumbnail': internal_meta_json['imageUrl'], 'description': internal_meta_json['shortPreamble'], - 'upload_date': upload_date, + 'timestamp': internal_meta_json['timePublished'], 'duration': internal_meta_json['duration'], 'view_count': internal_meta_json['views'], } diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py index 646377e4b..b528a9ec5 100644 --- a/youtube_dl/extractor/arte.py +++ b/youtube_dl/extractor/arte.py @@ -74,7 +74,8 @@ class ArteTVPlus7IE(InfoExtractor): return self._extract_from_webpage(webpage, video_id, lang) def _extract_from_webpage(self, webpage, video_id, lang): - json_url = self._html_search_regex(r'arte_vp_url="(.*?)"', webpage, 'json url') + json_url = self._html_search_regex( + r'arte_vp_url="(.*?)"', webpage, 'json vp url') return self._extract_from_json_url(json_url, video_id, lang) def _extract_from_json_url(self, json_url, video_id, lang): @@ -120,14 +121,17 @@ class ArteTVPlus7IE(InfoExtractor): return ['HQ', 'MQ', 'EQ', 'SQ'].index(f['quality']) else: def sort_key(f): + versionCode = f.get('versionCode') + if versionCode is None: + versionCode = '' return ( # Sort first by quality - int(f.get('height',-1)), - int(f.get('bitrate',-1)), + int(f.get('height', -1)), + int(f.get('bitrate', -1)), # The original version with subtitles has lower relevance - re.match(r'VO-ST(F|A)', f.get('versionCode', '')) is None, + re.match(r'VO-ST(F|A)', versionCode) is None, # The version with sourds/mal subtitles has also lower relevance - re.match(r'VO?(F|A)-STM\1', f.get('versionCode', '')) is None, + re.match(r'VO?(F|A)-STM\1', versionCode) is None, # Prefer http downloads over m3u8 0 if f['url'].endswith('m3u8') else 1, ) diff --git a/youtube_dl/extractor/bandcamp.py b/youtube_dl/extractor/bandcamp.py index 886b0dfab..dcbbdef43 100644 --- a/youtube_dl/extractor/bandcamp.py +++ b/youtube_dl/extractor/bandcamp.py @@ -12,14 +12,14 @@ from ..utils import ( class BandcampIE(InfoExtractor): - _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P.*)' + _VALID_URL = r'https?://.*?\.bandcamp\.com/track/(?P<title>.*)' _TESTS = [{ 'url': 'http://youtube-dl.bandcamp.com/track/youtube-dl-test-song', 'file': '1812978515.mp3', 'md5': 'c557841d5e50261777a6585648adf439', 'info_dict': { "title": "youtube-dl \"'/\\\u00e4\u21ad - youtube-dl test song \"'/\\\u00e4\u21ad", - "duration": 10, + "duration": 9.8485, }, '_skip': 'There is a limit of 200 free downloads / month for the test song' }] @@ -28,36 +28,32 @@ class BandcampIE(InfoExtractor): mobj = re.match(self._VALID_URL, url) title = mobj.group('title') webpage = self._download_webpage(url, title) - # We get the link to the free download page m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage) - if m_download is None: + if not m_download: m_trackinfo = re.search(r'trackinfo: (.+),\s*?\n', webpage) if m_trackinfo: json_code = m_trackinfo.group(1) - data = json.loads(json_code) - d = data[0] + data = json.loads(json_code)[0] - duration = int(round(d['duration'])) formats = [] - for format_id, format_url in d['file'].items(): - ext, _, abr_str = format_id.partition('-') - + for format_id, format_url in data['file'].items(): + ext, abr_str = format_id.split('-', 1) formats.append({ 'format_id': format_id, 'url': format_url, - 'ext': format_id.partition('-')[0], + 'ext': ext, 'vcodec': 'none', - 'acodec': format_id.partition('-')[0], - 'abr': int(format_id.partition('-')[2]), + 'acodec': ext, + 'abr': int(abr_str), }) self._sort_formats(formats) return { - 'id': compat_str(d['id']), - 'title': d['title'], + 'id': compat_str(data['id']), + 'title': data['title'], 'formats': formats, - 'duration': duration, + 'duration': float(data['duration']), } else: raise ExtractorError('No free songs found') @@ -67,11 +63,9 @@ class BandcampIE(InfoExtractor): r'var TralbumData = {(.*?)id: (?P<id>\d*?)$', webpage, re.MULTILINE | re.DOTALL).group('id') - download_webpage = self._download_webpage(download_link, video_id, - 'Downloading free downloads page') - # We get the dictionary of the track from some javascrip code - info = re.search(r'items: (.*?),$', - download_webpage, re.MULTILINE).group(1) + download_webpage = self._download_webpage(download_link, video_id, 'Downloading free downloads page') + # We get the dictionary of the track from some javascript code + info = re.search(r'items: (.*?),$', download_webpage, re.MULTILINE).group(1) info = json.loads(info)[0] # We pick mp3-320 for now, until format selection can be easily implemented. mp3_info = info['downloads']['mp3-320'] @@ -100,7 +94,7 @@ class BandcampIE(InfoExtractor): class BandcampAlbumIE(InfoExtractor): IE_NAME = 'Bandcamp:album' - _VALID_URL = r'http://.*?\.bandcamp\.com/album/(?P<title>.*)' + _VALID_URL = r'https?://(?:(?P<subdomain>[^.]+)\.)?bandcamp\.com(?:/album/(?P<title>[^?#]+))' _TEST = { 'url': 'http://blazo.bandcamp.com/album/jazz-format-mixtape-vol-1', @@ -123,13 +117,15 @@ class BandcampAlbumIE(InfoExtractor): 'params': { 'playlistend': 2 }, - 'skip': 'Bancamp imposes download limits. See test_playlists:test_bandcamp_album for the playlist test' + 'skip': 'Bandcamp imposes download limits. See test_playlists:test_bandcamp_album for the playlist test' } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) + playlist_id = mobj.group('subdomain') title = mobj.group('title') - webpage = self._download_webpage(url, title) + display_id = title or playlist_id + webpage = self._download_webpage(url, display_id) tracks_paths = re.findall(r'<a href="(.*?)" itemprop="url">', webpage) if not tracks_paths: raise ExtractorError('The page doesn\'t contain any tracks') @@ -139,6 +135,8 @@ class BandcampAlbumIE(InfoExtractor): title = self._search_regex(r'album_title : "(.*?)"', webpage, 'title') return { '_type': 'playlist', + 'id': playlist_id, + 'display_id': display_id, 'title': title, 'entries': entries, } diff --git a/youtube_dl/extractor/bilibili.py b/youtube_dl/extractor/bilibili.py new file mode 100644 index 000000000..45067b944 --- /dev/null +++ b/youtube_dl/extractor/bilibili.py @@ -0,0 +1,106 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + compat_parse_qs, + ExtractorError, + int_or_none, + unified_strdate, +) + + +class BiliBiliIE(InfoExtractor): + _VALID_URL = r'http://www\.bilibili\.tv/video/av(?P<id>[0-9]+)/' + + _TEST = { + 'url': 'http://www.bilibili.tv/video/av1074402/', + 'md5': '2c301e4dab317596e837c3e7633e7d86', + 'info_dict': { + 'id': '1074402', + 'ext': 'flv', + 'title': '【金坷垃】金泡沫', + 'duration': 308, + 'upload_date': '20140420', + 'thumbnail': 're:^https?://.+\.jpg', + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + + webpage = self._download_webpage(url, video_id) + video_code = self._search_regex( + r'(?s)<div itemprop="video".*?>(.*?)</div>', webpage, 'video code') + + title = self._html_search_meta( + 'media:title', video_code, 'title', fatal=True) + duration_str = self._html_search_meta( + 'duration', video_code, 'duration') + if duration_str is None: + duration = None + else: + duration_mobj = re.match( + r'^T(?:(?P<hours>[0-9]+)H)?(?P<minutes>[0-9]+)M(?P<seconds>[0-9]+)S$', + duration_str) + duration = ( + int_or_none(duration_mobj.group('hours'), default=0) * 3600 + + int(duration_mobj.group('minutes')) * 60 + + int(duration_mobj.group('seconds'))) + upload_date = unified_strdate(self._html_search_meta( + 'uploadDate', video_code, fatal=False)) + thumbnail = self._html_search_meta( + 'thumbnailUrl', video_code, 'thumbnail', fatal=False) + + player_params = compat_parse_qs(self._html_search_regex( + r'<iframe .*?class="player" src="https://secure.bilibili.tv/secure,([^"]+)"', + webpage, 'player params')) + + if 'cid' in player_params: + cid = player_params['cid'][0] + + lq_doc = self._download_xml( + 'http://interface.bilibili.cn/v_cdn_play?cid=%s' % cid, + video_id, + note='Downloading LQ video info' + ) + lq_durl = lq_doc.find('.//durl') + formats = [{ + 'format_id': 'lq', + 'quality': 1, + 'url': lq_durl.find('./url').text, + 'filesize': int_or_none( + lq_durl.find('./size'), get_attr='text'), + }] + + hq_doc = self._download_xml( + 'http://interface.bilibili.cn/playurl?cid=%s' % cid, + video_id, + note='Downloading HQ video info', + fatal=False, + ) + if hq_doc is not False: + hq_durl = hq_doc.find('.//durl') + formats.append({ + 'format_id': 'hq', + 'quality': 2, + 'ext': 'flv', + 'url': hq_durl.find('./url').text, + 'filesize': int_or_none( + hq_durl.find('./size'), get_attr='text'), + }) + else: + raise ExtractorError('Unsupported player parameters: %r' % (player_params,)) + + self._sort_formats(formats) + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'duration': duration, + 'upload_date': upload_date, + 'thumbnail': thumbnail, + } diff --git a/youtube_dl/extractor/blinkx.py b/youtube_dl/extractor/blinkx.py index 96408e4e0..38ccd957f 100644 --- a/youtube_dl/extractor/blinkx.py +++ b/youtube_dl/extractor/blinkx.py @@ -1,6 +1,5 @@ from __future__ import unicode_literals -import datetime import json import re @@ -19,15 +18,16 @@ class BlinkxIE(InfoExtractor): 'file': '8aQUy7GV.mp4', 'md5': '2e9a07364af40163a908edbf10bb2492', 'info_dict': { - "title": "Police Car Rolls Away", - "uploader": "stupidvideos.com", - "upload_date": "20131215", - "description": "A police car gently rolls away from a fight. Maybe it felt weird being around a confrontation and just had to get out of there!", - "duration": 14.886, - "thumbnails": [{ - "width": 100, - "height": 76, - "url": "http://cdn.blinkx.com/stream/b/41/StupidVideos/20131215/1873969261/1873969261_tn_0.jpg", + 'title': 'Police Car Rolls Away', + 'uploader': 'stupidvideos.com', + 'upload_date': '20131215', + 'timestamp': 1387068000, + 'description': 'A police car gently rolls away from a fight. Maybe it felt weird being around a confrontation and just had to get out of there!', + 'duration': 14.886, + 'thumbnails': [{ + 'width': 100, + 'height': 76, + 'url': 'http://cdn.blinkx.com/stream/b/41/StupidVideos/20131215/1873969261/1873969261_tn_0.jpg', }], }, } @@ -41,9 +41,6 @@ class BlinkxIE(InfoExtractor): 'video=%s' % video_id) data_json = self._download_webpage(api_url, display_id) data = json.loads(data_json)['api']['results'][0] - dt = datetime.datetime.fromtimestamp(data['pubdate_epoch']) - pload_date = dt.strftime('%Y%m%d') - duration = None thumbnails = [] formats = [] @@ -64,10 +61,7 @@ class BlinkxIE(InfoExtractor): vcodec = remove_start(m['vcodec'], 'ff') acodec = remove_start(m['acodec'], 'ff') tbr = (int(m['vbr']) + int(m['abr'])) // 1000 - format_id = (u'%s-%sk-%s' % - (vcodec, - tbr, - m['w'])) + format_id = u'%s-%sk-%s' % (vcodec, tbr, m['w']) formats.append({ 'format_id': format_id, 'url': m['link'], @@ -88,7 +82,7 @@ class BlinkxIE(InfoExtractor): 'title': data['title'], 'formats': formats, 'uploader': data['channel_name'], - 'upload_date': pload_date, + 'timestamp': data['pubdate_epoch'], 'description': data.get('description'), 'thumbnails': thumbnails, 'duration': duration, diff --git a/youtube_dl/extractor/bliptv.py b/youtube_dl/extractor/bliptv.py index a26001bb3..d4da08991 100644 --- a/youtube_dl/extractor/bliptv.py +++ b/youtube_dl/extractor/bliptv.py @@ -1,102 +1,124 @@ from __future__ import unicode_literals -import datetime import re from .common import InfoExtractor from .subtitles import SubtitlesInfoExtractor from ..utils import ( - compat_str, compat_urllib_request, - unescapeHTML, + parse_iso8601, + compat_urlparse, + clean_html, + compat_str, ) class BlipTVIE(SubtitlesInfoExtractor): - """Information extractor for blip.tv""" + _VALID_URL = r'https?://(?:\w+\.)?blip\.tv/(?:(?:.+-|rss/flash/)(?P<id>\d+)|((?:play/|api\.swf#)(?P<lookup_id>[\da-zA-Z]+)))' - _VALID_URL = r'https?://(?:\w+\.)?blip\.tv/((.+/)|(play/)|(api\.swf#))(?P<presumptive_id>.+)$' - - _TESTS = [{ - 'url': 'http://blip.tv/cbr/cbr-exclusive-gotham-city-imposters-bats-vs-jokerz-short-3-5796352', - 'md5': 'c6934ad0b6acf2bd920720ec888eb812', - 'info_dict': { - 'id': '5779306', - 'ext': 'mov', - 'upload_date': '20111205', - 'description': 'md5:9bc31f227219cde65e47eeec8d2dc596', - 'uploader': 'Comic Book Resources - CBR TV', - 'title': 'CBR EXCLUSIVE: "Gotham City Imposters" Bats VS Jokerz Short 3', + _TESTS = [ + { + 'url': 'http://blip.tv/cbr/cbr-exclusive-gotham-city-imposters-bats-vs-jokerz-short-3-5796352', + 'md5': 'c6934ad0b6acf2bd920720ec888eb812', + 'info_dict': { + 'id': '5779306', + 'ext': 'mov', + 'title': 'CBR EXCLUSIVE: "Gotham City Imposters" Bats VS Jokerz Short 3', + 'description': 'md5:9bc31f227219cde65e47eeec8d2dc596', + 'timestamp': 1323138843, + 'upload_date': '20111206', + 'uploader': 'cbr', + 'uploader_id': '679425', + 'duration': 81, + } + }, + { + # https://github.com/rg3/youtube-dl/pull/2274 + 'note': 'Video with subtitles', + 'url': 'http://blip.tv/play/h6Uag5OEVgI.html', + 'md5': '309f9d25b820b086ca163ffac8031806', + 'info_dict': { + 'id': '6586561', + 'ext': 'mp4', + 'title': 'Red vs. Blue Season 11 Episode 1', + 'description': 'One-Zero-One', + 'timestamp': 1371261608, + 'upload_date': '20130615', + 'uploader': 'redvsblue', + 'uploader_id': '792887', + 'duration': 279, + } } - }, { - # https://github.com/rg3/youtube-dl/pull/2274 - 'note': 'Video with subtitles', - 'url': 'http://blip.tv/play/h6Uag5OEVgI.html', - 'md5': '309f9d25b820b086ca163ffac8031806', - 'info_dict': { - 'id': '6586561', - 'ext': 'mp4', - 'uploader': 'Red vs. Blue', - 'description': 'One-Zero-One', - 'upload_date': '20130614', - 'title': 'Red vs. Blue Season 11 Episode 1', - } - }] + ] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - presumptive_id = mobj.group('presumptive_id') + lookup_id = mobj.group('lookup_id') # See https://github.com/rg3/youtube-dl/issues/857 - embed_mobj = re.match(r'https?://(?:\w+\.)?blip\.tv/(?:play/|api\.swf#)([a-zA-Z0-9]+)', url) - if embed_mobj: - info_url = 'http://blip.tv/play/%s.x?p=1' % embed_mobj.group(1) - info_page = self._download_webpage(info_url, embed_mobj.group(1)) - video_id = self._search_regex( - r'data-episode-id="([0-9]+)', info_page, 'video_id') - return self.url_result('http://blip.tv/a/a-' + video_id, 'BlipTV') - - cchar = '&' if '?' in url else '?' - json_url = url + cchar + 'skin=json&version=2&no_wrap=1' - request = compat_urllib_request.Request(json_url) - request.add_header('User-Agent', 'iTunes/10.6.1') - - json_data = self._download_json(request, video_id=presumptive_id) - - if 'Post' in json_data: - data = json_data['Post'] + if lookup_id: + info_page = self._download_webpage( + 'http://blip.tv/play/%s.x?p=1' % lookup_id, lookup_id, 'Resolving lookup id') + video_id = self._search_regex(r'data-episode-id="([0-9]+)', info_page, 'video_id') else: - data = json_data + video_id = mobj.group('id') + + rss = self._download_xml('http://blip.tv/rss/flash/%s' % video_id, video_id, 'Downloading video RSS') + + def blip(s): + return '{http://blip.tv/dtd/blip/1.0}%s' % s + + def media(s): + return '{http://search.yahoo.com/mrss/}%s' % s + + def itunes(s): + return '{http://www.itunes.com/dtds/podcast-1.0.dtd}%s' % s + + item = rss.find('channel/item') + + video_id = item.find(blip('item_id')).text + title = item.find('./title').text + description = clean_html(compat_str(item.find(blip('puredescription')).text)) + timestamp = parse_iso8601(item.find(blip('datestamp')).text) + uploader = item.find(blip('user')).text + uploader_id = item.find(blip('userid')).text + duration = int(item.find(blip('runtime')).text) + media_thumbnail = item.find(media('thumbnail')) + thumbnail = media_thumbnail.get('url') if media_thumbnail is not None else item.find(itunes('image')).text + categories = [category.text for category in item.findall('category')] - video_id = compat_str(data['item_id']) - upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d') - subtitles = {} formats = [] - if 'additionalMedia' in data: - for f in data['additionalMedia']: - if f.get('file_type_srt') == 1: - LANGS = { - 'english': 'en', - } - lang = f['role'].rpartition('-')[-1].strip().lower() - langcode = LANGS.get(lang, lang) - subtitles[langcode] = f['url'] - continue - if not int(f['media_width']): # filter m3u8 - continue + subtitles = {} + + media_group = item.find(media('group')) + for media_content in media_group.findall(media('content')): + url = media_content.get('url') + role = media_content.get(blip('role')) + msg = self._download_webpage( + url + '?showplayer=20140425131715&referrer=http://blip.tv&mask=7&skin=flashvars&view=url', + video_id, 'Resolving URL for %s' % role) + real_url = compat_urlparse.parse_qs(msg)['message'][0] + + media_type = media_content.get('type') + if media_type == 'text/srt' or url.endswith('.srt'): + LANGS = { + 'english': 'en', + } + lang = role.rpartition('-')[-1].strip().lower() + langcode = LANGS.get(lang, lang) + subtitles[langcode] = url + elif media_type.startswith('video/'): formats.append({ - 'url': f['url'], - 'format_id': f['role'], - 'width': int(f['media_width']), - 'height': int(f['media_height']), + 'url': real_url, + 'format_id': role, + 'format_note': media_type, + 'vcodec': media_content.get(blip('vcodec')), + 'acodec': media_content.get(blip('acodec')), + 'filesize': media_content.get('filesize'), + 'width': int(media_content.get('width')), + 'height': int(media_content.get('height')), }) - else: - formats.append({ - 'url': data['media']['url'], - 'width': int(data['media']['width']), - 'height': int(data['media']['height']), - }) self._sort_formats(formats) # subtitles @@ -107,12 +129,14 @@ class BlipTVIE(SubtitlesInfoExtractor): return { 'id': video_id, - 'uploader': data['display_name'], - 'upload_date': upload_date, - 'title': data['title'], - 'thumbnail': data['thumbnailUrl'], - 'description': data['description'], - 'user_agent': 'iTunes/10.6.1', + 'title': title, + 'description': description, + 'timestamp': timestamp, + 'uploader': uploader, + 'uploader_id': uploader_id, + 'duration': duration, + 'thumbnail': thumbnail, + 'categories': categories, 'formats': formats, 'subtitles': video_subtitles, } diff --git a/youtube_dl/extractor/canalplus.py b/youtube_dl/extractor/canalplus.py index 49dfd881e..0202078b0 100644 --- a/youtube_dl/extractor/canalplus.py +++ b/youtube_dl/extractor/canalplus.py @@ -1,53 +1,72 @@ # encoding: utf-8 +from __future__ import unicode_literals + import re from .common import InfoExtractor -from ..utils import unified_strdate +from ..utils import ( + unified_strdate, + url_basename, +) class CanalplusIE(InfoExtractor): - _VALID_URL = r'https?://(www\.canalplus\.fr/.*?/(?P<path>.*)|player\.canalplus\.fr/#/(?P<id>\d+))' + _VALID_URL = r'https?://(?:www\.canalplus\.fr/.*?/(?P<path>.*)|player\.canalplus\.fr/#/(?P<id>[0-9]+))' _VIDEO_INFO_TEMPLATE = 'http://service.canal-plus.com/video/rest/getVideosLiees/cplus/%s' - IE_NAME = u'canalplus.fr' + IE_NAME = 'canalplus.fr' _TEST = { - u'url': u'http://www.canalplus.fr/c-infos-documentaires/pid1830-c-zapping.html?vid=922470', - u'file': u'922470.flv', - u'info_dict': { - u'title': u'Zapping - 26/08/13', - u'description': u'Le meilleur de toutes les chaînes, tous les jours.\nEmission du 26 août 2013', - u'upload_date': u'20130826', - }, - u'params': { - u'skip_download': True, + 'url': 'http://www.canalplus.fr/c-infos-documentaires/pid1830-c-zapping.html?vid=922470', + 'md5': '3db39fb48b9685438ecf33a1078023e4', + 'info_dict': { + 'id': '922470', + 'ext': 'flv', + 'title': 'Zapping - 26/08/13', + 'description': 'Le meilleur de toutes les chaînes, tous les jours.\nEmission du 26 août 2013', + 'upload_date': '20130826', }, } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.groupdict().get('id') + + # Beware, some subclasses do not define an id group + display_id = url_basename(mobj.group('path')) + if video_id is None: - webpage = self._download_webpage(url, mobj.group('path')) - video_id = self._search_regex(r'<canal:player videoId="(\d+)"', webpage, u'video id') + webpage = self._download_webpage(url, display_id) + video_id = self._search_regex(r'<canal:player videoId="(\d+)"', webpage, 'video id') + info_url = self._VIDEO_INFO_TEMPLATE % video_id - doc = self._download_xml(info_url,video_id, - u'Downloading video info') + doc = self._download_xml(info_url, video_id, 'Downloading video XML') - self.report_extraction(video_id) video_info = [video for video in doc if video.find('ID').text == video_id][0] - infos = video_info.find('INFOS') media = video_info.find('MEDIA') - formats = [media.find('VIDEOS/%s' % format) - for format in ['BAS_DEBIT', 'HAUT_DEBIT', 'HD']] - video_url = [format.text for format in formats if format is not None][-1] + infos = video_info.find('INFOS') - return {'id': video_id, - 'title': u'%s - %s' % (infos.find('TITRAGE/TITRE').text, - infos.find('TITRAGE/SOUS_TITRE').text), - 'url': video_url, - 'ext': 'flv', - 'upload_date': unified_strdate(infos.find('PUBLICATION/DATE').text), - 'thumbnail': media.find('IMAGES/GRAND').text, - 'description': infos.find('DESCRIPTION').text, - 'view_count': int(infos.find('NB_VUES').text), - } + preferences = ['MOBILE', 'BAS_DEBIT', 'HAUT_DEBIT', 'HD', 'HLS', 'HDS'] + + formats = [ + { + 'url': fmt.text + '?hdcore=2.11.3' if fmt.tag == 'HDS' else fmt.text, + 'format_id': fmt.tag, + 'ext': 'mp4' if fmt.tag == 'HLS' else 'flv', + 'preference': preferences.index(fmt.tag) if fmt.tag in preferences else -1, + } for fmt in media.find('VIDEOS') if fmt.text + ] + self._sort_formats(formats) + + return { + 'id': video_id, + 'display_id': display_id, + 'title': '%s - %s' % (infos.find('TITRAGE/TITRE').text, + infos.find('TITRAGE/SOUS_TITRE').text), + 'upload_date': unified_strdate(infos.find('PUBLICATION/DATE').text), + 'thumbnail': media.find('IMAGES/GRAND').text, + 'description': infos.find('DESCRIPTION').text, + 'view_count': int(infos.find('NB_VUES').text), + 'like_count': int(infos.find('NB_LIKES').text), + 'comment_count': int(infos.find('NB_COMMENTS').text), + 'formats': formats, + } \ No newline at end of file diff --git a/youtube_dl/extractor/cinemassacre.py b/youtube_dl/extractor/cinemassacre.py index 2301f61b6..496271be4 100644 --- a/youtube_dl/extractor/cinemassacre.py +++ b/youtube_dl/extractor/cinemassacre.py @@ -1,10 +1,12 @@ # encoding: utf-8 from __future__ import unicode_literals + import re from .common import InfoExtractor from ..utils import ( ExtractorError, + int_or_none, ) @@ -13,9 +15,10 @@ class CinemassacreIE(InfoExtractor): _TESTS = [ { 'url': 'http://cinemassacre.com/2012/11/10/avgn-the-movie-trailer/', - 'file': '19911.mp4', - 'md5': '782f8504ca95a0eba8fc9177c373eec7', + 'md5': 'fde81fbafaee331785f58cd6c0d46190', 'info_dict': { + 'id': '19911', + 'ext': 'mp4', 'upload_date': '20121110', 'title': '“Angry Video Game Nerd: The Movie” – Trailer', 'description': 'md5:fb87405fcb42a331742a0dce2708560b', @@ -23,9 +26,10 @@ class CinemassacreIE(InfoExtractor): }, { 'url': 'http://cinemassacre.com/2013/10/02/the-mummys-hand-1940', - 'file': '521be8ef82b16.mp4', - 'md5': 'dec39ee5118f8d9cc067f45f9cbe3a35', + 'md5': 'd72f10cd39eac4215048f62ab477a511', 'info_dict': { + 'id': '521be8ef82b16', + 'ext': 'mp4', 'upload_date': '20131002', 'title': 'The Mummy’s Hand (1940)', }, @@ -50,29 +54,40 @@ class CinemassacreIE(InfoExtractor): r'<div class="entry-content">(?P<description>.+?)</div>', webpage, 'description', flags=re.DOTALL, fatal=False) - playerdata = self._download_webpage(playerdata_url, video_id) + playerdata = self._download_webpage(playerdata_url, video_id, 'Downloading player webpage') + video_thumbnail = self._search_regex( + r'image: \'(?P<thumbnail>[^\']+)\'', playerdata, 'thumbnail', fatal=False) + sd_url = self._search_regex(r'file: \'([^\']+)\', label: \'SD\'', playerdata, 'sd_file') + videolist_url = self._search_regex(r'file: \'([^\']+\.smil)\'}', playerdata, 'videolist_url') - sd_url = self._html_search_regex(r'file: \'([^\']+)\', label: \'SD\'', playerdata, 'sd_file') - hd_url = self._html_search_regex( - r'file: \'([^\']+)\', label: \'HD\'', playerdata, 'hd_file', - default=None) - video_thumbnail = self._html_search_regex(r'image: \'(?P<thumbnail>[^\']+)\'', playerdata, 'thumbnail', fatal=False) + videolist = self._download_xml(videolist_url, video_id, 'Downloading videolist XML') - formats = [{ - 'url': sd_url, - 'ext': 'mp4', - 'format': 'sd', - 'format_id': 'sd', - 'quality': 1, - }] - if hd_url: - formats.append({ - 'url': hd_url, - 'ext': 'mp4', - 'format': 'hd', - 'format_id': 'hd', - 'quality': 2, - }) + formats = [] + baseurl = sd_url[:sd_url.rfind('/')+1] + for video in videolist.findall('.//video'): + src = video.get('src') + if not src: + continue + file_ = src.partition(':')[-1] + width = int_or_none(video.get('width')) + height = int_or_none(video.get('height')) + bitrate = int_or_none(video.get('system-bitrate')) + format = { + 'url': baseurl + file_, + 'format_id': src.rpartition('.')[0].rpartition('_')[-1], + } + if width or height: + format.update({ + 'tbr': bitrate // 1000 if bitrate else None, + 'width': width, + 'height': height, + }) + else: + format.update({ + 'abr': bitrate // 1000 if bitrate else None, + 'vcodec': 'none', + }) + formats.append(format) self._sort_formats(formats) return { diff --git a/youtube_dl/extractor/clubic.py b/youtube_dl/extractor/clubic.py new file mode 100644 index 000000000..14f215c5c --- /dev/null +++ b/youtube_dl/extractor/clubic.py @@ -0,0 +1,58 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json +import re + +from .common import InfoExtractor +from ..utils import ( + clean_html, + qualities, +) + + +class ClubicIE(InfoExtractor): + _VALID_URL = r'http://(?:www\.)?clubic\.com/video/[^/]+/video.*-(?P<id>[0-9]+)\.html' + + _TEST = { + 'url': 'http://www.clubic.com/video/clubic-week/video-clubic-week-2-0-le-fbi-se-lance-dans-la-photo-d-identite-448474.html', + 'md5': '1592b694ba586036efac1776b0b43cd3', + 'info_dict': { + 'id': '448474', + 'ext': 'mp4', + 'title': 'Clubic Week 2.0 : le FBI se lance dans la photo d\u0092identité', + 'description': 're:Gueule de bois chez Nokia. Le constructeur a indiqué cette.*', + 'thumbnail': 're:^http://img\.clubic\.com/.*\.jpg$', + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + + player_url = 'http://player.m6web.fr/v1/player/clubic/%s.html' % video_id + player_page = self._download_webpage(player_url, video_id) + + config_json = self._search_regex( + r'(?m)M6\.Player\.config\s*=\s*(\{.+?\});$', player_page, + 'configuration') + config = json.loads(config_json) + + video_info = config['videoInfo'] + sources = config['sources'] + quality_order = qualities(['sd', 'hq']) + + formats = [{ + 'format_id': src['streamQuality'], + 'url': src['src'], + 'quality': quality_order(src['streamQuality']), + } for src in sources] + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': video_info['title'], + 'formats': formats, + 'description': clean_html(video_info.get('description')), + 'thumbnail': config.get('poster'), + } diff --git a/youtube_dl/extractor/cnet.py b/youtube_dl/extractor/cnet.py index f5ab443d2..a94f42571 100644 --- a/youtube_dl/extractor/cnet.py +++ b/youtube_dl/extractor/cnet.py @@ -33,7 +33,7 @@ class CNETIE(InfoExtractor): webpage = self._download_webpage(url, display_id) data_json = self._html_search_regex( - r"<div class=\"cnetVideoPlayer\" data-cnet-video-options='([^']+)'", + r"<div class=\"cnetVideoPlayer\"\s+.*?data-cnet-video-options='([^']+)'", webpage, 'data json') data = json.loads(data_json) vdata = data['video'] diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 0b90febf4..1d49a2b8c 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -117,6 +117,8 @@ class InfoExtractor(object): webpage_url: The url to the video webpage, if given to youtube-dl it should allow to get the same result again. (It will be set by YoutubeDL if it's missing) + categories: A list of categories that the video falls in, for example + ["Sports", "Berlin"] Unless mentioned otherwise, the fields should be Unicode strings. @@ -246,10 +248,11 @@ class InfoExtractor(object): url = url_or_request.get_full_url() except AttributeError: url = url_or_request - if len(url) > 200: - h = u'___' + hashlib.md5(url.encode('utf-8')).hexdigest() - url = url[:200 - len(h)] + h - raw_filename = ('%s_%s.dump' % (video_id, url)) + basen = '%s_%s' % (video_id, url) + if len(basen) > 240: + h = u'___' + hashlib.md5(basen.encode('utf-8')).hexdigest() + basen = basen[:240 - len(h)] + h + raw_filename = basen + '.dump' filename = sanitize_filename(raw_filename, restricted=True) self.to_screen(u'Saving request to ' + filename) with open(filename, 'wb') as outf: @@ -283,9 +286,12 @@ class InfoExtractor(object): def _download_xml(self, url_or_request, video_id, note=u'Downloading XML', errnote=u'Unable to download XML', - transform_source=None): + transform_source=None, fatal=True): """Return the xml as an xml.etree.ElementTree.Element""" - xml_string = self._download_webpage(url_or_request, video_id, note, errnote) + xml_string = self._download_webpage( + url_or_request, video_id, note, errnote, fatal=fatal) + if xml_string is False: + return xml_string if transform_source: xml_string = transform_source(xml_string) return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8')) @@ -549,6 +555,23 @@ class InfoExtractor(object): ) formats.sort(key=_formats_key) + def http_scheme(self): + """ Either "https:" or "https:", depending on the user's preferences """ + return ( + 'http:' + if self._downloader.params.get('prefer_insecure', False) + else 'https:') + + def _proto_relative_url(self, url, scheme=None): + if url is None: + return url + if url.startswith('//'): + if scheme is None: + scheme = self.http_scheme() + return scheme + url + else: + return url + def _entry_formats_to_parts(self, entries): '''Transforms entries with formats to formats with parts. Used when joinparts is set.''' ekeys = None @@ -615,3 +638,4 @@ class SearchInfoExtractor(InfoExtractor): @property def SEARCH_KEY(self): return self._SEARCH_KEY + diff --git a/youtube_dl/extractor/condenast.py b/youtube_dl/extractor/condenast.py index 91c1c1348..ffbe4903b 100644 --- a/youtube_dl/extractor/condenast.py +++ b/youtube_dl/extractor/condenast.py @@ -28,16 +28,18 @@ class CondeNastIE(InfoExtractor): 'glamour': 'Glamour', 'wmagazine': 'W Magazine', 'vanityfair': 'Vanity Fair', + 'cnevids': 'Condé Nast', } - _VALID_URL = r'http://(video|www)\.(?P<site>%s)\.com/(?P<type>watch|series|video)/(?P<id>.+)' % '|'.join(_SITES.keys()) + _VALID_URL = r'http://(video|www|player)\.(?P<site>%s)\.com/(?P<type>watch|series|video|embed)/(?P<id>[^/?#]+)' % '|'.join(_SITES.keys()) IE_DESC = 'Condé Nast media group: %s' % ', '.join(sorted(_SITES.values())) _TEST = { 'url': 'http://video.wired.com/watch/3d-printed-speakers-lit-with-led', - 'file': '5171b343c2b4c00dd0c1ccb3.mp4', 'md5': '1921f713ed48aabd715691f774c451f7', 'info_dict': { + 'id': '5171b343c2b4c00dd0c1ccb3', + 'ext': 'mp4', 'title': '3D Printed Speakers Lit With LED', 'description': 'Check out these beautiful 3D printed LED speakers. You can\'t actually buy them, but LumiGeek is working on a board that will let you make you\'re own.', } @@ -55,12 +57,16 @@ class CondeNastIE(InfoExtractor): entries = [self.url_result(build_url(path), 'CondeNast') for path in paths] return self.playlist_result(entries, playlist_title=title) - def _extract_video(self, webpage): - description = self._html_search_regex([r'<div class="cne-video-description">(.+?)</div>', - r'<div class="video-post-content">(.+?)</div>', - ], - webpage, 'description', - fatal=False, flags=re.DOTALL) + def _extract_video(self, webpage, url_type): + if url_type != 'embed': + description = self._html_search_regex( + [ + r'<div class="cne-video-description">(.+?)</div>', + r'<div class="video-post-content">(.+?)</div>', + ], + webpage, 'description', fatal=False, flags=re.DOTALL) + else: + description = None params = self._search_regex(r'var params = {(.+?)}[;,]', webpage, 'player params', flags=re.DOTALL) video_id = self._search_regex(r'videoId: [\'"](.+?)[\'"]', params, 'video id') @@ -99,12 +105,12 @@ class CondeNastIE(InfoExtractor): mobj = re.match(self._VALID_URL, url) site = mobj.group('site') url_type = mobj.group('type') - id = mobj.group('id') + item_id = mobj.group('id') - self.to_screen(u'Extracting from %s with the Condé Nast extractor' % self._SITES[site]) - webpage = self._download_webpage(url, id) + self.to_screen('Extracting from %s with the Condé Nast extractor' % self._SITES[site]) + webpage = self._download_webpage(url, item_id) if url_type == 'series': return self._extract_series(url, webpage) else: - return self._extract_video(webpage) + return self._extract_video(webpage, url_type) diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index bae1c7754..55216201f 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -8,12 +8,11 @@ from .subtitles import SubtitlesInfoExtractor from ..utils import ( compat_urllib_request, compat_str, - get_element_by_id, orderedSet, str_to_int, int_or_none, - ExtractorError, + unescapeHTML, ) class DailymotionBaseInfoExtractor(InfoExtractor): @@ -189,7 +188,7 @@ class DailymotionPlaylistIE(DailymotionBaseInfoExtractor): webpage = self._download_webpage(request, id, u'Downloading page %s' % pagenum) - video_ids.extend(re.findall(r'data-id="(.+?)"', webpage)) + video_ids.extend(re.findall(r'data-xid="(.+?)"', webpage)) if re.search(self._MORE_PAGES_INDICATOR, webpage) is None: break @@ -218,9 +217,9 @@ class DailymotionUserIE(DailymotionPlaylistIE): mobj = re.match(self._VALID_URL, url) user = mobj.group('user') webpage = self._download_webpage(url, user) - full_user = self._html_search_regex( - r'<a class="label" href="/%s".*?>(.*?)</' % re.escape(user), - webpage, u'user', flags=re.DOTALL) + full_user = unescapeHTML(self._html_search_regex( + r'<a class="nav-image" title="([^"]+)" href="/%s">' % re.escape(user), + webpage, u'user', flags=re.DOTALL)) return { '_type': 'playlist', diff --git a/youtube_dl/extractor/empflix.py b/youtube_dl/extractor/empflix.py new file mode 100644 index 000000000..e6952588f --- /dev/null +++ b/youtube_dl/extractor/empflix.py @@ -0,0 +1,54 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor + + +class EmpflixIE(InfoExtractor): + _VALID_URL = r'^https?://www\.empflix\.com/videos/.*?-(?P<id>[0-9]+)\.html' + _TEST = { + 'url': 'http://www.empflix.com/videos/Amateur-Finger-Fuck-33051.html', + 'md5': 'b1bc15b6412d33902d6e5952035fcabc', + 'info_dict': { + 'id': '33051', + 'ext': 'mp4', + 'title': 'Amateur Finger Fuck', + 'description': 'Amateur solo finger fucking.', + 'age_limit': 18, + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + + webpage = self._download_webpage(url, video_id) + age_limit = self._rta_search(webpage) + + video_title = self._html_search_regex( + r'name="title" value="(?P<title>[^"]*)"', webpage, 'title') + video_description = self._html_search_regex( + r'name="description" value="([^"]*)"', webpage, 'description', fatal=False) + + cfg_url = self._html_search_regex( + r'flashvars\.config = escape\("([^"]+)"', + webpage, 'flashvars.config') + + cfg_xml = self._download_xml( + cfg_url, video_id, note='Downloading metadata') + + formats = [ + { + 'url': item.find('videoLink').text, + 'format_id': item.find('res').text, + } for item in cfg_xml.findall('./quality/item') + ] + + return { + 'id': video_id, + 'title': video_title, + 'description': video_description, + 'formats': formats, + 'age_limit': age_limit, + } diff --git a/youtube_dl/extractor/extremetube.py b/youtube_dl/extractor/extremetube.py index 1c20e4364..ff7c0cd3e 100644 --- a/youtube_dl/extractor/extremetube.py +++ b/youtube_dl/extractor/extremetube.py @@ -1,4 +1,5 @@ -import os +from __future__ import unicode_literals + import re from .common import InfoExtractor @@ -8,18 +9,23 @@ from ..utils import ( compat_urllib_parse, ) + class ExtremeTubeIE(InfoExtractor): - _VALID_URL = r'^(?:https?://)?(?:www\.)?(?P<url>extremetube\.com/video/.+?(?P<videoid>[0-9]+))(?:[/?&]|$)' - _TEST = { - u'url': u'http://www.extremetube.com/video/music-video-14-british-euro-brit-european-cumshots-swallow-652431', - u'file': u'652431.mp4', - u'md5': u'1fb9228f5e3332ec8c057d6ac36f33e0', - u'info_dict': { - u"title": u"Music Video 14 british euro brit european cumshots swallow", - u"uploader": u"unknown", - u"age_limit": 18, + _VALID_URL = r'^(?:https?://)?(?:www\.)?(?P<url>extremetube\.com/.*?video/.+?(?P<videoid>[0-9]+))(?:[/?&]|$)' + _TESTS = [{ + 'url': 'http://www.extremetube.com/video/music-video-14-british-euro-brit-european-cumshots-swallow-652431', + 'md5': '1fb9228f5e3332ec8c057d6ac36f33e0', + 'info_dict': { + 'id': '652431', + 'ext': 'mp4', + 'title': 'Music Video 14 british euro brit european cumshots swallow', + 'uploader': 'unknown', + 'age_limit': 18, } - } + }, { + 'url': 'http://www.extremetube.com/gay/video/abcde-1234', + 'only_matching': True, + }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) @@ -30,11 +36,14 @@ class ExtremeTubeIE(InfoExtractor): req.add_header('Cookie', 'age_verified=1') webpage = self._download_webpage(req, video_id) - video_title = self._html_search_regex(r'<h1 [^>]*?title="([^"]+)"[^>]*>\1<', webpage, u'title') - uploader = self._html_search_regex(r'>Posted by:(?=<)(?:\s|<[^>]*>)*(.+?)\|', webpage, u'uploader', fatal=False) - video_url = compat_urllib_parse.unquote(self._html_search_regex(r'video_url=(.+?)&', webpage, u'video_url')) + video_title = self._html_search_regex( + r'<h1 [^>]*?title="([^"]+)"[^>]*>\1<', webpage, 'title') + uploader = self._html_search_regex( + r'>Posted by:(?=<)(?:\s|<[^>]*>)*(.+?)\|', webpage, 'uploader', + fatal=False) + video_url = compat_urllib_parse.unquote(self._html_search_regex( + r'video_url=(.+?)&', webpage, 'video_url')) path = compat_urllib_parse_urlparse(video_url).path - extension = os.path.splitext(path)[1][1:] format = path.split('/')[5].split('_')[:2] format = "-".join(format) @@ -43,7 +52,6 @@ class ExtremeTubeIE(InfoExtractor): 'title': video_title, 'uploader': uploader, 'url': video_url, - 'ext': extension, 'format': format, 'format_id': format, 'age_limit': 18, diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index a713628b2..f0cd8f156 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -76,9 +76,8 @@ class FacebookIE(InfoExtractor): check_form = { 'fb_dtsg': self._search_regex(r'name="fb_dtsg" value="(.+?)"', login_results, 'fb_dtsg'), - 'nh': self._search_regex(r'name="nh" value="(\w*?)"', login_results, 'nh'), + 'h': self._search_regex(r'name="h" value="(\w*?)"', login_results, 'h'), 'name_action_selected': 'dont_save', - 'submit[Continue]': self._search_regex(r'<button[^>]+value="(.*?)"[^>]+name="submit\[Continue\]"', login_results, 'continue'), } check_req = compat_urllib_request.Request(self._CHECKPOINT_URL, urlencode_postdata(check_form)) check_req.add_header('Content-Type', 'application/x-www-form-urlencoded') diff --git a/youtube_dl/extractor/fc2.py b/youtube_dl/extractor/fc2.py new file mode 100644 index 000000000..ca8993241 --- /dev/null +++ b/youtube_dl/extractor/fc2.py @@ -0,0 +1,60 @@ +#! -*- coding: utf-8 -*- +from __future__ import unicode_literals + +import re +import hashlib + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + compat_urllib_request, + compat_urlparse, +) + + +class FC2IE(InfoExtractor): + _VALID_URL = r'^http://video\.fc2\.com/(?P<lang>[^/]+)/content/(?P<id>[^/]+)' + IE_NAME = 'fc2' + _TEST = { + 'url': 'http://video.fc2.com/en/content/20121103kUan1KHs', + 'md5': 'a6ebe8ebe0396518689d963774a54eb7', + 'info_dict': { + 'id': '20121103kUan1KHs', + 'ext': 'flv', + 'title': 'Boxing again with Puff', + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + + webpage = self._download_webpage(url, video_id) + self._downloader.cookiejar.clear_session_cookies() # must clear + + title = self._og_search_title(webpage) + thumbnail = self._og_search_thumbnail(webpage) + refer = url.replace('/content/', '/a/content/') + + mimi = hashlib.md5(video_id + '_gGddgPfeaf_gzyr').hexdigest() + + info_url = ( + "http://video.fc2.com/ginfo.php?mimi={1:s}&href={2:s}&v={0:s}&fversion=WIN%2011%2C6%2C602%2C180&from=2&otag=0&upid={0:s}&tk=null&". + format(video_id, mimi, compat_urllib_request.quote(refer, safe='').replace('.','%2E'))) + + info_webpage = self._download_webpage( + info_url, video_id, note='Downloading info page') + info = compat_urlparse.parse_qs(info_webpage) + + if 'err_code' in info: + raise ExtractorError('Error code: %s' % info['err_code'][0]) + + video_url = info['filepath'][0] + '?mid=' + info['mid'][0] + + return { + 'id': video_id, + 'title': info['title'][0], + 'url': video_url, + 'ext': 'flv', + 'thumbnail': thumbnail, + } diff --git a/youtube_dl/extractor/fivemin.py b/youtube_dl/extractor/fivemin.py index b596bf587..3a50bab5c 100644 --- a/youtube_dl/extractor/fivemin.py +++ b/youtube_dl/extractor/fivemin.py @@ -6,6 +6,7 @@ from .common import InfoExtractor from ..utils import ( compat_str, compat_urllib_parse, + ExtractorError, ) @@ -58,9 +59,17 @@ class FiveMinIE(InfoExtractor): 'isPlayerSeed': 'true', 'url': embed_url, }) - info = self._download_json( + response = self._download_json( 'https://syn.5min.com/handlers/SenseHandler.ashx?' + query, - video_id)['binding'][0] + video_id) + if not response['success']: + err_msg = response['errorMessage'] + if err_msg == 'ErrorVideoUserNotGeo': + msg = 'Video not available from your location' + else: + msg = 'Aol said: %s' % err_msg + raise ExtractorError(msg, expected=True, video_id=video_id) + info = response['binding'][0] second_id = compat_str(int(video_id[:-2]) + 1) formats = [] diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py index 51eb97b2f..f3e0f38b7 100644 --- a/youtube_dl/extractor/francetv.py +++ b/youtube_dl/extractor/francetv.py @@ -48,24 +48,36 @@ class PluzzIE(FranceTVBaseInfoExtractor): class FranceTvInfoIE(FranceTVBaseInfoExtractor): IE_NAME = 'francetvinfo.fr' - _VALID_URL = r'https?://www\.francetvinfo\.fr/replay.*/(?P<title>.+)\.html' + _VALID_URL = r'https?://www\.francetvinfo\.fr/.*/(?P<title>.+)\.html' - _TEST = { + _TESTS = [{ 'url': 'http://www.francetvinfo.fr/replay-jt/france-3/soir-3/jt-grand-soir-3-lundi-26-aout-2013_393427.html', - 'file': '84981923.mp4', 'info_dict': { + 'id': '84981923', + 'ext': 'mp4', 'title': 'Soir 3', }, 'params': { 'skip_download': True, }, - } + }, { + 'url': 'http://www.francetvinfo.fr/elections/europeennes/direct-europeennes-regardez-le-debat-entre-les-candidats-a-la-presidence-de-la-commission_600639.html', + 'info_dict': { + 'id': 'EV_20019', + 'ext': 'mp4', + 'title': 'Débat des candidats à la Commission européenne', + 'description': 'Débat des candidats à la Commission européenne', + }, + 'params': { + 'skip_download': 'HLS (reqires ffmpeg)' + } + }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) page_title = mobj.group('title') webpage = self._download_webpage(url, page_title) - video_id = self._search_regex(r'id-video=(\d+?)[@"]', webpage, 'video id') + video_id = self._search_regex(r'id-video=((?:[^0-9]*?_)?[0-9]+)[@"]', webpage, 'video id') return self._extract_video(video_id) diff --git a/youtube_dl/extractor/funnyordie.py b/youtube_dl/extractor/funnyordie.py index 56e079288..6e6b66660 100644 --- a/youtube_dl/extractor/funnyordie.py +++ b/youtube_dl/extractor/funnyordie.py @@ -4,22 +4,32 @@ import json import re from .common import InfoExtractor +from ..utils import ExtractorError class FunnyOrDieIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?funnyordie\.com/(?P<type>embed|videos)/(?P<id>[0-9a-f]+)(?:$|[?#/])' - _TEST = { + _TESTS = [{ 'url': 'http://www.funnyordie.com/videos/0732f586d7/heart-shaped-box-literal-video-version', - 'file': '0732f586d7.mp4', - 'md5': 'f647e9e90064b53b6e046e75d0241fbd', + 'md5': 'bcd81e0c4f26189ee09be362ad6e6ba9', 'info_dict': { - 'description': ('Lyrics changed to match the video. Spoken cameo ' - 'by Obscurus Lupa (from ThatGuyWithTheGlasses.com). Based on a ' - 'concept by Dustin McLean (DustFilms.com). Performed, edited, ' - 'and written by David A. Scott.'), + 'id': '0732f586d7', + 'ext': 'mp4', 'title': 'Heart-Shaped Box: Literal Video Version', + 'description': 'md5:ea09a01bc9a1c46d9ab696c01747c338', + 'thumbnail': 're:^http:.*\.jpg$', }, - } + }, { + 'url': 'http://www.funnyordie.com/embed/e402820827', + 'md5': 'ff4d83318f89776ed0250634cfaa8d36', + 'info_dict': { + 'id': 'e402820827', + 'ext': 'mp4', + 'title': 'Please Use This Song (Jon Lajoie)', + 'description': 'md5:2ed27d364f5a805a6dba199faaf6681d', + 'thumbnail': 're:^http:.*\.jpg$', + }, + }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) @@ -27,27 +37,34 @@ class FunnyOrDieIE(InfoExtractor): video_id = mobj.group('id') webpage = self._download_webpage(url, video_id) - video_url = self._search_regex( - [r'type="video/mp4" src="(.*?)"', r'src="([^>]*?)" type=\'video/mp4\''], - webpage, 'video URL', flags=re.DOTALL) + links = re.findall(r'<source src="([^"]+/v)\d+\.([^"]+)" type=\'video', webpage) + if not links: + raise ExtractorError('No media links available for %s' % video_id) - if mobj.group('type') == 'embed': - post_json = self._search_regex( - r'fb_post\s*=\s*(\{.*?\});', webpage, 'post details') - post = json.loads(post_json) - title = post['name'] - description = post.get('description') - thumbnail = post.get('picture') - else: - title = self._og_search_title(webpage) - description = self._og_search_description(webpage) - thumbnail = None + links.sort(key=lambda link: 1 if link[1] == 'mp4' else 0) + + bitrates = self._html_search_regex(r'<source src="[^"]+/v,((?:\d+,)+)\.mp4\.csmil', webpage, 'video bitrates') + bitrates = [int(b) for b in bitrates.rstrip(',').split(',')] + bitrates.sort() + + formats = [] + + for bitrate in bitrates: + for link in links: + formats.append({ + 'url': '%s%d.%s' % (link[0], bitrate, link[1]), + 'format_id': '%s-%d' % (link[1], bitrate), + 'vbr': bitrate, + }) + + post_json = self._search_regex( + r'fb_post\s*=\s*(\{.*?\});', webpage, 'post details') + post = json.loads(post_json) return { 'id': video_id, - 'url': video_url, - 'ext': 'mp4', - 'title': title, - 'description': description, - 'thumbnail': thumbnail, + 'title': post['name'], + 'description': post.get('description'), + 'thumbnail': post.get('picture'), + 'formats': formats, } diff --git a/youtube_dl/extractor/gamekings.py b/youtube_dl/extractor/gamekings.py index 233398966..11fee3d31 100644 --- a/youtube_dl/extractor/gamekings.py +++ b/youtube_dl/extractor/gamekings.py @@ -15,7 +15,7 @@ class GamekingsIE(InfoExtractor): 'id': '20130811', 'ext': 'mp4', 'title': 'Phoenix Wright: Ace Attorney \u2013 Dual Destinies Review', - 'description': 'md5:632e61a9f97d700e83f43d77ddafb6a4', + 'description': 'md5:36fd701e57e8c15ac8682a2374c99731', } } diff --git a/youtube_dl/extractor/gamespot.py b/youtube_dl/extractor/gamespot.py index c9598ad3a..3d67b9d60 100644 --- a/youtube_dl/extractor/gamespot.py +++ b/youtube_dl/extractor/gamespot.py @@ -15,11 +15,12 @@ from ..utils import ( class GameSpotIE(InfoExtractor): _VALID_URL = r'(?:http://)?(?:www\.)?gamespot\.com/.*-(?P<page_id>\d+)/?' _TEST = { - "url": "http://www.gamespot.com/arma-iii/videos/arma-iii-community-guide-sitrep-i-6410818/", - "file": "gs-2300-6410818.mp4", - "md5": "b2a30deaa8654fcccd43713a6b6a4825", - "info_dict": { - "title": "Arma 3 - Community Guide: SITREP I", + 'url': 'http://www.gamespot.com/videos/arma-3-community-guide-sitrep-i/2300-6410818/', + 'md5': 'b2a30deaa8654fcccd43713a6b6a4825', + 'info_dict': { + 'id': 'gs-2300-6410818', + 'ext': 'mp4', + 'title': 'Arma 3 - Community Guide: SITREP I', 'description': 'Check out this video where some of the basics of Arma 3 is explained.', } } diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index f9b9d56d2..38a357d3b 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -239,6 +239,28 @@ class GenericIE(InfoExtractor): 'uploader_id': 'rbctv_2012_4', }, }, + # Condé Nast embed + { + 'url': 'http://www.wired.com/2014/04/honda-asimo/', + 'md5': 'ba0dfe966fa007657bd1443ee672db0f', + 'info_dict': { + 'id': '53501be369702d3275860000', + 'ext': 'mp4', + 'title': 'Honda’s New Asimo Robot Is More Human Than Ever', + } + }, + # Dailymotion embed + { + 'url': 'http://www.spi0n.com/zap-spi0n-com-n216/', + 'md5': '441aeeb82eb72c422c7f14ec533999cd', + 'info_dict': { + 'id': 'k2mm4bCdJ6CQ2i7c8o2', + 'ext': 'mp4', + 'title': 'Le Zap de Spi0n n°216 - Zapping du Web', + 'uploader': 'Spi0n', + }, + 'add_ie': ['Dailymotion'], + } ] def report_download_webpage(self, video_id): @@ -323,6 +345,12 @@ class GenericIE(InfoExtractor): } def _real_extract(self, url): + if url.startswith('//'): + return { + '_type': 'url', + 'url': self.http_scheme() + url, + } + parsed_url = compat_urlparse.urlparse(url) if not parsed_url.scheme: default_search = self._downloader.params.get('default_search') @@ -335,8 +363,13 @@ class GenericIE(InfoExtractor): return self.url_result('http://' + url) else: if default_search == 'auto_warning': - self._downloader.report_warning( - 'Falling back to youtube search for %s . Set --default-search to "auto" to suppress this warning.' % url) + if re.match(r'^(?:url|URL)$', url): + raise ExtractorError( + 'Invalid URL: %r . Call youtube-dl like this: youtube-dl -v "https://www.youtube.com/watch?v=BaW_jenozKc" ' % url, + expected=True) + else: + self._downloader.report_warning( + 'Falling back to youtube search for %s . Set --default-search to "auto" to suppress this warning.' % url) return self.url_result('ytsearch:' + url) else: assert ':' in default_search @@ -459,7 +492,7 @@ class GenericIE(InfoExtractor): matches = re.findall( r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.com/embed/video/.+?)\1', webpage) if matches: - urlrs = [self.url_result(unescapeHTML(tuppl[1]), 'Dailymotion') + urlrs = [self.url_result(unescapeHTML(tuppl[1])) for tuppl in matches] return self.playlist_result( urlrs, playlist_id=video_id, playlist_title=video_title) @@ -485,6 +518,22 @@ class GenericIE(InfoExtractor): if mobj: return self.url_result(mobj.group(1), 'BlipTV') + # Look for embedded condenast player + matches = re.findall( + r'<iframe\s+(?:[a-zA-Z-]+="[^"]+"\s+)*?src="(https?://player\.cnevids\.com/embed/[^"]+")', + webpage) + if matches: + return { + '_type': 'playlist', + 'entries': [{ + '_type': 'url', + 'ie_key': 'CondeNast', + 'url': ma, + } for ma in matches], + 'title': video_title, + 'id': video_id, + } + # Look for Bandcamp pages with custom domain mobj = re.search(r'<meta property="og:url"[^>]*?content="(.*?bandcamp\.com.*?)"', webpage) if mobj is not None: @@ -505,7 +554,7 @@ class GenericIE(InfoExtractor): return OoyalaIE._build_url_result(mobj.group('ec')) # Look for Aparat videos - mobj = re.search(r'<iframe src="(http://www\.aparat\.com/video/[^"]+)"', webpage) + mobj = re.search(r'<iframe .*?src="(http://www\.aparat\.com/video/[^"]+)"', webpage) if mobj is not None: return self.url_result(mobj.group(1), 'Aparat') @@ -516,7 +565,7 @@ class GenericIE(InfoExtractor): # Look for embedded NovaMov-based player mobj = re.search( - r'''(?x)<iframe[^>]+?src=(["\']) + r'''(?x)<(?:pagespeed_)?iframe[^>]+?src=(["\']) (?P<url>http://(?:(?:embed|www)\.)? (?:novamov\.com| nowvideo\.(?:ch|sx|eu|at|ag|co)| @@ -589,65 +638,86 @@ class GenericIE(InfoExtractor): if smotri_url: return self.url_result(smotri_url, 'Smotri') - # Start with something easy: JW Player in SWFObject - mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage) - if mobj is None: - # Look for gorilla-vid style embedding - mobj = re.search(r'(?s)(?:jw_plugins|JWPlayerOptions).*?file\s*:\s*["\'](.*?)["\']', webpage) - if mobj is None: - # Broaden the search a little bit - mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage) - if mobj is None: - # Broaden the search a little bit: JWPlayer JS loader - mobj = re.search(r'[^A-Za-z0-9]?file["\']?:\s*["\'](http(?![^\'"]+\.[0-9]+[\'"])[^\'"]+)["\']', webpage) + # Look for embeded soundcloud player + mobj = re.search( + r'<iframe src="(?P<url>https?://(?:w\.)?soundcloud\.com/player[^"]+)"', + webpage) + if mobj is not None: + url = unescapeHTML(mobj.group('url')) + return self.url_result(url) - if mobj is None: + # Start with something easy: JW Player in SWFObject + found = re.findall(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage) + if not found: + # Look for gorilla-vid style embedding + found = re.findall(r'''(?sx) + (?: + jw_plugins| + JWPlayerOptions| + jwplayer\s*\(\s*["'][^'"]+["']\s*\)\s*\.setup + ) + .*?file\s*:\s*["\'](.*?)["\']''', webpage) + if not found: + # Broaden the search a little bit + found = re.findall(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage) + if not found: + # Broaden the findall a little bit: JWPlayer JS loader + found = re.findall(r'[^A-Za-z0-9]?file["\']?:\s*["\'](http(?![^\'"]+\.[0-9]+[\'"])[^\'"]+)["\']', webpage) + if not found: # Try to find twitter cards info - mobj = re.search(r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage) - if mobj is None: + found = re.findall(r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage) + if not found: # We look for Open Graph info: # We have to match any number spaces between elements, some sites try to align them (eg.: statigr.am) - m_video_type = re.search(r'<meta.*?property="og:video:type".*?content="video/(.*?)"', webpage) + m_video_type = re.findall(r'<meta.*?property="og:video:type".*?content="video/(.*?)"', webpage) # We only look in og:video if the MIME type is a video, don't try if it's a Flash player: if m_video_type is not None: - mobj = re.search(r'<meta.*?property="og:video".*?content="(.*?)"', webpage) - if mobj is None: + found = re.findall(r'<meta.*?property="og:video".*?content="(.*?)"', webpage) + if not found: # HTML5 video - mobj = re.search(r'<video[^<]*(?:>.*?<source.*?)? src="([^"]+)"', webpage, flags=re.DOTALL) - if mobj is None: - mobj = re.search( + found = re.findall(r'(?s)<video[^<]*(?:>.*?<source.*?)? src="([^"]+)"', webpage) + if not found: + found = re.search( r'(?i)<meta\s+(?=(?:[a-z-]+="[^"]+"\s+)*http-equiv="refresh")' r'(?:[a-z-]+="[^"]+"\s+)*?content="[0-9]{,2};url=\'([^\']+)\'"', webpage) - if mobj: - new_url = mobj.group(1) + if found: + new_url = found.group(1) self.report_following_redirect(new_url) return { '_type': 'url', 'url': new_url, } - if mobj is None: + if not found: raise ExtractorError('Unsupported URL: %s' % url) - # It's possible that one of the regexes - # matched, but returned an empty group: - if mobj.group(1) is None: - raise ExtractorError('Did not find a valid video URL at %s' % url) + entries = [] + for video_url in found: + video_url = compat_urlparse.urljoin(url, video_url) + video_id = compat_urllib_parse.unquote(os.path.basename(video_url)) - video_url = mobj.group(1) - video_url = compat_urlparse.urljoin(url, video_url) - video_id = compat_urllib_parse.unquote(os.path.basename(video_url)) + # Sometimes, jwplayer extraction will result in a YouTube URL + if YoutubeIE.suitable(video_url): + entries.append(self.url_result(video_url, 'Youtube')) + continue - # Sometimes, jwplayer extraction will result in a YouTube URL - if YoutubeIE.suitable(video_url): - return self.url_result(video_url, 'Youtube') + # here's a fun little line of code for you: + video_id = os.path.splitext(video_id)[0] - # here's a fun little line of code for you: - video_id = os.path.splitext(video_id)[0] + entries.append({ + 'id': video_id, + 'url': video_url, + 'uploader': video_uploader, + 'title': video_title, + }) + + if len(entries) == 1: + return entries[0] + else: + for num, e in enumerate(entries, start=1): + e['title'] = '%s (%d)' % (e['title'], num) + return { + '_type': 'playlist', + 'entries': entries, + } - return { - 'id': video_id, - 'url': video_url, - 'uploader': video_uploader, - 'title': video_title, - } diff --git a/youtube_dl/extractor/hentaistigma.py b/youtube_dl/extractor/hentaistigma.py new file mode 100644 index 000000000..63d87b74c --- /dev/null +++ b/youtube_dl/extractor/hentaistigma.py @@ -0,0 +1,42 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor + + +class HentaiStigmaIE(InfoExtractor): + _VALID_URL = r'^https?://hentai\.animestigma\.com/(?P<id>[^/]+)' + _TEST = { + 'url': 'http://hentai.animestigma.com/inyouchuu-etsu-bonus/', + 'md5': '4e3d07422a68a4cc363d8f57c8bf0d23', + 'info_dict': { + 'id': 'inyouchuu-etsu-bonus', + 'ext': 'mp4', + "title": "Inyouchuu Etsu Bonus", + "age_limit": 18, + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + + webpage = self._download_webpage(url, video_id) + + title = self._html_search_regex( + r'<h2 class="posttitle"><a[^>]*>([^<]+)</a>', + webpage, 'title') + wrap_url = self._html_search_regex( + r'<iframe src="([^"]+mp4)"', webpage, 'wrapper url') + wrap_webpage = self._download_webpage(wrap_url, video_id) + + video_url = self._html_search_regex( + r'clip:\s*{\s*url: "([^"]*)"', wrap_webpage, 'video url') + + return { + 'id': video_id, + 'url': video_url, + 'title': title, + 'age_limit': 18, + } diff --git a/youtube_dl/extractor/statigram.py b/youtube_dl/extractor/iconosquare.py similarity index 76% rename from youtube_dl/extractor/statigram.py rename to youtube_dl/extractor/iconosquare.py index d602e817a..1d5a10a3b 100644 --- a/youtube_dl/extractor/statigram.py +++ b/youtube_dl/extractor/iconosquare.py @@ -5,8 +5,8 @@ import re from .common import InfoExtractor -class StatigramIE(InfoExtractor): - _VALID_URL = r'https?://(www\.)?statigr\.am/p/(?P<id>[^/]+)' +class IconosquareIE(InfoExtractor): + _VALID_URL = r'https?://(www\.)?(?:iconosquare\.com|statigr\.am)/p/(?P<id>[^/]+)' _TEST = { 'url': 'http://statigr.am/p/522207370455279102_24101272', 'md5': '6eb93b882a3ded7c378ee1d6884b1814', @@ -15,6 +15,7 @@ class StatigramIE(InfoExtractor): 'ext': 'mp4', 'uploader_id': 'aguynamedpatrick', 'title': 'Instagram photo by @aguynamedpatrick (Patrick Janelle)', + 'description': 'md5:644406a9ec27457ed7aa7a9ebcd4ce3d', }, } @@ -25,7 +26,7 @@ class StatigramIE(InfoExtractor): html_title = self._html_search_regex( r'<title>(.+?)', webpage, 'title') - title = re.sub(r'(?: *\(Videos?\))? \| Statigram$', '', html_title) + title = re.sub(r'(?: *\(Videos?\))? \| (?:Iconosquare|Statigram)$', '', html_title) uploader_id = self._html_search_regex( r'@([^ ]+)', title, 'uploader name', fatal=False) @@ -33,6 +34,7 @@ class StatigramIE(InfoExtractor): 'id': video_id, 'url': self._og_search_video_url(webpage), 'title': title, + 'description': self._og_search_description(webpage), 'thumbnail': self._og_search_thumbnail(webpage), 'uploader_id': uploader_id } diff --git a/youtube_dl/extractor/ign.py b/youtube_dl/extractor/ign.py index cfeaa4146..1f42c6d3a 100644 --- a/youtube_dl/extractor/ign.py +++ b/youtube_dl/extractor/ign.py @@ -106,7 +106,7 @@ class OneUPIE(IGNIE): _DESCRIPTION_RE = r'
(.+?)
' - _TEST = { + _TESTS = [{ 'url': 'http://gamevideos.1up.com/video/id/34976', 'md5': '68a54ce4ebc772e4b71e3123d413163d', 'info_dict': { @@ -115,10 +115,7 @@ class OneUPIE(IGNIE): 'title': 'Sniper Elite V2 - Trailer', 'description': 'md5:5d289b722f5a6d940ca3136e9dae89cf', } - } - - # Override IGN tests - _TESTS = [] + }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) diff --git a/youtube_dl/extractor/infoq.py b/youtube_dl/extractor/infoq.py index ed32373a1..e76dd222d 100644 --- a/youtube_dl/extractor/infoq.py +++ b/youtube_dl/extractor/infoq.py @@ -11,16 +11,15 @@ from ..utils import ( class InfoQIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?infoq\.com/[^/]+/(?P[^/]+)$' + _TEST = { - "name": "InfoQ", - "url": "http://www.infoq.com/presentations/A-Few-of-My-Favorite-Python-Things", - "file": "12-jan-pythonthings.mp4", - "info_dict": { - "description": "Mike Pirnat presents some tips and tricks, standard libraries and third party packages that make programming in Python a richer experience.", - "title": "A Few of My Favorite [Python] Things", - }, - "params": { - "skip_download": True, + 'url': 'http://www.infoq.com/presentations/A-Few-of-My-Favorite-Python-Things', + 'md5': 'b5ca0e0a8c1fed93b0e65e48e462f9a2', + 'info_dict': { + 'id': '12-jan-pythonthings', + 'ext': 'mp4', + 'description': 'Mike Pirnat presents some tips and tricks, standard libraries and third party packages that make programming in Python a richer experience.', + 'title': 'A Few of My Favorite [Python] Things', }, } @@ -30,26 +29,39 @@ class InfoQIE(InfoExtractor): webpage = self._download_webpage(url, video_id) + video_title = self._html_search_regex(r'(.*?)', webpage, 'title') + video_description = self._html_search_meta('description', webpage, 'description') + + # The server URL is hardcoded + video_url = 'rtmpe://video.infoq.com/cfx/st/' + # Extract video URL - encoded_id = self._search_regex(r"jsclassref ?= ?'([^']*)'", webpage, 'encoded id') + encoded_id = self._search_regex( + r"jsclassref\s*=\s*'([^']*)'", webpage, 'encoded id') real_id = compat_urllib_parse.unquote(base64.b64decode(encoded_id.encode('ascii')).decode('utf-8')) - video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id + playpath = 'mp4:' + real_id - # Extract title - video_title = self._search_regex(r'contentTitle = "(.*?)";', - webpage, 'title') - - # Extract description - video_description = self._html_search_regex(r'', - webpage, 'description', fatal=False) - - video_filename = video_url.split('/')[-1] + video_filename = playpath.split('/')[-1] video_id, extension = video_filename.split('.') + http_base = self._search_regex( + r'EXPRESSINSTALL_SWF\s*=\s*"(https?://[^/"]+/)', webpage, + 'HTTP base URL') + + formats = [{ + 'format_id': 'rtmp', + 'url': video_url, + 'ext': extension, + 'play_path': playpath, + }, { + 'format_id': 'http', + 'url': http_base + real_id, + }] + self._sort_formats(formats) + return { 'id': video_id, - 'url': video_url, 'title': video_title, - 'ext': extension, # Extension is always(?) mp4, but seems to be flv 'description': video_description, + 'formats': formats, } diff --git a/youtube_dl/extractor/jukebox.py b/youtube_dl/extractor/jukebox.py index f06dcb05e..9b553b9fa 100644 --- a/youtube_dl/extractor/jukebox.py +++ b/youtube_dl/extractor/jukebox.py @@ -14,7 +14,7 @@ class JukeboxIE(InfoExtractor): _VALID_URL = r'^http://www\.jukebox?\..+?\/.+[,](?P[a-z0-9\-]+)\.html' _TEST = { 'url': 'http://www.jukebox.es/kosheen/videoclip,pride,r303r.html', - 'md5': '5dc6477e74b1e37042ac5acedd8413e5', + 'md5': '1574e9b4d6438446d5b7dbcdf2786276', 'info_dict': { 'id': 'r303r', 'ext': 'flv', diff --git a/youtube_dl/extractor/mailru.py b/youtube_dl/extractor/mailru.py index f819c09b3..5016989cc 100644 --- a/youtube_dl/extractor/mailru.py +++ b/youtube_dl/extractor/mailru.py @@ -2,7 +2,6 @@ from __future__ import unicode_literals import re -import datetime from .common import InfoExtractor @@ -19,6 +18,7 @@ class MailRuIE(InfoExtractor): 'id': '46301138', 'ext': 'mp4', 'title': 'Новый Человек-Паук. Высокое напряжение. Восстание Электро', + 'timestamp': 1393232740, 'upload_date': '20140224', 'uploader': 'sonypicturesrus', 'uploader_id': 'sonypicturesrus@mail.ru', @@ -43,7 +43,6 @@ class MailRuIE(InfoExtractor): thumbnail = movie['poster'] duration = movie['duration'] - upload_date = datetime.datetime.fromtimestamp(video_data['timestamp']).strftime('%Y%m%d') view_count = video_data['views_count'] formats = [ @@ -57,7 +56,7 @@ class MailRuIE(InfoExtractor): 'id': content_id, 'title': title, 'thumbnail': thumbnail, - 'upload_date': upload_date, + 'timestamp': video_data['timestamp'], 'uploader': uploader, 'uploader_id': uploader_id, 'duration': duration, diff --git a/youtube_dl/extractor/mdr.py b/youtube_dl/extractor/mdr.py index 7aa0080d7..1b8c4a32e 100644 --- a/youtube_dl/extractor/mdr.py +++ b/youtube_dl/extractor/mdr.py @@ -1,15 +1,18 @@ +from __future__ import unicode_literals + import re from .common import InfoExtractor -from ..utils import ( - ExtractorError, -) class MDRIE(InfoExtractor): - _VALID_URL = r'^(?P(?:https?://)?(?:www\.)?mdr\.de)/mediathek/(?:.*)/(?Pvideo|audio)(?P[^/_]+)_.*' + _VALID_URL = r'^(?Phttps?://(?:www\.)?mdr\.de)/(?:.*)/(?Pvideo|audio)(?P[^/_]+)(?:_|\.html)' # No tests, MDR regularily deletes its videos + _TEST = { + 'url': 'http://www.mdr.de/fakt/video189002.html', + 'only_matching': True, + } def _real_extract(self, url): m = re.match(self._VALID_URL, url) @@ -19,9 +22,9 @@ class MDRIE(InfoExtractor): # determine title and media streams from webpage html = self._download_webpage(url, video_id) - title = self._html_search_regex(r'

(.*?)

', html, u'title') + title = self._html_search_regex(r'(.*?)', html, 'title') xmlurl = self._search_regex( - r'(/mediathek/(?:.+)/(?:video|audio)[0-9]+-avCustom.xml)', html, u'XML URL') + r'dataURL:\'(/(?:.+)/(?:video|audio)[0-9]+-avCustom.xml)', html, 'XML URL') doc = self._download_xml(domain + xmlurl, video_id) formats = [] @@ -41,7 +44,7 @@ class MDRIE(InfoExtractor): if vbr_el is None: format.update({ 'vcodec': 'none', - 'format_id': u'%s-%d' % (media_type, abr), + 'format_id': '%s-%d' % (media_type, abr), }) else: vbr = int(vbr_el.text) // 1000 @@ -49,12 +52,9 @@ class MDRIE(InfoExtractor): 'vbr': vbr, 'width': int(a.find('frameWidth').text), 'height': int(a.find('frameHeight').text), - 'format_id': u'%s-%d' % (media_type, vbr), + 'format_id': '%s-%d' % (media_type, vbr), }) formats.append(format) - if not formats: - raise ExtractorError(u'Could not find any valid formats') - self._sort_formats(formats) return { diff --git a/youtube_dl/extractor/mixcloud.py b/youtube_dl/extractor/mixcloud.py index c4bd53fe7..5f64e7bd0 100644 --- a/youtube_dl/extractor/mixcloud.py +++ b/youtube_dl/extractor/mixcloud.py @@ -4,9 +4,10 @@ import re from .common import InfoExtractor from ..utils import ( - unified_strdate, compat_urllib_parse, ExtractorError, + int_or_none, + parse_iso8601, ) @@ -24,6 +25,10 @@ class MixcloudIE(InfoExtractor): 'uploader': 'Daniel Holbach', 'uploader_id': 'dholbach', 'upload_date': '20111115', + 'timestamp': 1321359578, + 'thumbnail': 're:https?://.*\.jpg', + 'view_count': int, + 'like_count': int, }, } @@ -51,10 +56,6 @@ class MixcloudIE(InfoExtractor): webpage = self._download_webpage(url, track_id) - api_url = 'http://api.mixcloud.com/%s/%s/' % (uploader, cloudcast_name) - info = self._download_json( - api_url, track_id, 'Downloading cloudcast info') - preview_url = self._search_regex( r'\s(?:data-preview-url|m-preview)="(.+?)"', webpage, 'preview url') song_url = preview_url.replace('/previews/', '/c/originals/') @@ -65,16 +66,41 @@ class MixcloudIE(InfoExtractor): template_url = template_url.replace('.mp3', '.m4a').replace('originals/', 'm4a/64/') final_song_url = self._get_url(template_url) if final_song_url is None: - raise ExtractorError(u'Unable to extract track url') + raise ExtractorError('Unable to extract track url') + + PREFIX = ( + r'
', + webpage, 'upload date')) return { 'id': track_id, - 'title': info['name'], + 'title': title, 'url': final_song_url, - 'description': info.get('description'), - 'thumbnail': info['pictures'].get('extra_large'), - 'uploader': info['user']['name'], - 'uploader_id': info['user']['username'], - 'upload_date': unified_strdate(info['created_time']), - 'view_count': info['play_count'], + 'description': description, + 'thumbnail': thumbnail, + 'uploader': uploader, + 'uploader_id': uploader_id, + 'timestamp': timestamp, + 'view_count': view_count, + 'like_count': like_count, } diff --git a/youtube_dl/extractor/moviezine.py b/youtube_dl/extractor/moviezine.py new file mode 100644 index 000000000..43146180a --- /dev/null +++ b/youtube_dl/extractor/moviezine.py @@ -0,0 +1,45 @@ +# -*- coding: utf-8 -*- +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor + + +class MoviezineIE(InfoExtractor): + _VALID_URL = r'https?://www\.moviezine\.se/video/(?P[^?#]+)' + + _TEST = { + 'url': 'http://www.moviezine.se/video/205866', + 'info_dict': { + 'id': '205866', + 'ext': 'mp4', + 'title': 'Oculus - Trailer 1', + 'description': 'md5:40cc6790fc81d931850ca9249b40e8a4', + 'thumbnail': 're:http://.*\.jpg', + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + + webpage = self._download_webpage(url, video_id) + jsplayer = self._download_webpage('http://www.moviezine.se/api/player.js?video=%s' % video_id, video_id, 'Downloading js api player') + + formats =[{ + 'format_id': 'sd', + 'url': self._html_search_regex(r'file: "(.+?)",', jsplayer, 'file'), + 'quality': 0, + 'ext': 'mp4', + }] + + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': self._search_regex(r'title: "(.+?)",', jsplayer, 'title'), + 'thumbnail': self._search_regex(r'image: "(.+?)",', jsplayer, 'image'), + 'formats': formats, + 'description': self._og_search_description(webpage), + } diff --git a/youtube_dl/extractor/ndr.py b/youtube_dl/extractor/ndr.py index 0650f9564..3d6096e46 100644 --- a/youtube_dl/extractor/ndr.py +++ b/youtube_dl/extractor/ndr.py @@ -4,7 +4,11 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..utils import ExtractorError +from ..utils import ( + ExtractorError, + int_or_none, + qualities, +) class NDRIE(InfoExtractor): @@ -45,17 +49,16 @@ class NDRIE(InfoExtractor): page = self._download_webpage(url, video_id, 'Downloading page') - title = self._og_search_title(page) + title = self._og_search_title(page).strip() description = self._og_search_description(page) + if description: + description = description.strip() - mobj = re.search( - r'
(?P\d+):(?P\d+)
', - page) - duration = int(mobj.group('minutes')) * 60 + int(mobj.group('seconds')) if mobj else None + duration = int_or_none(self._html_search_regex(r'duration: (\d+),\n', page, 'duration', fatal=False)) formats = [] - mp3_url = re.search(r'''{src:'(?P