diff --git a/.gitignore b/.gitignore index 24fdb3626..7dd0ad09b 100644 --- a/.gitignore +++ b/.gitignore @@ -25,3 +25,4 @@ updates_key.pem *.mp4 *.part test/testdata +.tox diff --git a/Makefile b/Makefile index 85dacfa4c..c6d09932b 100644 --- a/Makefile +++ b/Makefile @@ -13,13 +13,13 @@ PYTHON=/usr/bin/env python # set SYSCONFDIR to /etc if PREFIX=/usr or PREFIX=/usr/local ifeq ($(PREFIX),/usr) - SYSCONFDIR=/etc + SYSCONFDIR=/etc else - ifeq ($(PREFIX),/usr/local) - SYSCONFDIR=/etc - else - SYSCONFDIR=$(PREFIX)/etc - endif + ifeq ($(PREFIX),/usr/local) + SYSCONFDIR=/etc + else + SYSCONFDIR=$(PREFIX)/etc + endif endif install: youtube-dl youtube-dl.1 youtube-dl.bash-completion @@ -71,6 +71,7 @@ youtube-dl.tar.gz: youtube-dl README.md README.txt youtube-dl.1 youtube-dl.bash- --exclude '*~' \ --exclude '__pycache' \ --exclude '.git' \ + --exclude 'testdata' \ -- \ bin devscripts test youtube_dl \ CHANGELOG LICENSE README.md README.txt \ diff --git a/README.md b/README.md index fc8070c37..a2b296613 100644 --- a/README.md +++ b/README.md @@ -21,6 +21,8 @@ which means you can modify it, redistribute it or use it however you like. sudo if needed) -i, --ignore-errors continue on download errors, for example to to skip unavailable videos in a playlist + --abort-on-error Abort downloading of further videos (in the + playlist or the command line) if an error occurs --dump-user-agent display the current browser identification --user-agent UA specify a custom user agent --referer REF specify a custom referer, use if the video access @@ -30,9 +32,10 @@ which means you can modify it, redistribute it or use it however you like. --extractor-descriptions Output descriptions of all supported extractors --proxy URL Use the specified HTTP/HTTPS proxy --no-check-certificate Suppress HTTPS certificate validation. - --cache-dir None Location in the filesystem where youtube-dl can - store downloaded information permanently. - ~/.youtube-dl/cache by default + --cache-dir DIR Location in the filesystem where youtube-dl can + store downloaded information permanently. By + default $XDG_CACHE_HOME/youtube-dl or ~/.cache + /youtube-dl . --no-cache-dir Disable filesystem caching ## Video Selection: @@ -50,11 +53,16 @@ which means you can modify it, redistribute it or use it however you like. --date DATE download only videos uploaded in this date --datebefore DATE download only videos uploaded before this date --dateafter DATE download only videos uploaded after this date + --no-playlist download only the currently playing video + --age-limit YEARS download only videos suitable for the given age + --download-archive FILE Download only videos not present in the archive + file. Record all downloaded videos in it. ## Download Options: - -r, --rate-limit LIMIT maximum download rate (e.g. 50k or 44.6m) + -r, --rate-limit LIMIT maximum download rate in bytes per second (e.g. + 50K or 4.2M) -R, --retries RETRIES number of retries (default is 10) - --buffer-size SIZE size of download buffer (e.g. 1024 or 16k) + --buffer-size SIZE size of download buffer (e.g. 1024 or 16K) (default is 1024) --no-resize-buffer do not automatically adjust the buffer size. By default, the buffer size is automatically resized @@ -70,7 +78,10 @@ which means you can modify it, redistribute it or use it however you like. %(uploader_id)s for the uploader nickname if different, %(autonumber)s to get an automatically incremented number, %(ext)s for the filename - extension, %(upload_date)s for the upload date + extension, %(format)s for the format description + (like "22 - 1280x720" or "HD"),%(format_id)s for + the unique id of the format (like Youtube's + itags: "137"),%(upload_date)s for the upload date (YYYYMMDD), %(extractor)s for the provider (youtube, metacafe, etc), %(id)s for the video id , %(playlist)s for the playlist the video is in, @@ -95,6 +106,7 @@ which means you can modify it, redistribute it or use it however you like. file modification time --write-description write video description to a .description file --write-info-json write video metadata to a .info.json file + --write-annotations write video annotations to a .annotation file --write-thumbnail write thumbnail image to disk ## Verbosity / Simulation Options: @@ -115,6 +127,8 @@ which means you can modify it, redistribute it or use it however you like. -v, --verbose print various debugging information --dump-intermediate-pages print downloaded pages to debug problems(very verbose) + --write-pages Write downloaded pages to files in the current + directory ## Video Format Options: -f, --format FORMAT video format code, specifiy the order of @@ -161,6 +175,7 @@ which means you can modify it, redistribute it or use it however you like. processed files are overwritten by default --embed-subs embed subtitles in the video (only for mp4 videos) + --add-metadata add metadata to the files # CONFIGURATION diff --git a/devscripts/bash-completion.in b/devscripts/bash-completion.in index bd10f63c2..ce893fcbe 100644 --- a/devscripts/bash-completion.in +++ b/devscripts/bash-completion.in @@ -1,4 +1,4 @@ -__youtube-dl() +__youtube_dl() { local cur prev opts COMPREPLY=() @@ -15,4 +15,4 @@ __youtube-dl() fi } -complete -F __youtube-dl youtube-dl +complete -F __youtube_dl youtube-dl diff --git a/devscripts/check-porn.py b/devscripts/check-porn.py new file mode 100644 index 000000000..63401fe18 --- /dev/null +++ b/devscripts/check-porn.py @@ -0,0 +1,39 @@ +#!/usr/bin/env python + +""" +This script employs a VERY basic heuristic ('porn' in webpage.lower()) to check +if we are not 'age_limit' tagging some porn site +""" + +# Allow direct execution +import os +import sys +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from test.helper import get_testcases +from youtube_dl.utils import compat_urllib_request + +for test in get_testcases(): + try: + webpage = compat_urllib_request.urlopen(test['url'], timeout=10).read() + except: + print('\nFail: {0}'.format(test['name'])) + continue + + webpage = webpage.decode('utf8', 'replace') + + if 'porn' in webpage.lower() and ('info_dict' not in test + or 'age_limit' not in test['info_dict'] + or test['info_dict']['age_limit'] != 18): + print('\nPotential missing age_limit check: {0}'.format(test['name'])) + + elif 'porn' not in webpage.lower() and ('info_dict' in test and + 'age_limit' in test['info_dict'] and + test['info_dict']['age_limit'] == 18): + print('\nPotential false negative: {0}'.format(test['name'])) + + else: + sys.stdout.write('.') + sys.stdout.flush() + +print() diff --git a/devscripts/gh-pages/update-sites.py b/devscripts/gh-pages/update-sites.py index 33f242480..153e15c8a 100755 --- a/devscripts/gh-pages/update-sites.py +++ b/devscripts/gh-pages/update-sites.py @@ -16,10 +16,11 @@ def main(): ie_htmls = [] for ie in sorted(youtube_dl.gen_extractors(), key=lambda i: i.IE_NAME.lower()): ie_html = '{}'.format(ie.IE_NAME) - try: + ie_desc = getattr(ie, 'IE_DESC', None) + if ie_desc is False: + continue + elif ie_desc is not None: ie_html += ': {}'.format(ie.IE_DESC) - except AttributeError: - pass if ie.working() == False: ie_html += ' (Currently broken)' ie_htmls.append('
  • {}
  • '.format(ie_html)) diff --git a/devscripts/release.sh b/devscripts/release.sh index 796468b4b..2766174c1 100755 --- a/devscripts/release.sh +++ b/devscripts/release.sh @@ -88,10 +88,6 @@ ROOT=$(pwd) "$ROOT/devscripts/gh-pages/update-sites.py" git add *.html *.html.in update git commit -m "release $version" - git show HEAD - read -p "Is it good, can I push? (y/n) " -n 1 - if [[ ! $REPLY =~ ^[Yy]$ ]]; then exit 1; fi - echo git push "$ROOT" gh-pages git push "$ORIGIN_URL" gh-pages ) diff --git a/setup.py b/setup.py index 3b6dc2d40..aa7cfca08 100644 --- a/setup.py +++ b/setup.py @@ -8,8 +8,10 @@ import sys try: from setuptools import setup + setuptools_available = True except ImportError: from distutils.core import setup + setuptools_available = False try: # This will create an exe that needs Microsoft Visual C++ 2008 @@ -43,13 +45,16 @@ if len(sys.argv) >= 2 and sys.argv[1] == 'py2exe': params = py2exe_params else: params = { - 'scripts': ['bin/youtube-dl'], 'data_files': [ # Installing system-wide would require sudo... ('etc/bash_completion.d', ['youtube-dl.bash-completion']), ('share/doc/youtube_dl', ['README.txt']), ('share/man/man1/', ['youtube-dl.1']) ] } + if setuptools_available: + params['entry_points'] = {'console_scripts': ['youtube-dl = youtube_dl:main']} + else: + params['scripts'] = ['bin/youtube-dl'] # Get the version from youtube_dl/version.py without importing the package exec(compile(open('youtube_dl/version.py').read(), @@ -63,6 +68,7 @@ setup( ' YouTube.com and other video sites.', url='https://github.com/rg3/youtube-dl', author='Ricardo Garcia', + author_email='ytdl@yt-dl.org', maintainer='Philipp Hagemeister', maintainer_email='phihag@phihag.de', packages=['youtube_dl', 'youtube_dl.extractor'], diff --git a/test/__init__.py b/test/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/test/helper.py b/test/helper.py index a2b468b50..d7bf7a828 100644 --- a/test/helper.py +++ b/test/helper.py @@ -1,38 +1,80 @@ +import errno import io +import hashlib import json import os.path +import re +import types +import sys import youtube_dl.extractor -from youtube_dl import YoutubeDL, YoutubeDLHandler -from youtube_dl.utils import ( - compat_cookiejar, - compat_urllib_request, -) +from youtube_dl import YoutubeDL +from youtube_dl.utils import preferredencoding -# General configuration (from __init__, not very elegant...) -jar = compat_cookiejar.CookieJar() -cookie_processor = compat_urllib_request.HTTPCookieProcessor(jar) -proxy_handler = compat_urllib_request.ProxyHandler() -opener = compat_urllib_request.build_opener(proxy_handler, cookie_processor, YoutubeDLHandler()) -compat_urllib_request.install_opener(opener) -PARAMETERS_FILE = os.path.join(os.path.dirname(os.path.abspath(__file__)), "parameters.json") -with io.open(PARAMETERS_FILE, encoding='utf-8') as pf: - parameters = json.load(pf) +def global_setup(): + youtube_dl._setup_opener(timeout=10) + + +def get_params(override=None): + PARAMETERS_FILE = os.path.join(os.path.dirname(os.path.abspath(__file__)), + "parameters.json") + with io.open(PARAMETERS_FILE, encoding='utf-8') as pf: + parameters = json.load(pf) + if override: + parameters.update(override) + return parameters + + +def try_rm(filename): + """ Remove a file if it exists """ + try: + os.remove(filename) + except OSError as ose: + if ose.errno != errno.ENOENT: + raise + + +def report_warning(message): + ''' + Print the message to stderr, it will be prefixed with 'WARNING:' + If stderr is a tty file the 'WARNING:' will be colored + ''' + if sys.stderr.isatty() and os.name != 'nt': + _msg_header = u'\033[0;33mWARNING:\033[0m' + else: + _msg_header = u'WARNING:' + output = u'%s %s\n' % (_msg_header, message) + if 'b' in getattr(sys.stderr, 'mode', '') or sys.version_info[0] < 3: + output = output.encode(preferredencoding()) + sys.stderr.write(output) + class FakeYDL(YoutubeDL): - def __init__(self): - self.result = [] + def __init__(self, override=None): # Different instances of the downloader can't share the same dictionary # some test set the "sublang" parameter, which would break the md5 checks. - self.params = dict(parameters) - def to_screen(self, s): + params = get_params(override=override) + super(FakeYDL, self).__init__(params) + self.result = [] + + def to_screen(self, s, skip_eol=None): print(s) + def trouble(self, s, tb=None): raise Exception(s) + def download(self, x): self.result.append(x) + def expect_warning(self, regex): + # Silence an expected warning matching a regex + old_report_warning = self.report_warning + def report_warning(self, message): + if re.match(regex, message): return + old_report_warning(message) + self.report_warning = types.MethodType(report_warning, self) + def get_testcases(): for ie in youtube_dl.extractor.gen_extractors(): t = getattr(ie, '_TEST', None) @@ -42,3 +84,6 @@ def get_testcases(): for t in getattr(ie, '_TESTS', []): t['name'] = type(ie).__name__[:-len('IE')] yield t + + +md5 = lambda s: hashlib.md5(s.encode('utf-8')).hexdigest() diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py new file mode 100644 index 000000000..58cf9c313 --- /dev/null +++ b/test/test_YoutubeDL.py @@ -0,0 +1,145 @@ +#!/usr/bin/env python + +# Allow direct execution +import os +import sys +import unittest +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from test.helper import FakeYDL + + +class YDL(FakeYDL): + def __init__(self, *args, **kwargs): + super(YDL, self).__init__(*args, **kwargs) + self.downloaded_info_dicts = [] + self.msgs = [] + + def process_info(self, info_dict): + self.downloaded_info_dicts.append(info_dict) + + def to_screen(self, msg): + self.msgs.append(msg) + + +class TestFormatSelection(unittest.TestCase): + def test_prefer_free_formats(self): + # Same resolution => download webm + ydl = YDL() + ydl.params['prefer_free_formats'] = True + formats = [ + {u'ext': u'webm', u'height': 460}, + {u'ext': u'mp4', u'height': 460}, + ] + info_dict = {u'formats': formats, u'extractor': u'test'} + ydl.process_ie_result(info_dict) + downloaded = ydl.downloaded_info_dicts[0] + self.assertEqual(downloaded[u'ext'], u'webm') + + # Different resolution => download best quality (mp4) + ydl = YDL() + ydl.params['prefer_free_formats'] = True + formats = [ + {u'ext': u'webm', u'height': 720}, + {u'ext': u'mp4', u'height': 1080}, + ] + info_dict[u'formats'] = formats + ydl.process_ie_result(info_dict) + downloaded = ydl.downloaded_info_dicts[0] + self.assertEqual(downloaded[u'ext'], u'mp4') + + # No prefer_free_formats => keep original formats order + ydl = YDL() + ydl.params['prefer_free_formats'] = False + formats = [ + {u'ext': u'webm', u'height': 720}, + {u'ext': u'flv', u'height': 720}, + ] + info_dict[u'formats'] = formats + ydl.process_ie_result(info_dict) + downloaded = ydl.downloaded_info_dicts[0] + self.assertEqual(downloaded[u'ext'], u'flv') + + def test_format_limit(self): + formats = [ + {u'format_id': u'meh', u'url': u'http://example.com/meh'}, + {u'format_id': u'good', u'url': u'http://example.com/good'}, + {u'format_id': u'great', u'url': u'http://example.com/great'}, + {u'format_id': u'excellent', u'url': u'http://example.com/exc'}, + ] + info_dict = { + u'formats': formats, u'extractor': u'test', 'id': 'testvid'} + + ydl = YDL() + ydl.process_ie_result(info_dict) + downloaded = ydl.downloaded_info_dicts[0] + self.assertEqual(downloaded[u'format_id'], u'excellent') + + ydl = YDL({'format_limit': 'good'}) + assert ydl.params['format_limit'] == 'good' + ydl.process_ie_result(info_dict) + downloaded = ydl.downloaded_info_dicts[0] + self.assertEqual(downloaded[u'format_id'], u'good') + + ydl = YDL({'format_limit': 'great', 'format': 'all'}) + ydl.process_ie_result(info_dict) + self.assertEqual(ydl.downloaded_info_dicts[0][u'format_id'], u'meh') + self.assertEqual(ydl.downloaded_info_dicts[1][u'format_id'], u'good') + self.assertEqual(ydl.downloaded_info_dicts[2][u'format_id'], u'great') + self.assertTrue('3' in ydl.msgs[0]) + + ydl = YDL() + ydl.params['format_limit'] = 'excellent' + ydl.process_ie_result(info_dict) + downloaded = ydl.downloaded_info_dicts[0] + self.assertEqual(downloaded[u'format_id'], u'excellent') + + def test_format_selection(self): + formats = [ + {u'format_id': u'35', u'ext': u'mp4'}, + {u'format_id': u'45', u'ext': u'webm'}, + {u'format_id': u'47', u'ext': u'webm'}, + {u'format_id': u'2', u'ext': u'flv'}, + ] + info_dict = {u'formats': formats, u'extractor': u'test'} + + ydl = YDL({'format': u'20/47'}) + ydl.process_ie_result(info_dict) + downloaded = ydl.downloaded_info_dicts[0] + self.assertEqual(downloaded['format_id'], u'47') + + ydl = YDL({'format': u'20/71/worst'}) + ydl.process_ie_result(info_dict) + downloaded = ydl.downloaded_info_dicts[0] + self.assertEqual(downloaded['format_id'], u'35') + + ydl = YDL() + ydl.process_ie_result(info_dict) + downloaded = ydl.downloaded_info_dicts[0] + self.assertEqual(downloaded['format_id'], u'2') + + ydl = YDL({'format': u'webm/mp4'}) + ydl.process_ie_result(info_dict) + downloaded = ydl.downloaded_info_dicts[0] + self.assertEqual(downloaded['format_id'], u'47') + + ydl = YDL({'format': u'3gp/40/mp4'}) + ydl.process_ie_result(info_dict) + downloaded = ydl.downloaded_info_dicts[0] + self.assertEqual(downloaded['format_id'], u'35') + + def test_add_extra_info(self): + test_dict = { + 'extractor': 'Foo', + } + extra_info = { + 'extractor': 'Bar', + 'playlist': 'funny videos', + } + YDL.add_extra_info(test_dict, extra_info) + self.assertEqual(test_dict['extractor'], 'Foo') + self.assertEqual(test_dict['playlist'], 'funny videos') + + +if __name__ == '__main__': + unittest.main() diff --git a/test/test_age_restriction.py b/test/test_age_restriction.py new file mode 100644 index 000000000..d500c6edc --- /dev/null +++ b/test/test_age_restriction.py @@ -0,0 +1,55 @@ +#!/usr/bin/env python + +# Allow direct execution +import os +import sys +import unittest +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from test.helper import global_setup, try_rm +global_setup() + + +from youtube_dl import YoutubeDL + + +def _download_restricted(url, filename, age): + """ Returns true iff the file has been downloaded """ + + params = { + 'age_limit': age, + 'skip_download': True, + 'writeinfojson': True, + "outtmpl": "%(id)s.%(ext)s", + } + ydl = YoutubeDL(params) + ydl.add_default_info_extractors() + json_filename = filename + '.info.json' + try_rm(json_filename) + ydl.download([url]) + res = os.path.exists(json_filename) + try_rm(json_filename) + return res + + +class TestAgeRestriction(unittest.TestCase): + def _assert_restricted(self, url, filename, age, old_age=None): + self.assertTrue(_download_restricted(url, filename, old_age)) + self.assertFalse(_download_restricted(url, filename, age)) + + def test_youtube(self): + self._assert_restricted('07FYdnEawAQ', '07FYdnEawAQ.mp4', 10) + + def test_youporn(self): + self._assert_restricted( + 'http://www.youporn.com/watch/505835/sex-ed-is-it-safe-to-masturbate-daily/', + '505835.mp4', 2, old_age=25) + + def test_pornotube(self): + self._assert_restricted( + 'http://pornotube.com/c/173/m/1689755/Marilyn-Monroe-Bathing', + '1689755.flv', 13) + + +if __name__ == '__main__': + unittest.main() diff --git a/test/test_all_urls.py b/test/test_all_urls.py index ff1c86efe..56e5f80e1 100644 --- a/test/test_all_urls.py +++ b/test/test_all_urls.py @@ -1,14 +1,20 @@ #!/usr/bin/env python -import sys -import unittest - # Allow direct execution import os -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +import sys +import unittest +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + + +from test.helper import get_testcases + +from youtube_dl.extractor import ( + gen_extractors, + JustinTVIE, + YoutubeIE, +) -from youtube_dl.extractor import YoutubeIE, YoutubePlaylistIE, YoutubeChannelIE, JustinTVIE, gen_extractors -from helper import get_testcases class TestAllURLsMatching(unittest.TestCase): def setUp(self): diff --git a/test/test_dailymotion_subtitles.py b/test/test_dailymotion_subtitles.py index 83c65d57e..ba3580ea4 100644 --- a/test/test_dailymotion_subtitles.py +++ b/test/test_dailymotion_subtitles.py @@ -1,20 +1,16 @@ #!/usr/bin/env python -import sys -import unittest -import json -import io -import hashlib - # Allow direct execution import os -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +import sys +import unittest +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from test.helper import FakeYDL, global_setup, md5 +global_setup() + from youtube_dl.extractor import DailymotionIE -from youtube_dl.utils import * -from helper import FakeYDL - -md5 = lambda s: hashlib.md5(s.encode('utf-8')).hexdigest() class TestDailymotionSubtitles(unittest.TestCase): def setUp(self): @@ -26,7 +22,7 @@ class TestDailymotionSubtitles(unittest.TestCase): return info_dict def getSubtitles(self): info_dict = self.getInfoDict() - return info_dict[0]['subtitles'] + return info_dict['subtitles'] def test_no_writesubtitles(self): subtitles = self.getSubtitles() self.assertEqual(subtitles, None) @@ -45,15 +41,18 @@ class TestDailymotionSubtitles(unittest.TestCase): subtitles = self.getSubtitles() self.assertEqual(len(subtitles.keys()), 5) def test_list_subtitles(self): + self.DL.expect_warning(u'Automatic Captions not supported by this server') self.DL.params['listsubtitles'] = True info_dict = self.getInfoDict() self.assertEqual(info_dict, None) def test_automatic_captions(self): + self.DL.expect_warning(u'Automatic Captions not supported by this server') self.DL.params['writeautomaticsub'] = True self.DL.params['subtitleslang'] = ['en'] subtitles = self.getSubtitles() self.assertTrue(len(subtitles.keys()) == 0) def test_nosubtitles(self): + self.DL.expect_warning(u'video doesn\'t have subtitles') self.url = 'http://www.dailymotion.com/video/x12u166_le-zapping-tele-star-du-08-aout-2013_tv' self.DL.params['writesubtitles'] = True self.DL.params['allsubtitles'] = True diff --git a/test/test_download.py b/test/test_download.py index 23a66254d..73379beb1 100644 --- a/test/test_download.py +++ b/test/test_download.py @@ -1,43 +1,39 @@ #!/usr/bin/env python -import errno +# Allow direct execution +import os +import sys +import unittest +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from test.helper import ( + get_params, + get_testcases, + global_setup, + try_rm, + md5, + report_warning +) +global_setup() + + import hashlib import io -import os import json -import unittest -import sys import socket -import binascii - -# Allow direct execution -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) import youtube_dl.YoutubeDL -from youtube_dl.utils import * - -PARAMETERS_FILE = os.path.join(os.path.dirname(os.path.abspath(__file__)), "parameters.json") +from youtube_dl.utils import ( + compat_str, + compat_urllib_error, + compat_HTTPError, + DownloadError, + ExtractorError, + UnavailableVideoError, +) RETRIES = 3 -# General configuration (from __init__, not very elegant...) -jar = compat_cookiejar.CookieJar() -cookie_processor = compat_urllib_request.HTTPCookieProcessor(jar) -proxy_handler = compat_urllib_request.ProxyHandler() -opener = compat_urllib_request.build_opener(proxy_handler, cookie_processor, YoutubeDLHandler()) -compat_urllib_request.install_opener(opener) -socket.setdefaulttimeout(10) - -def _try_rm(filename): - """ Remove a file if it exists """ - try: - os.remove(filename) - except OSError as ose: - if ose.errno != errno.ENOENT: - raise - -md5 = lambda s: hashlib.md5(s.encode('utf-8')).hexdigest() - class YoutubeDL(youtube_dl.YoutubeDL): def __init__(self, *args, **kwargs): self.to_stderr = self.to_screen @@ -54,17 +50,12 @@ def _file_md5(fn): with open(fn, 'rb') as f: return hashlib.md5(f.read()).hexdigest() -from helper import get_testcases defs = get_testcases() -with io.open(PARAMETERS_FILE, encoding='utf-8') as pf: - parameters = json.load(pf) - class TestDownload(unittest.TestCase): maxDiff = None def setUp(self): - self.parameters = parameters self.defs = defs ### Dynamically generate tests @@ -77,15 +68,17 @@ def generator(test_case): if not ie._WORKING: print_skipping('IE marked as not _WORKING') return - if 'playlist' not in test_case and not test_case['file']: - print_skipping('No output file specified') - return + if 'playlist' not in test_case: + info_dict = test_case.get('info_dict', {}) + if not test_case.get('file') and not (info_dict.get('id') and info_dict.get('ext')): + print_skipping('The output file cannot be know, the "file" ' + 'key is missing or the info_dict is incomplete') + return if 'skip' in test_case: print_skipping(test_case['skip']) return - params = self.parameters.copy() - params.update(test_case.get('params', {})) + params = get_params(test_case.get('params', {})) ydl = YoutubeDL(params) ydl.add_default_info_extractors() @@ -95,35 +88,47 @@ def generator(test_case): finished_hook_called.add(status['filename']) ydl.fd.add_progress_hook(_hook) + def get_tc_filename(tc): + return tc.get('file') or ydl.prepare_filename(tc.get('info_dict', {})) + test_cases = test_case.get('playlist', [test_case]) - for tc in test_cases: - _try_rm(tc['file']) - _try_rm(tc['file'] + '.part') - _try_rm(tc['file'] + '.info.json') + def try_rm_tcs_files(): + for tc in test_cases: + tc_filename = get_tc_filename(tc) + try_rm(tc_filename) + try_rm(tc_filename + '.part') + try_rm(tc_filename + '.info.json') + try_rm_tcs_files() try: - for retry in range(1, RETRIES + 1): + try_num = 1 + while True: try: ydl.download([test_case['url']]) except (DownloadError, ExtractorError) as err: - if retry == RETRIES: raise - # Check if the exception is not a network related one - if not err.exc_info[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError): + if not err.exc_info[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError) or (err.exc_info[0] == compat_HTTPError and err.exc_info[1].code == 503): raise - print('Retrying: {0} failed tries\n\n##########\n\n'.format(retry)) + if try_num == RETRIES: + report_warning(u'Failed due to network errors, skipping...') + return + + print('Retrying: {0} failed tries\n\n##########\n\n'.format(try_num)) + + try_num += 1 else: break for tc in test_cases: + tc_filename = get_tc_filename(tc) if not test_case.get('params', {}).get('skip_download', False): - self.assertTrue(os.path.exists(tc['file']), msg='Missing file ' + tc['file']) - self.assertTrue(tc['file'] in finished_hook_called) - self.assertTrue(os.path.exists(tc['file'] + '.info.json')) + self.assertTrue(os.path.exists(tc_filename), msg='Missing file ' + tc_filename) + self.assertTrue(tc_filename in finished_hook_called) + self.assertTrue(os.path.exists(tc_filename + '.info.json')) if 'md5' in tc: - md5_for_file = _file_md5(tc['file']) + md5_for_file = _file_md5(tc_filename) self.assertEqual(md5_for_file, tc['md5']) - with io.open(tc['file'] + '.info.json', encoding='utf-8') as infof: + with io.open(tc_filename + '.info.json', encoding='utf-8') as infof: info_dict = json.load(infof) for (info_field, expected) in tc.get('info_dict', {}).items(): if isinstance(expected, compat_str) and expected.startswith('md5:'): @@ -143,11 +148,11 @@ def generator(test_case): # Check for the presence of mandatory fields for key in ('id', 'url', 'title', 'ext'): self.assertTrue(key in info_dict.keys() and info_dict[key]) + # Check for mandatory fields that are automatically set by YoutubeDL + for key in ['webpage_url', 'extractor', 'extractor_key']: + self.assertTrue(info_dict.get(key), u'Missing field: %s' % key) finally: - for tc in test_cases: - _try_rm(tc['file']) - _try_rm(tc['file'] + '.part') - _try_rm(tc['file'] + '.info.json') + try_rm_tcs_files() return test_template diff --git a/test/test_playlists.py b/test/test_playlists.py index c33511333..de1e8d88e 100644 --- a/test/test_playlists.py +++ b/test/test_playlists.py @@ -1,13 +1,16 @@ #!/usr/bin/env python # encoding: utf-8 -import sys -import unittest -import json # Allow direct execution import os -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +import sys +import unittest +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from test.helper import FakeYDL, global_setup +global_setup() + from youtube_dl.extractor import ( DailymotionPlaylistIE, @@ -16,10 +19,10 @@ from youtube_dl.extractor import ( UstreamChannelIE, SoundcloudUserIE, LivestreamIE, + NHLVideocenterIE, + BambuserChannelIE, ) -from youtube_dl.utils import * -from helper import FakeYDL class TestPlaylists(unittest.TestCase): def assertIsPlaylist(self, info): @@ -74,5 +77,22 @@ class TestPlaylists(unittest.TestCase): self.assertEqual(result['title'], u'TEDCity2.0 (English)') self.assertTrue(len(result['entries']) >= 4) + def test_nhl_videocenter(self): + dl = FakeYDL() + ie = NHLVideocenterIE(dl) + result = ie.extract('http://video.canucks.nhl.com/videocenter/console?catid=999') + self.assertIsPlaylist(result) + self.assertEqual(result['id'], u'999') + self.assertEqual(result['title'], u'Highlights') + self.assertEqual(len(result['entries']), 12) + + def test_bambuser_channel(self): + dl = FakeYDL() + ie = BambuserChannelIE(dl) + result = ie.extract('http://bambuser.com/channel/pixelversity') + self.assertIsPlaylist(result) + self.assertEqual(result['title'], u'pixelversity') + self.assertTrue(len(result['entries']) >= 66) + if __name__ == '__main__': unittest.main() diff --git a/test/test_utils.py b/test/test_utils.py index ff2e9885b..f3fbff042 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -1,14 +1,15 @@ #!/usr/bin/env python - -# Various small unit tests - -import sys -import unittest -import xml.etree.ElementTree +# coding: utf-8 # Allow direct execution import os -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +import sys +import unittest +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + + +# Various small unit tests +import xml.etree.ElementTree #from youtube_dl.utils import htmlentity_transform from youtube_dl.utils import ( @@ -20,6 +21,9 @@ from youtube_dl.utils import ( unified_strdate, find_xpath_attr, get_meta_content, + xpath_with_ns, + smuggle_url, + unsmuggle_url, ) if sys.version_info < (3, 0): @@ -141,5 +145,31 @@ class TestUtil(unittest.TestCase): self.assertEqual(get_meta('description'), u'foo & bar') self.assertEqual(get_meta('author'), 'Plato') + def test_xpath_with_ns(self): + testxml = u''' + + The Author + http://server.com/download.mp3 + + ''' + doc = xml.etree.ElementTree.fromstring(testxml) + find = lambda p: doc.find(xpath_with_ns(p, {'media': 'http://example.com/'})) + self.assertTrue(find('media:song') is not None) + self.assertEqual(find('media:song/media:author').text, u'The Author') + self.assertEqual(find('media:song/url').text, u'http://server.com/download.mp3') + + def test_smuggle_url(self): + data = {u"ö": u"ö", u"abc": [3]} + url = 'https://foo.bar/baz?x=y#a' + smug_url = smuggle_url(url, data) + unsmug_url, unsmug_data = unsmuggle_url(smug_url) + self.assertEqual(url, unsmug_url) + self.assertEqual(data, unsmug_data) + + res_url, res_data = unsmuggle_url(url) + self.assertEqual(res_url, url) + self.assertEqual(res_data, None) + + if __name__ == '__main__': unittest.main() diff --git a/test/test_write_annotations.py b/test/test_write_annotations.py new file mode 100644 index 000000000..35defb895 --- /dev/null +++ b/test/test_write_annotations.py @@ -0,0 +1,80 @@ +#!/usr/bin/env python +# coding: utf-8 + +# Allow direct execution +import os +import sys +import unittest +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from test.helper import get_params, global_setup, try_rm +global_setup() + + +import io + +import xml.etree.ElementTree + +import youtube_dl.YoutubeDL +import youtube_dl.extractor + + +class YoutubeDL(youtube_dl.YoutubeDL): + def __init__(self, *args, **kwargs): + super(YoutubeDL, self).__init__(*args, **kwargs) + self.to_stderr = self.to_screen + +params = get_params({ + 'writeannotations': True, + 'skip_download': True, + 'writeinfojson': False, + 'format': 'flv', +}) + + + +TEST_ID = 'gr51aVj-mLg' +ANNOTATIONS_FILE = TEST_ID + '.flv.annotations.xml' +EXPECTED_ANNOTATIONS = ['Speech bubble', 'Note', 'Title', 'Spotlight', 'Label'] + +class TestAnnotations(unittest.TestCase): + def setUp(self): + # Clear old files + self.tearDown() + + + def test_info_json(self): + expected = list(EXPECTED_ANNOTATIONS) #Two annotations could have the same text. + ie = youtube_dl.extractor.YoutubeIE() + ydl = YoutubeDL(params) + ydl.add_info_extractor(ie) + ydl.download([TEST_ID]) + self.assertTrue(os.path.exists(ANNOTATIONS_FILE)) + annoxml = None + with io.open(ANNOTATIONS_FILE, 'r', encoding='utf-8') as annof: + annoxml = xml.etree.ElementTree.parse(annof) + self.assertTrue(annoxml is not None, 'Failed to parse annotations XML') + root = annoxml.getroot() + self.assertEqual(root.tag, 'document') + annotationsTag = root.find('annotations') + self.assertEqual(annotationsTag.tag, 'annotations') + annotations = annotationsTag.findall('annotation') + + #Not all the annotations have TEXT children and the annotations are returned unsorted. + for a in annotations: + self.assertEqual(a.tag, 'annotation') + if a.get('type') == 'text': + textTag = a.find('TEXT') + text = textTag.text + self.assertTrue(text in expected) #assertIn only added in python 2.7 + #remove the first occurance, there could be more than one annotation with the same text + expected.remove(text) + #We should have seen (and removed) all the expected annotation texts. + self.assertEqual(len(expected), 0, 'Not all expected annotations were found.') + + + def tearDown(self): + try_rm(ANNOTATIONS_FILE) + +if __name__ == '__main__': + unittest.main() diff --git a/test/test_write_info_json.py b/test/test_write_info_json.py index de6d5180f..a5b6f6972 100644 --- a/test/test_write_info_json.py +++ b/test/test_write_info_json.py @@ -1,37 +1,34 @@ #!/usr/bin/env python # coding: utf-8 -import json +# Allow direct execution import os import sys import unittest +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -# Allow direct execution -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +from test.helper import get_params, global_setup +global_setup() + + +import io +import json import youtube_dl.YoutubeDL import youtube_dl.extractor -from youtube_dl.utils import * -PARAMETERS_FILE = os.path.join(os.path.dirname(os.path.abspath(__file__)), "parameters.json") - -# General configuration (from __init__, not very elegant...) -jar = compat_cookiejar.CookieJar() -cookie_processor = compat_urllib_request.HTTPCookieProcessor(jar) -proxy_handler = compat_urllib_request.ProxyHandler() -opener = compat_urllib_request.build_opener(proxy_handler, cookie_processor, YoutubeDLHandler()) -compat_urllib_request.install_opener(opener) class YoutubeDL(youtube_dl.YoutubeDL): def __init__(self, *args, **kwargs): super(YoutubeDL, self).__init__(*args, **kwargs) self.to_stderr = self.to_screen -with io.open(PARAMETERS_FILE, encoding='utf-8') as pf: - params = json.load(pf) -params['writeinfojson'] = True -params['skip_download'] = True -params['writedescription'] = True +params = get_params({ + 'writeinfojson': True, + 'skip_download': True, + 'writedescription': True, +}) + TEST_ID = 'BaW_jenozKc' INFO_JSON_FILE = TEST_ID + '.mp4.info.json' @@ -42,6 +39,7 @@ This is a test video for youtube-dl. For more information, contact phihag@phihag.de .''' + class TestInfoJSON(unittest.TestCase): def setUp(self): # Clear old files diff --git a/test/test_youtube_lists.py b/test/test_youtube_lists.py index dd9e292b0..4b7a7847b 100644 --- a/test/test_youtube_lists.py +++ b/test/test_youtube_lists.py @@ -1,20 +1,26 @@ #!/usr/bin/env python -import sys -import unittest -import json - # Allow direct execution import os -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +import sys +import unittest +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from youtube_dl.extractor import YoutubeUserIE, YoutubePlaylistIE, YoutubeIE, YoutubeChannelIE, YoutubeShowIE -from youtube_dl.utils import * +from test.helper import FakeYDL, global_setup +global_setup() + + +from youtube_dl.extractor import ( + YoutubeUserIE, + YoutubePlaylistIE, + YoutubeIE, + YoutubeChannelIE, + YoutubeShowIE, +) -from helper import FakeYDL class TestYoutubeLists(unittest.TestCase): - def assertIsPlaylist(self,info): + def assertIsPlaylist(self, info): """Make sure the info has '_type' set to 'playlist'""" self.assertEqual(info['_type'], 'playlist') @@ -27,6 +33,14 @@ class TestYoutubeLists(unittest.TestCase): ytie_results = [YoutubeIE()._extract_id(url['url']) for url in result['entries']] self.assertEqual(ytie_results, [ 'bV9L5Ht9LgY', 'FXxLjLQi3Fg', 'tU3Bgo5qJZE']) + def test_youtube_playlist_noplaylist(self): + dl = FakeYDL() + dl.params['noplaylist'] = True + ie = YoutubePlaylistIE(dl) + result = ie.extract('https://www.youtube.com/watch?v=FXxLjLQi3Fg&list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re') + self.assertEqual(result['_type'], 'url') + self.assertEqual(YoutubeIE()._extract_id(result['url']), 'FXxLjLQi3Fg') + def test_issue_673(self): dl = FakeYDL() ie = YoutubePlaylistIE(dl) @@ -92,7 +106,7 @@ class TestYoutubeLists(unittest.TestCase): dl = FakeYDL() ie = YoutubeShowIE(dl) result = ie.extract('http://www.youtube.com/show/airdisasters') - self.assertTrue(len(result) >= 4) + self.assertTrue(len(result) >= 3) if __name__ == '__main__': unittest.main() diff --git a/test/test_youtube_signature.py b/test/test_youtube_signature.py index 5007d9a16..5e1ff5eb0 100644 --- a/test/test_youtube_signature.py +++ b/test/test_youtube_signature.py @@ -1,14 +1,18 @@ #!/usr/bin/env python +# Allow direct execution +import os +import sys +import unittest +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from test.helper import global_setup +global_setup() + + import io import re import string -import sys -import unittest - -# Allow direct execution -import os -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from youtube_dl.extractor import YoutubeIE from youtube_dl.utils import compat_str, compat_urlretrieve diff --git a/test/test_youtube_subtitles.py b/test/test_youtube_subtitles.py index 168e6c66c..00430a338 100644 --- a/test/test_youtube_subtitles.py +++ b/test/test_youtube_subtitles.py @@ -1,76 +1,87 @@ #!/usr/bin/env python -import sys -import unittest -import json -import io -import hashlib - # Allow direct execution import os -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +import sys +import unittest +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from test.helper import FakeYDL, global_setup, md5 +global_setup() + from youtube_dl.extractor import YoutubeIE -from youtube_dl.utils import * -from helper import FakeYDL -md5 = lambda s: hashlib.md5(s.encode('utf-8')).hexdigest() class TestYoutubeSubtitles(unittest.TestCase): def setUp(self): self.DL = FakeYDL() self.url = 'QRS8MkLhQmM' + def getInfoDict(self): IE = YoutubeIE(self.DL) info_dict = IE.extract(self.url) return info_dict + def getSubtitles(self): info_dict = self.getInfoDict() - return info_dict[0]['subtitles'] + return info_dict[0]['subtitles'] + def test_youtube_no_writesubtitles(self): self.DL.params['writesubtitles'] = False subtitles = self.getSubtitles() self.assertEqual(subtitles, None) + def test_youtube_subtitles(self): self.DL.params['writesubtitles'] = True subtitles = self.getSubtitles() self.assertEqual(md5(subtitles['en']), '4cd9278a35ba2305f47354ee13472260') + def test_youtube_subtitles_lang(self): self.DL.params['writesubtitles'] = True self.DL.params['subtitleslangs'] = ['it'] subtitles = self.getSubtitles() self.assertEqual(md5(subtitles['it']), '164a51f16f260476a05b50fe4c2f161d') + def test_youtube_allsubtitles(self): self.DL.params['writesubtitles'] = True self.DL.params['allsubtitles'] = True subtitles = self.getSubtitles() self.assertEqual(len(subtitles.keys()), 13) + def test_youtube_subtitles_sbv_format(self): self.DL.params['writesubtitles'] = True self.DL.params['subtitlesformat'] = 'sbv' subtitles = self.getSubtitles() self.assertEqual(md5(subtitles['en']), '13aeaa0c245a8bed9a451cb643e3ad8b') + def test_youtube_subtitles_vtt_format(self): self.DL.params['writesubtitles'] = True self.DL.params['subtitlesformat'] = 'vtt' subtitles = self.getSubtitles() self.assertEqual(md5(subtitles['en']), '356cdc577fde0c6783b9b822e7206ff7') + def test_youtube_list_subtitles(self): + self.DL.expect_warning(u'Video doesn\'t have automatic captions') self.DL.params['listsubtitles'] = True info_dict = self.getInfoDict() self.assertEqual(info_dict, None) + def test_youtube_automatic_captions(self): self.url = '8YoUxe5ncPo' self.DL.params['writeautomaticsub'] = True self.DL.params['subtitleslangs'] = ['it'] subtitles = self.getSubtitles() self.assertTrue(subtitles['it'] is not None) + def test_youtube_nosubtitles(self): + self.DL.expect_warning(u'video doesn\'t have subtitles') self.url = 'sAjKT8FhjI8' self.DL.params['writesubtitles'] = True self.DL.params['allsubtitles'] = True subtitles = self.getSubtitles() self.assertEqual(len(subtitles), 0) + def test_youtube_multiple_langs(self): self.url = 'QRS8MkLhQmM' self.DL.params['writesubtitles'] = True diff --git a/tox.ini b/tox.ini new file mode 100644 index 000000000..ed01e3386 --- /dev/null +++ b/tox.ini @@ -0,0 +1,8 @@ +[tox] +envlist = py26,py27,py33 +[testenv] +deps = + nose + coverage +commands = nosetests --verbose {posargs:test} # --with-coverage --cover-package=youtube_dl --cover-html + # test.test_download:TestDownload.test_NowVideo diff --git a/youtube-dl b/youtube-dl index d2401a2d8..ba664b481 100755 Binary files a/youtube-dl and b/youtube-dl differ diff --git a/youtube_dl/FileDownloader.py b/youtube_dl/FileDownloader.py index d6673fd3a..8ecabab1a 100644 --- a/youtube_dl/FileDownloader.py +++ b/youtube_dl/FileDownloader.py @@ -270,6 +270,7 @@ class FileDownloader(object): def _download_with_rtmpdump(self, filename, url, player_url, page_url, play_path, tc_url): self.report_destination(filename) tmpfilename = self.temp_name(filename) + test = self.params.get('test', False) # Check for rtmpdump first try: @@ -291,6 +292,8 @@ class FileDownloader(object): basic_args += ['--playpath', play_path] if tc_url is not None: basic_args += ['--tcUrl', url] + if test: + basic_args += ['--stop', '1'] args = basic_args + [[], ['--resume', '--skip', '1']][self.params.get('continuedl', False)] if self.params.get('verbose', False): try: @@ -300,7 +303,7 @@ class FileDownloader(object): shell_quote = repr self.to_screen(u'[debug] rtmpdump command line: ' + shell_quote(args)) retval = subprocess.call(args) - while retval == 2 or retval == 1: + while (retval == 2 or retval == 1) and not test: prevsize = os.path.getsize(encodeFilename(tmpfilename)) self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True) time.sleep(5.0) # This seems to be needed @@ -313,7 +316,7 @@ class FileDownloader(object): self.to_screen(u'\r[rtmpdump] Could not download the whole video. This can happen for some advertisements.') retval = 0 break - if retval == 0: + if retval == 0 or (test and retval == 2): fsize = os.path.getsize(encodeFilename(tmpfilename)) self.to_screen(u'\r[rtmpdump] %s bytes' % fsize) self.try_rename(tmpfilename, filename) diff --git a/youtube_dl/PostProcessor.py b/youtube_dl/PostProcessor.py index 3ee1d3c58..13b56ede5 100644 --- a/youtube_dl/PostProcessor.py +++ b/youtube_dl/PostProcessor.py @@ -3,7 +3,14 @@ import subprocess import sys import time -from .utils import * + +from .utils import ( + compat_subprocess_get_DEVNULL, + encodeFilename, + PostProcessingError, + shell_quote, + subtitles_filename, +) class PostProcessor(object): @@ -82,6 +89,8 @@ class FFmpegPostProcessor(PostProcessor): + opts + [encodeFilename(self._ffmpeg_filename_argument(out_path))]) + if self._downloader.params.get('verbose', False): + self._downloader.to_screen(u'[debug] ffmpeg command line: %s' % shell_quote(cmd)) p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdout,stderr = p.communicate() if p.returncode != 0: @@ -177,7 +186,8 @@ class FFmpegExtractAudioPP(FFmpegPostProcessor): extension = self._preferredcodec more_opts = [] if self._preferredquality is not None: - if int(self._preferredquality) < 10: + # The opus codec doesn't support the -aq option + if int(self._preferredquality) < 10 and extension != 'opus': more_opts += [self._exes['avconv'] and '-q:a' or '-aq', self._preferredquality] else: more_opts += [self._exes['avconv'] and '-b:a' or '-ab', self._preferredquality + 'k'] @@ -467,3 +477,35 @@ class FFmpegEmbedSubtitlePP(FFmpegPostProcessor): os.rename(encodeFilename(temp_filename), encodeFilename(filename)) return True, information + + +class FFmpegMetadataPP(FFmpegPostProcessor): + def run(self, info): + metadata = {} + if info.get('title') is not None: + metadata['title'] = info['title'] + if info.get('upload_date') is not None: + metadata['date'] = info['upload_date'] + if info.get('uploader') is not None: + metadata['artist'] = info['uploader'] + elif info.get('uploader_id') is not None: + metadata['artist'] = info['uploader_id'] + + if not metadata: + self._downloader.to_screen(u'[ffmpeg] There isn\'t any metadata to add') + return True, info + + filename = info['filepath'] + ext = os.path.splitext(filename)[1][1:] + temp_filename = filename + u'.temp' + + options = ['-c', 'copy'] + for (name, value) in metadata.items(): + options.extend(['-metadata', '%s="%s"' % (name, value)]) + options.extend(['-f', ext]) + + self._downloader.to_screen(u'[ffmpeg] Adding metadata to \'%s\'' % filename) + self.run_ffmpeg(filename, temp_filename, options) + os.remove(encodeFilename(filename)) + os.rename(encodeFilename(temp_filename), encodeFilename(filename)) + return True, info diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 62982521e..d3562826e 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -3,6 +3,7 @@ from __future__ import absolute_import +import errno import io import os import re @@ -70,6 +71,7 @@ class YoutubeDL(object): logtostderr: Log messages to stderr instead of stdout. writedescription: Write the video description to a .description file writeinfojson: Write the video description to a .info.json file + writeannotations: Write the video annotations to a .annotations.xml file writethumbnail: Write the thumbnail image to a file writesubtitles: Write the video subtitles to a file writeautomaticsub: Write the automatic subtitles to a file @@ -83,7 +85,13 @@ class YoutubeDL(object): skip_download: Skip the actual download of the video file cachedir: Location of the cache files in the filesystem. None to disable filesystem cache. - + noplaylist: Download single video instead of a playlist if in doubt. + age_limit: An integer representing the user's age in years. + Unsuitable videos for the given age are skipped. + downloadarchive: File name of a file where all downloads are recorded. + Videos already present in the file are not downloaded + again. + The following parameters are not used by YoutubeDL itself, they are used by the FileDownloader: nopart, updatetime, buffersize, ratelimit, min_filesize, max_filesize, test, @@ -112,7 +120,7 @@ class YoutubeDL(object): and not params['restrictfilenames']): # On Python 3, the Unicode filesystem API will throw errors (#1474) self.report_warning( - u'Assuming --restrict-filenames isnce file system encoding ' + u'Assuming --restrict-filenames since file system encoding ' u'cannot encode all charactes. ' u'Set the LC_ALL environment variable to fix this.') params['restrictfilenames'] = True @@ -208,10 +216,10 @@ class YoutubeDL(object): If stderr is a tty file the 'WARNING:' will be colored ''' if sys.stderr.isatty() and os.name != 'nt': - _msg_header=u'\033[0;33mWARNING:\033[0m' + _msg_header = u'\033[0;33mWARNING:\033[0m' else: - _msg_header=u'WARNING:' - warning_message=u'%s %s' % (_msg_header,message) + _msg_header = u'WARNING:' + warning_message = u'%s %s' % (_msg_header, message) self.to_stderr(warning_message) def report_error(self, message, tb=None): @@ -226,19 +234,6 @@ class YoutubeDL(object): error_message = u'%s %s' % (_msg_header, message) self.trouble(error_message, tb) - def slow_down(self, start_time, byte_counter): - """Sleep if the download speed is over the rate limit.""" - rate_limit = self.params.get('ratelimit', None) - if rate_limit is None or byte_counter == 0: - return - now = time.time() - elapsed = now - start_time - if elapsed <= 0.0: - return - speed = float(byte_counter) / elapsed - if speed > rate_limit: - time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit) - def report_writedescription(self, descfn): """ Report that the description file is being written """ self.to_screen(u'[info] Writing video description to: ' + descfn) @@ -251,6 +246,10 @@ class YoutubeDL(object): """ Report that the metadata file has been written """ self.to_screen(u'[info] Video description metadata as JSON to: ' + infofn) + def report_writeannotations(self, annofn): + """ Report that the annotations file has been written. """ + self.to_screen(u'[info] Writing video annotations to: ' + annofn) + def report_file_already_downloaded(self, file_name): """Report file has already been fully downloaded.""" try: @@ -273,16 +272,18 @@ class YoutubeDL(object): autonumber_size = 5 autonumber_templ = u'%0' + str(autonumber_size) + u'd' template_dict['autonumber'] = autonumber_templ % self._num_downloads - if template_dict['playlist_index'] is not None: + if template_dict.get('playlist_index') is not None: template_dict['playlist_index'] = u'%05d' % template_dict['playlist_index'] - sanitize = lambda k,v: sanitize_filename( + sanitize = lambda k, v: sanitize_filename( u'NA' if v is None else compat_str(v), restricted=self.params.get('restrictfilenames'), - is_id=(k==u'id')) - template_dict = dict((k, sanitize(k, v)) for k,v in template_dict.items()) + is_id=(k == u'id')) + template_dict = dict((k, sanitize(k, v)) + for k, v in template_dict.items()) - filename = self.params['outtmpl'] % template_dict + tmpl = os.path.expanduser(self.params['outtmpl']) + filename = tmpl % template_dict return filename except KeyError as err: self.report_error(u'Erroneous output template') @@ -308,15 +309,28 @@ class YoutubeDL(object): dateRange = self.params.get('daterange', DateRange()) if date not in dateRange: return u'[download] %s upload date is not in range %s' % (date_from_str(date).isoformat(), dateRange) + age_limit = self.params.get('age_limit') + if age_limit is not None: + if age_limit < info_dict.get('age_limit', 0): + return u'Skipping "' + title + '" because it is age restricted' + if self.in_download_archive(info_dict): + return (u'%(title)s has already been recorded in archive' + % info_dict) return None - + + @staticmethod + def add_extra_info(info_dict, extra_info): + '''Set the keys from extra_info in info dict if they are missing''' + for key, value in extra_info.items(): + info_dict.setdefault(key, value) + def extract_info(self, url, download=True, ie_key=None, extra_info={}): ''' Returns a list with a dictionary for each video we find. If 'download', also downloads the videos. extra_info is a dict containing the extra values to add to each result ''' - + if ie_key: ies = [self.get_info_extractor(ie_key)] else: @@ -336,17 +350,17 @@ class YoutubeDL(object): break if isinstance(ie_result, list): # Backwards compatibility: old IE result format - for result in ie_result: - result.update(extra_info) ie_result = { '_type': 'compat_list', 'entries': ie_result, } - else: - ie_result.update(extra_info) - if 'extractor' not in ie_result: - ie_result['extractor'] = ie.IE_NAME - return self.process_ie_result(ie_result, download=download) + self.add_extra_info(ie_result, + { + 'extractor': ie.IE_NAME, + 'webpage_url': url, + 'extractor_key': ie.ie_key(), + }) + return self.process_ie_result(ie_result, download, extra_info) except ExtractorError as de: # An error we somewhat expected self.report_error(compat_str(de), de.format_traceback()) break @@ -358,7 +372,7 @@ class YoutubeDL(object): raise else: self.report_error(u'no suitable InfoExtractor: %s' % url) - + def process_ie_result(self, ie_result, download=True, extra_info={}): """ Take the result of the ie(may be modified) and resolve all unresolved @@ -370,14 +384,8 @@ class YoutubeDL(object): result_type = ie_result.get('_type', 'video') # If not given we suppose it's a video, support the default old system if result_type == 'video': - ie_result.update(extra_info) - if 'playlist' not in ie_result: - # It isn't part of a playlist - ie_result['playlist'] = None - ie_result['playlist_index'] = None - if download: - self.process_info(ie_result) - return ie_result + self.add_extra_info(ie_result, extra_info) + return self.process_video_result(ie_result) elif result_type == 'url': # We have to add extra_info to the results because it may be # contained in a playlist @@ -386,9 +394,10 @@ class YoutubeDL(object): ie_key=ie_result.get('ie_key'), extra_info=extra_info) elif result_type == 'playlist': + self.add_extra_info(ie_result, extra_info) # We process each entry in the playlist playlist = ie_result.get('title', None) or ie_result.get('id', None) - self.to_screen(u'[download] Downloading playlist: %s' % playlist) + self.to_screen(u'[download] Downloading playlist: %s' % playlist) playlist_results = [] @@ -406,17 +415,15 @@ class YoutubeDL(object): self.to_screen(u"[%s] playlist '%s': Collected %d video ids (downloading %d of them)" % (ie_result['extractor'], playlist, n_all_entries, n_entries)) - for i,entry in enumerate(entries,1): - self.to_screen(u'[download] Downloading video #%s of %s' %(i, n_entries)) + for i, entry in enumerate(entries, 1): + self.to_screen(u'[download] Downloading video #%s of %s' % (i, n_entries)) extra = { - 'playlist': playlist, - 'playlist_index': i + playliststart, - } - if not 'extractor' in entry: - # We set the extractor, if it's an url it will be set then to - # the new extractor, but if it's already a video we must make - # sure it's present: see issue #877 - entry['extractor'] = ie_result['extractor'] + 'playlist': playlist, + 'playlist_index': i + playliststart, + 'extractor': ie_result['extractor'], + 'webpage_url': ie_result['webpage_url'], + 'extractor_key': ie_result['extractor_key'], + } entry_result = self.process_ie_result(entry, download=download, extra_info=extra) @@ -425,16 +432,122 @@ class YoutubeDL(object): return ie_result elif result_type == 'compat_list': def _fixup(r): - r.setdefault('extractor', ie_result['extractor']) + self.add_extra_info(r, + { + 'extractor': ie_result['extractor'], + 'webpage_url': ie_result['webpage_url'], + 'extractor_key': ie_result['extractor_key'], + }) return r ie_result['entries'] = [ - self.process_ie_result(_fixup(r), download=download) + self.process_ie_result(_fixup(r), download, extra_info) for r in ie_result['entries'] ] return ie_result else: raise Exception('Invalid result type: %s' % result_type) + def select_format(self, format_spec, available_formats): + if format_spec == 'best' or format_spec is None: + return available_formats[-1] + elif format_spec == 'worst': + return available_formats[0] + else: + extensions = [u'mp4', u'flv', u'webm', u'3gp'] + if format_spec in extensions: + filter_f = lambda f: f['ext'] == format_spec + else: + filter_f = lambda f: f['format_id'] == format_spec + matches = list(filter(filter_f, available_formats)) + if matches: + return matches[-1] + return None + + def process_video_result(self, info_dict, download=True): + assert info_dict.get('_type', 'video') == 'video' + + if 'playlist' not in info_dict: + # It isn't part of a playlist + info_dict['playlist'] = None + info_dict['playlist_index'] = None + + # This extractors handle format selection themselves + if info_dict['extractor'] in [u'youtube', u'Youku']: + if download: + self.process_info(info_dict) + return info_dict + + # We now pick which formats have to be downloaded + if info_dict.get('formats') is None: + # There's only one format available + formats = [info_dict] + else: + formats = info_dict['formats'] + + # We check that all the formats have the format and format_id fields + for (i, format) in enumerate(formats): + if format.get('format_id') is None: + format['format_id'] = compat_str(i) + if format.get('format') is None: + format['format'] = u'{id} - {res}{note}'.format( + id=format['format_id'], + res=self.format_resolution(format), + note=u' ({0})'.format(format['format_note']) if format.get('format_note') is not None else '', + ) + # Automatically determine file extension if missing + if 'ext' not in format: + format['ext'] = determine_ext(format['url']) + + if self.params.get('listformats', None): + self.list_formats(info_dict) + return + + format_limit = self.params.get('format_limit', None) + if format_limit: + formats = list(takewhile_inclusive( + lambda f: f['format_id'] != format_limit, formats + )) + if self.params.get('prefer_free_formats'): + def _free_formats_key(f): + try: + ext_ord = [u'flv', u'mp4', u'webm'].index(f['ext']) + except ValueError: + ext_ord = -1 + # We only compare the extension if they have the same height and width + return (f.get('height'), f.get('width'), ext_ord) + formats = sorted(formats, key=_free_formats_key) + + req_format = self.params.get('format', 'best') + if req_format is None: + req_format = 'best' + formats_to_download = [] + # The -1 is for supporting YoutubeIE + if req_format in ('-1', 'all'): + formats_to_download = formats + else: + # We can accept formats requestd in the format: 34/5/best, we pick + # the first that is available, starting from left + req_formats = req_format.split('/') + for rf in req_formats: + selected_format = self.select_format(rf, formats) + if selected_format is not None: + formats_to_download = [selected_format] + break + if not formats_to_download: + raise ExtractorError(u'requested format not available', + expected=True) + + if download: + if len(formats_to_download) > 1: + self.to_screen(u'[info] %s: downloading video in %s formats' % (info_dict['id'], len(formats_to_download))) + for format in formats_to_download: + new_info = dict(info_dict) + new_info.update(format) + self.process_info(new_info) + # We update the info dict with the best quality format (backwards compatibility) + info_dict.update(formats_to_download[-1]) + return info_dict + def process_info(self, info_dict): """Process a single resolved IE result.""" @@ -472,9 +585,9 @@ class YoutubeDL(object): if self.params.get('forceurl', False): # For RTMP URLs, also include the playpath compat_print(info_dict['url'] + info_dict.get('play_path', u'')) - if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict: + if self.params.get('forcethumbnail', False) and info_dict.get('thumbnail') is not None: compat_print(info_dict['thumbnail']) - if self.params.get('forcedescription', False) and 'description' in info_dict: + if self.params.get('forcedescription', False) and info_dict.get('description') is not None: compat_print(info_dict['description']) if self.params.get('forcefilename', False) and filename is not None: compat_print(filename) @@ -508,10 +621,22 @@ class YoutubeDL(object): self.report_error(u'Cannot write description file ' + descfn) return + if self.params.get('writeannotations', False): + try: + annofn = filename + u'.annotations.xml' + self.report_writeannotations(annofn) + with io.open(encodeFilename(annofn), 'w', encoding='utf-8') as annofile: + annofile.write(info_dict['annotations']) + except (KeyError, TypeError): + self.report_warning(u'There are no annotations to write.') + except (OSError, IOError): + self.report_error(u'Cannot write annotations file: ' + annofn) + return + subtitles_are_requested = any([self.params.get('writesubtitles', False), self.params.get('writeautomaticsub')]) - if subtitles_are_requested and 'subtitles' in info_dict and info_dict['subtitles']: + if subtitles_are_requested and 'subtitles' in info_dict and info_dict['subtitles']: # subtitles download errors are already managed as troubles in relevant IE # that way it will silently go on when used with unsupporting IE subtitles = info_dict['subtitles'] @@ -533,7 +658,7 @@ class YoutubeDL(object): infofn = filename + u'.info.json' self.report_writeinfojson(infofn) try: - json_info_dict = dict((k, v) for k,v in info_dict.items() if not k in ['urlhandle']) + json_info_dict = dict((k, v) for k, v in info_dict.items() if not k in ['urlhandle']) write_json_file(json_info_dict, encodeFilename(infofn)) except (OSError, IOError): self.report_error(u'Cannot write metadata to JSON file ' + infofn) @@ -578,6 +703,8 @@ class YoutubeDL(object): self.report_error(u'postprocessing: %s' % str(err)) return + self.record_download_archive(info_dict) + def download(self, url_list): """Download a given list of URLs.""" if len(url_list) > 1 and self.fixed_template(): @@ -616,7 +743,7 @@ class YoutubeDL(object): self.to_screen('[download] Writing metadata to the file\'s xattrs') xattr_mapping = { - 'user.xdg.referrer.url': 'referrer', + 'user.xdg.referrer.url': 'webpage_url', # 'user.xdg.comment': 'description', 'user.dublincore.title': 'title', 'user.dublincore.date': 'upload_date', @@ -648,7 +775,7 @@ class YoutubeDL(object): keep_video = None for pp in self._pps: try: - keep_video_wish,new_info = pp.run(info) + keep_video_wish, new_info = pp.run(info) if keep_video_wish is not None: if keep_video_wish: keep_video = keep_video_wish @@ -663,3 +790,61 @@ class YoutubeDL(object): os.remove(encodeFilename(filename)) except (IOError, OSError): self.report_warning(u'Unable to remove downloaded video file') + + def in_download_archive(self, info_dict): + fn = self.params.get('download_archive') + if fn is None: + return False + vid_id = info_dict['extractor'] + u' ' + info_dict['id'] + try: + with locked_file(fn, 'r', encoding='utf-8') as archive_file: + for line in archive_file: + if line.strip() == vid_id: + return True + except IOError as ioe: + if ioe.errno != errno.ENOENT: + raise + return False + + def record_download_archive(self, info_dict): + fn = self.params.get('download_archive') + if fn is None: + return + vid_id = info_dict['extractor'] + u' ' + info_dict['id'] + with locked_file(fn, 'a', encoding='utf-8') as archive_file: + archive_file.write(vid_id + u'\n') + + @staticmethod + def format_resolution(format, default='unknown'): + if format.get('_resolution') is not None: + return format['_resolution'] + if format.get('height') is not None: + if format.get('width') is not None: + res = u'%sx%s' % (format['width'], format['height']) + else: + res = u'%sp' % format['height'] + else: + res = default + return res + + def list_formats(self, info_dict): + def line(format): + return (u'%-15s%-10s%-12s%s' % ( + format['format_id'], + format['ext'], + self.format_resolution(format), + format.get('format_note', ''), + ) + ) + + formats = info_dict.get('formats', [info_dict]) + formats_s = list(map(line, formats)) + if len(formats) > 1: + formats_s[0] += (' ' if formats[0].get('format_note') else '') + '(worst)' + formats_s[-1] += (' ' if formats[-1].get('format_note') else '') + '(best)' + + header_line = line({ + 'format_id': u'format code', 'ext': u'extension', + '_resolution': u'resolution', 'format_note': u'note'}) + self.to_screen(u'[info] Available formats for %s:\n%s\n%s' % + (info_dict['id'], header_line, u"\n".join(formats_s))) diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 3851fc0a6..48ffcbf8e 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -31,11 +31,13 @@ __authors__ = ( 'Huarong Huo', 'Ismael Mejía', 'Steffan \'Ruirize\' James', + 'Andras Elso', ) __license__ = 'Public Domain' import codecs +import collections import getpass import optparse import os @@ -45,17 +47,43 @@ import shlex import socket import subprocess import sys -import warnings +import traceback import platform -from .utils import * +from .utils import ( + compat_cookiejar, + compat_print, + compat_str, + compat_urllib_request, + DateRange, + decodeOption, + determine_ext, + DownloadError, + get_cachedir, + make_HTTPS_handler, + MaxDownloadsReached, + platform_name, + preferredencoding, + SameFileError, + std_headers, + write_string, + YoutubeDLHandler, +) from .update import update_self from .version import __version__ -from .FileDownloader import * +from .FileDownloader import ( + FileDownloader, +) from .extractor import gen_extractors from .YoutubeDL import YoutubeDL -from .PostProcessor import * +from .PostProcessor import ( + FFmpegMetadataPP, + FFmpegVideoConvertor, + FFmpegExtractAudioPP, + FFmpegEmbedSubtitlePP, +) + def parseOpts(overrideArguments=None): def _readOptions(filename_bytes): @@ -105,7 +133,7 @@ def parseOpts(overrideArguments=None): def _hide_login_info(opts): opts = list(opts) - for private_opt in ['-p', '--password', '-u', '--username']: + for private_opt in ['-p', '--password', '-u', '--username', '--video-password']: try: i = opts.index(private_opt) opts[i+1] = '' @@ -151,6 +179,9 @@ def parseOpts(overrideArguments=None): action='store_true', dest='update_self', help='update this program to latest version. Make sure that you have sufficient permissions (run with sudo if needed)') general.add_option('-i', '--ignore-errors', action='store_true', dest='ignoreerrors', help='continue on download errors, for example to to skip unavailable videos in a playlist', default=False) + general.add_option('--abort-on-error', + action='store_false', dest='ignoreerrors', + help='Abort downloading of further videos (in the playlist or the command line) if an error occurs') general.add_option('--dump-user-agent', action='store_true', dest='dump_user_agent', help='display the current browser identification', default=False) @@ -168,8 +199,8 @@ def parseOpts(overrideArguments=None): general.add_option('--proxy', dest='proxy', default=None, help='Use the specified HTTP/HTTPS proxy', metavar='URL') general.add_option('--no-check-certificate', action='store_true', dest='no_check_certificate', default=False, help='Suppress HTTPS certificate validation.') general.add_option( - '--cache-dir', dest='cachedir', default=u'~/.youtube-dl/cache', - help='Location in the filesystem where youtube-dl can store downloaded information permanently. %default by default') + '--cache-dir', dest='cachedir', default=get_cachedir(), metavar='DIR', + help='Location in the filesystem where youtube-dl can store downloaded information permanently. By default $XDG_CACHE_HOME/youtube-dl or ~/.cache/youtube-dl .') general.add_option( '--no-cache-dir', action='store_const', const=None, dest='cachedir', help='Disable filesystem caching') @@ -187,6 +218,13 @@ def parseOpts(overrideArguments=None): selection.add_option('--date', metavar='DATE', dest='date', help='download only videos uploaded in this date', default=None) selection.add_option('--datebefore', metavar='DATE', dest='datebefore', help='download only videos uploaded before this date', default=None) selection.add_option('--dateafter', metavar='DATE', dest='dateafter', help='download only videos uploaded after this date', default=None) + selection.add_option('--no-playlist', action='store_true', dest='noplaylist', help='download only the currently playing video', default=False) + selection.add_option('--age-limit', metavar='YEARS', dest='age_limit', + help='download only videos suitable for the given age', + default=None, type=int) + selection.add_option('--download-archive', metavar='FILE', + dest='download_archive', + help='Download only videos not present in the archive file. Record all downloaded videos in it.') authentication.add_option('-u', '--username', @@ -200,7 +238,7 @@ def parseOpts(overrideArguments=None): video_format.add_option('-f', '--format', - action='store', dest='format', metavar='FORMAT', + action='store', dest='format', metavar='FORMAT', default='best', help='video format code, specifiy the order of preference using slashes: "-f 22/17/18". "-f mp4" and "-f flv" are also supported') video_format.add_option('--all-formats', action='store_const', dest='format', help='download all available video formats', const='all') @@ -232,11 +270,11 @@ def parseOpts(overrideArguments=None): help='languages of the subtitles to download (optional) separated by commas, use IETF language tags like \'en,pt\'') downloader.add_option('-r', '--rate-limit', - dest='ratelimit', metavar='LIMIT', help='maximum download rate (e.g. 50k or 44.6m)') + dest='ratelimit', metavar='LIMIT', help='maximum download rate in bytes per second (e.g. 50K or 4.2M)') downloader.add_option('-R', '--retries', dest='retries', metavar='RETRIES', help='number of retries (default is %default)', default=10) downloader.add_option('--buffer-size', - dest='buffersize', metavar='SIZE', help='size of download buffer (e.g. 1024 or 16k) (default is %default)', default="1024") + dest='buffersize', metavar='SIZE', help='size of download buffer (e.g. 1024 or 16K) (default is %default)', default="1024") downloader.add_option('--no-resize-buffer', action='store_true', dest='noresizebuffer', help='do not automatically adjust the buffer size. By default, the buffer size is automatically resized from an initial value of SIZE.', default=False) @@ -278,6 +316,9 @@ def parseOpts(overrideArguments=None): verbosity.add_option('--dump-intermediate-pages', action='store_true', dest='dump_intermediate_pages', default=False, help='print downloaded pages to debug problems(very verbose)') + verbosity.add_option('--write-pages', + action='store_true', dest='write_pages', default=False, + help='Write downloaded pages to files in the current directory') verbosity.add_option('--youtube-print-sig-code', action='store_true', dest='youtube_print_sig_code', default=False, help=optparse.SUPPRESS_HELP) @@ -297,7 +338,10 @@ def parseOpts(overrideArguments=None): help=('output filename template. Use %(title)s to get the title, ' '%(uploader)s for the uploader name, %(uploader_id)s for the uploader nickname if different, ' '%(autonumber)s to get an automatically incremented number, ' - '%(ext)s for the filename extension, %(upload_date)s for the upload date (YYYYMMDD), ' + '%(ext)s for the filename extension, ' + '%(format)s for the format description (like "22 - 1280x720" or "HD"),' + '%(format_id)s for the unique id of the format (like Youtube\'s itags: "137"),' + '%(upload_date)s for the upload date (YYYYMMDD), ' '%(extractor)s for the provider (youtube, metacafe, etc), ' '%(id)s for the video id , %(playlist)s for the playlist the video is in, ' '%(playlist_index)s for the position in the playlist and %% for a literal percent. ' @@ -331,6 +375,9 @@ def parseOpts(overrideArguments=None): filesystem.add_option('--write-info-json', action='store_true', dest='writeinfojson', help='write video metadata to a .info.json file', default=False) + filesystem.add_option('--write-annotations', + action='store_true', dest='writeannotations', + help='write video annotations to a .annotation file', default=False) filesystem.add_option('--write-thumbnail', action='store_true', dest='writethumbnail', help='write thumbnail image to disk', default=False) @@ -350,6 +397,8 @@ def parseOpts(overrideArguments=None): help='do not overwrite post-processed files; the post-processed files are overwritten by default') postproc.add_option('--embed-subs', action='store_true', dest='embedsubtitles', default=False, help='embed subtitles in the video (only for mp4 videos)') + postproc.add_option('--add-metadata', action='store_true', dest='addmetadata', default=False, + help='add metadata to the files') parser.add_option_group(general) @@ -369,9 +418,13 @@ def parseOpts(overrideArguments=None): else: xdg_config_home = os.environ.get('XDG_CONFIG_HOME') if xdg_config_home: - userConfFile = os.path.join(xdg_config_home, 'youtube-dl.conf') + userConfFile = os.path.join(xdg_config_home, 'youtube-dl', 'config') + if not os.path.isfile(userConfFile): + userConfFile = os.path.join(xdg_config_home, 'youtube-dl.conf') else: - userConfFile = os.path.join(os.path.expanduser('~'), '.config', 'youtube-dl.conf') + userConfFile = os.path.join(os.path.expanduser('~'), '.config', 'youtube-dl', 'config') + if not os.path.isfile(userConfFile): + userConfFile = os.path.join(os.path.expanduser('~'), '.config', 'youtube-dl.conf') systemConf = _readOptions('/etc/youtube-dl.conf') userConf = _readOptions(userConfFile) commandLineConf = sys.argv[1:] @@ -436,27 +489,7 @@ def _real_main(argv=None): all_urls = batchurls + args all_urls = [url.strip() for url in all_urls] - # General configuration - cookie_processor = compat_urllib_request.HTTPCookieProcessor(jar) - if opts.proxy is not None: - if opts.proxy == '': - proxies = {} - else: - proxies = {'http': opts.proxy, 'https': opts.proxy} - else: - proxies = compat_urllib_request.getproxies() - # Set HTTPS proxy to HTTP one if given (https://github.com/rg3/youtube-dl/issues/805) - if 'http' in proxies and 'https' not in proxies: - proxies['https'] = proxies['http'] - proxy_handler = compat_urllib_request.ProxyHandler(proxies) - https_handler = make_HTTPS_handler(opts) - opener = compat_urllib_request.build_opener(https_handler, proxy_handler, cookie_processor, YoutubeDLHandler()) - # Delete the default user-agent header, which would otherwise apply in - # cases where our custom HTTP handler doesn't come into play - # (See https://github.com/rg3/youtube-dl/issues/1309 for details) - opener.addheaders =[] - compat_urllib_request.install_opener(opener) - socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words) + opener = _setup_opener(jar=jar, opts=opts) extractors = gen_extractors() @@ -473,6 +506,8 @@ def _real_main(argv=None): if not ie._WORKING: continue desc = getattr(ie, 'IE_DESC', ie.IE_NAME) + if desc is False: + continue if hasattr(ie, 'SEARCH_KEY'): _SEARCHES = (u'cute kittens', u'slithering pythons', u'falling cat', u'angry poodle', u'purple fish', u'running tortoise') _COUNTS = (u'', u'5', u'10', u'all') @@ -599,11 +634,13 @@ def _real_main(argv=None): 'progress_with_newline': opts.progress_with_newline, 'playliststart': opts.playliststart, 'playlistend': opts.playlistend, + 'noplaylist': opts.noplaylist, 'logtostderr': opts.outtmpl == '-', 'consoletitle': opts.consoletitle, 'nopart': opts.nopart, 'updatetime': opts.updatetime, 'writedescription': opts.writedescription, + 'writeannotations': opts.writeannotations, 'writeinfojson': opts.writeinfojson, 'writethumbnail': opts.writethumbnail, 'writesubtitles': opts.writesubtitles, @@ -618,6 +655,7 @@ def _real_main(argv=None): 'prefer_free_formats': opts.prefer_free_formats, 'verbose': opts.verbose, 'dump_intermediate_pages': opts.dump_intermediate_pages, + 'write_pages': opts.write_pages, 'test': opts.test, 'keepvideo': opts.keepvideo, 'min_filesize': opts.min_filesize, @@ -625,6 +663,8 @@ def _real_main(argv=None): 'daterange': date, 'cachedir': opts.cachedir, 'youtube_print_sig_code': opts.youtube_print_sig_code, + 'age_limit': opts.age_limit, + 'download_archive': opts.download_archive, }) if opts.verbose: @@ -644,11 +684,19 @@ def _real_main(argv=None): except: pass write_string(u'[debug] Python version %s - %s' %(platform.python_version(), platform_name()) + u'\n') - write_string(u'[debug] Proxy map: ' + str(proxy_handler.proxies) + u'\n') + + proxy_map = {} + for handler in opener.handlers: + if hasattr(handler, 'proxies'): + proxy_map.update(handler.proxies) + write_string(u'[debug] Proxy map: ' + compat_str(proxy_map) + u'\n') ydl.add_default_info_extractors() # PostProcessors + # Add the metadata pp first, the other pps will copy it + if opts.addmetadata: + ydl.add_post_processor(FFmpegMetadataPP()) if opts.extractaudio: ydl.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat, preferredquality=opts.audioquality, nopostoverwrites=opts.nopostoverwrites)) if opts.recodevideo: @@ -658,7 +706,7 @@ def _real_main(argv=None): # Update version if opts.update_self: - update_self(ydl.to_screen, opts.verbose, sys.argv[0]) + update_self(ydl.to_screen, opts.verbose) # Maybe do nothing if len(all_urls) < 1: @@ -677,11 +725,42 @@ def _real_main(argv=None): if opts.cookiefile is not None: try: jar.save() - except (IOError, OSError) as err: + except (IOError, OSError): sys.exit(u'ERROR: unable to save cookie jar') sys.exit(retcode) + +def _setup_opener(jar=None, opts=None, timeout=300): + if opts is None: + FakeOptions = collections.namedtuple( + 'FakeOptions', ['proxy', 'no_check_certificate']) + opts = FakeOptions(proxy=None, no_check_certificate=False) + + cookie_processor = compat_urllib_request.HTTPCookieProcessor(jar) + if opts.proxy is not None: + if opts.proxy == '': + proxies = {} + else: + proxies = {'http': opts.proxy, 'https': opts.proxy} + else: + proxies = compat_urllib_request.getproxies() + # Set HTTPS proxy to HTTP one if given (https://github.com/rg3/youtube-dl/issues/805) + if 'http' in proxies and 'https' not in proxies: + proxies['https'] = proxies['http'] + proxy_handler = compat_urllib_request.ProxyHandler(proxies) + https_handler = make_HTTPS_handler(opts) + opener = compat_urllib_request.build_opener( + https_handler, proxy_handler, cookie_processor, YoutubeDLHandler()) + # Delete the default user-agent header, which would otherwise apply in + # cases where our custom HTTP handler doesn't come into play + # (See https://github.com/rg3/youtube-dl/issues/1309 for details) + opener.addheaders = [] + compat_urllib_request.install_opener(opener) + socket.setdefaulttimeout(timeout) + return opener + + def main(argv=None): try: _real_main(argv) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index d1b7e5f99..888a91cce 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -2,8 +2,14 @@ from .appletrailers import AppleTrailersIE from .addanime import AddAnimeIE from .archiveorg import ArchiveOrgIE from .ard import ARDIE -from .arte import ArteTvIE +from .arte import ( + ArteTvIE, + ArteTVPlus7IE, + ArteTVCreativeIE, + ArteTVFutureIE, +) from .auengine import AUEngineIE +from .bambuser import BambuserIE, BambuserChannelIE from .bandcamp import BandcampIE from .bliptv import BlipTVIE, BlipTVUserIE from .bloomberg import BloombergIE @@ -12,6 +18,7 @@ from .brightcove import BrightcoveIE from .c56 import C56IE from .canalplus import CanalplusIE from .canalc2 import Canalc2IE +from .cinemassacre import CinemassacreIE from .cnn import CNNIE from .collegehumor import CollegeHumorIE from .comedycentral import ComedyCentralIE @@ -33,7 +40,9 @@ from .ehow import EHowIE from .eighttracks import EightTracksIE from .escapist import EscapistIE from .exfm import ExfmIE +from .extremetube import ExtremeTubeIE from .facebook import FacebookIE +from .faz import FazIE from .fktv import ( FKTVIE, FKTVPosteckeIE, @@ -60,10 +69,12 @@ from .ign import IGNIE, OneUPIE from .ina import InaIE from .infoq import InfoQIE from .instagram import InstagramIE +from .internetvideoarchive import InternetVideoArchiveIE from .jeuxvideo import JeuxVideoIE from .jukebox import JukeboxIE from .justintv import JustinTVIE from .kankan import KankanIE +from .keezmovies import KeezMoviesIE from .kickstarter import KickStarterIE from .keek import KeekIE from .liveleak import LiveLeakIE @@ -72,41 +83,52 @@ from .metacafe import MetacafeIE from .metacritic import MetacriticIE from .mit import TechTVMITIE, MITIE from .mixcloud import MixcloudIE +from .mofosex import MofosexIE from .mtv import MTVIE from .muzu import MuzuTVIE +from .myspace import MySpaceIE from .myspass import MySpassIE from .myvideo import MyVideoIE from .naver import NaverIE from .nba import NBAIE from .nbc import NBCNewsIE from .newgrounds import NewgroundsIE +from .nhl import NHLIE, NHLVideocenterIE +from .nowvideo import NowVideoIE from .ooyala import OoyalaIE from .orf import ORFIE from .pbs import PBSIE from .photobucket import PhotobucketIE +from .pornhub import PornHubIE from .pornotube import PornotubeIE from .rbmaradio import RBMARadioIE from .redtube import RedTubeIE from .ringtv import RingTVIE from .ro220 import Ro220IE +from .rottentomatoes import RottenTomatoesIE from .roxwel import RoxwelIE from .rtlnow import RTLnowIE +from .rutube import RutubeIE from .sina import SinaIE from .slashdot import SlashdotIE from .slideshare import SlideshareIE from .sohu import SohuIE from .soundcloud import SoundcloudIE, SoundcloudSetIE, SoundcloudUserIE from .southparkstudios import SouthParkStudiosIE +from .spankwire import SpankwireIE from .spiegel import SpiegelIE from .stanfordoc import StanfordOpenClassroomIE from .statigram import StatigramIE from .steam import SteamIE +from .sztvhu import SztvHuIE from .teamcoco import TeamcocoIE +from .techtalks import TechTalksIE from .ted import TEDIE from .tf1 import TF1IE from .thisav import ThisAVIE from .traileraddict import TrailerAddictIE from .trilulilu import TriluliluIE +from .tube8 import Tube8IE from .tudou import TudouIE from .tumblr import TumblrIE from .tutv import TutvIE @@ -117,16 +139,22 @@ from .veehd import VeeHDIE from .veoh import VeohIE from .vevo import VevoIE from .vice import ViceIE +from .viddler import ViddlerIE +from .videodetective import VideoDetectiveIE from .videofyme import VideofyMeIE +from .videopremium import VideoPremiumIE from .vimeo import VimeoIE, VimeoChannelIE from .vine import VineIE +from .vk import VKIE from .wat import WatIE +from .websurg import WeBSurgIE from .weibo import WeiboIE from .wimp import WimpIE from .worldstarhiphop import WorldStarHipHopIE from .xhamster import XHamsterIE from .xnxx import XNXXIE from .xvideos import XVideosIE +from .xtube import XTubeIE from .yahoo import YahooIE, YahooSearchIE from .youjizz import YouJizzIE from .youku import YoukuIE @@ -135,11 +163,13 @@ from .youtube import ( YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, + YoutubeSearchDateIE, YoutubeUserIE, YoutubeChannelIE, YoutubeShowIE, YoutubeSubscriptionsIE, YoutubeRecommendedIE, + YoutubeTruncatedURLIE, YoutubeWatchLaterIE, YoutubeFavouritesIE, ) diff --git a/youtube_dl/extractor/addanime.py b/youtube_dl/extractor/addanime.py index 82a785a19..b99d4b966 100644 --- a/youtube_dl/extractor/addanime.py +++ b/youtube_dl/extractor/addanime.py @@ -17,8 +17,8 @@ class AddAnimeIE(InfoExtractor): IE_NAME = u'AddAnime' _TEST = { u'url': u'http://www.add-anime.net/watch_video.php?v=24MR3YO5SAS9', - u'file': u'24MR3YO5SAS9.flv', - u'md5': u'1036a0e0cd307b95bd8a8c3a5c8cfaf1', + u'file': u'24MR3YO5SAS9.mp4', + u'md5': u'72954ea10bc979ab5e2eb288b21425a0', u'info_dict': { u"description": u"One Piece 606", u"title": u"One Piece 606" @@ -31,7 +31,8 @@ class AddAnimeIE(InfoExtractor): video_id = mobj.group('video_id') webpage = self._download_webpage(url, video_id) except ExtractorError as ee: - if not isinstance(ee.cause, compat_HTTPError): + if not isinstance(ee.cause, compat_HTTPError) or \ + ee.cause.code != 503: raise redir_webpage = ee.cause.read().decode('utf-8') @@ -60,16 +61,26 @@ class AddAnimeIE(InfoExtractor): note=u'Confirming after redirect') webpage = self._download_webpage(url, video_id) - video_url = self._search_regex(r"var normal_video_file = '(.*?)';", - webpage, u'video file URL') + formats = [] + for format_id in ('normal', 'hq'): + rex = r"var %s_video_file = '(.*?)';" % re.escape(format_id) + video_url = self._search_regex(rex, webpage, u'video file URLx', + fatal=False) + if not video_url: + continue + formats.append({ + 'format_id': format_id, + 'url': video_url, + }) + if not formats: + raise ExtractorError(u'Cannot find any video format!') video_title = self._og_search_title(webpage) video_description = self._og_search_description(webpage) return { '_type': 'video', 'id': video_id, - 'url': video_url, - 'ext': 'flv', + 'formats': formats, 'title': video_title, 'description': video_description } diff --git a/youtube_dl/extractor/appletrailers.py b/youtube_dl/extractor/appletrailers.py index 8b191c196..6d6237f8a 100644 --- a/youtube_dl/extractor/appletrailers.py +++ b/youtube_dl/extractor/appletrailers.py @@ -1,8 +1,10 @@ import re import xml.etree.ElementTree +import json from .common import InfoExtractor from ..utils import ( + compat_urlparse, determine_ext, ) @@ -14,10 +16,9 @@ class AppleTrailersIE(InfoExtractor): u"playlist": [ { u"file": u"manofsteel-trailer4.mov", - u"md5": u"11874af099d480cc09e103b189805d5f", + u"md5": u"d97a8e575432dbcb81b7c3acb741f8a8", u"info_dict": { u"duration": 111, - u"thumbnail": u"http://trailers.apple.com/trailers/wb/manofsteel/images/thumbnail_11624.jpg", u"title": u"Trailer 4", u"upload_date": u"20130523", u"uploader_id": u"wb", @@ -25,10 +26,9 @@ class AppleTrailersIE(InfoExtractor): }, { u"file": u"manofsteel-trailer3.mov", - u"md5": u"07a0a262aae5afe68120eed61137ab34", + u"md5": u"b8017b7131b721fb4e8d6f49e1df908c", u"info_dict": { u"duration": 182, - u"thumbnail": u"http://trailers.apple.com/trailers/wb/manofsteel/images/thumbnail_10793.jpg", u"title": u"Trailer 3", u"upload_date": u"20130417", u"uploader_id": u"wb", @@ -36,10 +36,9 @@ class AppleTrailersIE(InfoExtractor): }, { u"file": u"manofsteel-trailer.mov", - u"md5": u"e401fde0813008e3307e54b6f384cff1", + u"md5": u"d0f1e1150989b9924679b441f3404d48", u"info_dict": { u"duration": 148, - u"thumbnail": u"http://trailers.apple.com/trailers/wb/manofsteel/images/thumbnail_8703.jpg", u"title": u"Trailer", u"upload_date": u"20121212", u"uploader_id": u"wb", @@ -47,10 +46,9 @@ class AppleTrailersIE(InfoExtractor): }, { u"file": u"manofsteel-teaser.mov", - u"md5": u"76b392f2ae9e7c98b22913c10a639c97", + u"md5": u"5fe08795b943eb2e757fa95cb6def1cb", u"info_dict": { u"duration": 93, - u"thumbnail": u"http://trailers.apple.com/trailers/wb/manofsteel/images/thumbnail_6899.jpg", u"title": u"Teaser", u"upload_date": u"20120721", u"uploader_id": u"wb", @@ -59,87 +57,61 @@ class AppleTrailersIE(InfoExtractor): ] } + _JSON_RE = r'iTunes.playURL\((.*?)\);' + def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) movie = mobj.group('movie') uploader_id = mobj.group('company') - playlist_url = url.partition(u'?')[0] + u'/includes/playlists/web.inc' + playlist_url = compat_urlparse.urljoin(url, u'includes/playlists/itunes.inc') playlist_snippet = self._download_webpage(playlist_url, movie) - playlist_cleaned = re.sub(r'(?s)', u'', playlist_snippet) + playlist_cleaned = re.sub(r'(?s).*?', u'', playlist_snippet) + playlist_cleaned = re.sub(r'', r'', playlist_cleaned) + # The ' in the onClick attributes are not escaped, it couldn't be parsed + # with xml.etree.ElementTree.fromstring + # like: http://trailers.apple.com/trailers/wb/gravity/ + def _clean_json(m): + return u'iTunes.playURL(%s);' % m.group(1).replace('\'', ''') + playlist_cleaned = re.sub(self._JSON_RE, _clean_json, playlist_cleaned) playlist_html = u'' + playlist_cleaned + u'' - size_cache = {} - doc = xml.etree.ElementTree.fromstring(playlist_html) playlist = [] for li in doc.findall('./div/ul/li'): - title = li.find('.//h3').text + on_click = li.find('.//a').attrib['onClick'] + trailer_info_json = self._search_regex(self._JSON_RE, + on_click, u'trailer info') + trailer_info = json.loads(trailer_info_json) + title = trailer_info['title'] video_id = movie + '-' + re.sub(r'[^a-zA-Z0-9]', '', title).lower() thumbnail = li.find('.//img').attrib['src'] + upload_date = trailer_info['posted'].replace('-', '') - date_el = li.find('.//p') - upload_date = None - m = re.search(r':\s?(?P[0-9]{2})/(?P[0-9]{2})/(?P[0-9]{2})', date_el.text) - if m: - upload_date = u'20' + m.group('year') + m.group('month') + m.group('day') - runtime_el = date_el.find('./br') - m = re.search(r':\s?(?P[0-9]+):(?P[0-9]{1,2})', runtime_el.tail) + runtime = trailer_info['runtime'] + m = re.search(r'(?P[0-9]+):(?P[0-9]{1,2})', runtime) duration = None if m: duration = 60 * int(m.group('minutes')) + int(m.group('seconds')) + first_url = trailer_info['url'] + trailer_id = first_url.split('/')[-1].rpartition('_')[0].lower() + settings_json_url = compat_urlparse.urljoin(url, 'includes/settings/%s.json' % trailer_id) + settings_json = self._download_webpage(settings_json_url, trailer_id, u'Downloading settings json') + settings = json.loads(settings_json) + formats = [] - for formats_el in li.findall('.//a'): - if formats_el.attrib['class'] != 'OverlayPanel': - continue - target = formats_el.attrib['target'] - - format_code = formats_el.text - if 'Automatic' in format_code: - continue - - size_q = formats_el.attrib['href'] - size_id = size_q.rpartition('#videos-')[2] - if size_id not in size_cache: - size_url = url + size_q - sizepage_html = self._download_webpage( - size_url, movie, - note=u'Downloading size info %s' % size_id, - errnote=u'Error while downloading size info %s' % size_id, - ) - _doc = xml.etree.ElementTree.fromstring(sizepage_html) - size_cache[size_id] = _doc - - sizepage_doc = size_cache[size_id] - links = sizepage_doc.findall('.//{http://www.w3.org/1999/xhtml}ul/{http://www.w3.org/1999/xhtml}li/{http://www.w3.org/1999/xhtml}a') - for vid_a in links: - href = vid_a.get('href') - if not href.endswith(target): - continue - detail_q = href.partition('#')[0] - detail_url = url + '/' + detail_q - - m = re.match(r'includes/(?P[^/]+)/', detail_q) - detail_id = m.group('detail_id') - - detail_html = self._download_webpage( - detail_url, movie, - note=u'Downloading detail %s %s' % (detail_id, size_id), - errnote=u'Error while downloading detail %s %s' % (detail_id, size_id) - ) - detail_doc = xml.etree.ElementTree.fromstring(detail_html) - movie_link_el = detail_doc.find('.//{http://www.w3.org/1999/xhtml}a') - assert movie_link_el.get('class') == 'movieLink' - movie_link = movie_link_el.get('href').partition('?')[0].replace('_', '_h') - ext = determine_ext(movie_link) - assert ext == 'mov' - - formats.append({ - 'format': format_code, - 'ext': ext, - 'url': movie_link, - }) + for format in settings['metadata']['sizes']: + # The src is a file pointing to the real video file + format_url = re.sub(r'_(\d*p.mov)', r'_h\1', format['src']) + formats.append({ + 'url': format_url, + 'ext': determine_ext(format_url), + 'format': format['type'], + 'width': format['width'], + 'height': int(format['height']), + }) + formats = sorted(formats, key=lambda f: (f['height'], f['width'])) info = { '_type': 'video', diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py index 69b3b0ad7..e10c74c11 100644 --- a/youtube_dl/extractor/arte.py +++ b/youtube_dl/extractor/arte.py @@ -1,3 +1,4 @@ +# encoding: utf-8 import re import json import xml.etree.ElementTree @@ -7,15 +8,15 @@ from ..utils import ( ExtractorError, find_xpath_attr, unified_strdate, + determine_ext, + get_element_by_id, ) +# There are different sources of video in arte.tv, the extraction process +# is different for each one. The videos usually expire in 7 days, so we can't +# add tests. + class ArteTvIE(InfoExtractor): - """ - There are two sources of video in arte.tv: videos.arte.tv and - www.arte.tv/guide, the extraction process is different for each one. - The videos expire in 7 days, so we can't add tests. - """ - _EMISSION_URL = r'(?:http://)?www\.arte.tv/guide/(?Pfr|de)/(?:(?:sendungen|emissions)/)?(?P.*?)/(?P.*?)(\?.*)?' _VIDEOS_URL = r'(?:http://)?videos.arte.tv/(?Pfr|de)/.*-(?P.*?).html' _LIVEWEB_URL = r'(?:http://)?liveweb.arte.tv/(?Pfr|de)/(?P.+?)/(?P.+)' _LIVE_URL = r'index-[0-9]+\.html$' @@ -24,7 +25,7 @@ class ArteTvIE(InfoExtractor): @classmethod def suitable(cls, url): - return any(re.match(regex, url) for regex in (cls._EMISSION_URL, cls._VIDEOS_URL, cls._LIVEWEB_URL)) + return any(re.match(regex, url) for regex in (cls._VIDEOS_URL, cls._LIVEWEB_URL)) # TODO implement Live Stream # from ..utils import compat_urllib_parse @@ -55,14 +56,6 @@ class ArteTvIE(InfoExtractor): # video_url = u'%s/%s' % (info.get('url'), info.get('path')) def _real_extract(self, url): - mobj = re.match(self._EMISSION_URL, url) - if mobj is not None: - lang = mobj.group('lang') - # This is not a real id, it can be for example AJT for the news - # http://www.arte.tv/guide/fr/emissions/AJT/arte-journal - video_id = mobj.group('id') - return self._extract_emission(url, video_id, lang) - mobj = re.match(self._VIDEOS_URL, url) if mobj is not None: id = mobj.group('id') @@ -80,49 +73,6 @@ class ArteTvIE(InfoExtractor): # self.extractLiveStream(url) # return - def _extract_emission(self, url, video_id, lang): - """Extract from www.arte.tv/guide""" - webpage = self._download_webpage(url, video_id) - json_url = self._html_search_regex(r'arte_vp_url="(.*?)"', webpage, 'json url') - - json_info = self._download_webpage(json_url, video_id, 'Downloading info json') - self.report_extraction(video_id) - info = json.loads(json_info) - player_info = info['videoJsonPlayer'] - - info_dict = {'id': player_info['VID'], - 'title': player_info['VTI'], - 'description': player_info.get('VDE'), - 'upload_date': unified_strdate(player_info['VDA'].split(' ')[0]), - 'thumbnail': player_info['programImage'], - 'ext': 'flv', - } - - formats = player_info['VSR'].values() - def _match_lang(f): - # Return true if that format is in the language of the url - if lang == 'fr': - l = 'F' - elif lang == 'de': - l = 'A' - regexes = [r'VO?%s' % l, r'VO?.-ST%s' % l] - return any(re.match(r, f['versionCode']) for r in regexes) - # Some formats may not be in the same language as the url - formats = filter(_match_lang, formats) - # We order the formats by quality - formats = sorted(formats, key=lambda f: int(f['height'])) - # Prefer videos without subtitles in the same language - formats = sorted(formats, key=lambda f: re.match(r'VO(F|A)-STM\1', f['versionCode']) is None) - # Pick the best quality - format_info = formats[-1] - if format_info['mediaType'] == u'rtmp': - info_dict['url'] = format_info['streamer'] - info_dict['play_path'] = 'mp4:' + format_info['url'] - else: - info_dict['url'] = format_info['url'] - - return info_dict - def _extract_video(self, url, video_id, lang): """Extract from videos.arte.tv""" ref_xml_url = url.replace('/videos/', '/do_delegate/videos/') @@ -172,3 +122,130 @@ class ArteTvIE(InfoExtractor): 'ext': 'flv', 'thumbnail': self._og_search_thumbnail(webpage), } + + +class ArteTVPlus7IE(InfoExtractor): + IE_NAME = u'arte.tv:+7' + _VALID_URL = r'https?://www\.arte.tv/guide/(?Pfr|de)/(?:(?:sendungen|emissions)/)?(?P.*?)/(?P.*?)(\?.*)?' + + @classmethod + def _extract_url_info(cls, url): + mobj = re.match(cls._VALID_URL, url) + lang = mobj.group('lang') + # This is not a real id, it can be for example AJT for the news + # http://www.arte.tv/guide/fr/emissions/AJT/arte-journal + video_id = mobj.group('id') + return video_id, lang + + def _real_extract(self, url): + video_id, lang = self._extract_url_info(url) + webpage = self._download_webpage(url, video_id) + return self._extract_from_webpage(webpage, video_id, lang) + + def _extract_from_webpage(self, webpage, video_id, lang): + json_url = self._html_search_regex(r'arte_vp_url="(.*?)"', webpage, 'json url') + + json_info = self._download_webpage(json_url, video_id, 'Downloading info json') + self.report_extraction(video_id) + info = json.loads(json_info) + player_info = info['videoJsonPlayer'] + + info_dict = { + 'id': player_info['VID'], + 'title': player_info['VTI'], + 'description': player_info.get('VDE'), + 'upload_date': unified_strdate(player_info.get('VDA', '').split(' ')[0]), + 'thumbnail': player_info.get('programImage') or player_info.get('VTU', {}).get('IUR'), + } + + all_formats = player_info['VSR'].values() + # Some formats use the m3u8 protocol + all_formats = list(filter(lambda f: f.get('videoFormat') != 'M3U8', all_formats)) + def _match_lang(f): + if f.get('versionCode') is None: + return True + # Return true if that format is in the language of the url + if lang == 'fr': + l = 'F' + elif lang == 'de': + l = 'A' + regexes = [r'VO?%s' % l, r'VO?.-ST%s' % l] + return any(re.match(r, f['versionCode']) for r in regexes) + # Some formats may not be in the same language as the url + formats = filter(_match_lang, all_formats) + formats = list(formats) # in python3 filter returns an iterator + if not formats: + # Some videos are only available in the 'Originalversion' + # they aren't tagged as being in French or German + if all(f['versionCode'] == 'VO' for f in all_formats): + formats = all_formats + else: + raise ExtractorError(u'The formats list is empty') + # We order the formats by quality + if re.match(r'[A-Z]Q', formats[0]['quality']) is not None: + sort_key = lambda f: ['HQ', 'MQ', 'EQ', 'SQ'].index(f['quality']) + else: + sort_key = lambda f: int(f.get('height',-1)) + formats = sorted(formats, key=sort_key) + # Prefer videos without subtitles in the same language + formats = sorted(formats, key=lambda f: re.match(r'VO(F|A)-STM\1', f.get('versionCode', '')) is None) + # Pick the best quality + def _format(format_info): + quality = format_info['quality'] + m_quality = re.match(r'\w*? - (\d*)p', quality) + if m_quality is not None: + quality = m_quality.group(1) + if format_info.get('versionCode') is not None: + format_id = u'%s-%s' % (quality, format_info['versionCode']) + else: + format_id = quality + info = { + 'format_id': format_id, + 'format_note': format_info.get('versionLibelle'), + 'width': format_info.get('width'), + 'height': format_info.get('height'), + } + if format_info['mediaType'] == u'rtmp': + info['url'] = format_info['streamer'] + info['play_path'] = 'mp4:' + format_info['url'] + info['ext'] = 'flv' + else: + info['url'] = format_info['url'] + info['ext'] = determine_ext(info['url']) + return info + info_dict['formats'] = [_format(f) for f in formats] + + return info_dict + + +# It also uses the arte_vp_url url from the webpage to extract the information +class ArteTVCreativeIE(ArteTVPlus7IE): + IE_NAME = u'arte.tv:creative' + _VALID_URL = r'https?://creative\.arte\.tv/(?Pfr|de)/magazine?/(?P.+)' + + _TEST = { + u'url': u'http://creative.arte.tv/de/magazin/agentur-amateur-corporate-design', + u'file': u'050489-002.mp4', + u'info_dict': { + u'title': u'Agentur Amateur / Agence Amateur #2 : Corporate Design', + }, + } + + +class ArteTVFutureIE(ArteTVPlus7IE): + IE_NAME = u'arte.tv:future' + _VALID_URL = r'https?://future\.arte\.tv/(?Pfr|de)/(thema|sujet)/.*?#article-anchor-(?P\d+)' + + _TEST = { + u'url': u'http://future.arte.tv/fr/sujet/info-sciences#article-anchor-7081', + u'file': u'050940-003.mp4', + u'info_dict': { + u'title': u'Les champignons au secours de la planète', + }, + } + + def _real_extract(self, url): + anchor_id, lang = self._extract_url_info(url) + webpage = self._download_webpage(url, anchor_id) + row = get_element_by_id(anchor_id, webpage) + return self._extract_from_webpage(row, anchor_id, lang) diff --git a/youtube_dl/extractor/bambuser.py b/youtube_dl/extractor/bambuser.py new file mode 100644 index 000000000..f3b36f473 --- /dev/null +++ b/youtube_dl/extractor/bambuser.py @@ -0,0 +1,80 @@ +import re +import json +import itertools + +from .common import InfoExtractor +from ..utils import ( + compat_urllib_request, +) + + +class BambuserIE(InfoExtractor): + IE_NAME = u'bambuser' + _VALID_URL = r'https?://bambuser\.com/v/(?P\d+)' + _API_KEY = '005f64509e19a868399060af746a00aa' + + _TEST = { + u'url': u'http://bambuser.com/v/4050584', + u'md5': u'fba8f7693e48fd4e8641b3fd5539a641', + u'info_dict': { + u'id': u'4050584', + u'ext': u'flv', + u'title': u'Education engineering days - lightning talks', + u'duration': 3741, + u'uploader': u'pixelversity', + u'uploader_id': u'344706', + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + info_url = ('http://player-c.api.bambuser.com/getVideo.json?' + '&api_key=%s&vid=%s' % (self._API_KEY, video_id)) + info_json = self._download_webpage(info_url, video_id) + info = json.loads(info_json)['result'] + + return { + 'id': video_id, + 'title': info['title'], + 'url': info['url'], + 'thumbnail': info.get('preview'), + 'duration': int(info['length']), + 'view_count': int(info['views_total']), + 'uploader': info['username'], + 'uploader_id': info['uid'], + } + + +class BambuserChannelIE(InfoExtractor): + IE_NAME = u'bambuser:channel' + _VALID_URL = r'http://bambuser.com/channel/(?P.*?)(?:/|#|\?|$)' + # The maximum number we can get with each request + _STEP = 50 + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + user = mobj.group('user') + urls = [] + last_id = '' + for i in itertools.count(1): + req_url = ('http://bambuser.com/xhr-api/index.php?username={user}' + '&sort=created&access_mode=0%2C1%2C2&limit={count}' + '&method=broadcast&format=json&vid_older_than={last}' + ).format(user=user, count=self._STEP, last=last_id) + req = compat_urllib_request.Request(req_url) + # Without setting this header, we wouldn't get any result + req.add_header('Referer', 'http://bambuser.com/channel/%s' % user) + info_json = self._download_webpage(req, user, + u'Downloading page %d' % i) + results = json.loads(info_json)['result'] + if len(results) == 0: + break + last_id = results[-1]['vid'] + urls.extend(self.url_result(v['page'], 'Bambuser') for v in results) + + return { + '_type': 'playlist', + 'title': user, + 'entries': urls, + } diff --git a/youtube_dl/extractor/bliptv.py b/youtube_dl/extractor/bliptv.py index 08b28c994..493504f75 100644 --- a/youtube_dl/extractor/bliptv.py +++ b/youtube_dl/extractor/bliptv.py @@ -115,7 +115,7 @@ class BlipTVIE(InfoExtractor): ext = umobj.group(1) info = { - 'id': data['item_id'], + 'id': compat_str(data['item_id']), 'url': video_url, 'uploader': data['display_name'], 'upload_date': upload_date, diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index 558b3d009..0d9b87a34 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -23,7 +23,7 @@ class BrightcoveIE(InfoExtractor): # From http://www.8tv.cat/8aldia/videos/xavier-sala-i-martin-aquesta-tarda-a-8-al-dia/ u'url': u'http://c.brightcove.com/services/viewer/htmlFederated?playerID=1654948606001&flashID=myExperience&%40videoPlayer=2371591881001', u'file': u'2371591881001.mp4', - u'md5': u'9e80619e0a94663f0bdc849b4566af19', + u'md5': u'8eccab865181d29ec2958f32a6a754f5', u'note': u'Test Brightcove downloads and detection in GenericIE', u'info_dict': { u'title': u'Xavier Sala i Martín: “Un banc que no presta és un banc zombi que no serveix per a res”', @@ -49,6 +49,13 @@ class BrightcoveIE(InfoExtractor): Build a Brightcove url from a xml string containing {params} """ + + # Fix up some stupid HTML, see https://github.com/rg3/youtube-dl/issues/1553 + object_str = re.sub(r'(', + lambda m: m.group(1) + '/>', object_str) + # Fix up some stupid XML, see https://github.com/rg3/youtube-dl/issues/1608 + object_str = object_str.replace(u'<--', u'