diff --git a/.gitignore b/.gitignore index ca4e8f353..24fdb3626 100644 --- a/.gitignore +++ b/.gitignore @@ -17,4 +17,11 @@ youtube-dl.tar.gz .coverage cover/ updates_key.pem -*.egg-info \ No newline at end of file +*.egg-info +*.srt +*.sbv +*.vtt +*.flv +*.mp4 +*.part +test/testdata diff --git a/README.md b/README.md index 75068fe56..14d62b189 100644 --- a/README.md +++ b/README.md @@ -19,7 +19,8 @@ which means you can modify it, redistribute it or use it however you like. -U, --update update this program to latest version. Make sure that you have sufficient permissions (run with sudo if needed) - -i, --ignore-errors continue on download errors + -i, --ignore-errors continue on download errors, for example to to + skip unavailable videos in a playlist --dump-user-agent display the current browser identification --user-agent UA specify a custom user agent --referer REF specify a custom referer, use if the video access @@ -29,6 +30,11 @@ which means you can modify it, redistribute it or use it however you like. --extractor-descriptions Output descriptions of all supported extractors --proxy URL Use the specified HTTP/HTTPS proxy --no-check-certificate Suppress HTTPS certificate validation. + --cache-dir None Location in the filesystem where youtube-dl can + store downloaded information permanently. By + default $XDG_CACHE_HOME/youtube-dl or ~/.cache + /youtube-dl . + --no-cache-dir Disable filesystem caching ## Video Selection: --playlist-start NUMBER playlist video to start at (default is 1) @@ -45,6 +51,7 @@ which means you can modify it, redistribute it or use it however you like. --date DATE download only videos uploaded in this date --datebefore DATE download only videos uploaded before this date --dateafter DATE download only videos uploaded after this date + --no-playlist download only the currently playing video ## Download Options: -r, --rate-limit LIMIT maximum download rate (e.g. 50k or 44.6m) @@ -113,7 +120,8 @@ which means you can modify it, redistribute it or use it however you like. ## Video Format Options: -f, --format FORMAT video format code, specifiy the order of - preference using slashes: "-f 22/17/18" + preference using slashes: "-f 22/17/18". "-f mp4" + and "-f flv" are also supported --all-formats download all available video formats --prefer-free-formats prefer free video formats unless a specific one is requested @@ -122,10 +130,8 @@ which means you can modify it, redistribute it or use it however you like. only) ## Subtitle Options: - --write-sub write subtitle file (currently youtube only) - --write-auto-sub write automatic subtitle file (currently youtube - only) - --only-sub [deprecated] alias of --skip-download + --write-sub write subtitle file + --write-auto-sub write automatic subtitle file (youtube only) --all-subs downloads all the available subtitles of the video --list-subs lists all available subtitles for the video diff --git a/devscripts/buildserver.py b/devscripts/buildserver.py new file mode 100644 index 000000000..e0c3cc83e --- /dev/null +++ b/devscripts/buildserver.py @@ -0,0 +1,405 @@ +#!/usr/bin/python3 + +from http.server import HTTPServer, BaseHTTPRequestHandler +from socketserver import ThreadingMixIn +import argparse +import ctypes +import functools +import sys +import threading +import traceback +import os.path + + +class BuildHTTPServer(ThreadingMixIn, HTTPServer): + allow_reuse_address = True + + +advapi32 = ctypes.windll.advapi32 + +SC_MANAGER_ALL_ACCESS = 0xf003f +SC_MANAGER_CREATE_SERVICE = 0x02 +SERVICE_WIN32_OWN_PROCESS = 0x10 +SERVICE_AUTO_START = 0x2 +SERVICE_ERROR_NORMAL = 0x1 +DELETE = 0x00010000 +SERVICE_STATUS_START_PENDING = 0x00000002 +SERVICE_STATUS_RUNNING = 0x00000004 +SERVICE_ACCEPT_STOP = 0x1 + +SVCNAME = 'youtubedl_builder' + +LPTSTR = ctypes.c_wchar_p +START_CALLBACK = ctypes.WINFUNCTYPE(None, ctypes.c_int, ctypes.POINTER(LPTSTR)) + + +class SERVICE_TABLE_ENTRY(ctypes.Structure): + _fields_ = [ + ('lpServiceName', LPTSTR), + ('lpServiceProc', START_CALLBACK) + ] + + +HandlerEx = ctypes.WINFUNCTYPE( + ctypes.c_int, # return + ctypes.c_int, # dwControl + ctypes.c_int, # dwEventType + ctypes.c_void_p, # lpEventData, + ctypes.c_void_p, # lpContext, +) + + +def _ctypes_array(c_type, py_array): + ar = (c_type * len(py_array))() + ar[:] = py_array + return ar + + +def win_OpenSCManager(): + res = advapi32.OpenSCManagerW(None, None, SC_MANAGER_ALL_ACCESS) + if not res: + raise Exception('Opening service manager failed - ' + 'are you running this as administrator?') + return res + + +def win_install_service(service_name, cmdline): + manager = win_OpenSCManager() + try: + h = advapi32.CreateServiceW( + manager, service_name, None, + SC_MANAGER_CREATE_SERVICE, SERVICE_WIN32_OWN_PROCESS, + SERVICE_AUTO_START, SERVICE_ERROR_NORMAL, + cmdline, None, None, None, None, None) + if not h: + raise OSError('Service creation failed: %s' % ctypes.FormatError()) + + advapi32.CloseServiceHandle(h) + finally: + advapi32.CloseServiceHandle(manager) + + +def win_uninstall_service(service_name): + manager = win_OpenSCManager() + try: + h = advapi32.OpenServiceW(manager, service_name, DELETE) + if not h: + raise OSError('Could not find service %s: %s' % ( + service_name, ctypes.FormatError())) + + try: + if not advapi32.DeleteService(h): + raise OSError('Deletion failed: %s' % ctypes.FormatError()) + finally: + advapi32.CloseServiceHandle(h) + finally: + advapi32.CloseServiceHandle(manager) + + +def win_service_report_event(service_name, msg, is_error=True): + with open('C:/sshkeys/log', 'a', encoding='utf-8') as f: + f.write(msg + '\n') + + event_log = advapi32.RegisterEventSourceW(None, service_name) + if not event_log: + raise OSError('Could not report event: %s' % ctypes.FormatError()) + + try: + type_id = 0x0001 if is_error else 0x0004 + event_id = 0xc0000000 if is_error else 0x40000000 + lines = _ctypes_array(LPTSTR, [msg]) + + if not advapi32.ReportEventW( + event_log, type_id, 0, event_id, None, len(lines), 0, + lines, None): + raise OSError('Event reporting failed: %s' % ctypes.FormatError()) + finally: + advapi32.DeregisterEventSource(event_log) + + +def win_service_handler(stop_event, *args): + try: + raise ValueError('Handler called with args ' + repr(args)) + TODO + except Exception as e: + tb = traceback.format_exc() + msg = str(e) + '\n' + tb + win_service_report_event(service_name, msg, is_error=True) + raise + + +def win_service_set_status(handle, status_code): + svcStatus = SERVICE_STATUS() + svcStatus.dwServiceType = SERVICE_WIN32_OWN_PROCESS + svcStatus.dwCurrentState = status_code + svcStatus.dwControlsAccepted = SERVICE_ACCEPT_STOP + + svcStatus.dwServiceSpecificExitCode = 0 + + if not advapi32.SetServiceStatus(handle, ctypes.byref(svcStatus)): + raise OSError('SetServiceStatus failed: %r' % ctypes.FormatError()) + + +def win_service_main(service_name, real_main, argc, argv_raw): + try: + #args = [argv_raw[i].value for i in range(argc)] + stop_event = threading.Event() + handler = HandlerEx(functools.partial(stop_event, win_service_handler)) + h = advapi32.RegisterServiceCtrlHandlerExW(service_name, handler, None) + if not h: + raise OSError('Handler registration failed: %s' % + ctypes.FormatError()) + + TODO + except Exception as e: + tb = traceback.format_exc() + msg = str(e) + '\n' + tb + win_service_report_event(service_name, msg, is_error=True) + raise + + +def win_service_start(service_name, real_main): + try: + cb = START_CALLBACK( + functools.partial(win_service_main, service_name, real_main)) + dispatch_table = _ctypes_array(SERVICE_TABLE_ENTRY, [ + SERVICE_TABLE_ENTRY( + service_name, + cb + ), + SERVICE_TABLE_ENTRY(None, ctypes.cast(None, START_CALLBACK)) + ]) + + if not advapi32.StartServiceCtrlDispatcherW(dispatch_table): + raise OSError('ctypes start failed: %s' % ctypes.FormatError()) + except Exception as e: + tb = traceback.format_exc() + msg = str(e) + '\n' + tb + win_service_report_event(service_name, msg, is_error=True) + raise + + +def main(args=None): + parser = argparse.ArgumentParser() + parser.add_argument('-i', '--install', + action='store_const', dest='action', const='install', + help='Launch at Windows startup') + parser.add_argument('-u', '--uninstall', + action='store_const', dest='action', const='uninstall', + help='Remove Windows service') + parser.add_argument('-s', '--service', + action='store_const', dest='action', const='service', + help='Run as a Windows service') + parser.add_argument('-b', '--bind', metavar='', + action='store', default='localhost:8142', + help='Bind to host:port (default %default)') + options = parser.parse_args(args=args) + + if options.action == 'install': + fn = os.path.abspath(__file__).replace('v:', '\\\\vboxsrv\\vbox') + cmdline = '%s %s -s -b %s' % (sys.executable, fn, options.bind) + win_install_service(SVCNAME, cmdline) + return + + if options.action == 'uninstall': + win_uninstall_service(SVCNAME) + return + + if options.action == 'service': + win_service_start(SVCNAME, main) + return + + host, port_str = options.bind.split(':') + port = int(port_str) + + print('Listening on %s:%d' % (host, port)) + srv = BuildHTTPServer((host, port), BuildHTTPRequestHandler) + thr = threading.Thread(target=srv.serve_forever) + thr.start() + input('Press ENTER to shut down') + srv.shutdown() + thr.join() + + +def rmtree(path): + for name in os.listdir(path): + fname = os.path.join(path, name) + if os.path.isdir(fname): + rmtree(fname) + else: + os.chmod(fname, 0o666) + os.remove(fname) + os.rmdir(path) + +#============================================================================== + +class BuildError(Exception): + def __init__(self, output, code=500): + self.output = output + self.code = code + + def __str__(self): + return self.output + + +class HTTPError(BuildError): + pass + + +class PythonBuilder(object): + def __init__(self, **kwargs): + pythonVersion = kwargs.pop('python', '2.7') + try: + key = _winreg.OpenKey(_winreg.HKEY_LOCAL_MACHINE, r'SOFTWARE\Python\PythonCore\%s\InstallPath' % pythonVersion) + try: + self.pythonPath, _ = _winreg.QueryValueEx(key, '') + finally: + _winreg.CloseKey(key) + except Exception: + raise BuildError('No such Python version: %s' % pythonVersion) + + super(PythonBuilder, self).__init__(**kwargs) + + +class GITInfoBuilder(object): + def __init__(self, **kwargs): + try: + self.user, self.repoName = kwargs['path'][:2] + self.rev = kwargs.pop('rev') + except ValueError: + raise BuildError('Invalid path') + except KeyError as e: + raise BuildError('Missing mandatory parameter "%s"' % e.args[0]) + + path = os.path.join(os.environ['APPDATA'], 'Build archive', self.repoName, self.user) + if not os.path.exists(path): + os.makedirs(path) + self.basePath = tempfile.mkdtemp(dir=path) + self.buildPath = os.path.join(self.basePath, 'build') + + super(GITInfoBuilder, self).__init__(**kwargs) + + +class GITBuilder(GITInfoBuilder): + def build(self): + try: + subprocess.check_output(['git', 'clone', 'git://github.com/%s/%s.git' % (self.user, self.repoName), self.buildPath]) + subprocess.check_output(['git', 'checkout', self.rev], cwd=self.buildPath) + except subprocess.CalledProcessError as e: + raise BuildError(e.output) + + super(GITBuilder, self).build() + + +class YoutubeDLBuilder(object): + authorizedUsers = ['fraca7', 'phihag', 'rg3', 'FiloSottile'] + + def __init__(self, **kwargs): + if self.repoName != 'youtube-dl': + raise BuildError('Invalid repository "%s"' % self.repoName) + if self.user not in self.authorizedUsers: + raise HTTPError('Unauthorized user "%s"' % self.user, 401) + + super(YoutubeDLBuilder, self).__init__(**kwargs) + + def build(self): + try: + subprocess.check_output([os.path.join(self.pythonPath, 'python.exe'), 'setup.py', 'py2exe'], + cwd=self.buildPath) + except subprocess.CalledProcessError as e: + raise BuildError(e.output) + + super(YoutubeDLBuilder, self).build() + + +class DownloadBuilder(object): + def __init__(self, **kwargs): + self.handler = kwargs.pop('handler') + self.srcPath = os.path.join(self.buildPath, *tuple(kwargs['path'][2:])) + self.srcPath = os.path.abspath(os.path.normpath(self.srcPath)) + if not self.srcPath.startswith(self.buildPath): + raise HTTPError(self.srcPath, 401) + + super(DownloadBuilder, self).__init__(**kwargs) + + def build(self): + if not os.path.exists(self.srcPath): + raise HTTPError('No such file', 404) + if os.path.isdir(self.srcPath): + raise HTTPError('Is a directory: %s' % self.srcPath, 401) + + self.handler.send_response(200) + self.handler.send_header('Content-Type', 'application/octet-stream') + self.handler.send_header('Content-Disposition', 'attachment; filename=%s' % os.path.split(self.srcPath)[-1]) + self.handler.send_header('Content-Length', str(os.stat(self.srcPath).st_size)) + self.handler.end_headers() + + with open(self.srcPath, 'rb') as src: + shutil.copyfileobj(src, self.handler.wfile) + + super(DownloadBuilder, self).build() + + +class CleanupTempDir(object): + def build(self): + try: + rmtree(self.basePath) + except Exception as e: + print('WARNING deleting "%s": %s' % (self.basePath, e)) + + super(CleanupTempDir, self).build() + + +class Null(object): + def __init__(self, **kwargs): + pass + + def start(self): + pass + + def close(self): + pass + + def build(self): + pass + + +class Builder(PythonBuilder, GITBuilder, YoutubeDLBuilder, DownloadBuilder, CleanupTempDir, Null): + pass + + +class BuildHTTPRequestHandler(BaseHTTPRequestHandler): + actionDict = { 'build': Builder, 'download': Builder } # They're the same, no more caching. + + def do_GET(self): + path = urlparse.urlparse(self.path) + paramDict = dict([(key, value[0]) for key, value in urlparse.parse_qs(path.query).items()]) + action, _, path = path.path.strip('/').partition('/') + if path: + path = path.split('/') + if action in self.actionDict: + try: + builder = self.actionDict[action](path=path, handler=self, **paramDict) + builder.start() + try: + builder.build() + finally: + builder.close() + except BuildError as e: + self.send_response(e.code) + msg = unicode(e).encode('UTF-8') + self.send_header('Content-Type', 'text/plain; charset=UTF-8') + self.send_header('Content-Length', len(msg)) + self.end_headers() + self.wfile.write(msg) + except HTTPError as e: + self.send_response(e.code, str(e)) + else: + self.send_response(500, 'Unknown build method "%s"' % action) + else: + self.send_response(500, 'Malformed URL') + +#============================================================================== + +if __name__ == '__main__': + main() diff --git a/devscripts/gh-pages/add-version.py b/devscripts/gh-pages/add-version.py index 116420ef2..35865b2f3 100755 --- a/devscripts/gh-pages/add-version.py +++ b/devscripts/gh-pages/add-version.py @@ -3,7 +3,8 @@ import json import sys import hashlib -import urllib.request +import os.path + if len(sys.argv) <= 1: print('Specify the version number as parameter') @@ -23,10 +24,14 @@ filenames = { 'bin': 'youtube-dl', 'exe': 'youtube-dl.exe', 'tar': 'youtube-dl-%s.tar.gz' % version} +build_dir = os.path.join('..', '..', 'build', version) for key, filename in filenames.items(): - print('Downloading and checksumming %s...' % filename) url = 'https://yt-dl.org/downloads/%s/%s' % (version, filename) - data = urllib.request.urlopen(url).read() + fn = os.path.join(build_dir, filename) + with open(fn, 'rb') as f: + data = f.read() + if not data: + raise ValueError('File %s is empty!' % fn) sha256sum = hashlib.sha256(data).hexdigest() new_version[key] = (url, sha256sum) diff --git a/devscripts/gh-pages/update-sites.py b/devscripts/gh-pages/update-sites.py new file mode 100755 index 000000000..33f242480 --- /dev/null +++ b/devscripts/gh-pages/update-sites.py @@ -0,0 +1,33 @@ +#!/usr/bin/env python3 + +import sys +import os +import textwrap + +# We must be able to import youtube_dl +sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) + +import youtube_dl + +def main(): + with open('supportedsites.html.in', 'r', encoding='utf-8') as tmplf: + template = tmplf.read() + + ie_htmls = [] + for ie in sorted(youtube_dl.gen_extractors(), key=lambda i: i.IE_NAME.lower()): + ie_html = '{}'.format(ie.IE_NAME) + try: + ie_html += ': {}'.format(ie.IE_DESC) + except AttributeError: + pass + if ie.working() == False: + ie_html += ' (Currently broken)' + ie_htmls.append('
  • {}
  • '.format(ie_html)) + + template = template.replace('@SITES@', textwrap.indent('\n'.join(ie_htmls), '\t')) + + with open('supportedsites.html', 'w', encoding='utf-8') as sitesf: + sitesf.write(template) + +if __name__ == '__main__': + main() diff --git a/devscripts/release.sh b/devscripts/release.sh index 24c9ad8d8..796468b4b 100755 --- a/devscripts/release.sh +++ b/devscripts/release.sh @@ -55,8 +55,8 @@ git push origin "$version" /bin/echo -e "\n### OK, now it is time to build the binaries..." REV=$(git rev-parse HEAD) make youtube-dl youtube-dl.tar.gz -wget "http://jeromelaheurte.net:8142/download/rg3/youtube-dl/youtube-dl.exe?rev=$REV" -O youtube-dl.exe || \ - wget "http://jeromelaheurte.net:8142/build/rg3/youtube-dl/youtube-dl.exe?rev=$REV" -O youtube-dl.exe +read -p "VM running? (y/n) " -n 1 +wget "http://localhost:8142/build/rg3/youtube-dl/youtube-dl.exe?rev=$REV" -O youtube-dl.exe mkdir -p "build/$version" mv youtube-dl youtube-dl.exe "build/$version" mv youtube-dl.tar.gz "build/$version/youtube-dl-$version.tar.gz" @@ -85,6 +85,7 @@ ROOT=$(pwd) "$ROOT/devscripts/gh-pages/sign-versions.py" < "$ROOT/updates_key.pem" "$ROOT/devscripts/gh-pages/generate-download.py" "$ROOT/devscripts/gh-pages/update-copyright.py" + "$ROOT/devscripts/gh-pages/update-sites.py" git add *.html *.html.in update git commit -m "release $version" git show HEAD diff --git a/devscripts/youtube_genalgo.py b/devscripts/youtube_genalgo.py deleted file mode 100644 index 97a0d7290..000000000 --- a/devscripts/youtube_genalgo.py +++ /dev/null @@ -1,109 +0,0 @@ -#!/usr/bin/env python - -# Generate youtube signature algorithm from test cases - -import sys - -tests = [ - # 92 - vflQw-fB4 2013/07/17 - ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[]}|:;?/>.<'`~\"", - "mrtyuioplkjhgfdsazxcvbnq1234567890QWERTY}IOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[]\"|:;"), - # 90 - ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[]}|:;?/>.<'`", - "mrtyuioplkjhgfdsazxcvbne1234567890QWER[YUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={`]}|"), - # 89 - ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[]}|:;?/>.<'", - "/?;:|}<[{=+-_)(*&^%$#@!MqBVCXZASDFGHJKLPOIUYTREWQ0987654321mnbvcxzasdfghjklpoiuyt"), - # 88 - vflapUV9V 2013/08/28 - ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[]}|:;?/>.<", - "ioplkjhgfdsazxcvbnm12<4567890QWERTYUIOZLKJHGFDSAeXCVBNM!@#$%^&*()_-+={[]}|:;?/>.3"), - # 87 - ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$^&*()_-+={[]}|:;?/>.<", - "uioplkjhgfdsazxcvbnm1t34567890QWE2TYUIOPLKJHGFDSAZXCVeNM!@#$^&*()_-+={[]}|:;?/>.<"), - # 86 - vflHOr_nV 2013/08/30 - ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[|};?/>.<", - "?;}|[{=+._)(*&^%$#@!MNBqCXZASDFGHJKLPOIUYTREWQ<987654321mnbvcxzasdfghjklpoiuytrew"), - # 85 - ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[};?/>.<", - ".>/?;}[{=+-_)(*&^%$#@!MNBVCXZASDFGHJKLPOIUYTREWQ0q876543r1mnbvcx9asdfghjklpoiuyt2"), - # 84 - vflg0g8PQ 2013/08/29 (sporadic) - ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[};?>.<", - ">?;}[{=+-_)(*&^%$#@!MNBVCXZASDFGHJKLPOIUYTREWq0987654321mnbvcxzasdfghjklpoiuytr"), - # 83 - ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!#$%^&*()_+={[};?/>.<", - ".>/?;}[{=+_)(*&^%<#!MNBVCXZASPFGHJKLwOIUYTREWQ0987654321mnbvcxzasdfghjklpoiuytreq"), - # 82 - vflZK4ZYR 2013/08/23 - ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKHGFDSAZXCVBNM!@#$%^&*(-+={[};?/>.<", - "wertyuioplkjhgfdsaqxcvbnm1234567890QWERTYUIOPLKHGFDSAZXCVBNM!@#$%^&z(-+={[};?/>.<"), - # 81 - vflLC8JvQ 2013/07/25 - ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKHGFDSAZXCVBNM!@#$%^&*(-+={[};?/>.", - "C>/?;}[{=+-(*&^%$#@!MNBVYXZASDFGHKLPOIU.TREWQ0q87659321mnbvcxzasdfghjkl4oiuytrewp"), - # 80 - vflZK4ZYR 2013/08/23 (sporadic) - ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKHGFDSAZXCVBNM!@#$%^&*(-+={[};?/>", - "wertyuioplkjhgfdsaqxcvbnm1234567890QWERTYUIOPLKHGFDSAZXCVBNM!@#$%^&z(-+={[};?/>"), - # 79 - vflLC8JvQ 2013/07/25 (sporadic) - ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKHGFDSAZXCVBNM!@#$%^&*(-+={[};?/", - "Z?;}[{=+-(*&^%$#@!MNBVCXRASDFGHKLPOIUYT/EWQ0q87659321mnbvcxzasdfghjkl4oiuytrewp"), -] - -tests_age_gate = [ - # 86 - vflqinMWD - ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[|};?/>.<", - "ertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!/#$%^&*()_-+={[|};?@"), -] - -def find_matching(wrong, right): - idxs = [wrong.index(c) for c in right] - return compress(idxs) - return ('s[%d]' % i for i in idxs) - -def compress(idxs): - def _genslice(start, end, step): - starts = '' if start == 0 else str(start) - ends = ':%d' % (end+step) - steps = '' if step == 1 else (':%d' % step) - return 's[%s%s%s]' % (starts, ends, steps) - - step = None - for i, prev in zip(idxs[1:], idxs[:-1]): - if step is not None: - if i - prev == step: - continue - yield _genslice(start, prev, step) - step = None - continue - if i - prev in [-1, 1]: - step = i - prev - start = prev - continue - else: - yield 's[%d]' % prev - if step is None: - yield 's[%d]' % i - else: - yield _genslice(start, i, step) - -def _assert_compress(inp, exp): - res = list(compress(inp)) - if res != exp: - print('Got %r, expected %r' % (res, exp)) - assert res == exp -_assert_compress([0,2,4,6], ['s[0]', 's[2]', 's[4]', 's[6]']) -_assert_compress([0,1,2,4,6,7], ['s[:3]', 's[4]', 's[6:8]']) -_assert_compress([8,0,1,2,4,7,6,9], ['s[8]', 's[:3]', 's[4]', 's[7:5:-1]', 's[9]']) - -def gen(wrong, right, indent): - code = ' + '.join(find_matching(wrong, right)) - return 'if len(s) == %d:\n%s return %s\n' % (len(wrong), indent, code) - -def genall(tests): - indent = ' ' * 8 - return indent + (indent + 'el').join(gen(wrong, right, indent) for wrong,right in tests) - -def main(): - print(genall(tests)) - print(u' Age gate:') - print(genall(tests_age_gate)) - -if __name__ == '__main__': - main() diff --git a/test/parameters.json b/test/parameters.json index 96998b5c3..f042880ed 100644 --- a/test/parameters.json +++ b/test/parameters.json @@ -38,7 +38,6 @@ "writedescription": false, "writeinfojson": true, "writesubtitles": false, - "onlysubtitles": false, "allsubtitles": false, "listssubtitles": false } diff --git a/test/test_all_urls.py b/test/test_all_urls.py index c54faa380..ff1c86efe 100644 --- a/test/test_all_urls.py +++ b/test/test_all_urls.py @@ -11,24 +11,50 @@ from youtube_dl.extractor import YoutubeIE, YoutubePlaylistIE, YoutubeChannelIE, from helper import get_testcases class TestAllURLsMatching(unittest.TestCase): + def setUp(self): + self.ies = gen_extractors() + + def matching_ies(self, url): + return [ie.IE_NAME for ie in self.ies if ie.suitable(url) and ie.IE_NAME != 'generic'] + + def assertMatch(self, url, ie_list): + self.assertEqual(self.matching_ies(url), ie_list) + def test_youtube_playlist_matching(self): - self.assertTrue(YoutubePlaylistIE.suitable(u'ECUl4u3cNGP61MdtwGTqZA0MreSaDybji8')) - self.assertTrue(YoutubePlaylistIE.suitable(u'UUBABnxM4Ar9ten8Mdjj1j0Q')) #585 - self.assertTrue(YoutubePlaylistIE.suitable(u'PL63F0C78739B09958')) - self.assertTrue(YoutubePlaylistIE.suitable(u'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q')) - self.assertTrue(YoutubePlaylistIE.suitable(u'https://www.youtube.com/course?list=ECUl4u3cNGP61MdtwGTqZA0MreSaDybji8')) - self.assertTrue(YoutubePlaylistIE.suitable(u'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC')) - self.assertTrue(YoutubePlaylistIE.suitable(u'https://www.youtube.com/watch?v=AV6J6_AeFEQ&playnext=1&list=PL4023E734DA416012')) #668 - self.assertFalse(YoutubePlaylistIE.suitable(u'PLtS2H6bU1M')) + assertPlaylist = lambda url: self.assertMatch(url, ['youtube:playlist']) + assertPlaylist(u'ECUl4u3cNGP61MdtwGTqZA0MreSaDybji8') + assertPlaylist(u'UUBABnxM4Ar9ten8Mdjj1j0Q') #585 + assertPlaylist(u'PL63F0C78739B09958') + assertPlaylist(u'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q') + assertPlaylist(u'https://www.youtube.com/course?list=ECUl4u3cNGP61MdtwGTqZA0MreSaDybji8') + assertPlaylist(u'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC') + assertPlaylist(u'https://www.youtube.com/watch?v=AV6J6_AeFEQ&playnext=1&list=PL4023E734DA416012') #668 + self.assertFalse('youtube:playlist' in self.matching_ies(u'PLtS2H6bU1M')) def test_youtube_matching(self): self.assertTrue(YoutubeIE.suitable(u'PLtS2H6bU1M')) self.assertFalse(YoutubeIE.suitable(u'https://www.youtube.com/watch?v=AV6J6_AeFEQ&playnext=1&list=PL4023E734DA416012')) #668 + self.assertMatch('http://youtu.be/BaW_jenozKc', ['youtube']) + self.assertMatch('http://www.youtube.com/v/BaW_jenozKc', ['youtube']) + self.assertMatch('https://youtube.googleapis.com/v/BaW_jenozKc', ['youtube']) def test_youtube_channel_matching(self): - self.assertTrue(YoutubeChannelIE.suitable('https://www.youtube.com/channel/HCtnHdj3df7iM')) - self.assertTrue(YoutubeChannelIE.suitable('https://www.youtube.com/channel/HCtnHdj3df7iM?feature=gb_ch_rec')) - self.assertTrue(YoutubeChannelIE.suitable('https://www.youtube.com/channel/HCtnHdj3df7iM/videos')) + assertChannel = lambda url: self.assertMatch(url, ['youtube:channel']) + assertChannel('https://www.youtube.com/channel/HCtnHdj3df7iM') + assertChannel('https://www.youtube.com/channel/HCtnHdj3df7iM?feature=gb_ch_rec') + assertChannel('https://www.youtube.com/channel/HCtnHdj3df7iM/videos') + + def test_youtube_user_matching(self): + self.assertMatch('www.youtube.com/NASAgovVideo/videos', ['youtube:user']) + + def test_youtube_feeds(self): + self.assertMatch('https://www.youtube.com/feed/watch_later', ['youtube:watch_later']) + self.assertMatch('https://www.youtube.com/feed/subscriptions', ['youtube:subscriptions']) + self.assertMatch('https://www.youtube.com/feed/recommended', ['youtube:recommended']) + self.assertMatch('https://www.youtube.com/my_favorites', ['youtube:favorites']) + + def test_youtube_show_matching(self): + self.assertMatch('http://www.youtube.com/show/airdisasters', ['youtube:show']) def test_justin_tv_channelid_matching(self): self.assertTrue(JustinTVIE.suitable(u"justin.tv/vanillatv")) @@ -47,10 +73,13 @@ class TestAllURLsMatching(unittest.TestCase): self.assertTrue(JustinTVIE.suitable(u"http://www.twitch.tv/tsm_theoddone/c/2349361")) def test_youtube_extract(self): - self.assertEqual(YoutubeIE()._extract_id('http://www.youtube.com/watch?&v=BaW_jenozKc'), 'BaW_jenozKc') - self.assertEqual(YoutubeIE()._extract_id('https://www.youtube.com/watch?&v=BaW_jenozKc'), 'BaW_jenozKc') - self.assertEqual(YoutubeIE()._extract_id('https://www.youtube.com/watch?feature=player_embedded&v=BaW_jenozKc'), 'BaW_jenozKc') - self.assertEqual(YoutubeIE()._extract_id('https://www.youtube.com/watch_popup?v=BaW_jenozKc'), 'BaW_jenozKc') + assertExtractId = lambda url, id: self.assertEqual(YoutubeIE()._extract_id(url), id) + assertExtractId('http://www.youtube.com/watch?&v=BaW_jenozKc', 'BaW_jenozKc') + assertExtractId('https://www.youtube.com/watch?&v=BaW_jenozKc', 'BaW_jenozKc') + assertExtractId('https://www.youtube.com/watch?feature=player_embedded&v=BaW_jenozKc', 'BaW_jenozKc') + assertExtractId('https://www.youtube.com/watch_popup?v=BaW_jenozKc', 'BaW_jenozKc') + assertExtractId('http://www.youtube.com/watch?v=BaW_jenozKcsharePLED17F32AD9753930', 'BaW_jenozKc') + assertExtractId('BaW_jenozKc', 'BaW_jenozKc') def test_no_duplicates(self): ies = gen_extractors() @@ -63,15 +92,12 @@ class TestAllURLsMatching(unittest.TestCase): self.assertFalse(ie.suitable(url), '%s should not match URL %r' % (type(ie).__name__, url)) def test_keywords(self): - ies = gen_extractors() - matching_ies = lambda url: [ie.IE_NAME for ie in ies - if ie.suitable(url) and ie.IE_NAME != 'generic'] - self.assertEqual(matching_ies(':ytsubs'), ['youtube:subscriptions']) - self.assertEqual(matching_ies(':ytsubscriptions'), ['youtube:subscriptions']) - self.assertEqual(matching_ies(':thedailyshow'), ['ComedyCentral']) - self.assertEqual(matching_ies(':tds'), ['ComedyCentral']) - self.assertEqual(matching_ies(':colbertreport'), ['ComedyCentral']) - self.assertEqual(matching_ies(':cr'), ['ComedyCentral']) + self.assertMatch(':ytsubs', ['youtube:subscriptions']) + self.assertMatch(':ytsubscriptions', ['youtube:subscriptions']) + self.assertMatch(':thedailyshow', ['ComedyCentral']) + self.assertMatch(':tds', ['ComedyCentral']) + self.assertMatch(':colbertreport', ['ComedyCentral']) + self.assertMatch(':cr', ['ComedyCentral']) if __name__ == '__main__': diff --git a/test/test_dailymotion_subtitles.py b/test/test_dailymotion_subtitles.py new file mode 100644 index 000000000..83c65d57e --- /dev/null +++ b/test/test_dailymotion_subtitles.py @@ -0,0 +1,71 @@ +#!/usr/bin/env python + +import sys +import unittest +import json +import io +import hashlib + +# Allow direct execution +import os +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from youtube_dl.extractor import DailymotionIE +from youtube_dl.utils import * +from helper import FakeYDL + +md5 = lambda s: hashlib.md5(s.encode('utf-8')).hexdigest() + +class TestDailymotionSubtitles(unittest.TestCase): + def setUp(self): + self.DL = FakeYDL() + self.url = 'http://www.dailymotion.com/video/xczg00' + def getInfoDict(self): + IE = DailymotionIE(self.DL) + info_dict = IE.extract(self.url) + return info_dict + def getSubtitles(self): + info_dict = self.getInfoDict() + return info_dict[0]['subtitles'] + def test_no_writesubtitles(self): + subtitles = self.getSubtitles() + self.assertEqual(subtitles, None) + def test_subtitles(self): + self.DL.params['writesubtitles'] = True + subtitles = self.getSubtitles() + self.assertEqual(md5(subtitles['en']), '976553874490cba125086bbfea3ff76f') + def test_subtitles_lang(self): + self.DL.params['writesubtitles'] = True + self.DL.params['subtitleslangs'] = ['fr'] + subtitles = self.getSubtitles() + self.assertEqual(md5(subtitles['fr']), '594564ec7d588942e384e920e5341792') + def test_allsubtitles(self): + self.DL.params['writesubtitles'] = True + self.DL.params['allsubtitles'] = True + subtitles = self.getSubtitles() + self.assertEqual(len(subtitles.keys()), 5) + def test_list_subtitles(self): + self.DL.params['listsubtitles'] = True + info_dict = self.getInfoDict() + self.assertEqual(info_dict, None) + def test_automatic_captions(self): + self.DL.params['writeautomaticsub'] = True + self.DL.params['subtitleslang'] = ['en'] + subtitles = self.getSubtitles() + self.assertTrue(len(subtitles.keys()) == 0) + def test_nosubtitles(self): + self.url = 'http://www.dailymotion.com/video/x12u166_le-zapping-tele-star-du-08-aout-2013_tv' + self.DL.params['writesubtitles'] = True + self.DL.params['allsubtitles'] = True + subtitles = self.getSubtitles() + self.assertEqual(len(subtitles), 0) + def test_multiple_langs(self): + self.DL.params['writesubtitles'] = True + langs = ['es', 'fr', 'de'] + self.DL.params['subtitleslangs'] = langs + subtitles = self.getSubtitles() + for lang in langs: + self.assertTrue(subtitles.get(lang) is not None, u'Subtitles for \'%s\' not extracted' % lang) + +if __name__ == '__main__': + unittest.main() diff --git a/test/test_playlists.py b/test/test_playlists.py index 65de3a55c..c33511333 100644 --- a/test/test_playlists.py +++ b/test/test_playlists.py @@ -1,4 +1,5 @@ #!/usr/bin/env python +# encoding: utf-8 import sys import unittest @@ -8,7 +9,14 @@ import json import os sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from youtube_dl.extractor import DailymotionPlaylistIE, VimeoChannelIE +from youtube_dl.extractor import ( + DailymotionPlaylistIE, + DailymotionUserIE, + VimeoChannelIE, + UstreamChannelIE, + SoundcloudUserIE, + LivestreamIE, +) from youtube_dl.utils import * from helper import FakeYDL @@ -26,6 +34,14 @@ class TestPlaylists(unittest.TestCase): self.assertEqual(result['title'], u'SPORT') self.assertTrue(len(result['entries']) > 20) + def test_dailymotion_user(self): + dl = FakeYDL() + ie = DailymotionUserIE(dl) + result = ie.extract('http://www.dailymotion.com/user/generation-quoi/') + self.assertIsPlaylist(result) + self.assertEqual(result['title'], u'Génération Quoi') + self.assertTrue(len(result['entries']) >= 26) + def test_vimeo_channel(self): dl = FakeYDL() ie = VimeoChannelIE(dl) @@ -34,5 +50,29 @@ class TestPlaylists(unittest.TestCase): self.assertEqual(result['title'], u'Vimeo Tributes') self.assertTrue(len(result['entries']) > 24) + def test_ustream_channel(self): + dl = FakeYDL() + ie = UstreamChannelIE(dl) + result = ie.extract('http://www.ustream.tv/channel/young-americans-for-liberty') + self.assertIsPlaylist(result) + self.assertEqual(result['id'], u'5124905') + self.assertTrue(len(result['entries']) >= 11) + + def test_soundcloud_user(self): + dl = FakeYDL() + ie = SoundcloudUserIE(dl) + result = ie.extract('https://soundcloud.com/the-concept-band') + self.assertIsPlaylist(result) + self.assertEqual(result['id'], u'9615865') + self.assertTrue(len(result['entries']) >= 12) + + def test_livestream_event(self): + dl = FakeYDL() + ie = LivestreamIE(dl) + result = ie.extract('http://new.livestream.com/tedx/cityenglish') + self.assertIsPlaylist(result) + self.assertEqual(result['title'], u'TEDCity2.0 (English)') + self.assertTrue(len(result['entries']) >= 4) + if __name__ == '__main__': unittest.main() diff --git a/test/test_utils.py b/test/test_utils.py index be1069105..ff2e9885b 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -11,13 +11,16 @@ import os sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) #from youtube_dl.utils import htmlentity_transform -from youtube_dl.utils import timeconvert -from youtube_dl.utils import sanitize_filename -from youtube_dl.utils import unescapeHTML -from youtube_dl.utils import orderedSet -from youtube_dl.utils import DateRange -from youtube_dl.utils import unified_strdate -from youtube_dl.utils import find_xpath_attr +from youtube_dl.utils import ( + timeconvert, + sanitize_filename, + unescapeHTML, + orderedSet, + DateRange, + unified_strdate, + find_xpath_attr, + get_meta_content, +) if sys.version_info < (3, 0): _compat_str = lambda b: b.decode('unicode-escape') @@ -127,5 +130,16 @@ class TestUtil(unittest.TestCase): self.assertEqual(find_xpath_attr(doc, './/node', 'x', 'a'), doc[1]) self.assertEqual(find_xpath_attr(doc, './/node', 'y', 'c'), doc[2]) + def test_meta_parser(self): + testhtml = u''' + + + + + ''' + get_meta = lambda name: get_meta_content(name, testhtml) + self.assertEqual(get_meta('description'), u'foo & bar') + self.assertEqual(get_meta('author'), 'Plato') + if __name__ == '__main__': unittest.main() diff --git a/test/test_youtube_lists.py b/test/test_youtube_lists.py index dd9e292b0..53e65816d 100644 --- a/test/test_youtube_lists.py +++ b/test/test_youtube_lists.py @@ -27,6 +27,14 @@ class TestYoutubeLists(unittest.TestCase): ytie_results = [YoutubeIE()._extract_id(url['url']) for url in result['entries']] self.assertEqual(ytie_results, [ 'bV9L5Ht9LgY', 'FXxLjLQi3Fg', 'tU3Bgo5qJZE']) + def test_youtube_playlist_noplaylist(self): + dl = FakeYDL() + dl.params['noplaylist'] = True + ie = YoutubePlaylistIE(dl) + result = ie.extract('https://www.youtube.com/watch?v=FXxLjLQi3Fg&list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re') + self.assertEqual(result['_type'], 'url') + self.assertEqual(YoutubeIE()._extract_id(result['url']), 'FXxLjLQi3Fg') + def test_issue_673(self): dl = FakeYDL() ie = YoutubePlaylistIE(dl) diff --git a/test/test_youtube_signature.py b/test/test_youtube_signature.py new file mode 100644 index 000000000..5007d9a16 --- /dev/null +++ b/test/test_youtube_signature.py @@ -0,0 +1,80 @@ +#!/usr/bin/env python + +import io +import re +import string +import sys +import unittest + +# Allow direct execution +import os +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from youtube_dl.extractor import YoutubeIE +from youtube_dl.utils import compat_str, compat_urlretrieve + +_TESTS = [ + ( + u'https://s.ytimg.com/yts/jsbin/html5player-vflHOr_nV.js', + u'js', + 86, + u'>=<;:/.-[+*)(\'&%$#"!ZYX0VUTSRQPONMLKJIHGFEDCBA\\yxwvutsrqponmlkjihgfedcba987654321', + ), + ( + u'https://s.ytimg.com/yts/jsbin/html5player-vfldJ8xgI.js', + u'js', + 85, + u'3456789a0cdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRS[UVWXYZ!"#$%&\'()*+,-./:;<=>?@', + ), + ( + u'https://s.ytimg.com/yts/swfbin/watch_as3-vflg5GhxU.swf', + u'swf', + 82, + u':/.-,+*)=\'&%$#"!ZYX0VUTSRQPONMLKJIHGFEDCBAzyxw>utsrqponmlkjihgfedcba987654321' + ), +] + + +class TestSignature(unittest.TestCase): + def setUp(self): + TEST_DIR = os.path.dirname(os.path.abspath(__file__)) + self.TESTDATA_DIR = os.path.join(TEST_DIR, 'testdata') + if not os.path.exists(self.TESTDATA_DIR): + os.mkdir(self.TESTDATA_DIR) + + +def make_tfunc(url, stype, sig_length, expected_sig): + basename = url.rpartition('/')[2] + m = re.match(r'.*-([a-zA-Z0-9_-]+)\.[a-z]+$', basename) + assert m, '%r should follow URL format' % basename + test_id = m.group(1) + + def test_func(self): + fn = os.path.join(self.TESTDATA_DIR, basename) + + if not os.path.exists(fn): + compat_urlretrieve(url, fn) + + ie = YoutubeIE() + if stype == 'js': + with io.open(fn, encoding='utf-8') as testf: + jscode = testf.read() + func = ie._parse_sig_js(jscode) + else: + assert stype == 'swf' + with open(fn, 'rb') as testf: + swfcode = testf.read() + func = ie._parse_sig_swf(swfcode) + src_sig = compat_str(string.printable[:sig_length]) + got_sig = func(src_sig) + self.assertEqual(got_sig, expected_sig) + + test_func.__name__ = str('test_signature_' + stype + '_' + test_id) + setattr(TestSignature, test_func.__name__, test_func) + +for test_spec in _TESTS: + make_tfunc(*test_spec) + + +if __name__ == '__main__': + unittest.main() diff --git a/test/test_youtube_subtitles.py b/test/test_youtube_subtitles.py index 641206277..168e6c66c 100644 --- a/test/test_youtube_subtitles.py +++ b/test/test_youtube_subtitles.py @@ -18,85 +18,65 @@ md5 = lambda s: hashlib.md5(s.encode('utf-8')).hexdigest() class TestYoutubeSubtitles(unittest.TestCase): def setUp(self): - DL = FakeYDL() - DL.params['allsubtitles'] = False - DL.params['writesubtitles'] = False - DL.params['subtitlesformat'] = 'srt' - DL.params['listsubtitles'] = False - def test_youtube_no_subtitles(self): - DL = FakeYDL() - DL.params['writesubtitles'] = False - IE = YoutubeIE(DL) - info_dict = IE.extract('QRS8MkLhQmM') - subtitles = info_dict[0]['subtitles'] + self.DL = FakeYDL() + self.url = 'QRS8MkLhQmM' + def getInfoDict(self): + IE = YoutubeIE(self.DL) + info_dict = IE.extract(self.url) + return info_dict + def getSubtitles(self): + info_dict = self.getInfoDict() + return info_dict[0]['subtitles'] + def test_youtube_no_writesubtitles(self): + self.DL.params['writesubtitles'] = False + subtitles = self.getSubtitles() self.assertEqual(subtitles, None) def test_youtube_subtitles(self): - DL = FakeYDL() - DL.params['writesubtitles'] = True - IE = YoutubeIE(DL) - info_dict = IE.extract('QRS8MkLhQmM') - sub = info_dict[0]['subtitles']['en'] - self.assertEqual(md5(sub), '4cd9278a35ba2305f47354ee13472260') - def test_youtube_subtitles_it(self): - DL = FakeYDL() - DL.params['writesubtitles'] = True - DL.params['subtitleslangs'] = ['it'] - IE = YoutubeIE(DL) - info_dict = IE.extract('QRS8MkLhQmM') - sub = info_dict[0]['subtitles']['it'] - self.assertEqual(md5(sub), '164a51f16f260476a05b50fe4c2f161d') - def test_youtube_onlysubtitles(self): - DL = FakeYDL() - DL.params['writesubtitles'] = True - DL.params['onlysubtitles'] = True - IE = YoutubeIE(DL) - info_dict = IE.extract('QRS8MkLhQmM') - sub = info_dict[0]['subtitles']['en'] - self.assertEqual(md5(sub), '4cd9278a35ba2305f47354ee13472260') + self.DL.params['writesubtitles'] = True + subtitles = self.getSubtitles() + self.assertEqual(md5(subtitles['en']), '4cd9278a35ba2305f47354ee13472260') + def test_youtube_subtitles_lang(self): + self.DL.params['writesubtitles'] = True + self.DL.params['subtitleslangs'] = ['it'] + subtitles = self.getSubtitles() + self.assertEqual(md5(subtitles['it']), '164a51f16f260476a05b50fe4c2f161d') def test_youtube_allsubtitles(self): - DL = FakeYDL() - DL.params['allsubtitles'] = True - IE = YoutubeIE(DL) - info_dict = IE.extract('QRS8MkLhQmM') - subtitles = info_dict[0]['subtitles'] + self.DL.params['writesubtitles'] = True + self.DL.params['allsubtitles'] = True + subtitles = self.getSubtitles() self.assertEqual(len(subtitles.keys()), 13) def test_youtube_subtitles_sbv_format(self): - DL = FakeYDL() - DL.params['writesubtitles'] = True - DL.params['subtitlesformat'] = 'sbv' - IE = YoutubeIE(DL) - info_dict = IE.extract('QRS8MkLhQmM') - sub = info_dict[0]['subtitles']['en'] - self.assertEqual(md5(sub), '13aeaa0c245a8bed9a451cb643e3ad8b') + self.DL.params['writesubtitles'] = True + self.DL.params['subtitlesformat'] = 'sbv' + subtitles = self.getSubtitles() + self.assertEqual(md5(subtitles['en']), '13aeaa0c245a8bed9a451cb643e3ad8b') def test_youtube_subtitles_vtt_format(self): - DL = FakeYDL() - DL.params['writesubtitles'] = True - DL.params['subtitlesformat'] = 'vtt' - IE = YoutubeIE(DL) - info_dict = IE.extract('QRS8MkLhQmM') - sub = info_dict[0]['subtitles']['en'] - self.assertEqual(md5(sub), '356cdc577fde0c6783b9b822e7206ff7') + self.DL.params['writesubtitles'] = True + self.DL.params['subtitlesformat'] = 'vtt' + subtitles = self.getSubtitles() + self.assertEqual(md5(subtitles['en']), '356cdc577fde0c6783b9b822e7206ff7') def test_youtube_list_subtitles(self): - DL = FakeYDL() - DL.params['listsubtitles'] = True - IE = YoutubeIE(DL) - info_dict = IE.extract('QRS8MkLhQmM') + self.DL.params['listsubtitles'] = True + info_dict = self.getInfoDict() self.assertEqual(info_dict, None) def test_youtube_automatic_captions(self): - DL = FakeYDL() - DL.params['writeautomaticsub'] = True - DL.params['subtitleslangs'] = ['it'] - IE = YoutubeIE(DL) - info_dict = IE.extract('8YoUxe5ncPo') - sub = info_dict[0]['subtitles']['it'] - self.assertTrue(sub is not None) + self.url = '8YoUxe5ncPo' + self.DL.params['writeautomaticsub'] = True + self.DL.params['subtitleslangs'] = ['it'] + subtitles = self.getSubtitles() + self.assertTrue(subtitles['it'] is not None) + def test_youtube_nosubtitles(self): + self.url = 'sAjKT8FhjI8' + self.DL.params['writesubtitles'] = True + self.DL.params['allsubtitles'] = True + subtitles = self.getSubtitles() + self.assertEqual(len(subtitles), 0) def test_youtube_multiple_langs(self): - DL = FakeYDL() - DL.params['writesubtitles'] = True + self.url = 'QRS8MkLhQmM' + self.DL.params['writesubtitles'] = True langs = ['it', 'fr', 'de'] - DL.params['subtitleslangs'] = langs - IE = YoutubeIE(DL) - subtitles = IE.extract('QRS8MkLhQmM')[0]['subtitles'] + self.DL.params['subtitleslangs'] = langs + subtitles = self.getSubtitles() for lang in langs: self.assertTrue(subtitles.get(lang) is not None, u'Subtitles for \'%s\' not extracted' % lang) diff --git a/youtube_dl/FileDownloader.py b/youtube_dl/FileDownloader.py index c82b049d7..a79de3cd6 100644 --- a/youtube_dl/FileDownloader.py +++ b/youtube_dl/FileDownloader.py @@ -66,7 +66,7 @@ class FileDownloader(object): @staticmethod def format_seconds(seconds): (mins, secs) = divmod(seconds, 60) - (hours, eta_mins) = divmod(mins, 60) + (hours, mins) = divmod(mins, 60) if hours > 99: return '--:--:--' if hours == 0: @@ -77,26 +77,43 @@ class FileDownloader(object): @staticmethod def calc_percent(byte_counter, data_len): if data_len is None: + return None + return float(byte_counter) / float(data_len) * 100.0 + + @staticmethod + def format_percent(percent): + if percent is None: return '---.-%' - return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0)) + return '%6s' % ('%3.1f%%' % percent) @staticmethod def calc_eta(start, now, total, current): if total is None: - return '--:--' + return None dif = now - start if current == 0 or dif < 0.001: # One millisecond - return '--:--' + return None rate = float(current) / dif - eta = int((float(total) - float(current)) / rate) + return int((float(total) - float(current)) / rate) + + @staticmethod + def format_eta(eta): + if eta is None: + return '--:--' return FileDownloader.format_seconds(eta) @staticmethod def calc_speed(start, now, bytes): dif = now - start if bytes == 0 or dif < 0.001: # One millisecond + return None + return float(bytes) / dif + + @staticmethod + def format_speed(speed): + if speed is None: return '%10s' % '---b/s' - return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif)) + return '%10s' % ('%s/s' % FileDownloader.format_bytes(speed)) @staticmethod def best_block_size(elapsed_time, bytes): @@ -205,11 +222,14 @@ class FileDownloader(object): """Report destination filename.""" self.to_screen(u'[download] Destination: ' + filename) - def report_progress(self, percent_str, data_len_str, speed_str, eta_str): + def report_progress(self, percent, data_len_str, speed, eta): """Report download progress.""" if self.params.get('noprogress', False): return clear_line = (u'\x1b[K' if sys.stderr.isatty() and os.name != 'nt' else u'') + eta_str = self.format_eta(eta) + percent_str = self.format_percent(percent) + speed_str = self.format_speed(speed) if self.params.get('progress_with_newline', False): self.to_screen(u'[download] %s of %s at %s ETA %s' % (percent_str, data_len_str, speed_str, eta_str)) @@ -435,6 +455,7 @@ class FileDownloader(object): self._hook_progress({ 'filename': filename, 'status': 'finished', + 'total_bytes': os.path.getsize(encodeFilename(filename)), }) return True @@ -583,13 +604,14 @@ class FileDownloader(object): block_size = self.best_block_size(after - before, len(data_block)) # Progress message - speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len) + speed = self.calc_speed(start, time.time(), byte_counter - resume_len) if data_len is None: self.report_progress('Unknown %', data_len_str, speed_str, 'Unknown ETA') + eta = None else: - percent_str = self.calc_percent(byte_counter, data_len) - eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len) - self.report_progress(percent_str, data_len_str, speed_str, eta_str) + percent = self.calc_percent(byte_counter, data_len) + eta = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len) + self.report_progress(percent, data_len_str, speed, eta) self._hook_progress({ 'downloaded_bytes': byte_counter, @@ -597,6 +619,8 @@ class FileDownloader(object): 'tmpfilename': tmpfilename, 'filename': filename, 'status': 'downloading', + 'eta': eta, + 'speed': speed, }) # Apply rate limit @@ -639,6 +663,8 @@ class FileDownloader(object): * downloaded_bytes: Bytes on disks * total_bytes: Total bytes, None if unknown * tmpfilename: The filename we're currently writing to + * eta: The estimated time in seconds, None if unknown + * speed: The download speed in bytes/second, None if unknown Hooks are guaranteed to be called at least once (with status "finished") if the download is successful. diff --git a/youtube_dl/PostProcessor.py b/youtube_dl/PostProcessor.py index ae56d2082..3ee1d3c58 100644 --- a/youtube_dl/PostProcessor.py +++ b/youtube_dl/PostProcessor.py @@ -444,8 +444,11 @@ class FFmpegEmbedSubtitlePP(FFmpegPostProcessor): if information['ext'] != u'mp4': self._downloader.to_screen(u'[ffmpeg] Subtitles can only be embedded in mp4 files') return True, information - sub_langs = [key for key in information['subtitles']] + if not information.get('subtitles'): + self._downloader.to_screen(u'[ffmpeg] There aren\'t any subtitles to embed') + return True, information + sub_langs = [key for key in information['subtitles']] filename = information['filepath'] input_files = [filename] + [subtitles_filename(filename, lang, self._subformat) for lang in sub_langs] diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index b289bd9e2..2503fd09b 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -74,12 +74,16 @@ class YoutubeDL(object): writesubtitles: Write the video subtitles to a file writeautomaticsub: Write the automatic subtitles to a file allsubtitles: Downloads all the subtitles of the video + (requires writesubtitles or writeautomaticsub) listsubtitles: Lists all available subtitles for the video subtitlesformat: Subtitle format [srt/sbv/vtt] (default=srt) subtitleslangs: List of languages of the subtitles to download keepvideo: Keep the video file after post-processing daterange: A DateRange object, download only if the upload_date is in the range. skip_download: Skip the actual download of the video file + cachedir: Location of the cache files in the filesystem. + None to disable filesystem cache. + noplaylist: Download single video instead of a playlist if in doubt. The following parameters are not used by YoutubeDL itself, they are used by the FileDownloader: @@ -103,6 +107,17 @@ class YoutubeDL(object): self._download_retcode = 0 self._num_downloads = 0 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)] + + if (sys.version_info >= (3,) and sys.platform != 'win32' and + sys.getfilesystemencoding() in ['ascii', 'ANSI_X3.4-1968'] + and not params['restrictfilenames']): + # On Python 3, the Unicode filesystem API will throw errors (#1474) + self.report_warning( + u'Assuming --restrict-filenames isnce file system encoding ' + u'cannot encode all charactes. ' + u'Set the LC_ALL environment variable to fix this.') + params['restrictfilenames'] = True + self.params = params self.fd = FileDownloader(self, self.params) @@ -141,14 +156,10 @@ class YoutubeDL(object): def to_screen(self, message, skip_eol=False): """Print message to stdout if not in quiet mode.""" - assert type(message) == type(u'') if not self.params.get('quiet', False): terminator = [u'\n', u''][skip_eol] output = message + terminator - if 'b' in getattr(self._screen_file, 'mode', '') or sys.version_info[0] < 3: # Python 2 lies about the mode of sys.stdout/sys.stderr - output = output.encode(preferredencoding(), 'ignore') - self._screen_file.write(output) - self._screen_file.flush() + write_string(output, self._screen_file) def to_stderr(self, message): """Print message to stderr.""" @@ -492,13 +503,14 @@ class YoutubeDL(object): self.report_writedescription(descfn) with io.open(encodeFilename(descfn), 'w', encoding='utf-8') as descfile: descfile.write(info_dict['description']) + except (KeyError, TypeError): + self.report_warning(u'There\'s no description to write.') except (OSError, IOError): self.report_error(u'Cannot write description file ' + descfn) return subtitles_are_requested = any([self.params.get('writesubtitles', False), - self.params.get('writeautomaticsub'), - self.params.get('allsubtitles', False)]) + self.params.get('writeautomaticsub')]) if subtitles_are_requested and 'subtitles' in info_dict and info_dict['subtitles']: # subtitles download errors are already managed as troubles in relevant IE @@ -534,11 +546,15 @@ class YoutubeDL(object): thumb_filename = filename.rpartition('.')[0] + u'.' + thumb_format self.to_screen(u'[%s] %s: Downloading thumbnail ...' % (info_dict['extractor'], info_dict['id'])) - uf = compat_urllib_request.urlopen(info_dict['thumbnail']) - with open(thumb_filename, 'wb') as thumbf: - shutil.copyfileobj(uf, thumbf) - self.to_screen(u'[%s] %s: Writing thumbnail to: %s' % - (info_dict['extractor'], info_dict['id'], thumb_filename)) + try: + uf = compat_urllib_request.urlopen(info_dict['thumbnail']) + with open(thumb_filename, 'wb') as thumbf: + shutil.copyfileobj(uf, thumbf) + self.to_screen(u'[%s] %s: Writing thumbnail to: %s' % + (info_dict['extractor'], info_dict['id'], thumb_filename)) + except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: + self.report_warning(u'Unable to download thumbnail "%s": %s' % + (info_dict['thumbnail'], compat_str(err))) if not self.params.get('skip_download', False): if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(filename)): @@ -546,11 +562,11 @@ class YoutubeDL(object): else: try: success = self.fd._do_download(filename, info_dict) - except (OSError, IOError) as err: - raise UnavailableVideoError(err) except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: self.report_error(u'unable to download video data: %s' % str(err)) return + except (OSError, IOError) as err: + raise UnavailableVideoError(err) except (ContentTooShortError, ) as err: self.report_error(u'content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded)) return diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 5da919989..cbb6c92c2 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -28,6 +28,9 @@ __authors__ = ( 'Axel Noack', 'Albert Kim', 'Pierre Rudloff', + 'Huarong Huo', + 'Ismael Mejía', + 'Steffan \'Ruirize\' James', ) __license__ = 'Public Domain' @@ -147,7 +150,7 @@ def parseOpts(overrideArguments=None): general.add_option('-U', '--update', action='store_true', dest='update_self', help='update this program to latest version. Make sure that you have sufficient permissions (run with sudo if needed)') general.add_option('-i', '--ignore-errors', - action='store_true', dest='ignoreerrors', help='continue on download errors', default=False) + action='store_true', dest='ignoreerrors', help='continue on download errors, for example to to skip unavailable videos in a playlist', default=False) general.add_option('--dump-user-agent', action='store_true', dest='dump_user_agent', help='display the current browser identification', default=False) @@ -164,6 +167,12 @@ def parseOpts(overrideArguments=None): help='Output descriptions of all supported extractors', default=False) general.add_option('--proxy', dest='proxy', default=None, help='Use the specified HTTP/HTTPS proxy', metavar='URL') general.add_option('--no-check-certificate', action='store_true', dest='no_check_certificate', default=False, help='Suppress HTTPS certificate validation.') + general.add_option( + '--cache-dir', dest='cachedir', default=get_cachedir(), + help='Location in the filesystem where youtube-dl can store downloaded information permanently. By default $XDG_CACHE_HOME/youtube-dl or ~/.cache/youtube-dl .') + general.add_option( + '--no-cache-dir', action='store_const', const=None, dest='cachedir', + help='Disable filesystem caching') selection.add_option('--playlist-start', @@ -178,6 +187,7 @@ def parseOpts(overrideArguments=None): selection.add_option('--date', metavar='DATE', dest='date', help='download only videos uploaded in this date', default=None) selection.add_option('--datebefore', metavar='DATE', dest='datebefore', help='download only videos uploaded before this date', default=None) selection.add_option('--dateafter', metavar='DATE', dest='dateafter', help='download only videos uploaded after this date', default=None) + selection.add_option('--no-playlist', action='store_true', dest='noplaylist', help='download only the currently playing video', default=False) authentication.add_option('-u', '--username', @@ -192,7 +202,7 @@ def parseOpts(overrideArguments=None): video_format.add_option('-f', '--format', action='store', dest='format', metavar='FORMAT', - help='video format code, specifiy the order of preference using slashes: "-f 22/17/18"') + help='video format code, specifiy the order of preference using slashes: "-f 22/17/18". "-f mp4" and "-f flv" are also supported') video_format.add_option('--all-formats', action='store_const', dest='format', help='download all available video formats', const='all') video_format.add_option('--prefer-free-formats', @@ -204,13 +214,10 @@ def parseOpts(overrideArguments=None): subtitles.add_option('--write-sub', '--write-srt', action='store_true', dest='writesubtitles', - help='write subtitle file (currently youtube only)', default=False) + help='write subtitle file', default=False) subtitles.add_option('--write-auto-sub', '--write-automatic-sub', action='store_true', dest='writeautomaticsub', - help='write automatic subtitle file (currently youtube only)', default=False) - subtitles.add_option('--only-sub', - action='store_true', dest='skip_download', - help='[deprecated] alias of --skip-download', default=False) + help='write automatic subtitle file (youtube only)', default=False) subtitles.add_option('--all-subs', action='store_true', dest='allsubtitles', help='downloads all the available subtitles of the video', default=False) @@ -221,7 +228,7 @@ def parseOpts(overrideArguments=None): action='store', dest='subtitlesformat', metavar='FORMAT', help='subtitle format (default=srt) ([sbv/vtt] youtube only)', default='srt') subtitles.add_option('--sub-lang', '--sub-langs', '--srt-lang', - action='callback', dest='subtitleslang', metavar='LANGS', type='str', + action='callback', dest='subtitleslangs', metavar='LANGS', type='str', default=[], callback=_comma_separated_values_options_callback, help='languages of the subtitles to download (optional) separated by commas, use IETF language tags like \'en,pt\'') @@ -272,6 +279,10 @@ def parseOpts(overrideArguments=None): verbosity.add_option('--dump-intermediate-pages', action='store_true', dest='dump_intermediate_pages', default=False, help='print downloaded pages to debug problems(very verbose)') + verbosity.add_option('--youtube-print-sig-code', + action='store_true', dest='youtube_print_sig_code', default=False, + help=optparse.SUPPRESS_HELP) + filesystem.add_option('-t', '--title', action='store_true', dest='usetitle', help='use title in file name (default)', default=False) @@ -355,22 +366,26 @@ def parseOpts(overrideArguments=None): if overrideArguments is not None: opts, args = parser.parse_args(overrideArguments) if opts.verbose: - sys.stderr.write(u'[debug] Override config: ' + repr(overrideArguments) + '\n') + write_string(u'[debug] Override config: ' + repr(overrideArguments) + '\n') else: xdg_config_home = os.environ.get('XDG_CONFIG_HOME') if xdg_config_home: - userConfFile = os.path.join(xdg_config_home, 'youtube-dl.conf') + userConfFile = os.path.join(xdg_config_home, 'youtube-dl', 'config') + if not os.path.isfile(userConfFile): + userConfFile = os.path.join(xdg_config_home, 'youtube-dl.conf') else: - userConfFile = os.path.join(os.path.expanduser('~'), '.config', 'youtube-dl.conf') + userConfFile = os.path.join(os.path.expanduser('~'), '.config', 'youtube-dl', 'config') + if not os.path.isfile(userConfFile): + userConfFile = os.path.join(os.path.expanduser('~'), '.config', 'youtube-dl.conf') systemConf = _readOptions('/etc/youtube-dl.conf') userConf = _readOptions(userConfFile) commandLineConf = sys.argv[1:] argv = systemConf + userConf + commandLineConf opts, args = parser.parse_args(argv) if opts.verbose: - sys.stderr.write(u'[debug] System config: ' + repr(_hide_login_info(systemConf)) + '\n') - sys.stderr.write(u'[debug] User config: ' + repr(_hide_login_info(userConf)) + '\n') - sys.stderr.write(u'[debug] Command-line args: ' + repr(_hide_login_info(commandLineConf)) + '\n') + write_string(u'[debug] System config: ' + repr(_hide_login_info(systemConf)) + '\n') + write_string(u'[debug] User config: ' + repr(_hide_login_info(userConf)) + '\n') + write_string(u'[debug] Command-line args: ' + repr(_hide_login_info(commandLineConf)) + '\n') return parser, opts, args @@ -397,7 +412,7 @@ def _real_main(argv=None): except (IOError, OSError) as err: if opts.verbose: traceback.print_exc() - sys.stderr.write(u'ERROR: unable to open cookie file\n') + write_string(u'ERROR: unable to open cookie file\n') sys.exit(101) # Set user agent if opts.user_agent is not None: @@ -424,7 +439,7 @@ def _real_main(argv=None): batchurls = [x.strip() for x in batchurls] batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)] if opts.verbose: - sys.stderr.write(u'[debug] Batch file urls: ' + repr(batchurls) + u'\n') + write_string(u'[debug] Batch file urls: ' + repr(batchurls) + u'\n') except IOError: sys.exit(u'ERROR: batch file could not be read') all_urls = batchurls + args @@ -538,6 +553,11 @@ def _real_main(argv=None): else: date = DateRange(opts.dateafter, opts.datebefore) + # --all-sub automatically sets --write-sub if --write-auto-sub is not given + # this was the old behaviour if only --all-sub was given. + if opts.allsubtitles and (opts.writeautomaticsub == False): + opts.writesubtitles = True + if sys.version_info < (3,): # In Python 2, sys.argv is a bytestring (also note http://bugs.python.org/issue2128 for Windows systems) if opts.outtmpl is not None: @@ -550,6 +570,10 @@ def _real_main(argv=None): or (opts.useid and u'%(id)s.%(ext)s') or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s') or u'%(title)s-%(id)s.%(ext)s') + if '%(ext)s' not in outtmpl and opts.extractaudio: + parser.error(u'Cannot download a video and extract audio into the same' + u' file! Use "%%(ext)s" instead of %r' % + determine_ext(outtmpl, u'')) # YoutubeDL ydl = YoutubeDL({ @@ -584,6 +608,7 @@ def _real_main(argv=None): 'progress_with_newline': opts.progress_with_newline, 'playliststart': opts.playliststart, 'playlistend': opts.playlistend, + 'noplaylist': opts.noplaylist, 'logtostderr': opts.outtmpl == '-', 'consoletitle': opts.consoletitle, 'nopart': opts.nopart, @@ -596,7 +621,7 @@ def _real_main(argv=None): 'allsubtitles': opts.allsubtitles, 'listsubtitles': opts.listsubtitles, 'subtitlesformat': opts.subtitlesformat, - 'subtitleslangs': opts.subtitleslang, + 'subtitleslangs': opts.subtitleslangs, 'matchtitle': decodeOption(opts.matchtitle), 'rejecttitle': decodeOption(opts.rejecttitle), 'max_downloads': opts.max_downloads, @@ -608,10 +633,12 @@ def _real_main(argv=None): 'min_filesize': opts.min_filesize, 'max_filesize': opts.max_filesize, 'daterange': date, + 'cachedir': opts.cachedir, + 'youtube_print_sig_code': opts.youtube_print_sig_code, }) if opts.verbose: - sys.stderr.write(u'[debug] youtube-dl version ' + __version__ + u'\n') + write_string(u'[debug] youtube-dl version ' + __version__ + u'\n') try: sp = subprocess.Popen( ['git', 'rev-parse', '--short', 'HEAD'], @@ -620,14 +647,14 @@ def _real_main(argv=None): out, err = sp.communicate() out = out.decode().strip() if re.match('[0-9a-f]+', out): - sys.stderr.write(u'[debug] Git HEAD: ' + out + u'\n') + write_string(u'[debug] Git HEAD: ' + out + u'\n') except: try: sys.exc_clear() except: pass - sys.stderr.write(u'[debug] Python version %s - %s' %(platform.python_version(), platform_name()) + u'\n') - sys.stderr.write(u'[debug] Proxy map: ' + str(proxy_handler.proxies) + u'\n') + write_string(u'[debug] Python version %s - %s' %(platform.python_version(), platform_name()) + u'\n') + write_string(u'[debug] Proxy map: ' + str(proxy_handler.proxies) + u'\n') ydl.add_default_info_extractors() @@ -641,7 +668,7 @@ def _real_main(argv=None): # Update version if opts.update_self: - update_self(ydl.to_screen, opts.verbose, sys.argv[0]) + update_self(ydl.to_screen, opts.verbose) # Maybe do nothing if len(all_urls) < 1: diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 90f1a4418..d1b7e5f99 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -6,6 +6,7 @@ from .arte import ArteTvIE from .auengine import AUEngineIE from .bandcamp import BandcampIE from .bliptv import BlipTVIE, BlipTVUserIE +from .bloomberg import BloombergIE from .breakcom import BreakIE from .brightcove import BrightcoveIE from .c56 import C56IE @@ -17,16 +18,33 @@ from .comedycentral import ComedyCentralIE from .condenast import CondeNastIE from .criterion import CriterionIE from .cspan import CSpanIE -from .dailymotion import DailymotionIE, DailymotionPlaylistIE +from .dailymotion import ( + DailymotionIE, + DailymotionPlaylistIE, + DailymotionUserIE, +) +from .daum import DaumIE from .depositfiles import DepositFilesIE from .dotsub import DotsubIE from .dreisat import DreiSatIE +from .defense import DefenseGouvFrIE +from .ebaumsworld import EbaumsWorldIE from .ehow import EHowIE from .eighttracks import EightTracksIE from .escapist import EscapistIE from .exfm import ExfmIE from .facebook import FacebookIE +from .fktv import ( + FKTVIE, + FKTVPosteckeIE, +) from .flickr import FlickrIE +from .francetv import ( + PluzzIE, + FranceTvInfoIE, + France2IE, + GenerationQuoiIE +) from .freesound import FreesoundIE from .funnyordie import FunnyOrDieIE from .gamespot import GameSpotIE @@ -46,18 +64,22 @@ from .jeuxvideo import JeuxVideoIE from .jukebox import JukeboxIE from .justintv import JustinTVIE from .kankan import KankanIE +from .kickstarter import KickStarterIE from .keek import KeekIE from .liveleak import LiveLeakIE from .livestream import LivestreamIE from .metacafe import MetacafeIE +from .metacritic import MetacriticIE from .mit import TechTVMITIE, MITIE from .mixcloud import MixcloudIE from .mtv import MTVIE from .muzu import MuzuTVIE from .myspass import MySpassIE from .myvideo import MyVideoIE +from .naver import NaverIE from .nba import NBAIE from .nbc import NBCNewsIE +from .newgrounds import NewgroundsIE from .ooyala import OoyalaIE from .orf import ORFIE from .pbs import PBSIE @@ -71,8 +93,10 @@ from .roxwel import RoxwelIE from .rtlnow import RTLnowIE from .sina import SinaIE from .slashdot import SlashdotIE +from .slideshare import SlideshareIE from .sohu import SohuIE -from .soundcloud import SoundcloudIE, SoundcloudSetIE +from .soundcloud import SoundcloudIE, SoundcloudSetIE, SoundcloudUserIE +from .southparkstudios import SouthParkStudiosIE from .spiegel import SpiegelIE from .stanfordoc import StanfordOpenClassroomIE from .statigram import StatigramIE @@ -87,10 +111,12 @@ from .tudou import TudouIE from .tumblr import TumblrIE from .tutv import TutvIE from .unistra import UnistraIE -from .ustream import UstreamIE +from .ustream import UstreamIE, UstreamChannelIE from .vbox7 import Vbox7IE +from .veehd import VeeHDIE from .veoh import VeohIE from .vevo import VevoIE +from .vice import ViceIE from .videofyme import VideofyMeIE from .vimeo import VimeoIE, VimeoChannelIE from .vine import VineIE diff --git a/youtube_dl/extractor/appletrailers.py b/youtube_dl/extractor/appletrailers.py index 8b191c196..6d6237f8a 100644 --- a/youtube_dl/extractor/appletrailers.py +++ b/youtube_dl/extractor/appletrailers.py @@ -1,8 +1,10 @@ import re import xml.etree.ElementTree +import json from .common import InfoExtractor from ..utils import ( + compat_urlparse, determine_ext, ) @@ -14,10 +16,9 @@ class AppleTrailersIE(InfoExtractor): u"playlist": [ { u"file": u"manofsteel-trailer4.mov", - u"md5": u"11874af099d480cc09e103b189805d5f", + u"md5": u"d97a8e575432dbcb81b7c3acb741f8a8", u"info_dict": { u"duration": 111, - u"thumbnail": u"http://trailers.apple.com/trailers/wb/manofsteel/images/thumbnail_11624.jpg", u"title": u"Trailer 4", u"upload_date": u"20130523", u"uploader_id": u"wb", @@ -25,10 +26,9 @@ class AppleTrailersIE(InfoExtractor): }, { u"file": u"manofsteel-trailer3.mov", - u"md5": u"07a0a262aae5afe68120eed61137ab34", + u"md5": u"b8017b7131b721fb4e8d6f49e1df908c", u"info_dict": { u"duration": 182, - u"thumbnail": u"http://trailers.apple.com/trailers/wb/manofsteel/images/thumbnail_10793.jpg", u"title": u"Trailer 3", u"upload_date": u"20130417", u"uploader_id": u"wb", @@ -36,10 +36,9 @@ class AppleTrailersIE(InfoExtractor): }, { u"file": u"manofsteel-trailer.mov", - u"md5": u"e401fde0813008e3307e54b6f384cff1", + u"md5": u"d0f1e1150989b9924679b441f3404d48", u"info_dict": { u"duration": 148, - u"thumbnail": u"http://trailers.apple.com/trailers/wb/manofsteel/images/thumbnail_8703.jpg", u"title": u"Trailer", u"upload_date": u"20121212", u"uploader_id": u"wb", @@ -47,10 +46,9 @@ class AppleTrailersIE(InfoExtractor): }, { u"file": u"manofsteel-teaser.mov", - u"md5": u"76b392f2ae9e7c98b22913c10a639c97", + u"md5": u"5fe08795b943eb2e757fa95cb6def1cb", u"info_dict": { u"duration": 93, - u"thumbnail": u"http://trailers.apple.com/trailers/wb/manofsteel/images/thumbnail_6899.jpg", u"title": u"Teaser", u"upload_date": u"20120721", u"uploader_id": u"wb", @@ -59,87 +57,61 @@ class AppleTrailersIE(InfoExtractor): ] } + _JSON_RE = r'iTunes.playURL\((.*?)\);' + def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) movie = mobj.group('movie') uploader_id = mobj.group('company') - playlist_url = url.partition(u'?')[0] + u'/includes/playlists/web.inc' + playlist_url = compat_urlparse.urljoin(url, u'includes/playlists/itunes.inc') playlist_snippet = self._download_webpage(playlist_url, movie) - playlist_cleaned = re.sub(r'(?s)', u'', playlist_snippet) + playlist_cleaned = re.sub(r'(?s).*?', u'', playlist_snippet) + playlist_cleaned = re.sub(r'', r'', playlist_cleaned) + # The ' in the onClick attributes are not escaped, it couldn't be parsed + # with xml.etree.ElementTree.fromstring + # like: http://trailers.apple.com/trailers/wb/gravity/ + def _clean_json(m): + return u'iTunes.playURL(%s);' % m.group(1).replace('\'', ''') + playlist_cleaned = re.sub(self._JSON_RE, _clean_json, playlist_cleaned) playlist_html = u'' + playlist_cleaned + u'' - size_cache = {} - doc = xml.etree.ElementTree.fromstring(playlist_html) playlist = [] for li in doc.findall('./div/ul/li'): - title = li.find('.//h3').text + on_click = li.find('.//a').attrib['onClick'] + trailer_info_json = self._search_regex(self._JSON_RE, + on_click, u'trailer info') + trailer_info = json.loads(trailer_info_json) + title = trailer_info['title'] video_id = movie + '-' + re.sub(r'[^a-zA-Z0-9]', '', title).lower() thumbnail = li.find('.//img').attrib['src'] + upload_date = trailer_info['posted'].replace('-', '') - date_el = li.find('.//p') - upload_date = None - m = re.search(r':\s?(?P[0-9]{2})/(?P[0-9]{2})/(?P[0-9]{2})', date_el.text) - if m: - upload_date = u'20' + m.group('year') + m.group('month') + m.group('day') - runtime_el = date_el.find('./br') - m = re.search(r':\s?(?P[0-9]+):(?P[0-9]{1,2})', runtime_el.tail) + runtime = trailer_info['runtime'] + m = re.search(r'(?P[0-9]+):(?P[0-9]{1,2})', runtime) duration = None if m: duration = 60 * int(m.group('minutes')) + int(m.group('seconds')) + first_url = trailer_info['url'] + trailer_id = first_url.split('/')[-1].rpartition('_')[0].lower() + settings_json_url = compat_urlparse.urljoin(url, 'includes/settings/%s.json' % trailer_id) + settings_json = self._download_webpage(settings_json_url, trailer_id, u'Downloading settings json') + settings = json.loads(settings_json) + formats = [] - for formats_el in li.findall('.//a'): - if formats_el.attrib['class'] != 'OverlayPanel': - continue - target = formats_el.attrib['target'] - - format_code = formats_el.text - if 'Automatic' in format_code: - continue - - size_q = formats_el.attrib['href'] - size_id = size_q.rpartition('#videos-')[2] - if size_id not in size_cache: - size_url = url + size_q - sizepage_html = self._download_webpage( - size_url, movie, - note=u'Downloading size info %s' % size_id, - errnote=u'Error while downloading size info %s' % size_id, - ) - _doc = xml.etree.ElementTree.fromstring(sizepage_html) - size_cache[size_id] = _doc - - sizepage_doc = size_cache[size_id] - links = sizepage_doc.findall('.//{http://www.w3.org/1999/xhtml}ul/{http://www.w3.org/1999/xhtml}li/{http://www.w3.org/1999/xhtml}a') - for vid_a in links: - href = vid_a.get('href') - if not href.endswith(target): - continue - detail_q = href.partition('#')[0] - detail_url = url + '/' + detail_q - - m = re.match(r'includes/(?P[^/]+)/', detail_q) - detail_id = m.group('detail_id') - - detail_html = self._download_webpage( - detail_url, movie, - note=u'Downloading detail %s %s' % (detail_id, size_id), - errnote=u'Error while downloading detail %s %s' % (detail_id, size_id) - ) - detail_doc = xml.etree.ElementTree.fromstring(detail_html) - movie_link_el = detail_doc.find('.//{http://www.w3.org/1999/xhtml}a') - assert movie_link_el.get('class') == 'movieLink' - movie_link = movie_link_el.get('href').partition('?')[0].replace('_', '_h') - ext = determine_ext(movie_link) - assert ext == 'mov' - - formats.append({ - 'format': format_code, - 'ext': ext, - 'url': movie_link, - }) + for format in settings['metadata']['sizes']: + # The src is a file pointing to the real video file + format_url = re.sub(r'_(\d*p.mov)', r'_h\1', format['src']) + formats.append({ + 'url': format_url, + 'ext': determine_ext(format_url), + 'format': format['type'], + 'width': format['width'], + 'height': int(format['height']), + }) + formats = sorted(formats, key=lambda f: (f['height'], f['width'])) info = { '_type': 'video', diff --git a/youtube_dl/extractor/archiveorg.py b/youtube_dl/extractor/archiveorg.py index 7efd1d823..61ce4469a 100644 --- a/youtube_dl/extractor/archiveorg.py +++ b/youtube_dl/extractor/archiveorg.py @@ -46,6 +46,8 @@ class ArchiveOrgIE(InfoExtractor): for fn,fdata in data['files'].items() if 'Video' in fdata['format']] formats.sort(key=lambda fdata: fdata['file_size']) + for f in formats: + f['ext'] = determine_ext(f['url']) info = { '_type': 'video', @@ -61,7 +63,6 @@ class ArchiveOrgIE(InfoExtractor): info['thumbnail'] = thumbnail # TODO: Remove when #980 has been merged - info['url'] = formats[-1]['url'] - info['ext'] = determine_ext(formats[-1]['url']) + info.update(formats[-1]) - return info \ No newline at end of file + return info diff --git a/youtube_dl/extractor/bloomberg.py b/youtube_dl/extractor/bloomberg.py new file mode 100644 index 000000000..3666a780b --- /dev/null +++ b/youtube_dl/extractor/bloomberg.py @@ -0,0 +1,27 @@ +import re + +from .common import InfoExtractor + + +class BloombergIE(InfoExtractor): + _VALID_URL = r'https?://www\.bloomberg\.com/video/(?P.+?).html' + + _TEST = { + u'url': u'http://www.bloomberg.com/video/shah-s-presentation-on-foreign-exchange-strategies-qurhIVlJSB6hzkVi229d8g.html', + u'file': u'12bzhqZTqQHmmlA8I-i0NpzJgcG5NNYX.mp4', + u'info_dict': { + u'title': u'Shah\'s Presentation on Foreign-Exchange Strategies', + u'description': u'md5:abc86e5236f9f0e4866c59ad36736686', + }, + u'params': { + # Requires ffmpeg (m3u8 manifest) + u'skip_download': True, + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + name = mobj.group('name') + webpage = self._download_webpage(url, name) + ooyala_url = self._og_search_video_url(webpage) + return self.url_result(ooyala_url, ie='Ooyala') diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index 71e3c7883..745212f2f 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -1,3 +1,5 @@ +# encoding: utf-8 + import re import json import xml.etree.ElementTree @@ -7,15 +9,39 @@ from ..utils import ( compat_urllib_parse, find_xpath_attr, compat_urlparse, + + ExtractorError, ) class BrightcoveIE(InfoExtractor): _VALID_URL = r'https?://.*brightcove\.com/(services|viewer).*\?(?P.*)' _FEDERATED_URL_TEMPLATE = 'http://c.brightcove.com/services/viewer/htmlFederated?%s' _PLAYLIST_URL_TEMPLATE = 'http://c.brightcove.com/services/json/experience/runtime/?command=get_programming_for_experience&playerKey=%s' - - # There is a test for Brigtcove in GenericIE, that way we test both the download - # and the detection of videos, and we don't have to find an URL that is always valid + + _TESTS = [ + { + # From http://www.8tv.cat/8aldia/videos/xavier-sala-i-martin-aquesta-tarda-a-8-al-dia/ + u'url': u'http://c.brightcove.com/services/viewer/htmlFederated?playerID=1654948606001&flashID=myExperience&%40videoPlayer=2371591881001', + u'file': u'2371591881001.mp4', + u'md5': u'9e80619e0a94663f0bdc849b4566af19', + u'note': u'Test Brightcove downloads and detection in GenericIE', + u'info_dict': { + u'title': u'Xavier Sala i Martín: “Un banc que no presta és un banc zombi que no serveix per a res”', + u'uploader': u'8TV', + u'description': u'md5:a950cc4285c43e44d763d036710cd9cd', + } + }, + { + # From http://medianetwork.oracle.com/video/player/1785452137001 + u'url': u'http://c.brightcove.com/services/viewer/htmlFederated?playerID=1217746023001&flashID=myPlayer&%40videoPlayer=1785452137001', + u'file': u'1785452137001.flv', + u'info_dict': { + u'title': u'JVMLS 2012: Arrays 2.0 - Opportunities and Challenges', + u'description': u'John Rose speaks at the JVM Language Summit, August 1, 2012.', + u'uploader': u'Oracle', + }, + }, + ] @classmethod def _build_brighcove_url(cls, object_str): @@ -23,6 +49,11 @@ class BrightcoveIE(InfoExtractor): Build a Brightcove url from a xml string containing {params} """ + + # Fix up some stupid HTML, see https://github.com/rg3/youtube-dl/issues/1553 + object_str = re.sub(r'(', + lambda m: m.group(1) + '/>', object_str) + object_doc = xml.etree.ElementTree.fromstring(object_str) assert u'BrightcoveExperience' in object_doc.attrib['class'] params = {'flashID': object_doc.attrib['id'], @@ -72,15 +103,27 @@ class BrightcoveIE(InfoExtractor): playlist_title=playlist_info['mediaCollectionDTO']['displayName']) def _extract_video_info(self, video_info): - renditions = video_info['renditions'] - renditions = sorted(renditions, key=lambda r: r['size']) - best_format = renditions[-1] + info = { + 'id': video_info['id'], + 'title': video_info['displayName'], + 'description': video_info.get('shortDescription'), + 'thumbnail': video_info.get('videoStillURL') or video_info.get('thumbnailURL'), + 'uploader': video_info.get('publisherName'), + } - return {'id': video_info['id'], - 'title': video_info['displayName'], - 'url': best_format['defaultURL'], + renditions = video_info.get('renditions') + if renditions: + renditions = sorted(renditions, key=lambda r: r['size']) + best_format = renditions[-1] + info.update({ + 'url': best_format['defaultURL'], 'ext': 'mp4', - 'description': video_info.get('shortDescription'), - 'thumbnail': video_info.get('videoStillURL') or video_info.get('thumbnailURL'), - 'uploader': video_info.get('publisherName'), - } + }) + elif video_info.get('FLVFullLengthURL') is not None: + info.update({ + 'url': video_info['FLVFullLengthURL'], + 'ext': 'flv', + }) + else: + raise ExtractorError(u'Unable to extract video url for %s' % info['id']) + return info diff --git a/youtube_dl/extractor/canalc2.py b/youtube_dl/extractor/canalc2.py index 50832217a..e7f4fa9fd 100644 --- a/youtube_dl/extractor/canalc2.py +++ b/youtube_dl/extractor/canalc2.py @@ -5,7 +5,7 @@ from .common import InfoExtractor class Canalc2IE(InfoExtractor): - _IE_NAME = 'canalc2.tv' + IE_NAME = 'canalc2.tv' _VALID_URL = r'http://.*?\.canalc2\.tv/video\.asp\?idVideo=(\d+)&voir=oui' _TEST = { diff --git a/youtube_dl/extractor/canalplus.py b/youtube_dl/extractor/canalplus.py index 1f02519a0..1db9b24cf 100644 --- a/youtube_dl/extractor/canalplus.py +++ b/youtube_dl/extractor/canalplus.py @@ -1,3 +1,4 @@ +# encoding: utf-8 import re import xml.etree.ElementTree @@ -5,24 +6,29 @@ from .common import InfoExtractor from ..utils import unified_strdate class CanalplusIE(InfoExtractor): - _VALID_URL = r'https?://(www\.canalplus\.fr/.*?\?vid=|player\.canalplus\.fr/#/)(?P\d+)' + _VALID_URL = r'https?://(www\.canalplus\.fr/.*?/(?P.*)|player\.canalplus\.fr/#/(?P\d+))' _VIDEO_INFO_TEMPLATE = 'http://service.canal-plus.com/video/rest/getVideosLiees/cplus/%s' IE_NAME = u'canalplus.fr' _TEST = { - u'url': u'http://www.canalplus.fr/c-divertissement/pid3351-c-le-petit-journal.html?vid=889861', - u'file': u'889861.flv', - u'md5': u'590a888158b5f0d6832f84001fbf3e99', + u'url': u'http://www.canalplus.fr/c-infos-documentaires/pid1830-c-zapping.html?vid=922470', + u'file': u'922470.flv', u'info_dict': { - u'title': u'Le Petit Journal 20/06/13 - La guerre des drone', - u'upload_date': u'20130620', + u'title': u'Zapping - 26/08/13', + u'description': u'Le meilleur de toutes les chaînes, tous les jours.\nEmission du 26 août 2013', + u'upload_date': u'20130826', + }, + u'params': { + u'skip_download': True, }, - u'skip': u'Requires rtmpdump' } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') + if video_id is None: + webpage = self._download_webpage(url, mobj.group('path')) + video_id = self._search_regex(r'videoId = "(\d+)";', webpage, u'video id') info_url = self._VIDEO_INFO_TEMPLATE % video_id info_page = self._download_webpage(info_url,video_id, u'Downloading video info') @@ -43,4 +49,6 @@ class CanalplusIE(InfoExtractor): 'ext': 'flv', 'upload_date': unified_strdate(infos.find('PUBLICATION/DATE').text), 'thumbnail': media.find('IMAGES/GRAND').text, + 'description': infos.find('DESCRIPTION').text, + 'view_count': int(infos.find('NB_VUES').text), } diff --git a/youtube_dl/extractor/comedycentral.py b/youtube_dl/extractor/comedycentral.py index bf8d711ee..69b2beece 100644 --- a/youtube_dl/extractor/comedycentral.py +++ b/youtube_dl/extractor/comedycentral.py @@ -51,12 +51,12 @@ class ComedyCentralIE(InfoExtractor): '400': 'mp4', } _video_dimensions = { - '3500': '1280x720', - '2200': '960x540', - '1700': '768x432', - '1200': '640x360', - '750': '512x288', - '400': '384x216', + '3500': (1280, 720), + '2200': (960, 540), + '1700': (768, 432), + '1200': (640, 360), + '750': (512, 288), + '400': (384, 216), } @classmethod @@ -64,11 +64,13 @@ class ComedyCentralIE(InfoExtractor): """Receives a URL and returns True if suitable for this IE.""" return re.match(cls._VALID_URL, url, re.VERBOSE) is not None - def _print_formats(self, formats): - print('Available formats:') - for x in formats: - print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???'))) - + @staticmethod + def _transform_rtmp_url(rtmp_video_url): + m = re.match(r'^rtmpe?://.*?/(?Pgsp.comedystor/.*)$', rtmp_video_url) + if not m: + raise ExtractorError(u'Cannot transform RTMP url') + base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/' + return base + m.group('finalid') def _real_extract(self, url): mobj = re.match(self._VALID_URL, url, re.VERBOSE) @@ -155,40 +157,31 @@ class ComedyCentralIE(InfoExtractor): self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found') continue - if self._downloader.params.get('listformats', None): - self._print_formats([i[0] for i in turls]) - return - - # For now, just pick the highest bitrate - format,rtmp_video_url = turls[-1] - - # Get the format arg from the arg stream - req_format = self._downloader.params.get('format', None) - - # Select format if we can find one - for f,v in turls: - if f == req_format: - format, rtmp_video_url = f, v - break - - m = re.match(r'^rtmpe?://.*?/(?Pgsp.comedystor/.*)$', rtmp_video_url) - if not m: - raise ExtractorError(u'Cannot transform RTMP url') - base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/' - video_url = base + m.group('finalid') + formats = [] + for format, rtmp_video_url in turls: + w, h = self._video_dimensions.get(format, (None, None)) + formats.append({ + 'url': self._transform_rtmp_url(rtmp_video_url), + 'ext': self._video_extensions.get(format, 'mp4'), + 'format_id': format, + 'height': h, + 'width': w, + }) effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1) info = { 'id': shortMediaId, - 'url': video_url, + 'formats': formats, 'uploader': showId, 'upload_date': officialDate, 'title': effTitle, - 'ext': 'mp4', - 'format': format, 'thumbnail': None, 'description': compat_str(officialTitle), } + + # TODO: Remove when #980 has been merged + info.update(info['formats'][-1]) + results.append(info) return results diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 77726ee24..69cdcdc1b 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -35,6 +35,8 @@ class InfoExtractor(object): title: Video title, unescaped. ext: Video filename extension. + Instead of url and ext, formats can also specified. + The following fields are optional: format: The video format, defaults to ext (used for --get-format) @@ -52,8 +54,19 @@ class InfoExtractor(object): view_count: How many users have watched the video on the platform. urlhandle: [internal] The urlHandle to be used to download the file, like returned by urllib.request.urlopen + formats: A list of dictionaries for each format available, it must + be ordered from worst to best quality. Potential fields: + * url Mandatory. The URL of the video file + * ext Will be calculated from url if missing + * format A human-readable description of the format + ("mp4 container with h264/opus"). + Calculated from width and height if missing. + * format_id A short description of the format + ("mp4_h264_opus" or "19") + * width Width of the video, if known + * height Height of the video, if known - The fields should all be Unicode strings. + Unless mentioned otherwise, the fields should be Unicode strings. Subclasses of this one should re-define the _real_initialize() and _real_extract() methods and define a _VALID_URL regexp. diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index 1ea449ca8..7d8353946 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -3,28 +3,56 @@ import json import itertools from .common import InfoExtractor +from .subtitles import SubtitlesInfoExtractor + from ..utils import ( compat_urllib_request, + compat_str, get_element_by_attribute, get_element_by_id, + orderedSet, ExtractorError, ) -class DailymotionIE(InfoExtractor): +class DailymotionBaseInfoExtractor(InfoExtractor): + @staticmethod + def _build_request(url): + """Build a request with the family filter disabled""" + request = compat_urllib_request.Request(url) + request.add_header('Cookie', 'family_filter=off') + return request + +class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor): """Information Extractor for Dailymotion""" - _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)' + _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/(?:embed/)?video/([^/]+)' IE_NAME = u'dailymotion' - _TEST = { - u'url': u'http://www.dailymotion.com/video/x33vw9_tutoriel-de-youtubeur-dl-des-video_tech', - u'file': u'x33vw9.mp4', - u'md5': u'392c4b85a60a90dc4792da41ce3144eb', - u'info_dict': { - u"uploader": u"Amphora Alex and Van .", - u"title": u"Tutoriel de Youtubeur\"DL DES VIDEO DE YOUTUBE\"" - } - } + _TESTS = [ + { + u'url': u'http://www.dailymotion.com/video/x33vw9_tutoriel-de-youtubeur-dl-des-video_tech', + u'file': u'x33vw9.mp4', + u'md5': u'392c4b85a60a90dc4792da41ce3144eb', + u'info_dict': { + u"uploader": u"Amphora Alex and Van .", + u"title": u"Tutoriel de Youtubeur\"DL DES VIDEO DE YOUTUBE\"" + } + }, + # Vevo video + { + u'url': u'http://www.dailymotion.com/video/x149uew_katy-perry-roar-official_musi', + u'file': u'USUV71301934.mp4', + u'info_dict': { + u'title': u'Roar (Official)', + u'uploader': u'Katy Perry', + u'upload_date': u'20130905', + }, + u'params': { + u'skip_download': True, + }, + u'skip': u'VEVO is only available in some countries', + }, + ] def _real_extract(self, url): # Extract id and simplified title from URL @@ -33,15 +61,24 @@ class DailymotionIE(InfoExtractor): video_id = mobj.group(1).split('_')[0].split('?')[0] video_extension = 'mp4' + url = 'http://www.dailymotion.com/video/%s' % video_id # Retrieve video webpage to extract further information - request = compat_urllib_request.Request(url) - request.add_header('Cookie', 'family_filter=off') + request = self._build_request(url) webpage = self._download_webpage(request, video_id) # Extract URL, uploader and title from webpage self.report_extraction(video_id) + # It may just embed a vevo video: + m_vevo = re.search( + r'[\w]*)', + webpage) + if m_vevo is not None: + vevo_id = m_vevo.group('id') + self.to_screen(u'Vevo video detected: %s' % vevo_id) + return self.url_result(u'vevo:%s' % vevo_id, ie='Vevo') + video_uploader = self._search_regex([r'(?im)[^<]+?]+?>([^<]+?)', # Looking for official user r'<(?:span|a) .*?rel="author".*?>([^<]+?) 0): + sub_lang_list = dict((l['language'], l['url']) for l in info['list']) + return sub_lang_list + self._downloader.report_warning(u'video doesn\'t have subtitles') + return {} -class DailymotionPlaylistIE(InfoExtractor): + +class DailymotionPlaylistIE(DailymotionBaseInfoExtractor): + IE_NAME = u'dailymotion:playlist' _VALID_URL = r'(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/playlist/(?P.+?)/' _MORE_PAGES_INDICATOR = r'' + _PAGE_TEMPLATE = 'https://www.dailymotion.com/playlist/%s/%s' - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - playlist_id = mobj.group('id') + def _extract_entries(self, id): video_ids = [] - for pagenum in itertools.count(1): - webpage = self._download_webpage('https://www.dailymotion.com/playlist/%s/%s' % (playlist_id, pagenum), - playlist_id, u'Downloading page %s' % pagenum) + request = self._build_request(self._PAGE_TEMPLATE % (id, pagenum)) + webpage = self._download_webpage(request, + id, u'Downloading page %s' % pagenum) playlist_el = get_element_by_attribute(u'class', u'video_list', webpage) - video_ids.extend(re.findall(r'data-id="(.+?)" data-ext-id', playlist_el)) + video_ids.extend(re.findall(r'data-id="(.+?)"', playlist_el)) if re.search(self._MORE_PAGES_INDICATOR, webpage, re.DOTALL) is None: break + return [self.url_result('http://www.dailymotion.com/video/%s' % video_id, 'Dailymotion') + for video_id in orderedSet(video_ids)] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + playlist_id = mobj.group('id') + webpage = self._download_webpage(url, playlist_id) - entries = [self.url_result('http://www.dailymotion.com/video/%s' % video_id, 'Dailymotion') - for video_id in video_ids] return {'_type': 'playlist', 'id': playlist_id, 'title': get_element_by_id(u'playlist_name', webpage), - 'entries': entries, + 'entries': self._extract_entries(playlist_id), } + + +class DailymotionUserIE(DailymotionPlaylistIE): + IE_NAME = u'dailymotion:user' + _VALID_URL = r'(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/user/(?P[^/]+)' + _MORE_PAGES_INDICATOR = r'' + _PAGE_TEMPLATE = 'http://www.dailymotion.com/user/%s/%s' + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + user = mobj.group('user') + webpage = self._download_webpage(url, user) + full_user = self._html_search_regex( + r'(.*?)\d+)' + IE_NAME = u'daum.net' + + _TEST = { + u'url': u'http://tvpot.daum.net/clip/ClipView.do?clipid=52554690', + u'file': u'52554690.mp4', + u'info_dict': { + u'title': u'DOTA 2GETHER 시즌2 6회 - 2부', + u'description': u'DOTA 2GETHER 시즌2 6회 - 2부', + u'upload_date': u'20130831', + u'duration': 3868, + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group(1) + canonical_url = 'http://tvpot.daum.net/v/%s' % video_id + webpage = self._download_webpage(canonical_url, video_id) + full_id = self._search_regex(r'\d+)' + + _TEST = { + u'url': u'http://www.ebaumsworld.com/video/watch/83367677/', + u'file': u'83367677.mp4', + u'info_dict': { + u'title': u'A Giant Python Opens The Door', + u'description': u'This is how nightmares start...', + u'uploader': u'jihadpizza', + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + config_xml = self._download_webpage( + 'http://www.ebaumsworld.com/video/player/%s' % video_id, video_id) + config = xml.etree.ElementTree.fromstring(config_xml.encode('utf-8')) + video_url = config.find('file').text + + return { + 'id': video_id, + 'title': config.find('title').text, + 'url': video_url, + 'ext': determine_ext(video_url), + 'description': config.find('description').text, + 'thumbnail': config.find('image').text, + 'uploader': config.find('username').text, + } diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index beaa5b4bd..9d1bc0751 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -106,8 +106,8 @@ class FacebookIE(InfoExtractor): video_duration = int(video_data['video_duration']) thumbnail = video_data['thumbnail_src'] - video_title = self._html_search_regex('

    ([^<]+)

    ', - webpage, u'title') + video_title = self._html_search_regex( + r'

    ([^<]*)

    ', webpage, u'title') info = { 'id': video_id, diff --git a/youtube_dl/extractor/fktv.py b/youtube_dl/extractor/fktv.py new file mode 100644 index 000000000..9c89362ef --- /dev/null +++ b/youtube_dl/extractor/fktv.py @@ -0,0 +1,79 @@ +import re +import random +import json + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + get_element_by_id, + clean_html, +) + + +class FKTVIE(InfoExtractor): + IE_NAME = u'fernsehkritik.tv' + _VALID_URL = r'(?:http://)?(?:www\.)?fernsehkritik.tv/folge-(?P[0-9]+)(?:/.*)?' + + _TEST = { + u'url': u'http://fernsehkritik.tv/folge-1', + u'file': u'00011.flv', + u'info_dict': { + u'title': u'Folge 1 vom 10. April 2007', + u'description': u'md5:fb4818139c7cfe6907d4b83412a6864f', + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + episode = int(mobj.group('ep')) + + server = random.randint(2, 4) + video_thumbnail = 'http://fernsehkritik.tv/images/magazin/folge%d.jpg' % episode + start_webpage = self._download_webpage('http://fernsehkritik.tv/folge-%d/Start' % episode, + episode) + playlist = self._search_regex(r'playlist = (\[.*?\]);', start_webpage, + u'playlist', flags=re.DOTALL) + files = json.loads(re.sub('{[^{}]*?}', '{}', playlist)) + # TODO: return a single multipart video + videos = [] + for i, _ in enumerate(files, 1): + video_id = '%04d%d' % (episode, i) + video_url = 'http://dl%d.fernsehkritik.tv/fernsehkritik%d%s.flv' % (server, episode, '' if i == 1 else '-%d' % i) + video_title = 'Fernsehkritik %d.%d' % (episode, i) + videos.append({ + 'id': video_id, + 'url': video_url, + 'ext': determine_ext(video_url), + 'title': clean_html(get_element_by_id('eptitle', start_webpage)), + 'description': clean_html(get_element_by_id('contentlist', start_webpage)), + 'thumbnail': video_thumbnail + }) + return videos + + +class FKTVPosteckeIE(InfoExtractor): + IE_NAME = u'fernsehkritik.tv:postecke' + _VALID_URL = r'(?:http://)?(?:www\.)?fernsehkritik.tv/inline-video/postecke.php\?(.*&)?ep=(?P[0-9]+)(&|$)' + _TEST = { + u'url': u'http://fernsehkritik.tv/inline-video/postecke.php?iframe=true&width=625&height=440&ep=120', + u'file': u'0120.flv', + u'md5': u'262f0adbac80317412f7e57b4808e5c4', + u'info_dict': { + u"title": u"Postecke 120" + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + episode = int(mobj.group('ep')) + + server = random.randint(2, 4) + video_id = '%04d' % episode + video_url = 'http://dl%d.fernsehkritik.tv/postecke/postecke%d.flv' % (server, episode) + video_title = 'Postecke %d' % episode + return { + 'id': video_id, + 'url': video_url, + 'ext': determine_ext(video_url), + 'title': video_title, + } diff --git a/youtube_dl/extractor/flickr.py b/youtube_dl/extractor/flickr.py index 80d96baf7..e1d2f0526 100644 --- a/youtube_dl/extractor/flickr.py +++ b/youtube_dl/extractor/flickr.py @@ -9,7 +9,7 @@ from ..utils import ( class FlickrIE(InfoExtractor): """Information Extractor for Flickr videos""" - _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P[\w\-_@]+)/(?P\d+).*' + _VALID_URL = r'(?:https?://)?(?:www\.|secure\.)?flickr\.com/photos/(?P[\w\-_@]+)/(?P\d+).*' _TEST = { u'url': u'http://www.flickr.com/photos/forestwander-nature-pictures/5645318632/in/photostream/', u'file': u'5645318632.mp4', diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py new file mode 100644 index 000000000..086cafca0 --- /dev/null +++ b/youtube_dl/extractor/francetv.py @@ -0,0 +1,129 @@ +# encoding: utf-8 +import re +import xml.etree.ElementTree +import json + +from .common import InfoExtractor +from ..utils import ( + compat_urlparse, +) + + +class FranceTVBaseInfoExtractor(InfoExtractor): + def _extract_video(self, video_id): + xml_desc = self._download_webpage( + 'http://www.francetvinfo.fr/appftv/webservices/video/' + 'getInfosOeuvre.php?id-diffusion=' + + video_id, video_id, 'Downloading XML config') + info = xml.etree.ElementTree.fromstring(xml_desc.encode('utf-8')) + + manifest_url = info.find('videos/video/url').text + video_url = manifest_url.replace('manifest.f4m', 'index_2_av.m3u8') + video_url = video_url.replace('/z/', '/i/') + thumbnail_path = info.find('image').text + + return {'id': video_id, + 'ext': 'mp4', + 'url': video_url, + 'title': info.find('titre').text, + 'thumbnail': compat_urlparse.urljoin('http://pluzz.francetv.fr', thumbnail_path), + 'description': info.find('synopsis').text, + } + + +class PluzzIE(FranceTVBaseInfoExtractor): + IE_NAME = u'pluzz.francetv.fr' + _VALID_URL = r'https?://pluzz\.francetv\.fr/videos/(.*?)\.html' + + # Can't use tests, videos expire in 7 days + + def _real_extract(self, url): + title = re.match(self._VALID_URL, url).group(1) + webpage = self._download_webpage(url, title) + video_id = self._search_regex( + r'data-diffusion="(\d+)"', webpage, 'ID') + return self._extract_video(video_id) + + +class FranceTvInfoIE(FranceTVBaseInfoExtractor): + IE_NAME = u'francetvinfo.fr' + _VALID_URL = r'https?://www\.francetvinfo\.fr/replay.*/(?P.+).html' + + _TEST = { + u'url': u'http://www.francetvinfo.fr/replay-jt/france-3/soir-3/jt-grand-soir-3-lundi-26-aout-2013_393427.html', + u'file': u'84981923.mp4', + u'info_dict': { + u'title': u'Soir 3', + }, + u'params': { + u'skip_download': True, + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + page_title = mobj.group('title') + webpage = self._download_webpage(url, page_title) + video_id = self._search_regex(r'id-video=(\d+?)"', webpage, u'video id') + return self._extract_video(video_id) + + +class France2IE(FranceTVBaseInfoExtractor): + IE_NAME = u'france2.fr' + _VALID_URL = r'''(?x)https?://www\.france2\.fr/ + (?: + emissions/.*?/videos/(?P<id>\d+) + | emission/(?P<key>[^/?]+) + )''' + + _TEST = { + u'url': u'http://www.france2.fr/emissions/13h15-le-samedi-le-dimanche/videos/75540104', + u'file': u'75540104.mp4', + u'info_dict': { + u'title': u'13h15, le samedi...', + u'description': u'md5:2e5b58ba7a2d3692b35c792be081a03d', + }, + u'params': { + u'skip_download': True, + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + if mobj.group('key'): + webpage = self._download_webpage(url, mobj.group('key')) + video_id = self._html_search_regex( + r'''(?x)<div\s+class="video-player">\s* + <a\s+href="http://videos.francetv.fr/video/([0-9]+)"\s+ + class="francetv-video-player">''', + webpage, u'video ID') + else: + video_id = mobj.group('id') + return self._extract_video(video_id) + + +class GenerationQuoiIE(InfoExtractor): + IE_NAME = u'france2.fr:generation-quoi' + _VALID_URL = r'https?://generation-quoi\.france2\.fr/portrait/(?P<name>.*)(\?|$)' + + _TEST = { + u'url': u'http://generation-quoi.france2.fr/portrait/garde-a-vous', + u'file': u'k7FJX8VBcvvLmX4wA5Q.mp4', + u'info_dict': { + u'title': u'Génération Quoi - Garde à Vous', + u'uploader': u'Génération Quoi', + }, + u'params': { + # It uses Dailymotion + u'skip_download': True, + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + name = mobj.group('name') + info_url = compat_urlparse.urljoin(url, '/medias/video/%s.json' % name) + info_json = self._download_webpage(info_url, name) + info = json.loads(info_json) + return self.url_result('http://www.dailymotion.com/video/%s' % info['id'], + ie='Dailymotion') diff --git a/youtube_dl/extractor/funnyordie.py b/youtube_dl/extractor/funnyordie.py index 4508f0dfa..2ccdb7073 100644 --- a/youtube_dl/extractor/funnyordie.py +++ b/youtube_dl/extractor/funnyordie.py @@ -21,7 +21,8 @@ class FunnyOrDieIE(InfoExtractor): video_id = mobj.group('id') webpage = self._download_webpage(url, video_id) - video_url = self._search_regex(r'type: "video/mp4", src: "(.*?)"', + video_url = self._search_regex( + [r'type="video/mp4" src="(.*?)"', r'src="([^>]*?)" type=\'video/mp4\''], webpage, u'video URL', flags=re.DOTALL) info = { diff --git a/youtube_dl/extractor/gamespot.py b/youtube_dl/extractor/gamespot.py index 7585b7061..cd3bbe65f 100644 --- a/youtube_dl/extractor/gamespot.py +++ b/youtube_dl/extractor/gamespot.py @@ -14,7 +14,7 @@ class GameSpotIE(InfoExtractor): u"file": u"6410818.mp4", u"md5": u"b2a30deaa8654fcccd43713a6b6a4825", u"info_dict": { - u"title": u"Arma III - Community Guide: SITREP I", + u"title": u"Arma 3 - Community Guide: SITREP I", u"upload_date": u"20130627", } } diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index dc4dea4ad..764070635 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -29,17 +29,6 @@ class GenericIE(InfoExtractor): u"title": u"R\u00e9gis plante sa Jeep" } }, - { - u'url': u'http://www.8tv.cat/8aldia/videos/xavier-sala-i-martin-aquesta-tarda-a-8-al-dia/', - u'file': u'2371591881001.mp4', - u'md5': u'9e80619e0a94663f0bdc849b4566af19', - u'note': u'Test Brightcove downloads and detection in GenericIE', - u'info_dict': { - u'title': u'Xavier Sala i Martín: “Un banc que no presta és un banc zombi que no serveix per a res”', - u'uploader': u'8TV', - u'description': u'md5:a950cc4285c43e44d763d036710cd9cd', - } - }, ] def report_download_webpage(self, video_id): @@ -109,6 +98,11 @@ class GenericIE(InfoExtractor): return new_url def _real_extract(self, url): + parsed_url = compat_urlparse.urlparse(url) + if not parsed_url.scheme: + self._downloader.report_warning('The url doesn\'t specify the protocol, trying with http') + return self.url_result('http://' + url) + try: new_url = self._test_redirect(url) if new_url: @@ -153,7 +147,7 @@ class GenericIE(InfoExtractor): mobj = re.search(r'<meta.*?property="og:video".*?content="(.*?)"', webpage) if mobj is None: # HTML5 video - mobj = re.search(r'<video[^<]*>.*?<source .*?src="([^"]+)"', webpage, flags=re.DOTALL) + mobj = re.search(r'<video[^<]*(?:>.*?<source.*?)? src="([^"]+)"', webpage, flags=re.DOTALL) if mobj is None: raise ExtractorError(u'Invalid URL: %s' % url) @@ -162,9 +156,9 @@ class GenericIE(InfoExtractor): if mobj.group(1) is None: raise ExtractorError(u'Invalid URL: %s' % url) - video_url = compat_urllib_parse.unquote(mobj.group(1)) + video_url = mobj.group(1) video_url = compat_urlparse.urljoin(url, video_url) - video_id = os.path.basename(video_url) + video_id = compat_urllib_parse.unquote(os.path.basename(video_url)) # here's a fun little line of code for you: video_extension = os.path.splitext(video_id)[1][1:] diff --git a/youtube_dl/extractor/googleplus.py b/youtube_dl/extractor/googleplus.py index f1cd88983..8895ad289 100644 --- a/youtube_dl/extractor/googleplus.py +++ b/youtube_dl/extractor/googleplus.py @@ -40,7 +40,8 @@ class GooglePlusIE(InfoExtractor): self.report_extraction(video_id) # Extract update date - upload_date = self._html_search_regex('title="Timestamp">(.*?)</a>', + upload_date = self._html_search_regex( + ['title="Timestamp">(.*?)</a>', r'<a.+?class="g-M.+?>(.+?)</a>'], webpage, u'upload date', fatal=False) if upload_date: # Convert timestring to a format suitable for filename diff --git a/youtube_dl/extractor/hotnewhiphop.py b/youtube_dl/extractor/hotnewhiphop.py index ccca1d7e0..3798118a7 100644 --- a/youtube_dl/extractor/hotnewhiphop.py +++ b/youtube_dl/extractor/hotnewhiphop.py @@ -7,11 +7,11 @@ from .common import InfoExtractor class HotNewHipHopIE(InfoExtractor): _VALID_URL = r'http://www\.hotnewhiphop.com/.*\.(?P<id>.*)\.html' _TEST = { - u'url': u"http://www.hotnewhiphop.com/freddie-gibbs-lay-it-down-song.1435540.html'", + u'url': u"http://www.hotnewhiphop.com/freddie-gibbs-lay-it-down-song.1435540.html", u'file': u'1435540.mp3', u'md5': u'2c2cd2f76ef11a9b3b581e8b232f3d96', u'info_dict': { - u"title": u"Freddie Gibbs Songs - Lay It Down" + u"title": u"Freddie Gibbs - Lay It Down" } } diff --git a/youtube_dl/extractor/howcast.py b/youtube_dl/extractor/howcast.py index 6104c4b5e..46954337f 100644 --- a/youtube_dl/extractor/howcast.py +++ b/youtube_dl/extractor/howcast.py @@ -19,8 +19,7 @@ class HowcastIE(InfoExtractor): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') - webpage_url = 'http://www.howcast.com/videos/' + video_id - webpage = self._download_webpage(webpage_url, video_id) + webpage = self._download_webpage(url, video_id) self.report_extraction(video_id) diff --git a/youtube_dl/extractor/ign.py b/youtube_dl/extractor/ign.py index b1c84278a..c52146f7d 100644 --- a/youtube_dl/extractor/ign.py +++ b/youtube_dl/extractor/ign.py @@ -13,7 +13,7 @@ class IGNIE(InfoExtractor): Some videos of it.ign.com are also supported """ - _VALID_URL = r'https?://.+?\.ign\.com/(?P<type>videos|show_videos|articles)(/.+)?/(?P<name_or_id>.+)' + _VALID_URL = r'https?://.+?\.ign\.com/(?P<type>videos|show_videos|articles|(?:[^/]*/feature))(/.+)?/(?P<name_or_id>.+)' IE_NAME = u'ign.com' _CONFIG_URL_TEMPLATE = 'http://www.ign.com/videos/configs/id/%s.config' @@ -21,15 +21,39 @@ class IGNIE(InfoExtractor): r'id="my_show_video">.*?<p>(.*?)</p>', ] - _TEST = { - u'url': u'http://www.ign.com/videos/2013/06/05/the-last-of-us-review', - u'file': u'8f862beef863986b2785559b9e1aa599.mp4', - u'md5': u'eac8bdc1890980122c3b66f14bdd02e9', - u'info_dict': { - u'title': u'The Last of Us Review', - u'description': u'md5:c8946d4260a4d43a00d5ae8ed998870c', - } - } + _TESTS = [ + { + u'url': u'http://www.ign.com/videos/2013/06/05/the-last-of-us-review', + u'file': u'8f862beef863986b2785559b9e1aa599.mp4', + u'md5': u'eac8bdc1890980122c3b66f14bdd02e9', + u'info_dict': { + u'title': u'The Last of Us Review', + u'description': u'md5:c8946d4260a4d43a00d5ae8ed998870c', + } + }, + { + u'url': u'http://me.ign.com/en/feature/15775/100-little-things-in-gta-5-that-will-blow-your-mind', + u'playlist': [ + { + u'file': u'5ebbd138523268b93c9141af17bec937.mp4', + u'info_dict': { + u'title': u'GTA 5 Video Review', + u'description': u'Rockstar drops the mic on this generation of games. Watch our review of the masterly Grand Theft Auto V.', + }, + }, + { + u'file': u'638672ee848ae4ff108df2a296418ee2.mp4', + u'info_dict': { + u'title': u'GTA 5\'s Twisted Beauty in Super Slow Motion', + u'description': u'The twisted beauty of GTA 5 in stunning slow motion.', + }, + }, + ], + u'params': { + u'skip_download': True, + }, + }, + ] def _find_video_id(self, webpage): res_id = [r'data-video-id="(.+?)"', @@ -46,6 +70,13 @@ class IGNIE(InfoExtractor): if page_type == 'articles': video_url = self._search_regex(r'var videoUrl = "(.+?)"', webpage, u'video url') return self.url_result(video_url, ie='IGN') + elif page_type != 'video': + multiple_urls = re.findall( + '<param name="flashvars" value="[^"]*?url=(https?://www\.ign\.com/videos/.*?)["&]', + webpage) + if multiple_urls: + return [self.url_result(u, ie='IGN') for u in multiple_urls] + video_id = self._find_video_id(webpage) result = self._get_video_info(video_id) description = self._html_search_regex(self._DESCRIPTION_RE, @@ -87,6 +118,9 @@ class OneUPIE(IGNIE): } } + # Override IGN tests + _TESTS = [] + def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) id = mobj.group('name_or_id') diff --git a/youtube_dl/extractor/jeuxvideo.py b/youtube_dl/extractor/jeuxvideo.py index 4327bc13d..ae2e37a70 100644 --- a/youtube_dl/extractor/jeuxvideo.py +++ b/youtube_dl/extractor/jeuxvideo.py @@ -12,7 +12,7 @@ class JeuxVideoIE(InfoExtractor): _TEST = { u'url': u'http://www.jeuxvideo.com/reportages-videos-jeux/0004/00046170/tearaway-playstation-vita-gc-2013-tearaway-nous-presente-ses-papiers-d-identite-00115182.htm', u'file': u'5182.mp4', - u'md5': u'e0fdb0cd3ce98713ef9c1e1e025779d0', + u'md5': u'046e491afb32a8aaac1f44dd4ddd54ee', u'info_dict': { u'title': u'GC 2013 : Tearaway nous présente ses papiers d\'identité', u'description': u'Lorsque les développeurs de LittleBigPlanet proposent un nouveau titre, on ne peut que s\'attendre à un résultat original et fort attrayant.\n', diff --git a/youtube_dl/extractor/kickstarter.py b/youtube_dl/extractor/kickstarter.py new file mode 100644 index 000000000..50bc883ef --- /dev/null +++ b/youtube_dl/extractor/kickstarter.py @@ -0,0 +1,37 @@ +import re + +from .common import InfoExtractor + + +class KickStarterIE(InfoExtractor): + _VALID_URL = r'https?://www\.kickstarter\.com/projects/(?P<id>\d*)/.*' + _TEST = { + u"url": u"https://www.kickstarter.com/projects/1404461844/intersection-the-story-of-josh-grant?ref=home_location", + u"file": u"1404461844.mp4", + u"md5": u"c81addca81327ffa66c642b5d8b08cab", + u"info_dict": { + u"title": u"Intersection: The Story of Josh Grant by Kyle Cowling", + }, + } + + def _real_extract(self, url): + m = re.match(self._VALID_URL, url) + video_id = m.group('id') + webpage_src = self._download_webpage(url, video_id) + + video_url = self._search_regex(r'data-video="(.*?)">', + webpage_src, u'video URL') + if 'mp4' in video_url: + ext = 'mp4' + else: + ext = 'flv' + video_title = self._html_search_regex(r"<title>(.*?)", + webpage_src, u'title').rpartition(u'\u2014 Kickstarter')[0].strip() + + results = [{ + 'id': video_id, + 'url': video_url, + 'title': video_title, + 'ext': ext, + }] + return results diff --git a/youtube_dl/extractor/livestream.py b/youtube_dl/extractor/livestream.py index 309921078..d04da98c8 100644 --- a/youtube_dl/extractor/livestream.py +++ b/youtube_dl/extractor/livestream.py @@ -2,7 +2,12 @@ import re import json from .common import InfoExtractor -from ..utils import compat_urllib_parse_urlparse, compat_urlparse +from ..utils import ( + compat_urllib_parse_urlparse, + compat_urlparse, + get_meta_content, + ExtractorError, +) class LivestreamIE(InfoExtractor): @@ -35,8 +40,11 @@ class LivestreamIE(InfoExtractor): if video_id is None: # This is an event page: - api_url = self._search_regex(r'event_design_eventId: \'(.+?)\'', - webpage, 'api url') + player = get_meta_content('twitter:player', webpage) + if player is None: + raise ExtractorError('Couldn\'t extract event api url') + api_url = player.replace('/player', '') + api_url = re.sub(r'^(https?://)(new\.)', r'\1api.\2', api_url) info = json.loads(self._download_webpage(api_url, event_name, u'Downloading event info')) videos = [self._extract_video_info(video_data['data']) diff --git a/youtube_dl/extractor/metacafe.py b/youtube_dl/extractor/metacafe.py index e38dc98b4..e537648ff 100644 --- a/youtube_dl/extractor/metacafe.py +++ b/youtube_dl/extractor/metacafe.py @@ -122,7 +122,7 @@ class MetacafeIE(InfoExtractor): video_title = self._html_search_regex(r'(?im)(.*) - Video', webpage, u'title') description = self._og_search_description(webpage) video_uploader = self._html_search_regex( - r'submitter=(.*?);|googletag\.pubads\(\)\.setTargeting\("channel","([^"]+)"\);', + r'submitter=(.*?);|googletag\.pubads\(\)\.setTargeting\("(?:channel|submiter)","([^"]+)"\);', webpage, u'uploader nickname', fatal=False) return { diff --git a/youtube_dl/extractor/metacritic.py b/youtube_dl/extractor/metacritic.py new file mode 100644 index 000000000..449138b56 --- /dev/null +++ b/youtube_dl/extractor/metacritic.py @@ -0,0 +1,55 @@ +import re +import xml.etree.ElementTree +import operator + +from .common import InfoExtractor + + +class MetacriticIE(InfoExtractor): + _VALID_URL = r'https?://www\.metacritic\.com/.+?/trailers/(?P\d+)' + + _TEST = { + u'url': u'http://www.metacritic.com/game/playstation-4/infamous-second-son/trailers/3698222', + u'file': u'3698222.mp4', + u'info_dict': { + u'title': u'inFamous: Second Son - inSide Sucker Punch: Smoke & Mirrors', + u'description': u'Take a peak behind-the-scenes to see how Sucker Punch brings smoke into the universe of inFAMOUS Second Son on the PS4.', + u'duration': 221, + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + webpage = self._download_webpage(url, video_id) + # The xml is not well formatted, there are raw '&' + info_xml = self._download_webpage('http://www.metacritic.com/video_data?video=' + video_id, + video_id, u'Downloading info xml').replace('&', '&') + info = xml.etree.ElementTree.fromstring(info_xml.encode('utf-8')) + + clip = next(c for c in info.findall('playList/clip') if c.find('id').text == video_id) + formats = [] + for videoFile in clip.findall('httpURI/videoFile'): + rate_str = videoFile.find('rate').text + video_url = videoFile.find('filePath').text + formats.append({ + 'url': video_url, + 'ext': 'mp4', + 'format_id': rate_str, + 'rate': int(rate_str), + }) + formats.sort(key=operator.itemgetter('rate')) + + description = self._html_search_regex(r'Description:(.*?)

    ', + webpage, u'description', flags=re.DOTALL) + + info = { + 'id': video_id, + 'title': clip.find('title').text, + 'formats': formats, + 'description': description, + 'duration': int(clip.find('duration').text), + } + # TODO: Remove when #980 has been merged + info.update(formats[-1]) + return info diff --git a/youtube_dl/extractor/mixcloud.py b/youtube_dl/extractor/mixcloud.py index 8245b5583..a200dcd74 100644 --- a/youtube_dl/extractor/mixcloud.py +++ b/youtube_dl/extractor/mixcloud.py @@ -5,34 +5,27 @@ import socket from .common import InfoExtractor from ..utils import ( compat_http_client, - compat_str, compat_urllib_error, compat_urllib_request, - - ExtractorError, + unified_strdate, ) class MixcloudIE(InfoExtractor): - _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/ _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)' IE_NAME = u'mixcloud' - def report_download_json(self, file_id): - """Report JSON download.""" - self.to_screen(u'Downloading json') - - def get_urls(self, jsonData, fmt, bitrate='best'): - """Get urls from 'audio_formats' section in json""" - try: - bitrate_list = jsonData[fmt] - if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list: - bitrate = max(bitrate_list) # select highest - - url_list = jsonData[fmt][bitrate] - except TypeError: # we have no bitrate info. - url_list = jsonData[fmt] - return url_list + _TEST = { + u'url': u'http://www.mixcloud.com/dholbach/cryptkeeper/', + u'file': u'dholbach-cryptkeeper.mp3', + u'info_dict': { + u'title': u'Cryptkeeper', + u'description': u'After quite a long silence from myself, finally another Drum\'n\'Bass mix with my favourite current dance floor bangers.', + u'uploader': u'Daniel Holbach', + u'uploader_id': u'dholbach', + u'upload_date': u'20111115', + }, + } def check_urls(self, url_list): """Returns 1st active url from list""" @@ -45,71 +38,32 @@ class MixcloudIE(InfoExtractor): return None - def _print_formats(self, formats): - print('Available formats:') - for fmt in formats.keys(): - for b in formats[fmt]: - try: - ext = formats[fmt][b][0] - print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])) - except TypeError: # we have no bitrate info - ext = formats[fmt][0] - print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])) - break - def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - if mobj is None: - raise ExtractorError(u'Invalid URL: %s' % url) - # extract uploader & filename from url - uploader = mobj.group(1).decode('utf-8') - file_id = uploader + "-" + mobj.group(2).decode('utf-8') - # construct API request - file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json' - # retrieve .json file with links to files - request = compat_urllib_request.Request(file_url) - try: - self.report_download_json(file_url) - jsonData = compat_urllib_request.urlopen(request).read() - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err)) + uploader = mobj.group(1) + cloudcast_name = mobj.group(2) + track_id = '-'.join((uploader, cloudcast_name)) + api_url = 'http://api.mixcloud.com/%s/%s/' % (uploader, cloudcast_name) + webpage = self._download_webpage(url, track_id) + json_data = self._download_webpage(api_url, track_id, + u'Downloading cloudcast info') + info = json.loads(json_data) - # parse JSON - json_data = json.loads(jsonData) - player_url = json_data['player_swf_url'] - formats = dict(json_data['audio_formats']) + preview_url = self._search_regex(r'data-preview-url="(.+?)"', webpage, u'preview url') + song_url = preview_url.replace('/previews/', '/cloudcasts/originals/') + template_url = re.sub(r'(stream\d*)', 'stream%d', song_url) + final_song_url = self.check_urls(template_url % i for i in range(30)) - req_format = self._downloader.params.get('format', None) - - if self._downloader.params.get('listformats', None): - self._print_formats(formats) - return - - if req_format is None or req_format == 'best': - for format_param in formats.keys(): - url_list = self.get_urls(formats, format_param) - # check urls - file_url = self.check_urls(url_list) - if file_url is not None: - break # got it! - else: - if req_format not in formats: - raise ExtractorError(u'Format is not available') - - url_list = self.get_urls(formats, req_format) - file_url = self.check_urls(url_list) - format_param = req_format - - return [{ - 'id': file_id.decode('utf-8'), - 'url': file_url.decode('utf-8'), - 'uploader': uploader.decode('utf-8'), - 'upload_date': None, - 'title': json_data['name'], - 'ext': file_url.split('.')[-1].decode('utf-8'), - 'format': (format_param is None and u'NA' or format_param.decode('utf-8')), - 'thumbnail': json_data['thumbnail_url'], - 'description': json_data['description'], - 'player_url': player_url.decode('utf-8'), - }] + return { + 'id': track_id, + 'title': info['name'], + 'url': final_song_url, + 'ext': 'mp3', + 'description': info['description'], + 'thumbnail': info['pictures'].get('extra_large'), + 'uploader': info['user']['name'], + 'uploader_id': info['user']['username'], + 'upload_date': unified_strdate(info['created_time']), + 'view_count': info['play_count'], + } diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py index 8f956571d..001a576a8 100644 --- a/youtube_dl/extractor/mtv.py +++ b/youtube_dl/extractor/mtv.py @@ -54,23 +54,26 @@ class MTVIE(InfoExtractor): def _get_thumbnail_url(self, uri, itemdoc): return 'http://mtv.mtvnimages.com/uri/' + uri - def _extract_video_url(self, metadataXml): + def _extract_video_formats(self, metadataXml): if '/error_country_block.swf' in metadataXml: raise ExtractorError(u'This video is not available from your country.', expected=True) mdoc = xml.etree.ElementTree.fromstring(metadataXml.encode('utf-8')) renditions = mdoc.findall('.//rendition') - # For now, always pick the highest quality. - rendition = renditions[-1] - - try: - _,_,ext = rendition.attrib['type'].partition('/') - format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate'] - rtmp_video_url = rendition.find('./src').text - except KeyError: - raise ExtractorError('Invalid rendition field.') - video_url = self._transform_rtmp_url(rtmp_video_url) - return {'ext': ext, 'url': video_url, 'format': format} + formats = [] + for rendition in mdoc.findall('.//rendition'): + try: + _, _, ext = rendition.attrib['type'].partition('/') + rtmp_video_url = rendition.find('./src').text + formats.append({'ext': ext, + 'url': self._transform_rtmp_url(rtmp_video_url), + 'format_id': rendition.get('bitrate'), + 'width': int(rendition.get('width')), + 'height': int(rendition.get('height')), + }) + except (KeyError, TypeError): + raise ExtractorError('Invalid rendition field.') + return formats def _get_video_info(self, itemdoc): uri = itemdoc.find('guid').text @@ -81,19 +84,25 @@ class MTVIE(InfoExtractor): mediagen_url += '&acceptMethods=fms' mediagen_page = self._download_webpage(mediagen_url, video_id, u'Downloading video urls') - video_info = self._extract_video_url(mediagen_page) description_node = itemdoc.find('description') if description_node is not None: description = description_node.text else: description = None - video_info.update({'title': itemdoc.find('title').text, - 'id': video_id, - 'thumbnail': self._get_thumbnail_url(uri, itemdoc), - 'description': description, - }) - return video_info + + info = { + 'title': itemdoc.find('title').text, + 'formats': self._extract_video_formats(mediagen_page), + 'id': video_id, + 'thumbnail': self._get_thumbnail_url(uri, itemdoc), + 'description': description, + } + + # TODO: Remove when #980 has been merged + info.update(info['formats'][-1]) + + return info def _get_videos_info(self, uri): video_id = self._id_from_uri(uri) diff --git a/youtube_dl/extractor/naver.py b/youtube_dl/extractor/naver.py new file mode 100644 index 000000000..9df236d69 --- /dev/null +++ b/youtube_dl/extractor/naver.py @@ -0,0 +1,73 @@ +# encoding: utf-8 +import re +import xml.etree.ElementTree + +from .common import InfoExtractor +from ..utils import ( + compat_urllib_parse, + ExtractorError, +) + + +class NaverIE(InfoExtractor): + _VALID_URL = r'https?://tvcast\.naver\.com/v/(?P\d+)' + + _TEST = { + u'url': u'http://tvcast.naver.com/v/81652', + u'file': u'81652.mp4', + u'info_dict': { + u'title': u'[9월 모의고사 해설강의][수학_김상희] 수학 A형 16~20번', + u'description': u'합격불변의 법칙 메가스터디 | 메가스터디 수학 김상희 선생님이 9월 모의고사 수학A형 16번에서 20번까지 해설강의를 공개합니다.', + u'upload_date': u'20130903', + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group(1) + webpage = self._download_webpage(url, video_id) + m_id = re.search(r'var rmcPlayer = new nhn.rmcnmv.RMCVideoPlayer\("(.+?)", "(.+?)"', + webpage) + if m_id is None: + raise ExtractorError(u'couldn\'t extract vid and key') + vid = m_id.group(1) + key = m_id.group(2) + query = compat_urllib_parse.urlencode({'vid': vid, 'inKey': key,}) + query_urls = compat_urllib_parse.urlencode({ + 'masterVid': vid, + 'protocol': 'p2p', + 'inKey': key, + }) + info_xml = self._download_webpage( + 'http://serviceapi.rmcnmv.naver.com/flash/videoInfo.nhn?' + query, + video_id, u'Downloading video info') + urls_xml = self._download_webpage( + 'http://serviceapi.rmcnmv.naver.com/flash/playableEncodingOption.nhn?' + query_urls, + video_id, u'Downloading video formats info') + info = xml.etree.ElementTree.fromstring(info_xml.encode('utf-8')) + urls = xml.etree.ElementTree.fromstring(urls_xml.encode('utf-8')) + + formats = [] + for format_el in urls.findall('EncodingOptions/EncodingOption'): + domain = format_el.find('Domain').text + if domain.startswith('rtmp'): + continue + formats.append({ + 'url': domain + format_el.find('uri').text, + 'ext': 'mp4', + 'width': int(format_el.find('width').text), + 'height': int(format_el.find('height').text), + }) + + info = { + 'id': video_id, + 'title': info.find('Subject').text, + 'formats': formats, + 'description': self._og_search_description(webpage), + 'thumbnail': self._og_search_thumbnail(webpage), + 'upload_date': info.find('WriteDate').text.replace('.', ''), + 'view_count': int(info.find('PlayCount').text), + } + # TODO: Remove when #980 has been merged + info.update(formats[-1]) + return info diff --git a/youtube_dl/extractor/newgrounds.py b/youtube_dl/extractor/newgrounds.py new file mode 100644 index 000000000..2ef80bce0 --- /dev/null +++ b/youtube_dl/extractor/newgrounds.py @@ -0,0 +1,38 @@ +import json +import re + +from .common import InfoExtractor +from ..utils import determine_ext + + +class NewgroundsIE(InfoExtractor): + _VALID_URL = r'(?:https?://)?(?:www\.)?newgrounds\.com/audio/listen/(?P\d+)' + _TEST = { + u'url': u'http://www.newgrounds.com/audio/listen/549479', + u'file': u'549479.mp3', + u'md5': u'fe6033d297591288fa1c1f780386f07a', + u'info_dict': { + u"title": u"B7 - BusMode", + u"uploader": u"Burn7", + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + music_id = mobj.group('id') + webpage = self._download_webpage(url, music_id) + + title = self._html_search_regex(r',"name":"([^"]+)",', webpage, u'music title') + uploader = self._html_search_regex(r',"artist":"([^"]+)",', webpage, u'music uploader') + + music_url_json_string = self._html_search_regex(r'({"url":"[^"]+"),', webpage, u'music url') + '}' + music_url_json = json.loads(music_url_json_string) + music_url = music_url_json['url'] + + return { + 'id': music_id, + 'title': title, + 'url': music_url, + 'uploader': uploader, + 'ext': determine_ext(music_url), + } diff --git a/youtube_dl/extractor/ooyala.py b/youtube_dl/extractor/ooyala.py index b734722d0..1f7b4d2e7 100644 --- a/youtube_dl/extractor/ooyala.py +++ b/youtube_dl/extractor/ooyala.py @@ -18,11 +18,15 @@ class OoyalaIE(InfoExtractor): }, } + @staticmethod + def _url_for_embed_code(embed_code): + return 'http://player.ooyala.com/player.js?embedCode=%s' % embed_code + def _extract_result(self, info, more_info): return {'id': info['embedCode'], 'ext': 'mp4', 'title': unescapeHTML(info['title']), - 'url': info['url'], + 'url': info.get('ipad_url') or info['url'], 'description': unescapeHTML(more_info['description']), 'thumbnail': more_info['promo'], } @@ -35,7 +39,9 @@ class OoyalaIE(InfoExtractor): mobile_url = self._search_regex(r'mobile_player_url="(.+?)&device="', player, u'mobile player url') mobile_player = self._download_webpage(mobile_url, embedCode) - videos_info = self._search_regex(r'eval\("\((\[{.*?stream_redirect.*?}\])\)"\);', mobile_player, u'info').replace('\\"','"') + videos_info = self._search_regex( + r'var streams=window.oo_testEnv\?\[\]:eval\("\((\[{.*?}\])\)"\);', + mobile_player, u'info').replace('\\"','"') videos_more_info = self._search_regex(r'eval\("\(({.*?\\"promo\\".*?})\)"', mobile_player, u'more info').replace('\\"','"') videos_info = json.loads(videos_info) videos_more_info =json.loads(videos_more_info) diff --git a/youtube_dl/extractor/orf.py b/youtube_dl/extractor/orf.py index 41ef8e992..cfca2a063 100644 --- a/youtube_dl/extractor/orf.py +++ b/youtube_dl/extractor/orf.py @@ -14,19 +14,6 @@ from ..utils import ( class ORFIE(InfoExtractor): _VALID_URL = r'https?://tvthek.orf.at/(programs/.+?/episodes|topics/.+?)/(?P\d+)' - _TEST = { - u'url': u'http://tvthek.orf.at/programs/1171769-Wetter-ZIB/episodes/6557323-Wetter', - u'file': u'6566957.flv', - u'info_dict': { - u'title': u'Wetter', - u'description': u'Christa Kummer, Marcus Wadsak und Kollegen präsentieren abwechselnd ihre täglichen Wetterprognosen für Österreich.\r \r Mehr Wetter unter wetter.ORF.at', - }, - u'params': { - # It uses rtmp - u'skip_download': True, - } - } - def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) playlist_id = mobj.group('id') diff --git a/youtube_dl/extractor/redtube.py b/youtube_dl/extractor/redtube.py index 1d2cf1f56..bb19b898a 100644 --- a/youtube_dl/extractor/redtube.py +++ b/youtube_dl/extractor/redtube.py @@ -14,24 +14,25 @@ class RedTubeIE(InfoExtractor): } } - def _real_extract(self,url): + def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') - video_extension = 'mp4' + video_extension = 'mp4' webpage = self._download_webpage(url, video_id) self.report_extraction(video_id) - video_url = self._html_search_regex(r'', - webpage, u'video URL') + video_url = self._html_search_regex( + r'', webpage, u'video URL') - video_title = self._html_search_regex('

    (.+?)

    ', + video_title = self._html_search_regex( + r'

    (.+?)

    ', webpage, u'title') - return [{ + return { 'id': video_id, 'url': video_url, 'ext': video_extension, 'title': video_title, - }] + } diff --git a/youtube_dl/extractor/rtlnow.py b/youtube_dl/extractor/rtlnow.py index 7bb236c2b..d1b08c9bc 100644 --- a/youtube_dl/extractor/rtlnow.py +++ b/youtube_dl/extractor/rtlnow.py @@ -8,8 +8,8 @@ from ..utils import ( ) class RTLnowIE(InfoExtractor): - """Information Extractor for RTL NOW, RTL2 NOW, SUPER RTL NOW and VOX NOW""" - _VALID_URL = r'(?:http://)?(?P(?Prtl-now\.rtl\.de/|rtl2now\.rtl2\.de/|(?:www\.)?voxnow\.de/|(?:www\.)?superrtlnow\.de/)[a-zA-Z0-9-]+/[a-zA-Z0-9-]+\.php\?(?:container_id|film_id)=(?P[0-9]+)&player=1(?:&season=[0-9]+)?(?:&.*)?)' + """Information Extractor for RTL NOW, RTL2 NOW, RTL NITRO, SUPER RTL NOW, VOX NOW and n-tv NOW""" + _VALID_URL = r'(?:http://)?(?P(?Prtl-now\.rtl\.de/|rtl2now\.rtl2\.de/|(?:www\.)?voxnow\.de/|(?:www\.)?rtlnitronow\.de/|(?:www\.)?superrtlnow\.de/|(?:www\.)?n-tvnow\.de/)[a-zA-Z0-9-]+/[a-zA-Z0-9-]+\.php\?(?:container_id|film_id)=(?P[0-9]+)&player=1(?:&season=[0-9]+)?(?:&.*)?)' _TESTS = [{ u'url': u'http://rtl-now.rtl.de/ahornallee/folge-1.php?film_id=90419&player=1&season=1', u'file': u'90419.flv', @@ -61,8 +61,35 @@ class RTLnowIE(InfoExtractor): u'params': { u'skip_download': True, }, + }, + { + u'url': u'http://www.rtlnitronow.de/recht-ordnung/lebensmittelkontrolle-erlangenordnungsamt-berlin.php?film_id=127367&player=1&season=1', + u'file': u'127367.flv', + u'info_dict': { + u'upload_date': u'20130926', + u'title': u'Recht & Ordnung - Lebensmittelkontrolle Erlangen/Ordnungsamt...', + u'description': u'Lebensmittelkontrolle Erlangen/Ordnungsamt Berlin', + u'thumbnail': u'http://autoimg.static-fra.de/nitronow/344787/1500x1500/image2.jpg', + }, + u'params': { + u'skip_download': True, + }, + }, + { + u'url': u'http://www.n-tvnow.de/top-gear/episode-1-2013-01-01-00-00-00.php?film_id=124903&player=1&season=10', + u'file': u'124903.flv', + u'info_dict': { + u'upload_date': u'20130101', + u'title': u'Top Gear vom 01.01.2013', + u'description': u'Episode 1', + }, + u'params': { + u'skip_download': True, + }, + u'skip': u'Only works from Germany', }] + def _real_extract(self,url): mobj = re.match(self._VALID_URL, url) @@ -79,20 +106,23 @@ class RTLnowIE(InfoExtractor): msg = clean_html(note_m.group(1)) raise ExtractorError(msg) - video_title = self._html_search_regex(r'(?P<title>[^<]+)', + video_title = self._html_search_regex(r'(?P<title>[^<]+?)( \| [^<]*)?', webpage, u'title') playerdata_url = self._html_search_regex(r'\'playerdata\': \'(?P[^\']+)\'', webpage, u'playerdata_url') playerdata = self._download_webpage(playerdata_url, video_id) - mobj = re.search(r'<!\[CDATA\[(?P<description>.+?)\s+- (?:Sendung )?vom (?P<upload_date_d>[0-9]{2})\.(?P<upload_date_m>[0-9]{2})\.(?:(?P<upload_date_Y>[0-9]{4})|(?P<upload_date_y>[0-9]{2})) [0-9]{2}:[0-9]{2} Uhr\]\]>', playerdata) + mobj = re.search(r'<!\[CDATA\[(?P<description>.+?)(?:\s+- (?:Sendung )?vom (?P<upload_date_d>[0-9]{2})\.(?P<upload_date_m>[0-9]{2})\.(?:(?P<upload_date_Y>[0-9]{4})|(?P<upload_date_y>[0-9]{2})) [0-9]{2}:[0-9]{2} Uhr)?\]\]>', playerdata) if mobj: video_description = mobj.group(u'description') if mobj.group('upload_date_Y'): video_upload_date = mobj.group('upload_date_Y') - else: + elif mobj.group('upload_date_y'): video_upload_date = u'20' + mobj.group('upload_date_y') - video_upload_date += mobj.group('upload_date_m')+mobj.group('upload_date_d') + else: + video_upload_date = None + if video_upload_date: + video_upload_date += mobj.group('upload_date_m')+mobj.group('upload_date_d') else: video_description = None video_upload_date = None diff --git a/youtube_dl/extractor/slideshare.py b/youtube_dl/extractor/slideshare.py new file mode 100644 index 000000000..afc3001b5 --- /dev/null +++ b/youtube_dl/extractor/slideshare.py @@ -0,0 +1,47 @@ +import re +import json + +from .common import InfoExtractor +from ..utils import ( + compat_urlparse, + ExtractorError, +) + + +class SlideshareIE(InfoExtractor): + _VALID_URL = r'https?://www\.slideshare\.net/[^/]+?/(?P.+?)($|\?)' + + _TEST = { + u'url': u'http://www.slideshare.net/Dataversity/keynote-presentation-managing-scale-and-complexity', + u'file': u'25665706.mp4', + u'info_dict': { + u'title': u'Managing Scale and Complexity', + u'description': u'This was a keynote presentation at the NoSQL Now! 2013 Conference & Expo (http://www.nosqlnow.com). This presentation was given by Adrian Cockcroft from Netflix', + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + page_title = mobj.group('title') + webpage = self._download_webpage(url, page_title) + slideshare_obj = self._search_regex( + r'var slideshare_object = ({.*?}); var user_info =', + webpage, u'slideshare object') + info = json.loads(slideshare_obj) + if info['slideshow']['type'] != u'video': + raise ExtractorError(u'Webpage type is "%s": only video extraction is supported for Slideshare' % info['slideshow']['type'], expected=True) + + doc = info['doc'] + bucket = info['jsplayer']['video_bucket'] + ext = info['jsplayer']['video_extension'] + video_url = compat_urlparse.urljoin(bucket, doc + '-SD.' + ext) + + return { + '_type': 'video', + 'id': info['slideshow']['id'], + 'title': info['slideshow']['title'], + 'ext': ext, + 'url': video_url, + 'thumbnail': info['slideshow']['pin_image_url'], + 'description': self._og_search_description(webpage), + } diff --git a/youtube_dl/extractor/sohu.py b/youtube_dl/extractor/sohu.py index 77bb0a8dc..2b9bf0cb7 100644 --- a/youtube_dl/extractor/sohu.py +++ b/youtube_dl/extractor/sohu.py @@ -8,7 +8,7 @@ from ..utils import ExtractorError class SohuIE(InfoExtractor): - _VALID_URL = r'https?://tv\.sohu\.com/\d+?/n(?P<id>\d+)\.shtml.*?' + _VALID_URL = r'https?://(?P<mytv>my\.)?tv\.sohu\.com/.+?/(?(mytv)|n)(?P<id>\d+)\.shtml.*?' _TEST = { u'url': u'http://tv.sohu.com/20130724/n382479172.shtml#super', @@ -21,8 +21,11 @@ class SohuIE(InfoExtractor): def _real_extract(self, url): - def _fetch_data(vid_id): - base_data_url = u'http://hot.vrs.sohu.com/vrs_flash.action?vid=' + def _fetch_data(vid_id, mytv=False): + if mytv: + base_data_url = 'http://my.tv.sohu.com/play/videonew.do?vid=' + else: + base_data_url = u'http://hot.vrs.sohu.com/vrs_flash.action?vid=' data_url = base_data_url + str(vid_id) data_json = self._download_webpage( data_url, video_id, @@ -31,15 +34,16 @@ class SohuIE(InfoExtractor): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') + mytv = mobj.group('mytv') is not None webpage = self._download_webpage(url, video_id) raw_title = self._html_search_regex(r'(?s)<title>(.+?)', webpage, u'video title') title = raw_title.partition('-')[0].strip() - vid = self._html_search_regex(r'var vid="(\d+)"', webpage, + vid = self._html_search_regex(r'var vid ?= ?["\'](\d+)["\']', webpage, u'video path') - data = _fetch_data(vid) + data = _fetch_data(vid, mytv) QUALITIES = ('ori', 'super', 'high', 'nor') vid_ids = [data['data'][q + 'Vid'] @@ -51,7 +55,7 @@ class SohuIE(InfoExtractor): # For now, we just pick the highest available quality vid_id = vid_ids[-1] - format_data = data if vid == vid_id else _fetch_data(vid_id) + format_data = data if vid == vid_id else _fetch_data(vid_id, mytv) part_count = format_data['data']['totalBlocks'] allot = format_data['allot'] prot = format_data['prot'] diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index 5f3a5540d..29cd5617c 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -1,10 +1,12 @@ import json import re +import itertools from .common import InfoExtractor from ..utils import ( compat_str, compat_urlparse, + compat_urllib_parse, ExtractorError, unified_strdate, @@ -53,10 +55,11 @@ class SoundcloudIE(InfoExtractor): def _resolv_url(cls, url): return 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=' + cls._CLIENT_ID - def _extract_info_dict(self, info, full_title=None): + def _extract_info_dict(self, info, full_title=None, quiet=False): video_id = info['id'] name = full_title or video_id - self.report_extraction(name) + if quiet == False: + self.report_extraction(name) thumbnail = info['artwork_url'] if thumbnail is not None: @@ -198,3 +201,41 @@ class SoundcloudSetIE(SoundcloudIE): 'id': info['id'], 'title': info['title'], } + + +class SoundcloudUserIE(SoundcloudIE): + _VALID_URL = r'https?://(www\.)?soundcloud.com/(?P[^/]+)(/?(tracks/)?)?(\?.*)?$' + IE_NAME = u'soundcloud:user' + + # it's in tests/test_playlists.py + _TEST = None + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + uploader = mobj.group('user') + + url = 'http://soundcloud.com/%s/' % uploader + resolv_url = self._resolv_url(url) + user_json = self._download_webpage(resolv_url, uploader, + u'Downloading user info') + user = json.loads(user_json) + + tracks = [] + for i in itertools.count(): + data = compat_urllib_parse.urlencode({'offset': i*50, + 'client_id': self._CLIENT_ID, + }) + tracks_url = 'http://api.soundcloud.com/users/%s/tracks.json?' % user['id'] + data + response = self._download_webpage(tracks_url, uploader, + u'Downloading tracks page %s' % (i+1)) + new_tracks = json.loads(response) + tracks.extend(self._extract_info_dict(track, quiet=True) for track in new_tracks) + if len(new_tracks) < 50: + break + + return { + '_type': 'playlist', + 'id': compat_str(user['id']), + 'title': user['username'], + 'entries': tracks, + } diff --git a/youtube_dl/extractor/southparkstudios.py b/youtube_dl/extractor/southparkstudios.py new file mode 100644 index 000000000..b1e96b679 --- /dev/null +++ b/youtube_dl/extractor/southparkstudios.py @@ -0,0 +1,38 @@ +import re + +from .mtv import MTVIE, _media_xml_tag + + +class SouthParkStudiosIE(MTVIE): + IE_NAME = u'southparkstudios.com' + _VALID_URL = r'https?://www\.southparkstudios\.com/(clips|full-episodes)/(?P.+?)(\?|#|$)' + + _FEED_URL = 'http://www.southparkstudios.com/feeds/video-player/mrss' + + _TEST = { + u'url': u'http://www.southparkstudios.com/clips/104437/bat-daded#tab=featured', + u'file': u'a7bff6c2-ed00-11e0-aca6-0026b9414f30.mp4', + u'info_dict': { + u'title': u'Bat Daded', + u'description': u'Randy disqualifies South Park by getting into a fight with Bat Dad.', + }, + } + + # Overwrite MTVIE properties we don't want + _TESTS = [] + + def _get_thumbnail_url(self, uri, itemdoc): + search_path = '%s/%s' % (_media_xml_tag('group'), _media_xml_tag('thumbnail')) + thumb_node = itemdoc.find(search_path) + if thumb_node is None: + return None + else: + return thumb_node.attrib['url'] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + webpage = self._download_webpage(url, video_id) + mgid = self._search_regex(r'swfobject.embedSWF\(".*?(mgid:.*?)"', + webpage, u'mgid') + return self._get_videos_info(mgid) diff --git a/youtube_dl/extractor/subtitles.py b/youtube_dl/extractor/subtitles.py new file mode 100644 index 000000000..90de7de3a --- /dev/null +++ b/youtube_dl/extractor/subtitles.py @@ -0,0 +1,91 @@ +from .common import InfoExtractor + +from ..utils import ( + compat_str, + ExtractorError, +) + + +class SubtitlesInfoExtractor(InfoExtractor): + @property + def _have_to_download_any_subtitles(self): + return any([self._downloader.params.get('writesubtitles', False), + self._downloader.params.get('writeautomaticsub')]) + + def _list_available_subtitles(self, video_id, webpage=None): + """ outputs the available subtitles for the video """ + sub_lang_list = self._get_available_subtitles(video_id) + auto_captions_list = self._get_available_automatic_caption(video_id, webpage) + sub_lang = ",".join(list(sub_lang_list.keys())) + self.to_screen(u'%s: Available subtitles for video: %s' % + (video_id, sub_lang)) + auto_lang = ",".join(auto_captions_list.keys()) + self.to_screen(u'%s: Available automatic captions for video: %s' % + (video_id, auto_lang)) + + def extract_subtitles(self, video_id, video_webpage=None): + """ + returns {sub_lang: sub} ,{} if subtitles not found or None if the + subtitles aren't requested. + """ + if not self._have_to_download_any_subtitles: + return None + available_subs_list = {} + if self._downloader.params.get('writeautomaticsub', False): + available_subs_list.update(self._get_available_automatic_caption(video_id, video_webpage)) + if self._downloader.params.get('writesubtitles', False): + available_subs_list.update(self._get_available_subtitles(video_id)) + + if not available_subs_list: # error, it didn't get the available subtitles + return {} + if self._downloader.params.get('allsubtitles', False): + sub_lang_list = available_subs_list + else: + if self._downloader.params.get('subtitleslangs', False): + requested_langs = self._downloader.params.get('subtitleslangs') + elif 'en' in available_subs_list: + requested_langs = ['en'] + else: + requested_langs = [list(available_subs_list.keys())[0]] + + sub_lang_list = {} + for sub_lang in requested_langs: + if not sub_lang in available_subs_list: + self._downloader.report_warning(u'no closed captions found in the specified language "%s"' % sub_lang) + continue + sub_lang_list[sub_lang] = available_subs_list[sub_lang] + + subtitles = {} + for sub_lang, url in sub_lang_list.items(): + subtitle = self._request_subtitle_url(sub_lang, url) + if subtitle: + subtitles[sub_lang] = subtitle + return subtitles + + def _request_subtitle_url(self, sub_lang, url): + """ makes the http request for the subtitle """ + try: + sub = self._download_webpage(url, None, note=False) + except ExtractorError as err: + self._downloader.report_warning(u'unable to download video subtitles for %s: %s' % (sub_lang, compat_str(err))) + return + if not sub: + self._downloader.report_warning(u'Did not fetch video subtitles') + return + return sub + + def _get_available_subtitles(self, video_id): + """ + returns {sub_lang: url} or {} if not available + Must be redefined by the subclasses + """ + pass + + def _get_available_automatic_caption(self, video_id, webpage): + """ + returns {sub_lang: url} or {} if not available + Must be redefined by the subclasses that support automatic captions, + otherwise it will return {} + """ + self._downloader.report_warning(u'Automatic Captions not supported by this server') + return {} diff --git a/youtube_dl/extractor/ted.py b/youtube_dl/extractor/ted.py index 4c11f7a03..dfa1176a3 100644 --- a/youtube_dl/extractor/ted.py +++ b/youtube_dl/extractor/ted.py @@ -77,12 +77,20 @@ class TEDIE(InfoExtractor): thumbnail = self._search_regex(r'
    [\s.]*[\s.]*.+)' + IE_NAME = u'ustream:channel' + + def _real_extract(self, url): + m = re.match(self._VALID_URL, url) + slug = m.group('slug') + webpage = self._download_webpage(url, slug) + channel_id = get_meta_content('ustream:channel_id', webpage) + + BASE = 'http://www.ustream.tv' + next_url = '/ajax/socialstream/videos/%s/1.json' % channel_id + video_ids = [] + while next_url: + reply = json.loads(self._download_webpage(compat_urlparse.urljoin(BASE, next_url), channel_id)) + video_ids.extend(re.findall(r'data-content-id="(\d.*)"', reply['data'])) + next_url = reply['nextUrl'] + + urls = ['http://www.ustream.tv/recorded/' + vid for vid in video_ids] + url_entries = [self.url_result(eurl, 'Ustream') for eurl in urls] + return self.playlist_result(url_entries, channel_id) diff --git a/youtube_dl/extractor/veehd.py b/youtube_dl/extractor/veehd.py new file mode 100644 index 000000000..3a99a29c6 --- /dev/null +++ b/youtube_dl/extractor/veehd.py @@ -0,0 +1,56 @@ +import re +import json + +from .common import InfoExtractor +from ..utils import ( + compat_urlparse, + get_element_by_id, + clean_html, +) + +class VeeHDIE(InfoExtractor): + _VALID_URL = r'https?://veehd.com/video/(?P\d+)' + + _TEST = { + u'url': u'http://veehd.com/video/4686958', + u'file': u'4686958.mp4', + u'info_dict': { + u'title': u'Time Lapse View from Space ( ISS)', + u'uploader_id': u'spotted', + u'description': u'md5:f0094c4cf3a72e22bc4e4239ef767ad7', + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + + webpage = self._download_webpage(url, video_id) + player_path = self._search_regex(r'\$\("#playeriframe"\).attr\({src : "(.+?)"', + webpage, u'player path') + player_url = compat_urlparse.urljoin(url, player_path) + player_page = self._download_webpage(player_url, video_id, + u'Downloading player page') + config_json = self._search_regex(r'value=\'config=({.+?})\'', + player_page, u'config json') + config = json.loads(config_json) + + video_url = compat_urlparse.unquote(config['clip']['url']) + title = clean_html(get_element_by_id('videoName', webpage).rpartition('|')[0]) + uploader_id = self._html_search_regex(r'(.+?)', + webpage, u'uploader') + thumbnail = self._search_regex(r'(.*?).+)' + + _TEST = { + u'url': u'http://www.vice.com/Fringes/cowboy-capitalists-part-1', + u'file': u'43cW1mYzpia9IlestBjVpd23Yu3afAfp.mp4', + u'info_dict': { + u'title': u'VICE_COWBOYCAPITALISTS_PART01_v1_VICE_WM_1080p.mov', + }, + u'params': { + # Requires ffmpeg (m3u8 manifest) + u'skip_download': True, + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + name = mobj.group('name') + webpage = self._download_webpage(url, name) + try: + ooyala_url = self._og_search_video_url(webpage) + except ExtractorError: + try: + embed_code = self._search_regex( + r'OO.Player.create\(\'ooyalaplayer\', \'(.+?)\'', webpage, + u'ooyala embed code') + ooyala_url = OoyalaIE._url_for_embed_code(embed_code) + except ExtractorError: + raise ExtractorError(u'The page doesn\'t contain a video', expected=True) + return self.url_result(ooyala_url, ie='Ooyala') + diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 512e06e2a..4a7d82b7a 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -44,6 +44,16 @@ class VimeoIE(InfoExtractor): u'title': u'Andy Allan - Putting the Carto into OpenStreetMap Cartography', }, }, + { + u'url': u'http://player.vimeo.com/video/54469442', + u'file': u'54469442.mp4', + u'md5': u'619b811a4417aa4abe78dc653becf511', + u'note': u'Videos that embed the url in the player page', + u'info_dict': { + u'title': u'Kathy Sierra: Building the minimum Badass User, Business of Software', + u'uploader': u'The BLN & Business of Software', + }, + }, ] def _login(self): @@ -112,7 +122,8 @@ class VimeoIE(InfoExtractor): # Extract the config JSON try: - config = webpage.split(' = {config:')[1].split(',assets:')[0] + config = self._search_regex([r' = {config:({.+?}),assets:', r'c=({.+?);'], + webpage, u'info section', flags=re.DOTALL) config = json.loads(config) except: if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage): @@ -132,12 +143,22 @@ class VimeoIE(InfoExtractor): video_uploader_id = config["video"]["owner"]["url"].split('/')[-1] if config["video"]["owner"]["url"] else None # Extract video thumbnail - video_thumbnail = config["video"]["thumbnail"] + video_thumbnail = config["video"].get("thumbnail") + if video_thumbnail is None: + _, video_thumbnail = sorted((int(width), t_url) for (width, t_url) in config["video"]["thumbs"].items())[-1] # Extract video description - video_description = get_element_by_attribute("itemprop", "description", webpage) - if video_description: video_description = clean_html(video_description) - else: video_description = u'' + video_description = None + try: + video_description = get_element_by_attribute("itemprop", "description", webpage) + if video_description: video_description = clean_html(video_description) + except AssertionError as err: + # On some pages like (http://player.vimeo.com/video/54469442) the + # html tags are not closed, python 2.6 cannot handle it + if err.args[0] == 'we should not get here!': + pass + else: + raise # Extract upload date video_upload_date = None @@ -154,14 +175,15 @@ class VimeoIE(InfoExtractor): # TODO bind to format param codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')] files = { 'hd': [], 'sd': [], 'other': []} + config_files = config["video"].get("files") or config["request"].get("files") for codec_name, codec_extension in codecs: - if codec_name in config["video"]["files"]: - if 'hd' in config["video"]["files"][codec_name]: + if codec_name in config_files: + if 'hd' in config_files[codec_name]: files['hd'].append((codec_name, codec_extension, 'hd')) - elif 'sd' in config["video"]["files"][codec_name]: + elif 'sd' in config_files[codec_name]: files['sd'].append((codec_name, codec_extension, 'sd')) else: - files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0])) + files['other'].append((codec_name, codec_extension, config_files[codec_name][0])) for quality in ('hd', 'sd', 'other'): if len(files[quality]) > 0: @@ -173,8 +195,12 @@ class VimeoIE(InfoExtractor): else: raise ExtractorError(u'No known codec found') - video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \ - %(video_id, sig, timestamp, video_quality, video_codec.upper()) + video_url = None + if isinstance(config_files[video_codec], dict): + video_url = config_files[video_codec][video_quality].get("url") + if video_url is None: + video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \ + %(video_id, sig, timestamp, video_quality, video_codec.upper()) return [{ 'id': video_id, diff --git a/youtube_dl/extractor/xhamster.py b/youtube_dl/extractor/xhamster.py index 88b8b6be0..361619694 100644 --- a/youtube_dl/extractor/xhamster.py +++ b/youtube_dl/extractor/xhamster.py @@ -11,8 +11,8 @@ from ..utils import ( class XHamsterIE(InfoExtractor): """Information Extractor for xHamster""" - _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P[0-9]+)/.*\.html' - _TEST = { + _VALID_URL = r'(?:http://)?(?:www\.)?xhamster\.com/movies/(?P[0-9]+)/(?P.+?)\.html(?:\?.*)?' + _TESTS = [{ u'url': u'http://xhamster.com/movies/1509445/femaleagent_shy_beauty_takes_the_bait.html', u'file': u'1509445.flv', u'md5': u'9f48e0e8d58e3076bb236ff412ab62fa', @@ -21,13 +21,24 @@ class XHamsterIE(InfoExtractor): u"uploader_id": u"Ruseful2011", u"title": u"FemaleAgent Shy beauty takes the bait" } - } + }, + { + u'url': u'http://xhamster.com/movies/2221348/britney_spears_sexy_booty.html?hd', + u'file': u'2221348.flv', + u'md5': u'e767b9475de189320f691f49c679c4c7', + u'info_dict': { + u"upload_date": u"20130914", + u"uploader_id": u"jojo747400", + u"title": u"Britney Spears Sexy Booty" + } + }] def _real_extract(self,url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') - mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id + seo = mobj.group('seo') + mrss_url = 'http://xhamster.com/movies/%s/%s.html?hd' % (video_id, seo) webpage = self._download_webpage(mrss_url, video_id) mobj = re.search(r'\'srv\': \'(?P[^\']*)\',\s*\'file\': \'(?P[^\']+)\',', webpage) diff --git a/youtube_dl/extractor/yahoo.py b/youtube_dl/extractor/yahoo.py index 32d5b9477..5bdd5d591 100644 --- a/youtube_dl/extractor/yahoo.py +++ b/youtube_dl/extractor/yahoo.py @@ -1,4 +1,3 @@ -import datetime import itertools import json import re @@ -6,86 +5,89 @@ import re from .common import InfoExtractor, SearchInfoExtractor from ..utils import ( compat_urllib_parse, - - ExtractorError, + compat_urlparse, + determine_ext, + clean_html, ) + class YahooIE(InfoExtractor): IE_DESC = u'Yahoo screen' _VALID_URL = r'http://screen\.yahoo\.com/.*?-(?P\d*?)\.html' - _TEST = { - u'url': u'http://screen.yahoo.com/julian-smith-travis-legg-watch-214727115.html', - u'file': u'214727115.flv', - u'md5': u'2e717f169c1be93d84d3794a00d4a325', - u'info_dict': { - u"title": u"Julian Smith & Travis Legg Watch Julian Smith" + _TESTS = [ + { + u'url': u'http://screen.yahoo.com/julian-smith-travis-legg-watch-214727115.html', + u'file': u'214727115.flv', + u'info_dict': { + u'title': u'Julian Smith & Travis Legg Watch Julian Smith', + u'description': u'Julian and Travis watch Julian Smith', + }, + u'params': { + # Requires rtmpdump + u'skip_download': True, + }, }, - u'skip': u'Requires rtmpdump' - } + { + u'url': u'http://screen.yahoo.com/wired/codefellas-s1-ep12-cougar-lies-103000935.html', + u'file': u'103000935.flv', + u'info_dict': { + u'title': u'Codefellas - The Cougar Lies with Spanish Moss', + u'description': u'Agent Topple\'s mustache does its dirty work, and Nicole brokers a deal for peace. But why is the NSA collecting millions of Instagram brunch photos? And if your waffles have nothing to hide, what are they so worried about?', + }, + u'params': { + # Requires rtmpdump + u'skip_download': True, + }, + }, + ] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - if mobj is None: - raise ExtractorError(u'Invalid URL: %s' % url) video_id = mobj.group('id') webpage = self._download_webpage(url, video_id) - m_id = re.search(r'YUI\.namespace\("Media"\)\.CONTENT_ID = "(?P.+?)";', webpage) - if m_id is None: - # TODO: Check which url parameters are required - info_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;lmsoverride=1;outputformat=mrss;cb=974419660;id=%s;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id - webpage = self._download_webpage(info_url, video_id, u'Downloading info webpage') - info_re = r'''<!\[CDATA\[(?P<title>.*?)\]\]>.* - .*?)\]\]>.* - .*?)\ .*\]\]>.* - youtube.com/xxxx is OK + )) + |youtu\.be/ # just youtu.be/xxxx + ) )? # all until now is optional -> you can pass the naked ID - ([0-9A-Za-z_-]+) # here is it! the YouTube video ID + ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID (?(1).+)? # if we found the ID, everything can follow $""" _NEXT_URL_RE = r'[\?&]next_url=([^&]+)' # Listed in order of quality - _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13', - '95', '94', '93', '92', '132', '151', + _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '36', '17', '13', + # Apple HTTP Live Streaming + '96', '95', '94', '93', '92', '132', '151', # 3D '85', '84', '102', '83', '101', '82', '100', # Dash video @@ -163,8 +181,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): # Dash audio '141', '172', '140', '171', '139', ] - _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13', - '95', '94', '93', '92', '132', '151', + _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '36', '17', '13', + # Apple HTTP Live Streaming + '96', '95', '94', '93', '92', '132', '151', + # 3D '85', '102', '84', '101', '83', '100', '82', # Dash video '138', '248', '137', '247', '136', '246', '245', @@ -172,11 +192,18 @@ class YoutubeIE(YoutubeBaseInfoExtractor): # Dash audio '172', '141', '171', '140', '139', ] + _video_formats_map = { + 'flv': ['35', '34', '6', '5'], + '3gp': ['36', '17', '13'], + 'mp4': ['38', '37', '22', '18'], + 'webm': ['46', '45', '44', '43'], + } _video_extensions = { '13': '3gp', - '17': 'mp4', + '17': '3gp', '18': 'mp4', '22': 'mp4', + '36': '3gp', '37': 'mp4', '38': 'mp4', '43': 'webm', @@ -193,7 +220,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): '101': 'webm', '102': 'webm', - # videos that use m3u8 + # Apple HTTP Live Streaming '92': 'mp4', '93': 'mp4', '94': 'mp4', @@ -234,6 +261,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): '22': '720x1280', '34': '360x640', '35': '480x854', + '36': '240x320', '37': '1080x1920', '38': '3072x4096', '43': '360x640', @@ -335,7 +363,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): u"info_dict": { u"upload_date": u"20120506", u"title": u"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]", - u"description": u"md5:3e2666e0a55044490499ea45fe9037b7", + u"description": u"md5:5b292926389560516e384ac437c0ec07", u"uploader": u"Icona Pop", u"uploader_id": u"IconaPop" } @@ -352,30 +380,19 @@ class YoutubeIE(YoutubeBaseInfoExtractor): u"uploader_id": u"justintimberlakeVEVO" } }, - { - u'url': u'https://www.youtube.com/watch?v=TGi3HqYrWHE', - u'file': u'TGi3HqYrWHE.mp4', - u'note': u'm3u8 video', - u'info_dict': { - u'title': u'Triathlon - Men - London 2012 Olympic Games', - u'description': u'- Men - TR02 - Triathlon - 07 August 2012 - London 2012 Olympic Games', - u'uploader': u'olympic', - u'upload_date': u'20120807', - u'uploader_id': u'olympic', - }, - u'params': { - u'skip_download': True, - }, - }, ] @classmethod def suitable(cls, url): """Receives a URL and returns True if suitable for this IE.""" - if YoutubePlaylistIE.suitable(url) or YoutubeSubscriptionsIE.suitable(url): return False + if YoutubePlaylistIE.suitable(url): return False return re.match(cls._VALID_URL, url, re.VERBOSE) is not None + def __init__(self, *args, **kwargs): + super(YoutubeIE, self).__init__(*args, **kwargs) + self._player_cache = {} + def report_video_webpage_download(self, video_id): """Report attempt to download video webpage.""" self.to_screen(u'%s: Downloading video webpage' % video_id) @@ -384,19 +401,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): """Report attempt to download video info webpage.""" self.to_screen(u'%s: Downloading video info webpage' % video_id) - def report_video_subtitles_download(self, video_id): - """Report attempt to download video info webpage.""" - self.to_screen(u'%s: Checking available subtitles' % video_id) - - def report_video_subtitles_request(self, video_id, sub_lang, format): - """Report attempt to download video info webpage.""" - self.to_screen(u'%s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format)) - - def report_video_subtitles_available(self, video_id, sub_lang_list): - """Report available subtitles.""" - sub_lang = ",".join(list(sub_lang_list.keys())) - self.to_screen(u'%s: Available subtitles for video: %s' % (video_id, sub_lang)) - def report_information_extraction(self, video_id): """Report attempt to extract video information.""" self.to_screen(u'%s: Extracting video information' % video_id) @@ -409,11 +413,664 @@ class YoutubeIE(YoutubeBaseInfoExtractor): """Indicate the download will use the RTMP protocol.""" self.to_screen(u'RTMP download detected') - def _decrypt_signature(self, s): + def _extract_signature_function(self, video_id, player_url, slen): + id_m = re.match(r'.*-(?P[a-zA-Z0-9_-]+)\.(?P[a-z]+)$', + player_url) + player_type = id_m.group('ext') + player_id = id_m.group('id') + + # Read from filesystem cache + func_id = '%s_%s_%d' % (player_type, player_id, slen) + assert os.path.basename(func_id) == func_id + cache_dir = get_cachedir(self._downloader.params) + + cache_enabled = cache_dir is not None + if cache_enabled: + cache_fn = os.path.join(os.path.expanduser(cache_dir), + u'youtube-sigfuncs', + func_id + '.json') + try: + with io.open(cache_fn, 'r', encoding='utf-8') as cachef: + cache_spec = json.load(cachef) + return lambda s: u''.join(s[i] for i in cache_spec) + except IOError: + pass # No cache available + + if player_type == 'js': + code = self._download_webpage( + player_url, video_id, + note=u'Downloading %s player %s' % (player_type, player_id), + errnote=u'Download of %s failed' % player_url) + res = self._parse_sig_js(code) + elif player_type == 'swf': + urlh = self._request_webpage( + player_url, video_id, + note=u'Downloading %s player %s' % (player_type, player_id), + errnote=u'Download of %s failed' % player_url) + code = urlh.read() + res = self._parse_sig_swf(code) + else: + assert False, 'Invalid player type %r' % player_type + + if cache_enabled: + try: + test_string = u''.join(map(compat_chr, range(slen))) + cache_res = res(test_string) + cache_spec = [ord(c) for c in cache_res] + try: + os.makedirs(os.path.dirname(cache_fn)) + except OSError as ose: + if ose.errno != errno.EEXIST: + raise + write_json_file(cache_spec, cache_fn) + except Exception: + tb = traceback.format_exc() + self._downloader.report_warning( + u'Writing cache to %r failed: %s' % (cache_fn, tb)) + + return res + + def _print_sig_code(self, func, slen): + def gen_sig_code(idxs): + def _genslice(start, end, step): + starts = u'' if start == 0 else str(start) + ends = (u':%d' % (end+step)) if end + step >= 0 else u':' + steps = u'' if step == 1 else (u':%d' % step) + return u's[%s%s%s]' % (starts, ends, steps) + + step = None + start = '(Never used)' # Quelch pyflakes warnings - start will be + # set as soon as step is set + for i, prev in zip(idxs[1:], idxs[:-1]): + if step is not None: + if i - prev == step: + continue + yield _genslice(start, prev, step) + step = None + continue + if i - prev in [-1, 1]: + step = i - prev + start = prev + continue + else: + yield u's[%d]' % prev + if step is None: + yield u's[%d]' % i + else: + yield _genslice(start, i, step) + + test_string = u''.join(map(compat_chr, range(slen))) + cache_res = func(test_string) + cache_spec = [ord(c) for c in cache_res] + expr_code = u' + '.join(gen_sig_code(cache_spec)) + code = u'if len(s) == %d:\n return %s\n' % (slen, expr_code) + self.to_screen(u'Extracted signature function:\n' + code) + + def _parse_sig_js(self, jscode): + funcname = self._search_regex( + r'signature=([a-zA-Z]+)', jscode, + u'Initial JS player signature function name') + + functions = {} + + def argidx(varname): + return string.lowercase.index(varname) + + def interpret_statement(stmt, local_vars, allow_recursion=20): + if allow_recursion < 0: + raise ExtractorError(u'Recursion limit reached') + + if stmt.startswith(u'var '): + stmt = stmt[len(u'var '):] + ass_m = re.match(r'^(?P[a-z]+)(?:\[(?P[^\]]+)\])?' + + r'=(?P.*)$', stmt) + if ass_m: + if ass_m.groupdict().get('index'): + def assign(val): + lvar = local_vars[ass_m.group('out')] + idx = interpret_expression(ass_m.group('index'), + local_vars, allow_recursion) + assert isinstance(idx, int) + lvar[idx] = val + return val + expr = ass_m.group('expr') + else: + def assign(val): + local_vars[ass_m.group('out')] = val + return val + expr = ass_m.group('expr') + elif stmt.startswith(u'return '): + assign = lambda v: v + expr = stmt[len(u'return '):] + else: + raise ExtractorError( + u'Cannot determine left side of statement in %r' % stmt) + + v = interpret_expression(expr, local_vars, allow_recursion) + return assign(v) + + def interpret_expression(expr, local_vars, allow_recursion): + if expr.isdigit(): + return int(expr) + + if expr.isalpha(): + return local_vars[expr] + + m = re.match(r'^(?P[a-z]+)\.(?P.*)$', expr) + if m: + member = m.group('member') + val = local_vars[m.group('in')] + if member == 'split("")': + return list(val) + if member == 'join("")': + return u''.join(val) + if member == 'length': + return len(val) + if member == 'reverse()': + return val[::-1] + slice_m = re.match(r'slice\((?P.*)\)', member) + if slice_m: + idx = interpret_expression( + slice_m.group('idx'), local_vars, allow_recursion-1) + return val[idx:] + + m = re.match( + r'^(?P[a-z]+)\[(?P.+)\]$', expr) + if m: + val = local_vars[m.group('in')] + idx = interpret_expression(m.group('idx'), local_vars, + allow_recursion-1) + return val[idx] + + m = re.match(r'^(?P.+?)(?P[%])(?P.+?)$', expr) + if m: + a = interpret_expression(m.group('a'), + local_vars, allow_recursion) + b = interpret_expression(m.group('b'), + local_vars, allow_recursion) + return a % b + + m = re.match( + r'^(?P[a-zA-Z]+)\((?P[a-z0-9,]+)\)$', expr) + if m: + fname = m.group('func') + if fname not in functions: + functions[fname] = extract_function(fname) + argvals = [int(v) if v.isdigit() else local_vars[v] + for v in m.group('args').split(',')] + return functions[fname](argvals) + raise ExtractorError(u'Unsupported JS expression %r' % expr) + + def extract_function(funcname): + func_m = re.search( + r'function ' + re.escape(funcname) + + r'\((?P[a-z,]+)\){(?P[^}]+)}', + jscode) + argnames = func_m.group('args').split(',') + + def resf(args): + local_vars = dict(zip(argnames, args)) + for stmt in func_m.group('code').split(';'): + res = interpret_statement(stmt, local_vars) + return res + return resf + + initial_function = extract_function(funcname) + return lambda s: initial_function([s]) + + def _parse_sig_swf(self, file_contents): + if file_contents[1:3] != b'WS': + raise ExtractorError( + u'Not an SWF file; header is %r' % file_contents[:3]) + if file_contents[:1] == b'C': + content = zlib.decompress(file_contents[8:]) + else: + raise NotImplementedError(u'Unsupported compression format %r' % + file_contents[:1]) + + def extract_tags(content): + pos = 0 + while pos < len(content): + header16 = struct.unpack('> 6 + tag_len = header16 & 0x3f + if tag_len == 0x3f: + tag_len = struct.unpack('> 4 + methods = {} + if kind in [0x00, 0x06]: # Slot or Const + u30() # Slot id + u30() # type_name_idx + vindex = u30() + if vindex != 0: + read_byte() # vkind + elif kind in [0x01, 0x02, 0x03]: # Method / Getter / Setter + u30() # disp_id + method_idx = u30() + methods[multinames[trait_name_idx]] = method_idx + elif kind == 0x04: # Class + u30() # slot_id + u30() # classi + elif kind == 0x05: # Function + u30() # slot_id + function_idx = u30() + methods[function_idx] = multinames[trait_name_idx] + else: + raise ExtractorError(u'Unsupported trait kind %d' % kind) + + if attrs & 0x4 != 0: # Metadata present + metadata_count = u30() + for _c3 in range(metadata_count): + u30() # metadata index + + return methods + + # Classes + TARGET_CLASSNAME = u'SignatureDecipher' + searched_idx = multinames.index(TARGET_CLASSNAME) + searched_class_id = None + class_count = u30() + for class_id in range(class_count): + name_idx = u30() + if name_idx == searched_idx: + # We found the class we're looking for! + searched_class_id = class_id + u30() # super_name idx + flags = read_byte() + if flags & 0x08 != 0: # Protected namespace is present + u30() # protected_ns_idx + intrf_count = u30() + for _c2 in range(intrf_count): + u30() + u30() # iinit + trait_count = u30() + for _c2 in range(trait_count): + parse_traits_info() + + if searched_class_id is None: + raise ExtractorError(u'Target class %r not found' % + TARGET_CLASSNAME) + + method_names = {} + method_idxs = {} + for class_id in range(class_count): + u30() # cinit + trait_count = u30() + for _c2 in range(trait_count): + trait_methods = parse_traits_info() + if class_id == searched_class_id: + method_names.update(trait_methods.items()) + method_idxs.update(dict( + (idx, name) + for name, idx in trait_methods.items())) + + # Scripts + script_count = u30() + for _c in range(script_count): + u30() # init + trait_count = u30() + for _c2 in range(trait_count): + parse_traits_info() + + # Method bodies + method_body_count = u30() + Method = collections.namedtuple('Method', ['code', 'local_count']) + methods = {} + for _c in range(method_body_count): + method_idx = u30() + u30() # max_stack + local_count = u30() + u30() # init_scope_depth + u30() # max_scope_depth + code_length = u30() + code = read_bytes(code_length) + if method_idx in method_idxs: + m = Method(code, local_count) + methods[method_idxs[method_idx]] = m + exception_count = u30() + for _c2 in range(exception_count): + u30() # from + u30() # to + u30() # target + u30() # exc_type + u30() # var_name + trait_count = u30() + for _c2 in range(trait_count): + parse_traits_info() + + assert p + code_reader.tell() == len(code_tag) + assert len(methods) == len(method_idxs) + + method_pyfunctions = {} + + def extract_function(func_name): + if func_name in method_pyfunctions: + return method_pyfunctions[func_name] + if func_name not in methods: + raise ExtractorError(u'Cannot find function %r' % func_name) + m = methods[func_name] + + def resfunc(args): + registers = ['(this)'] + list(args) + [None] * m.local_count + stack = [] + coder = io.BytesIO(m.code) + while True: + opcode = struct.unpack('!B', coder.read(1))[0] + if opcode == 36: # pushbyte + v = struct.unpack('!B', coder.read(1))[0] + stack.append(v) + elif opcode == 44: # pushstring + idx = u30(coder) + stack.append(constant_strings[idx]) + elif opcode == 48: # pushscope + # We don't implement the scope register, so we'll just + # ignore the popped value + stack.pop() + elif opcode == 70: # callproperty + index = u30(coder) + mname = multinames[index] + arg_count = u30(coder) + args = list(reversed( + [stack.pop() for _ in range(arg_count)])) + obj = stack.pop() + if mname == u'split': + assert len(args) == 1 + assert isinstance(args[0], compat_str) + assert isinstance(obj, compat_str) + if args[0] == u'': + res = list(obj) + else: + res = obj.split(args[0]) + stack.append(res) + elif mname == u'slice': + assert len(args) == 1 + assert isinstance(args[0], int) + assert isinstance(obj, list) + res = obj[args[0]:] + stack.append(res) + elif mname == u'join': + assert len(args) == 1 + assert isinstance(args[0], compat_str) + assert isinstance(obj, list) + res = args[0].join(obj) + stack.append(res) + elif mname in method_pyfunctions: + stack.append(method_pyfunctions[mname](args)) + else: + raise NotImplementedError( + u'Unsupported property %r on %r' + % (mname, obj)) + elif opcode == 72: # returnvalue + res = stack.pop() + return res + elif opcode == 79: # callpropvoid + index = u30(coder) + mname = multinames[index] + arg_count = u30(coder) + args = list(reversed( + [stack.pop() for _ in range(arg_count)])) + obj = stack.pop() + if mname == u'reverse': + assert isinstance(obj, list) + obj.reverse() + else: + raise NotImplementedError( + u'Unsupported (void) property %r on %r' + % (mname, obj)) + elif opcode == 93: # findpropstrict + index = u30(coder) + mname = multinames[index] + res = extract_function(mname) + stack.append(res) + elif opcode == 97: # setproperty + index = u30(coder) + value = stack.pop() + idx = stack.pop() + obj = stack.pop() + assert isinstance(obj, list) + assert isinstance(idx, int) + obj[idx] = value + elif opcode == 98: # getlocal + index = u30(coder) + stack.append(registers[index]) + elif opcode == 99: # setlocal + index = u30(coder) + value = stack.pop() + registers[index] = value + elif opcode == 102: # getproperty + index = u30(coder) + pname = multinames[index] + if pname == u'length': + obj = stack.pop() + assert isinstance(obj, list) + stack.append(len(obj)) + else: # Assume attribute access + idx = stack.pop() + assert isinstance(idx, int) + obj = stack.pop() + assert isinstance(obj, list) + stack.append(obj[idx]) + elif opcode == 128: # coerce + u30(coder) + elif opcode == 133: # coerce_s + assert isinstance(stack[-1], (type(None), compat_str)) + elif opcode == 164: # modulo + value2 = stack.pop() + value1 = stack.pop() + res = value1 % value2 + stack.append(res) + elif opcode == 208: # getlocal_0 + stack.append(registers[0]) + elif opcode == 209: # getlocal_1 + stack.append(registers[1]) + elif opcode == 210: # getlocal_2 + stack.append(registers[2]) + elif opcode == 211: # getlocal_3 + stack.append(registers[3]) + elif opcode == 214: # setlocal_2 + registers[2] = stack.pop() + elif opcode == 215: # setlocal_3 + registers[3] = stack.pop() + else: + raise NotImplementedError( + u'Unsupported opcode %d' % opcode) + + method_pyfunctions[func_name] = resfunc + return resfunc + + initial_function = extract_function(u'decipher') + return lambda s: initial_function([s]) + + def _decrypt_signature(self, s, video_id, player_url, age_gate=False): """Turn the encrypted s field into a working signature""" - if len(s) == 92: + if player_url is not None: + try: + player_id = (player_url, len(s)) + if player_id not in self._player_cache: + func = self._extract_signature_function( + video_id, player_url, len(s) + ) + self._player_cache[player_id] = func + func = self._player_cache[player_id] + if self._downloader.params.get('youtube_print_sig_code'): + self._print_sig_code(func, len(s)) + return func(s) + except Exception: + tb = traceback.format_exc() + self._downloader.report_warning( + u'Automatic signature extraction failed: ' + tb) + + self._downloader.report_warning( + u'Warning: Falling back to static signature algorithm') + + return self._static_decrypt_signature( + s, video_id, player_url, age_gate) + + def _static_decrypt_signature(self, s, video_id, player_url, age_gate): + if age_gate: + # The videos with age protection use another player, so the + # algorithms can be different. + if len(s) == 86: + return s[2:63] + s[82] + s[64:82] + s[63] + + if len(s) == 93: + return s[86:29:-1] + s[88] + s[28:5:-1] + elif len(s) == 92: return s[25] + s[3:25] + s[0] + s[26:42] + s[79] + s[43:79] + s[91] + s[80:83] + elif len(s) == 91: + return s[84:27:-1] + s[86] + s[26:5:-1] elif len(s) == 90: return s[25] + s[3:25] + s[2] + s[26:40] + s[77] + s[41:77] + s[89] + s[78:81] elif len(s) == 89: @@ -423,15 +1080,15 @@ class YoutubeIE(YoutubeBaseInfoExtractor): elif len(s) == 87: return s[6:27] + s[4] + s[28:39] + s[27] + s[40:59] + s[2] + s[60:] elif len(s) == 86: - return s[81:73:-1] + s[84] + s[72:58:-1] + s[0] + s[57:35:-1] + s[85] + s[34:0:-1] + return s[80:72:-1] + s[16] + s[71:39:-1] + s[72] + s[38:16:-1] + s[82] + s[15::-1] elif len(s) == 85: - return s[83:34:-1] + s[0] + s[33:27:-1] + s[3] + s[26:19:-1] + s[34] + s[18:3:-1] + s[27] + return s[3:11] + s[0] + s[12:55] + s[84] + s[56:84] elif len(s) == 84: - return s[81:36:-1] + s[0] + s[35:2:-1] + return s[78:70:-1] + s[14] + s[69:37:-1] + s[70] + s[36:14:-1] + s[80] + s[:14][::-1] elif len(s) == 83: - return s[81:64:-1] + s[82] + s[63:52:-1] + s[45] + s[51:45:-1] + s[1] + s[44:1:-1] + s[0] + return s[80:63:-1] + s[0] + s[62:0:-1] + s[63] elif len(s) == 82: - return s[1:19] + s[0] + s[20:68] + s[19] + s[69:82] + return s[80:37:-1] + s[7] + s[36:7:-1] + s[0] + s[6:0:-1] + s[37] elif len(s) == 81: return s[56] + s[79:56:-1] + s[41] + s[55:41:-1] + s[80] + s[40:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9] elif len(s) == 80: @@ -442,65 +1099,38 @@ class YoutubeIE(YoutubeBaseInfoExtractor): else: raise ExtractorError(u'Unable to decrypt signature, key length %d not supported; retrying might work' % (len(s))) - def _decrypt_signature_age_gate(self, s): - # The videos with age protection use another player, so the algorithms - # can be different. - if len(s) == 86: - return s[2:63] + s[82] + s[64:82] + s[63] - else: - # Fallback to the other algortihms - return self._decrypt_signature(s) - - def _get_available_subtitles(self, video_id): - self.report_video_subtitles_download(video_id) - request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id) try: - sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8') - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: + sub_list = self._download_webpage( + 'http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id, + video_id, note=False) + except ExtractorError as err: self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err)) return {} - sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list) - sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list) + lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list) + + sub_lang_list = {} + for l in lang_list: + lang = l[1] + params = compat_urllib_parse.urlencode({ + 'lang': lang, + 'v': video_id, + 'fmt': self._downloader.params.get('subtitlesformat'), + }) + url = u'http://www.youtube.com/api/timedtext?' + params + sub_lang_list[lang] = url if not sub_lang_list: self._downloader.report_warning(u'video doesn\'t have subtitles') return {} return sub_lang_list - def _list_available_subtitles(self, video_id): - sub_lang_list = self._get_available_subtitles(video_id) - self.report_video_subtitles_available(video_id, sub_lang_list) - - def _request_subtitle(self, sub_lang, sub_name, video_id, format): - """ - Return the subtitle as a string or None if they are not found - """ - self.report_video_subtitles_request(video_id, sub_lang, format) - params = compat_urllib_parse.urlencode({ - 'lang': sub_lang, - 'name': sub_name, - 'v': video_id, - 'fmt': format, - }) - url = 'http://www.youtube.com/api/timedtext?' + params - try: - sub = compat_urllib_request.urlopen(url).read().decode('utf-8') - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.report_warning(u'unable to download video subtitles for %s: %s' % (sub_lang, compat_str(err))) - return - if not sub: - self._downloader.report_warning(u'Did not fetch video subtitles') - return - return sub - - def _request_automatic_caption(self, video_id, webpage): + def _get_available_automatic_caption(self, video_id, webpage): """We need the webpage for getting the captions url, pass it as an argument to speed up the process.""" - sub_lang = (self._downloader.params.get('subtitleslangs') or ['en'])[0] sub_format = self._downloader.params.get('subtitlesformat') self.to_screen(u'%s: Looking for automatic captions' % video_id) mobj = re.search(r';ytplayer.config = ({.*?});', webpage) - err_msg = u'Couldn\'t find automatic captions for "%s"' % sub_lang + err_msg = u'Couldn\'t find automatic captions for %s' % video_id if mobj is None: self._downloader.report_warning(err_msg) return {} @@ -509,53 +1139,38 @@ class YoutubeIE(YoutubeBaseInfoExtractor): args = player_config[u'args'] caption_url = args[u'ttsurl'] timestamp = args[u'timestamp'] - params = compat_urllib_parse.urlencode({ - 'lang': 'en', - 'tlang': sub_lang, - 'fmt': sub_format, - 'ts': timestamp, - 'kind': 'asr', + # We get the available subtitles + list_params = compat_urllib_parse.urlencode({ + 'type': 'list', + 'tlangs': 1, + 'asrs': 1, }) - subtitles_url = caption_url + '&' + params - sub = self._download_webpage(subtitles_url, video_id, u'Downloading automatic captions') - return {sub_lang: sub} + list_url = caption_url + '&' + list_params + list_page = self._download_webpage(list_url, video_id) + caption_list = xml.etree.ElementTree.fromstring(list_page.encode('utf-8')) + original_lang_node = caption_list.find('track') + if original_lang_node.attrib.get('kind') != 'asr' : + self._downloader.report_warning(u'Video doesn\'t have automatic captions') + return {} + original_lang = original_lang_node.attrib['lang_code'] + + sub_lang_list = {} + for lang_node in caption_list.findall('target'): + sub_lang = lang_node.attrib['lang_code'] + params = compat_urllib_parse.urlencode({ + 'lang': original_lang, + 'tlang': sub_lang, + 'fmt': sub_format, + 'ts': timestamp, + 'kind': 'asr', + }) + sub_lang_list[sub_lang] = caption_url + '&' + params + return sub_lang_list # An extractor error can be raise by the download process if there are # no automatic captions but there are subtitles except (KeyError, ExtractorError): self._downloader.report_warning(err_msg) return {} - - def _extract_subtitles(self, video_id): - """ - Return a dictionary: {language: subtitles} or {} if the subtitles - couldn't be found - """ - available_subs_list = self._get_available_subtitles(video_id) - sub_format = self._downloader.params.get('subtitlesformat') - if not available_subs_list: #There was some error, it didn't get the available subtitles - return {} - if self._downloader.params.get('allsubtitles', False): - sub_lang_list = available_subs_list - else: - if self._downloader.params.get('subtitleslangs', False): - reqested_langs = self._downloader.params.get('subtitleslangs') - elif 'en' in available_subs_list: - reqested_langs = ['en'] - else: - reqested_langs = [list(available_subs_list.keys())[0]] - - sub_lang_list = {} - for sub_lang in reqested_langs: - if not sub_lang in available_subs_list: - self._downloader.report_warning(u'no closed captions found in the specified language "%s"' % sub_lang) - continue - sub_lang_list[sub_lang] = available_subs_list[sub_lang] - subtitles = {} - for sub_lang in sub_lang_list: - subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format) - if subtitle: - subtitles[sub_lang] = subtitle - return subtitles def _print_formats(self, formats): print('Available formats:') @@ -597,13 +1212,25 @@ class YoutubeIE(YoutubeBaseInfoExtractor): video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats else: # Specific formats. We pick the first in a slash-delimeted sequence. - # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'. + # Format can be specified as itag or 'mp4' or 'flv' etc. We pick the highest quality + # available in the specified format. For example, + # if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'. + # if '1/mp4/3/4' is requested and '1' and '5' (is a mp4) are available, we pick '1'. + # if '1/mp4/3/4' is requested and '4' and '5' (is a mp4) are available, we pick '5'. req_formats = req_format.split('/') video_url_list = None for rf in req_formats: if rf in url_map: video_url_list = [(rf, url_map[rf])] break + if rf in self._video_formats_map: + for srf in self._video_formats_map[rf]: + if srf in url_map: + video_url_list = [(srf, url_map[srf])] + break + else: + continue + break if video_url_list is None: raise ExtractorError(u'requested format not available') return video_url_list @@ -644,7 +1271,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): video_webpage = video_webpage_bytes.decode('utf-8', 'ignore') # Attempt to extract SWF player URL - mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage) + mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage) if mobj is not None: player_url = re.sub(r'\\(.)', r'\1', mobj.group(1)) else: @@ -708,9 +1335,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor): self._downloader.report_warning(u'unable to extract uploader nickname') # title - if 'title' not in video_info: - raise ExtractorError(u'Unable to extract video title') - video_title = compat_urllib_parse.unquote_plus(video_info['title'][0]) + if 'title' in video_info: + video_title = compat_urllib_parse.unquote_plus(video_info['title'][0]) + else: + self._downloader.report_warning(u'Unable to extract video title') + video_title = u'_' # thumbnail image # We try first to get a high quality image: @@ -720,7 +1349,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): video_thumbnail = m_thumb.group(1) elif 'thumbnail_url' not in video_info: self._downloader.report_warning(u'unable to extract video thumbnail') - video_thumbnail = '' + video_thumbnail = None else: # don't panic if we can't find it video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0]) @@ -743,15 +1372,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): video_description = u'' # subtitles - video_subtitles = None - - if self._downloader.params.get('writesubtitles', False) or self._downloader.params.get('allsubtitles', False): - video_subtitles = self._extract_subtitles(video_id) - elif self._downloader.params.get('writeautomaticsub', False): - video_subtitles = self._request_automatic_caption(video_id, video_webpage) + video_subtitles = self.extract_subtitles(video_id, video_webpage) if self._downloader.params.get('listsubtitles', False): - self._list_available_subtitles(video_id) + self._list_available_subtitles(video_id, video_webpage) return if 'length_seconds' not in video_info: @@ -770,6 +1394,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): args = info['args'] # Easy way to know if the 's' value is in url_encoded_fmt_stream_map # this signatures are encrypted + if 'url_encoded_fmt_stream_map' not in args: + raise ValueError(u'No stream_map present') # caught below m_s = re.search(r'[&,]s=', args['url_encoded_fmt_stream_map']) if m_s is not None: self.to_screen(u'%s: Encrypted signatures detected.' % video_id) @@ -802,24 +1428,34 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if 'sig' in url_data: url += '&signature=' + url_data['sig'][0] elif 's' in url_data: - if self._downloader.params.get('verbose'): - s = url_data['s'][0] - if age_gate: - player_version = self._search_regex(r'ad3-(.+?)\.swf', - video_info['ad3_module'][0] if 'ad3_module' in video_info else 'NOT FOUND', - 'flash player', fatal=False) - player = 'flash player %s' % player_version - else: - player = u'html5 player %s' % self._search_regex(r'html5player-(.+?)\.js', video_webpage, - 'html5 player', fatal=False) - parts_sizes = u'.'.join(compat_str(len(part)) for part in s.split('.')) - self.to_screen(u'encrypted signature length %d (%s), itag %s, %s' % - (len(s), parts_sizes, url_data['itag'][0], player)) encrypted_sig = url_data['s'][0] - if age_gate: - signature = self._decrypt_signature_age_gate(encrypted_sig) - else: - signature = self._decrypt_signature(encrypted_sig) + if self._downloader.params.get('verbose'): + if age_gate: + if player_url is None: + player_version = 'unknown' + else: + player_version = self._search_regex( + r'-(.+)\.swf$', player_url, + u'flash player', fatal=False) + player_desc = 'flash player %s' % player_version + else: + player_version = self._search_regex( + r'html5player-(.+?)\.js', video_webpage, + 'html5 player', fatal=False) + player_desc = u'html5 player %s' % player_version + + parts_sizes = u'.'.join(compat_str(len(part)) for part in encrypted_sig.split('.')) + self.to_screen(u'encrypted signature length %d (%s), itag %s, %s' % + (len(encrypted_sig), parts_sizes, url_data['itag'][0], player_desc)) + + if not age_gate: + jsplayer_url_json = self._search_regex( + r'"assets":.+?"js":\s*("[^"]+")', + video_webpage, u'JS player URL') + player_url = json.loads(jsplayer_url_json) + + signature = self._decrypt_signature( + encrypted_sig, video_id, player_url, age_gate) url += '&signature=' + signature if 'ratebypass' not in url: url += '&ratebypass=yes' @@ -835,7 +1471,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): return else: - raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info') + raise ExtractorError(u'no conn, hlsvp or url_encoded_fmt_stream_map information found in video info') results = [] for format_param, video_real_url in video_url_list: @@ -893,9 +1529,19 @@ class YoutubePlaylistIE(InfoExtractor): mobj = re.match(self._VALID_URL, url, re.VERBOSE) if mobj is None: raise ExtractorError(u'Invalid URL: %s' % url) + playlist_id = mobj.group(1) or mobj.group(2) + + # Check if it's a video-specific URL + query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) + if 'v' in query_dict: + video_id = query_dict['v'][0] + if self._downloader.params.get('noplaylist'): + self.to_screen(u'Downloading just video %s because of --no-playlist' % video_id) + return self.url_result('https://www.youtube.com/watch?v=' + video_id, 'Youtube') + else: + self.to_screen(u'Downloading playlist PL%s - add --no-playlist to just download video %s' % (playlist_id, video_id)) # Download playlist videos from API - playlist_id = mobj.group(1) or mobj.group(2) videos = [] for page_num in itertools.count(1): @@ -920,8 +1566,11 @@ class YoutubePlaylistIE(InfoExtractor): for entry in response['feed']['entry']: index = entry['yt$position']['$t'] - if 'media$group' in entry and 'media$player' in entry['media$group']: - videos.append((index, entry['media$group']['media$player']['url'])) + if 'media$group' in entry and 'yt$videoid' in entry['media$group']: + videos.append(( + index, + 'https://www.youtube.com/watch?v=' + entry['media$group']['yt$videoid']['$t'] + )) videos = [v[1] for v in sorted(videos)] @@ -987,13 +1636,20 @@ class YoutubeChannelIE(InfoExtractor): class YoutubeUserIE(InfoExtractor): IE_DESC = u'YouTube.com user videos (URL or "ytuser" keyword)' - _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)' + _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?)|ytuser:)(?!feed/)([A-Za-z0-9_-]+)' _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s' _GDATA_PAGE_SIZE = 50 - _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d' - _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]' + _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json' IE_NAME = u'youtube:user' + @classmethod + def suitable(cls, url): + # Don't return True if the url can be extracted with other youtube + # extractor, the regex would is too permissive and it would match. + other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls) + if any(ie.suitable(url) for ie in other_ies): return False + else: return super(YoutubeUserIE, cls).suitable(url) + def _real_extract(self, url): # Extract username mobj = re.match(self._VALID_URL, url) @@ -1016,13 +1672,18 @@ class YoutubeUserIE(InfoExtractor): page = self._download_webpage(gdata_url, username, u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE)) + try: + response = json.loads(page) + except ValueError as err: + raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err)) + if 'entry' not in response['feed']: + # Number of videos is a multiple of self._MAX_RESULTS + break + # Extract video identifiers ids_in_page = [] - - for mobj in re.finditer(self._VIDEO_INDICATOR, page): - if mobj.group(1) not in ids_in_page: - ids_in_page.append(mobj.group(1)) - + for entry in response['feed']['entry']: + ids_in_page.append(entry['id']['$t'].split('/')[-1]) video_ids.extend(ids_in_page) # A little optimization - if current page is not diff --git a/youtube_dl/extractor/zdf.py b/youtube_dl/extractor/zdf.py index 418509cb9..faed7ff7f 100644 --- a/youtube_dl/extractor/zdf.py +++ b/youtube_dl/extractor/zdf.py @@ -2,16 +2,14 @@ import re from .common import InfoExtractor from ..utils import ( + determine_ext, ExtractorError, - unescapeHTML, ) + class ZDFIE(InfoExtractor): - _VALID_URL = r'^http://www\.zdf\.de\/ZDFmediathek\/(.*beitrag\/video\/)(?P[^/\?]+)(?:\?.*)?' - _TITLE = r'(?P.*)</h1>' + _VALID_URL = r'^http://www\.zdf\.de\/ZDFmediathek(?P<hash>#)?\/(.*beitrag\/video\/)(?P<video_id>[^/\?]+)(?:\?.*)?' _MEDIA_STREAM = r'<a href="(?P<video_url>.+(?P<media_type>.streaming).+/zdf/(?P<quality>[^\/]+)/[^"]*)".+class="play".+>' - _MMS_STREAM = r'href="(?P<video_url>mms://[^"]*)"' - _RTSP_STREAM = r'(?P<video_url>rtsp://[^"]*.mp4)' def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) @@ -19,6 +17,9 @@ class ZDFIE(InfoExtractor): raise ExtractorError(u'Invalid URL: %s' % url) video_id = mobj.group('video_id') + if mobj.group('hash'): + url = url.replace(u'#', u'', 1) + html = self._download_webpage(url, video_id) streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)] if streams is None: @@ -27,39 +28,48 @@ class ZDFIE(InfoExtractor): # s['media_type'] == 'wstreaming' -> use 'Windows Media Player' and mms url # s['media_type'] == 'hstreaming' -> use 'Quicktime' and rtsp url # choose first/default media type and highest quality for now - for s in streams: #find 300 - dsl1000mbit - if s['quality'] == '300' and s['media_type'] == 'wstreaming': - stream_=s - break - for s in streams: #find veryhigh - dsl2000mbit - if s['quality'] == 'veryhigh' and s['media_type'] == 'wstreaming': # 'hstreaming' - rtsp is not working - stream_=s - break - if stream_ is None: + def stream_pref(s): + TYPE_ORDER = ['ostreaming', 'hstreaming', 'wstreaming'] + try: + type_pref = TYPE_ORDER.index(s['media_type']) + except ValueError: + type_pref = 999 + + QUALITY_ORDER = ['veryhigh', '300'] + try: + quality_pref = QUALITY_ORDER.index(s['quality']) + except ValueError: + quality_pref = 999 + + return (type_pref, quality_pref) + + sorted_streams = sorted(streams, key=stream_pref) + if not sorted_streams: raise ExtractorError(u'No stream found.') + stream = sorted_streams[0] - media_link = self._download_webpage(stream_['video_url'], video_id,'Get stream URL') + media_link = self._download_webpage( + stream['video_url'], + video_id, + u'Get stream URL') - self.report_extraction(video_id) - mobj = re.search(self._TITLE, html) + MMS_STREAM = r'href="(?P<video_url>mms://[^"]*)"' + RTSP_STREAM = r'(?P<video_url>rtsp://[^"]*.mp4)' + + mobj = re.search(self._MEDIA_STREAM, media_link) if mobj is None: - raise ExtractorError(u'Cannot extract title') - title = unescapeHTML(mobj.group('title')) - - mobj = re.search(self._MMS_STREAM, media_link) - if mobj is None: - mobj = re.search(self._RTSP_STREAM, media_link) + mobj = re.search(RTSP_STREAM, media_link) if mobj is None: raise ExtractorError(u'Cannot extract mms:// or rtsp:// URL') - mms_url = mobj.group('video_url') + video_url = mobj.group('video_url') - mobj = re.search('(.*)[.](?P<ext>[^.]+)', mms_url) - if mobj is None: - raise ExtractorError(u'Cannot extract extention') - ext = mobj.group('ext') + title = self._html_search_regex( + r'<h1(?: class="beitragHeadline")?>(.*?)</h1>', + html, u'title') - return [{'id': video_id, - 'url': mms_url, - 'title': title, - 'ext': ext - }] + return { + 'id': video_id, + 'url': video_url, + 'title': title, + 'ext': determine_ext(video_url) + } diff --git a/youtube_dl/update.py b/youtube_dl/update.py index ccab6f27f..0689a4891 100644 --- a/youtube_dl/update.py +++ b/youtube_dl/update.py @@ -1,6 +1,9 @@ +import io import json import traceback import hashlib +import subprocess +import sys from zipimport import zipimporter from .utils import * @@ -34,7 +37,7 @@ def rsa_verify(message, signature, key): if signature != sha256(message).digest(): return False return True -def update_self(to_screen, verbose, filename): +def update_self(to_screen, verbose): """Update the program file with the latest version from the repository""" UPDATE_URL = "http://rg3.github.io/youtube-dl/update/" @@ -42,7 +45,6 @@ def update_self(to_screen, verbose, filename): JSON_URL = UPDATE_URL + 'versions.json' UPDATES_RSA_KEY = (0x9d60ee4d8f805312fdb15a62f87b95bd66177b91df176765d13514a0f1754bcd2057295c5b6f1d35daa6742c3ffc9a82d3e118861c207995a8031e151d863c9927e304576bc80692bc8e094896fcf11b66f3e29e04e3a71e9a11558558acea1840aec37fc396fb6b65dc81a1c4144e03bd1c011de62e3f1357b327d08426fe93, 65537) - if not isinstance(globals().get('__loader__'), zipimporter) and not hasattr(sys, "frozen"): to_screen(u'It looks like you installed youtube-dl with a package manager, pip, setup.py or a tarball. Please use that to update.') return @@ -75,11 +77,18 @@ def update_self(to_screen, verbose, filename): to_screen(u'ERROR: the versions file signature is invalid. Aborting.') return - to_screen(u'Updating to version ' + versions_info['latest'] + '...') - version = versions_info['versions'][versions_info['latest']] + version_id = versions_info['latest'] + to_screen(u'Updating to version ' + version_id + '...') + version = versions_info['versions'][version_id] print_notes(to_screen, versions_info['versions']) + filename = sys.argv[0] + # Py2EXE: Filename could be different + if hasattr(sys, "frozen") and not os.path.isfile(filename): + if os.path.isfile(filename + u'.exe'): + filename += u'.exe' + if not os.access(filename, os.W_OK): to_screen(u'ERROR: no write permissions on %s' % filename) return @@ -116,16 +125,18 @@ def update_self(to_screen, verbose, filename): try: bat = os.path.join(directory, 'youtube-dl-updater.bat') - b = open(bat, 'w') - b.write(""" -echo Updating youtube-dl... + with io.open(bat, 'w') as batfile: + batfile.write(u""" +@echo off +echo Waiting for file handle to be closed ... ping 127.0.0.1 -n 5 -w 1000 > NUL -move /Y "%s.new" "%s" -del "%s" - \n""" %(exe, exe, bat)) - b.close() +move /Y "%s.new" "%s" > NUL +echo Updated youtube-dl to version %s. +start /b "" cmd /c del "%%~f0"&exit /b" + \n""" % (exe, exe, version_id)) - os.startfile(bat) + subprocess.Popen([bat]) # Continues to run in the background + return # Do not show premature success messages except (IOError, OSError) as err: if verbose: to_screen(compat_str(traceback.format_exc())) to_screen(u'ERROR: unable to overwrite current version') diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 201802cee..f5f9cde99 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -66,6 +66,12 @@ try: except ImportError: # Python 2 from urllib2 import HTTPError as compat_HTTPError +try: + from urllib.request import urlretrieve as compat_urlretrieve +except ImportError: # Python 2 + from urllib import urlretrieve as compat_urlretrieve + + try: from subprocess import DEVNULL compat_subprocess_get_DEVNULL = lambda: DEVNULL @@ -249,7 +255,17 @@ def htmlentity_transform(matchobj): return (u'&%s;' % entity) compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix -class AttrParser(compat_html_parser.HTMLParser): +class BaseHTMLParser(compat_html_parser.HTMLParser): + def __init(self): + compat_html_parser.HTMLParser.__init__(self) + self.html = None + + def loads(self, html): + self.html = html + self.feed(html) + self.close() + +class AttrParser(BaseHTMLParser): """Modified HTMLParser that isolates a tag with the specified attribute""" def __init__(self, attribute, value): self.attribute = attribute @@ -257,10 +273,9 @@ class AttrParser(compat_html_parser.HTMLParser): self.result = None self.started = False self.depth = {} - self.html = None self.watch_startpos = False self.error_count = 0 - compat_html_parser.HTMLParser.__init__(self) + BaseHTMLParser.__init__(self) def error(self, message): if self.error_count > 10 or self.started: @@ -269,11 +284,6 @@ class AttrParser(compat_html_parser.HTMLParser): self.error_count += 1 self.goahead(1) - def loads(self, html): - self.html = html - self.feed(html) - self.close() - def handle_starttag(self, tag, attrs): attrs = dict(attrs) if self.started: @@ -334,6 +344,38 @@ def get_element_by_attribute(attribute, value, html): pass return parser.get_result() +class MetaParser(BaseHTMLParser): + """ + Modified HTMLParser that isolates a meta tag with the specified name + attribute. + """ + def __init__(self, name): + BaseHTMLParser.__init__(self) + self.name = name + self.content = None + self.result = None + + def handle_starttag(self, tag, attrs): + if tag != 'meta': + return + attrs = dict(attrs) + if attrs.get('name') == self.name: + self.result = attrs.get('content') + + def get_result(self): + return self.result + +def get_meta_content(name, html): + """ + Return the content attribute from the meta tag with the given name attribute. + """ + parser = MetaParser(name) + try: + parser.loads(html) + except compat_html_parser.HTMLParseError: + pass + return parser.get_result() + def clean_html(html): """Clean an HTML snippet into a readable string""" @@ -664,7 +706,16 @@ def unified_strdate(date_str): date_str = date_str.replace(',',' ') # %z (UTC offset) is only supported in python>=3.2 date_str = re.sub(r' (\+|-)[\d]*$', '', date_str) - format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y', '%Y-%m-%d', '%d/%m/%Y', '%Y/%m/%d %H:%M:%S', '%d.%m.%Y %H:%M'] + format_expressions = [ + '%d %B %Y', + '%B %d %Y', + '%b %d %Y', + '%Y-%m-%d', + '%d/%m/%Y', + '%Y/%m/%d %H:%M:%S', + '%d.%m.%Y %H:%M', + '%Y-%m-%dT%H:%M:%SZ', + ] for expression in format_expressions: try: upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d') @@ -745,6 +796,18 @@ def platform_name(): return res +def write_string(s, out=None): + if out is None: + out = sys.stderr + assert type(s) == type(u'') + + if ('b' in getattr(out, 'mode', '') or + sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr + s = s.encode(preferredencoding(), 'ignore') + out.write(s) + out.flush() + + def bytes_to_intlist(bs): if not bs: return [] @@ -761,3 +824,9 @@ def intlist_to_bytes(xs): return ''.join([chr(x) for x in xs]) else: return bytes(xs) + + +def get_cachedir(params={}): + cache_root = os.environ.get('XDG_CACHE_HOME', + os.path.expanduser('~/.cache')) + return params.get('cachedir', os.path.join(cache_root, 'youtube-dl')) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index b6284c6d6..e773e82da 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2013.08.30' +__version__ = '2013.10.04'