From 068629da73622873a5869c63335209ce056a49ac Mon Sep 17 00:00:00 2001 From: brotherBox Date: Thu, 27 Nov 2014 09:14:21 +0100 Subject: [PATCH 01/11] Added --reverse-playlist feature --- youtube_dl/YoutubeDL.py | 30 ++++++++++++++++++++---------- youtube_dl/__init__.py | 2 +- youtube_dl/options.py | 3 +++ 3 files changed, 24 insertions(+), 11 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 21c7c298a..acec87d5e 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -529,7 +529,7 @@ class YoutubeDL(object): info_dict.setdefault(key, value) def extract_info(self, url, download=True, ie_key=None, extra_info={}, - process=True): + process=True, reverse=False): ''' Returns a list with a dictionary for each video we find. If 'download', also downloads the videos. @@ -561,7 +561,9 @@ class YoutubeDL(object): } self.add_default_extra_info(ie_result, ie, url) if process: - return self.process_ie_result(ie_result, download, extra_info) + return self.process_ie_result( + ie_result, download=download, extra_info=extra_info, + reverse=reverse) else: return ie_result except ExtractorError as de: # An error we somewhat expected @@ -586,7 +588,9 @@ class YoutubeDL(object): 'extractor_key': ie.ie_key(), }) - def process_ie_result(self, ie_result, download=True, extra_info={}): + def process_ie_result(self, ie_result, download=True, reverse=False, + extra_info={}): + """ Take the result of the ie(may be modified) and resolve all unresolved references (URLs, playlist items). @@ -640,7 +644,8 @@ class YoutubeDL(object): make_result(e) for e in new_result['entries']] return self.process_ie_result( - new_result, download=download, extra_info=extra_info) + new_result, download=download, extra_info=extra_info, + reverse=False) elif result_type == 'playlist' or result_type == 'multi_video': # We process each entry in the playlist playlist = ie_result.get('title', None) or ie_result.get('id', None) @@ -670,6 +675,9 @@ class YoutubeDL(object): "[%s] playlist %s: Downloading %d videos" % (ie_result['extractor'], playlist, n_entries)) + if reverse: + entries = reversed(entries) + for i, entry in enumerate(entries, 1): self.to_screen('[download] Downloading video #%s of %s' % (i, n_entries)) extra = { @@ -691,7 +699,8 @@ class YoutubeDL(object): entry_result = self.process_ie_result(entry, download=download, - extra_info=extra) + extra_info=extra, + reverse=reverse) playlist_results.append(entry_result) ie_result['entries'] = playlist_results return ie_result @@ -712,7 +721,8 @@ class YoutubeDL(object): ) return r ie_result['entries'] = [ - self.process_ie_result(_fixup(r), download, extra_info) + self.process_ie_result(_fixup(r), download=download, + extra_info=extra_info, reverse=reverse) for r in ie_result['entries'] ] return ie_result @@ -1103,7 +1113,7 @@ class YoutubeDL(object): self.record_download_archive(info_dict) - def download(self, url_list): + def download(self, url_list, reverse=True): """Download a given list of URLs.""" outtmpl = self.params.get('outtmpl', DEFAULT_OUTTMPL) if (len(url_list) > 1 and @@ -1114,7 +1124,7 @@ class YoutubeDL(object): for url in url_list: try: # It also downloads the videos - res = self.extract_info(url) + res = self.extract_info(url, reverse=reverse) except UnavailableVideoError: self.report_error('unable to download video') except MaxDownloadsReached: @@ -1126,11 +1136,11 @@ class YoutubeDL(object): return self._download_retcode - def download_with_info_file(self, info_filename): + def download_with_info_file(self, info_filename, reverse=False): with io.open(info_filename, 'r', encoding='utf-8') as f: info = json.load(f) try: - self.process_ie_result(info, download=True) + self.process_ie_result(info, download=True, reverse=reverse) except DownloadError: webpage_url = info.get('webpage_url') if webpage_url is not None: diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 77b3384a0..0139f49d0 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -342,7 +342,7 @@ def _real_main(argv=None): if opts.load_info_filename is not None: retcode = ydl.download_with_info_file(opts.load_info_filename) else: - retcode = ydl.download(all_urls) + retcode = ydl.download(all_urls, reverse=opts.reverse_playlist) except MaxDownloadsReached: ydl.to_screen('--max-download limit reached, aborting.') retcode = 101 diff --git a/youtube_dl/options.py b/youtube_dl/options.py index 2e8c71508..27849dca3 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -235,6 +235,9 @@ def parseOpts(overrideArguments=None): '--include-ads', dest='include_ads', action='store_true', help='Download advertisements as well (experimental)') + selection.add_option( + '--reverse-playlist', + dest='reverse_playlist', action='store_true', help='Reverse playlists') authentication = optparse.OptionGroup(parser, 'Authentication Options') authentication.add_option( From f2c22d9f3ff2fde6179f07343b2708b753eada6d Mon Sep 17 00:00:00 2001 From: brotherBox Date: Fri, 28 Nov 2014 11:25:48 +0100 Subject: [PATCH 02/11] Fixed --playlist-start feature --- youtube_dl/YoutubeDL.py | 15 +++++++++------ youtube_dl/utils.py | 7 +++++-- 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index acec87d5e..13488d55e 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -548,7 +548,6 @@ class YoutubeDL(object): if not ie.working(): self.report_warning('The program functionality for this site has been marked as broken, ' 'and will probably not work.') - try: ie_result = ie.extract(url) if ie_result is None: # Finished already (backwards compatibility; listformats and friends should be moved here) @@ -668,16 +667,20 @@ class YoutubeDL(object): (ie_result['extractor'], playlist, n_all_entries, n_entries)) else: assert isinstance(ie_result['entries'], PagedList) - entries = ie_result['entries'].getslice( - playliststart, playlistend) + entries = ie_result['entries'] + if reverse: + entries = entries.getslice(0, None) + entries = entries[::-1] + entries = entries[playliststart:playlistend] + else: + entries = entries.getslice( + playliststart, playlistend) + n_entries = len(entries) self.to_screen( "[%s] playlist %s: Downloading %d videos" % (ie_result['extractor'], playlist, n_entries)) - if reverse: - entries = reversed(entries) - for i, entry in enumerate(entries, 1): self.to_screen('[download] Downloading video #%s of %s' % (i, n_entries)) extra = { diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 4d3cbac74..b685e9a93 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1265,16 +1265,19 @@ class OnDemandPagedList(PagedList): self._pagefunc = pagefunc self._pagesize = pagesize - def getslice(self, start=0, end=None): + def getslice(self, start=0, end=None, reverse=False): res = [] + + # r = list(itertools.count(start // self._pagesize)) + # for pagenum in r: for pagenum in itertools.count(start // self._pagesize): + # absolute videos before current page; page 2 * 50 v/p = 100 firstid = pagenum * self._pagesize nextfirstid = pagenum * self._pagesize + self._pagesize if start >= nextfirstid: continue page_results = list(self._pagefunc(pagenum)) - startv = ( start % self._pagesize if firstid <= start < nextfirstid From 58a952ae201ac644203b6434cd099f52ff6a8253 Mon Sep 17 00:00:00 2001 From: brotherBox Date: Fri, 5 Dec 2014 19:58:36 +0100 Subject: [PATCH 03/11] Updated readme --- README.md | 1 + youtube_dl/extractor/youtube.py | 2 +- youtube_dl/options.py | 2 +- 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index d6e7ff902..0550d8970 100644 --- a/README.md +++ b/README.md @@ -102,6 +102,7 @@ which means you can modify it, redistribute it or use it however you like. downloaded videos in it. --include-ads Download advertisements as well (experimental) + --reverse-playlist Reverses video order in playlist ## Download Options: -r, --rate-limit LIMIT maximum download rate in bytes per second diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 115fc6840..baf33c655 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1357,7 +1357,7 @@ class YoutubeUserIE(InfoExtractor): # page by page until there are no video ids - it means we got # all of them. - def download_page(pagenum): + def download_page(pagenum, reverse=False): start_index = pagenum * self._GDATA_PAGE_SIZE + 1 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index) diff --git a/youtube_dl/options.py b/youtube_dl/options.py index 27849dca3..ca25aac98 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -237,7 +237,7 @@ def parseOpts(overrideArguments=None): help='Download advertisements as well (experimental)') selection.add_option( '--reverse-playlist', - dest='reverse_playlist', action='store_true', help='Reverse playlists') + dest='reverse_playlist', action='store_true', help='Reverses video order in playlist') authentication = optparse.OptionGroup(parser, 'Authentication Options') authentication.add_option( From d3a951c09b9c721e372294961b1ead497d428320 Mon Sep 17 00:00:00 2001 From: brotherBox Date: Fri, 5 Dec 2014 19:59:47 +0100 Subject: [PATCH 04/11] Updated readme again --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 0550d8970..68b97dc2d 100644 --- a/README.md +++ b/README.md @@ -34,6 +34,8 @@ YouTube.com and a few more sites. It requires the Python interpreter, version your Unix box, on Windows or on Mac OS X. It is released to the public domain, which means you can modify it, redistribute it or use it however you like. +This is a modified version to enable the reversal of a playlist. It works for me, and that is all I can promise. + # OPTIONS -h, --help print this help text and exit --version print program version and exit From 97b2409fde73d0b0a41bbda0c08f7ebc3960de6e Mon Sep 17 00:00:00 2001 From: brotherBox Date: Tue, 23 Dec 2014 13:38:35 +0100 Subject: [PATCH 05/11] Beginning playlist reversal --- README.md | 3 --- setup.py | 0 youtube_dl/extractor/youtube.py | 2 +- youtube_dl/options.py | 2 +- 4 files changed, 2 insertions(+), 5 deletions(-) mode change 100644 => 100755 setup.py diff --git a/README.md b/README.md index 68b97dc2d..d6e7ff902 100644 --- a/README.md +++ b/README.md @@ -34,8 +34,6 @@ YouTube.com and a few more sites. It requires the Python interpreter, version your Unix box, on Windows or on Mac OS X. It is released to the public domain, which means you can modify it, redistribute it or use it however you like. -This is a modified version to enable the reversal of a playlist. It works for me, and that is all I can promise. - # OPTIONS -h, --help print this help text and exit --version print program version and exit @@ -104,7 +102,6 @@ This is a modified version to enable the reversal of a playlist. It works for me downloaded videos in it. --include-ads Download advertisements as well (experimental) - --reverse-playlist Reverses video order in playlist ## Download Options: -r, --rate-limit LIMIT maximum download rate in bytes per second diff --git a/setup.py b/setup.py old mode 100644 new mode 100755 diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index baf33c655..115fc6840 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1357,7 +1357,7 @@ class YoutubeUserIE(InfoExtractor): # page by page until there are no video ids - it means we got # all of them. - def download_page(pagenum, reverse=False): + def download_page(pagenum): start_index = pagenum * self._GDATA_PAGE_SIZE + 1 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index) diff --git a/youtube_dl/options.py b/youtube_dl/options.py index ca25aac98..27849dca3 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -237,7 +237,7 @@ def parseOpts(overrideArguments=None): help='Download advertisements as well (experimental)') selection.add_option( '--reverse-playlist', - dest='reverse_playlist', action='store_true', help='Reverses video order in playlist') + dest='reverse_playlist', action='store_true', help='Reverse playlists') authentication = optparse.OptionGroup(parser, 'Authentication Options') authentication.add_option( From 82f521272c9c6bde5699639099b0f23bc460130e Mon Sep 17 00:00:00 2001 From: brotherBox Date: Tue, 23 Dec 2014 13:58:09 +0100 Subject: [PATCH 06/11] Integrated reverse playlist feature with internals --- youtube_dl/YoutubeDL.py | 26 +++++++++++--------------- youtube_dl/__init__.py | 3 ++- youtube_dl/options.py | 3 ++- 3 files changed, 15 insertions(+), 17 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 13488d55e..e3fb43ef2 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -529,7 +529,7 @@ class YoutubeDL(object): info_dict.setdefault(key, value) def extract_info(self, url, download=True, ie_key=None, extra_info={}, - process=True, reverse=False): + process=True): ''' Returns a list with a dictionary for each video we find. If 'download', also downloads the videos. @@ -561,8 +561,7 @@ class YoutubeDL(object): self.add_default_extra_info(ie_result, ie, url) if process: return self.process_ie_result( - ie_result, download=download, extra_info=extra_info, - reverse=reverse) + ie_result, download=download, extra_info=extra_info) else: return ie_result except ExtractorError as de: # An error we somewhat expected @@ -587,8 +586,7 @@ class YoutubeDL(object): 'extractor_key': ie.ie_key(), }) - def process_ie_result(self, ie_result, download=True, reverse=False, - extra_info={}): + def process_ie_result(self, ie_result, download=True, extra_info={}): """ Take the result of the ie(may be modified) and resolve all unresolved @@ -643,8 +641,7 @@ class YoutubeDL(object): make_result(e) for e in new_result['entries']] return self.process_ie_result( - new_result, download=download, extra_info=extra_info, - reverse=False) + new_result, download=download, extra_info=extra_info) elif result_type == 'playlist' or result_type == 'multi_video': # We process each entry in the playlist playlist = ie_result.get('title', None) or ie_result.get('id', None) @@ -668,7 +665,7 @@ class YoutubeDL(object): else: assert isinstance(ie_result['entries'], PagedList) entries = ie_result['entries'] - if reverse: + if self.params.get('reverse_playlist'): entries = entries.getslice(0, None) entries = entries[::-1] entries = entries[playliststart:playlistend] @@ -702,8 +699,7 @@ class YoutubeDL(object): entry_result = self.process_ie_result(entry, download=download, - extra_info=extra, - reverse=reverse) + extra_info=extra) playlist_results.append(entry_result) ie_result['entries'] = playlist_results return ie_result @@ -725,7 +721,7 @@ class YoutubeDL(object): return r ie_result['entries'] = [ self.process_ie_result(_fixup(r), download=download, - extra_info=extra_info, reverse=reverse) + extra_info=extra_info) for r in ie_result['entries'] ] return ie_result @@ -1116,7 +1112,7 @@ class YoutubeDL(object): self.record_download_archive(info_dict) - def download(self, url_list, reverse=True): + def download(self, url_list): """Download a given list of URLs.""" outtmpl = self.params.get('outtmpl', DEFAULT_OUTTMPL) if (len(url_list) > 1 and @@ -1127,7 +1123,7 @@ class YoutubeDL(object): for url in url_list: try: # It also downloads the videos - res = self.extract_info(url, reverse=reverse) + res = self.extract_info(url) except UnavailableVideoError: self.report_error('unable to download video') except MaxDownloadsReached: @@ -1139,11 +1135,11 @@ class YoutubeDL(object): return self._download_retcode - def download_with_info_file(self, info_filename, reverse=False): + def download_with_info_file(self, info_filename): with io.open(info_filename, 'r', encoding='utf-8') as f: info = json.load(f) try: - self.process_ie_result(info, download=True, reverse=reverse) + self.process_ie_result(info, download=True) except DownloadError: webpage_url = info.get('webpage_url') if webpage_url is not None: diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 0139f49d0..2c28b7bbf 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -247,6 +247,7 @@ def _real_main(argv=None): 'continuedl': opts.continue_dl, 'noprogress': opts.noprogress, 'progress_with_newline': opts.progress_with_newline, + 'reverse_playlist': opts.reverse_playlist, 'playliststart': opts.playliststart, 'playlistend': opts.playlistend, 'noplaylist': opts.noplaylist, @@ -342,7 +343,7 @@ def _real_main(argv=None): if opts.load_info_filename is not None: retcode = ydl.download_with_info_file(opts.load_info_filename) else: - retcode = ydl.download(all_urls, reverse=opts.reverse_playlist) + retcode = ydl.download(all_urls) except MaxDownloadsReached: ydl.to_screen('--max-download limit reached, aborting.') retcode = 101 diff --git a/youtube_dl/options.py b/youtube_dl/options.py index 27849dca3..93d966675 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -237,7 +237,8 @@ def parseOpts(overrideArguments=None): help='Download advertisements as well (experimental)') selection.add_option( '--reverse-playlist', - dest='reverse_playlist', action='store_true', help='Reverse playlists') + help='Reverse playlists', + dest='reverse_playlist', action='store_true', default=False) authentication = optparse.OptionGroup(parser, 'Authentication Options') authentication.add_option( From 730921b963940eedcb0c561282096c4e9368652c Mon Sep 17 00:00:00 2001 From: luceatnobis Date: Sat, 29 Aug 2015 11:14:49 +0200 Subject: [PATCH 07/11] Made it work again for very limited purposes. --- youtube_dl/YoutubeDL.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index cad6b026e..003465567 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -793,15 +793,21 @@ class YoutubeDL(object): entry_list = list(ie_entries) entries = [entry_list[i - 1] for i in playlistitems] else: + # ADDED + entries = list(ie_entries) + """ entries = list(itertools.islice( ie_entries, playliststart, playlistend)) + """ n_entries = len(entries) self.to_screen( "[%s] playlist %s: Downloading %d videos" % (ie_result['extractor'], playlist, n_entries)) + # ADDED if self.params.get('playlistreverse', False): entries = entries[::-1] + entries = entries[playliststart:playlistend] for i, entry in enumerate(entries, 1): self.to_screen('[download] Downloading video %s of %s' % (i, n_entries)) From 8d889462b82cc0f750dacb553a32578d947cb271 Mon Sep 17 00:00:00 2001 From: luceatnobis Date: Sat, 29 Aug 2015 11:28:34 +0200 Subject: [PATCH 08/11] Made it somehow work i suppose --- setup.py | 111 ++ youtube_dl/YoutubeDL.py | 2024 +++++++++++++++++++++++++++++++++ youtube_dl/__init__.py | 418 +++++++ youtube_dl/options.py | 801 +++++++++++++ youtube_dl/utils.py | 2401 +++++++++++++++++++++++++++++++++++++++ 5 files changed, 5755 insertions(+) create mode 100644 setup.py create mode 100755 youtube_dl/YoutubeDL.py create mode 100644 youtube_dl/__init__.py create mode 100644 youtube_dl/options.py create mode 100644 youtube_dl/utils.py diff --git a/setup.py b/setup.py new file mode 100644 index 000000000..4686260e0 --- /dev/null +++ b/setup.py @@ -0,0 +1,111 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +from __future__ import print_function + +import os.path +import warnings +import sys + +try: + from setuptools import setup + setuptools_available = True +except ImportError: + from distutils.core import setup + setuptools_available = False + +try: + # This will create an exe that needs Microsoft Visual C++ 2008 + # Redistributable Package + import py2exe +except ImportError: + if len(sys.argv) >= 2 and sys.argv[1] == 'py2exe': + print("Cannot import py2exe", file=sys.stderr) + exit(1) + +py2exe_options = { + "bundle_files": 1, + "compressed": 1, + "optimize": 2, + "dist_dir": '.', + "dll_excludes": ['w9xpopen.exe'], +} + +py2exe_console = [{ + "script": "./youtube_dl/__main__.py", + "dest_base": "youtube-dl", +}] + +py2exe_params = { + 'console': py2exe_console, + 'options': {"py2exe": py2exe_options}, + 'zipfile': None +} + +if len(sys.argv) >= 2 and sys.argv[1] == 'py2exe': + params = py2exe_params +else: + files_spec = [ + ('etc/bash_completion.d', ['youtube-dl.bash-completion']), + ('etc/fish/completions', ['youtube-dl.fish']), + ('share/doc/youtube_dl', ['README.txt']), + ('share/man/man1', ['youtube-dl.1']) + ] + root = os.path.dirname(os.path.abspath(__file__)) + data_files = [] + for dirname, files in files_spec: + resfiles = [] + for fn in files: + if not os.path.exists(fn): + warnings.warn('Skipping file %s since it is not present. Type make to build all automatically generated files.' % fn) + else: + resfiles.append(fn) + data_files.append((dirname, resfiles)) + + params = { + 'data_files': data_files, + } + if setuptools_available: + params['entry_points'] = {'console_scripts': ['youtube-dl = youtube_dl:main']} + else: + params['scripts'] = ['bin/youtube-dl'] + +# Get the version from youtube_dl/version.py without importing the package +exec(compile(open('youtube_dl/version.py').read(), + 'youtube_dl/version.py', 'exec')) + +setup( + name='youtube_dl', + version=__version__, + description='YouTube video downloader', + long_description='Small command-line program to download videos from' + ' YouTube.com and other video sites.', + url='https://github.com/rg3/youtube-dl', + author='Ricardo Garcia', + author_email='ytdl@yt-dl.org', + maintainer='Philipp Hagemeister', + maintainer_email='phihag@phihag.de', + packages=[ + 'youtube_dl', + 'youtube_dl.extractor', 'youtube_dl.downloader', + 'youtube_dl.postprocessor'], + + # Provokes warning on most systems (why?!) + # test_suite = 'nose.collector', + # test_requires = ['nosetest'], + + classifiers=[ + "Topic :: Multimedia :: Video", + "Development Status :: 5 - Production/Stable", + "Environment :: Console", + "License :: Public Domain", + "Programming Language :: Python :: 2.6", + "Programming Language :: Python :: 2.7", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.2", + "Programming Language :: Python :: 3.3", + "Programming Language :: Python :: 3.4", + ], + + **params +) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py new file mode 100755 index 000000000..003465567 --- /dev/null +++ b/youtube_dl/YoutubeDL.py @@ -0,0 +1,2024 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +from __future__ import absolute_import, unicode_literals + +import collections +import contextlib +import datetime +import errno +import fileinput +import io +import itertools +import json +import locale +import operator +import os +import platform +import re +import shutil +import subprocess +import socket +import sys +import time +import tokenize +import traceback + +if os.name == 'nt': + import ctypes + +from .compat import ( + compat_cookiejar, + compat_expanduser, + compat_get_terminal_size, + compat_http_client, + compat_kwargs, + compat_str, + compat_tokenize_tokenize, + compat_urllib_error, + compat_urllib_request, +) +from .utils import ( + ContentTooShortError, + date_from_str, + DateRange, + DEFAULT_OUTTMPL, + determine_ext, + DownloadError, + encodeFilename, + ExtractorError, + format_bytes, + formatSeconds, + locked_file, + make_HTTPS_handler, + MaxDownloadsReached, + PagedList, + parse_filesize, + PerRequestProxyHandler, + PostProcessingError, + platform_name, + preferredencoding, + render_table, + SameFileError, + sanitize_filename, + sanitize_path, + std_headers, + subtitles_filename, + UnavailableVideoError, + url_basename, + version_tuple, + write_json_file, + write_string, + YoutubeDLHandler, + prepend_extension, + replace_extension, + args_to_str, + age_restricted, +) +from .cache import Cache +from .extractor import get_info_extractor, gen_extractors +from .downloader import get_suitable_downloader +from .downloader.rtmp import rtmpdump_version +from .postprocessor import ( + FFmpegFixupM4aPP, + FFmpegFixupStretchedPP, + FFmpegMergerPP, + FFmpegPostProcessor, + get_postprocessor, +) +from .version import __version__ + + +class YoutubeDL(object): + """YoutubeDL class. + + YoutubeDL objects are the ones responsible of downloading the + actual video file and writing it to disk if the user has requested + it, among some other tasks. In most cases there should be one per + program. As, given a video URL, the downloader doesn't know how to + extract all the needed information, task that InfoExtractors do, it + has to pass the URL to one of them. + + For this, YoutubeDL objects have a method that allows + InfoExtractors to be registered in a given order. When it is passed + a URL, the YoutubeDL object handles it to the first InfoExtractor it + finds that reports being able to handle it. The InfoExtractor extracts + all the information about the video or videos the URL refers to, and + YoutubeDL process the extracted information, possibly using a File + Downloader to download the video. + + YoutubeDL objects accept a lot of parameters. In order not to saturate + the object constructor with arguments, it receives a dictionary of + options instead. These options are available through the params + attribute for the InfoExtractors to use. The YoutubeDL also + registers itself as the downloader in charge for the InfoExtractors + that are added to it, so this is a "mutual registration". + + Available options: + + username: Username for authentication purposes. + password: Password for authentication purposes. + videopassword: Password for accessing a video. + usenetrc: Use netrc for authentication instead. + verbose: Print additional info to stdout. + quiet: Do not print messages to stdout. + no_warnings: Do not print out anything for warnings. + forceurl: Force printing final URL. + forcetitle: Force printing title. + forceid: Force printing ID. + forcethumbnail: Force printing thumbnail URL. + forcedescription: Force printing description. + forcefilename: Force printing final filename. + forceduration: Force printing duration. + forcejson: Force printing info_dict as JSON. + dump_single_json: Force printing the info_dict of the whole playlist + (or video) as a single JSON line. + simulate: Do not download the video files. + format: Video format code. See options.py for more information. + outtmpl: Template for output names. + restrictfilenames: Do not allow "&" and spaces in file names + ignoreerrors: Do not stop on download errors. + force_generic_extractor: Force downloader to use the generic extractor + nooverwrites: Prevent overwriting files. + playliststart: Playlist item to start at. + playlistend: Playlist item to end at. + playlist_items: Specific indices of playlist to download. + playlistreverse: Download playlist items in reverse order. + matchtitle: Download only matching titles. + rejecttitle: Reject downloads for matching titles. + logger: Log messages to a logging.Logger instance. + logtostderr: Log messages to stderr instead of stdout. + writedescription: Write the video description to a .description file + writeinfojson: Write the video description to a .info.json file + writeannotations: Write the video annotations to a .annotations.xml file + writethumbnail: Write the thumbnail image to a file + write_all_thumbnails: Write all thumbnail formats to files + writesubtitles: Write the video subtitles to a file + writeautomaticsub: Write the automatic subtitles to a file + allsubtitles: Downloads all the subtitles of the video + (requires writesubtitles or writeautomaticsub) + listsubtitles: Lists all available subtitles for the video + subtitlesformat: The format code for subtitles + subtitleslangs: List of languages of the subtitles to download + keepvideo: Keep the video file after post-processing + daterange: A DateRange object, download only if the upload_date is in the range. + skip_download: Skip the actual download of the video file + cachedir: Location of the cache files in the filesystem. + False to disable filesystem cache. + noplaylist: Download single video instead of a playlist if in doubt. + age_limit: An integer representing the user's age in years. + Unsuitable videos for the given age are skipped. + min_views: An integer representing the minimum view count the video + must have in order to not be skipped. + Videos without view count information are always + downloaded. None for no limit. + max_views: An integer representing the maximum view count. + Videos that are more popular than that are not + downloaded. + Videos without view count information are always + downloaded. None for no limit. + download_archive: File name of a file where all downloads are recorded. + Videos already present in the file are not downloaded + again. + cookiefile: File name where cookies should be read from and dumped to. + nocheckcertificate:Do not verify SSL certificates + prefer_insecure: Use HTTP instead of HTTPS to retrieve information. + At the moment, this is only supported by YouTube. + proxy: URL of the proxy server to use + cn_verification_proxy: URL of the proxy to use for IP address verification + on Chinese sites. (Experimental) + socket_timeout: Time to wait for unresponsive hosts, in seconds + bidi_workaround: Work around buggy terminals without bidirectional text + support, using fridibi + debug_printtraffic:Print out sent and received HTTP traffic + include_ads: Download ads as well + default_search: Prepend this string if an input url is not valid. + 'auto' for elaborate guessing + encoding: Use this encoding instead of the system-specified. + extract_flat: Do not resolve URLs, return the immediate result. + Pass in 'in_playlist' to only show this behavior for + playlist items. + postprocessors: A list of dictionaries, each with an entry + * key: The name of the postprocessor. See + youtube_dl/postprocessor/__init__.py for a list. + as well as any further keyword arguments for the + postprocessor. + progress_hooks: A list of functions that get called on download + progress, with a dictionary with the entries + * status: One of "downloading", "error", or "finished". + Check this first and ignore unknown values. + + If status is one of "downloading", or "finished", the + following properties may also be present: + * filename: The final filename (always present) + * tmpfilename: The filename we're currently writing to + * downloaded_bytes: Bytes on disk + * total_bytes: Size of the whole file, None if unknown + * total_bytes_estimate: Guess of the eventual file size, + None if unavailable. + * elapsed: The number of seconds since download started. + * eta: The estimated time in seconds, None if unknown + * speed: The download speed in bytes/second, None if + unknown + * fragment_index: The counter of the currently + downloaded video fragment. + * fragment_count: The number of fragments (= individual + files that will be merged) + + Progress hooks are guaranteed to be called at least once + (with status "finished") if the download is successful. + merge_output_format: Extension to use when merging formats. + fixup: Automatically correct known faults of the file. + One of: + - "never": do nothing + - "warn": only emit a warning + - "detect_or_warn": check whether we can do anything + about it, warn otherwise (default) + source_address: (Experimental) Client-side IP address to bind to. + call_home: Boolean, true iff we are allowed to contact the + youtube-dl servers for debugging. + sleep_interval: Number of seconds to sleep before each download. + listformats: Print an overview of available video formats and exit. + list_thumbnails: Print a table of all thumbnails and exit. + match_filter: A function that gets called with the info_dict of + every video. + If it returns a message, the video is ignored. + If it returns None, the video is downloaded. + match_filter_func in utils.py is one example for this. + no_color: Do not emit color codes in output. + + The following options determine which downloader is picked: + external_downloader: Executable of the external downloader to call. + None or unset for standard (built-in) downloader. + hls_prefer_native: Use the native HLS downloader instead of ffmpeg/avconv. + + The following parameters are not used by YoutubeDL itself, they are used by + the downloader (see youtube_dl/downloader/common.py): + nopart, updatetime, buffersize, ratelimit, min_filesize, max_filesize, test, + noresizebuffer, retries, continuedl, noprogress, consoletitle, + xattr_set_filesize, external_downloader_args. + + The following options are used by the post processors: + prefer_ffmpeg: If True, use ffmpeg instead of avconv if both are available, + otherwise prefer avconv. + postprocessor_args: A list of additional command-line arguments for the + postprocessor. + """ + + params = None + _ies = [] + _pps = [] + _download_retcode = None + _num_downloads = None + _screen_file = None + + def __init__(self, params=None, auto_init=True): + """Create a FileDownloader object with the given options.""" + if params is None: + params = {} + self._ies = [] + self._ies_instances = {} + self._pps = [] + self._progress_hooks = [] + self._download_retcode = 0 + self._num_downloads = 0 + self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)] + self._err_file = sys.stderr + self.params = params + self.cache = Cache(self) + + if params.get('bidi_workaround', False): + try: + import pty + master, slave = pty.openpty() + width = compat_get_terminal_size().columns + if width is None: + width_args = [] + else: + width_args = ['-w', str(width)] + sp_kwargs = dict( + stdin=subprocess.PIPE, + stdout=slave, + stderr=self._err_file) + try: + self._output_process = subprocess.Popen( + ['bidiv'] + width_args, **sp_kwargs + ) + except OSError: + self._output_process = subprocess.Popen( + ['fribidi', '-c', 'UTF-8'] + width_args, **sp_kwargs) + self._output_channel = os.fdopen(master, 'rb') + except OSError as ose: + if ose.errno == 2: + self.report_warning('Could not find fribidi executable, ignoring --bidi-workaround . Make sure that fribidi is an executable file in one of the directories in your $PATH.') + else: + raise + + if (sys.version_info >= (3,) and sys.platform != 'win32' and + sys.getfilesystemencoding() in ['ascii', 'ANSI_X3.4-1968'] and + not params.get('restrictfilenames', False)): + # On Python 3, the Unicode filesystem API will throw errors (#1474) + self.report_warning( + 'Assuming --restrict-filenames since file system encoding ' + 'cannot encode all characters. ' + 'Set the LC_ALL environment variable to fix this.') + self.params['restrictfilenames'] = True + + if isinstance(params.get('outtmpl'), bytes): + self.report_warning( + 'Parameter outtmpl is bytes, but should be a unicode string. ' + 'Put from __future__ import unicode_literals at the top of your code file or consider switching to Python 3.x.') + + self._setup_opener() + + if auto_init: + self.print_debug_header() + self.add_default_info_extractors() + + for pp_def_raw in self.params.get('postprocessors', []): + pp_class = get_postprocessor(pp_def_raw['key']) + pp_def = dict(pp_def_raw) + del pp_def['key'] + pp = pp_class(self, **compat_kwargs(pp_def)) + self.add_post_processor(pp) + + for ph in self.params.get('progress_hooks', []): + self.add_progress_hook(ph) + + def warn_if_short_id(self, argv): + # short YouTube ID starting with dash? + idxs = [ + i for i, a in enumerate(argv) + if re.match(r'^-[0-9A-Za-z_-]{10}$', a)] + if idxs: + correct_argv = ( + ['youtube-dl'] + + [a for i, a in enumerate(argv) if i not in idxs] + + ['--'] + [argv[i] for i in idxs] + ) + self.report_warning( + 'Long argument string detected. ' + 'Use -- to separate parameters and URLs, like this:\n%s\n' % + args_to_str(correct_argv)) + + def add_info_extractor(self, ie): + """Add an InfoExtractor object to the end of the list.""" + self._ies.append(ie) + self._ies_instances[ie.ie_key()] = ie + ie.set_downloader(self) + + def get_info_extractor(self, ie_key): + """ + Get an instance of an IE with name ie_key, it will try to get one from + the _ies list, if there's no instance it will create a new one and add + it to the extractor list. + """ + ie = self._ies_instances.get(ie_key) + if ie is None: + ie = get_info_extractor(ie_key)() + self.add_info_extractor(ie) + return ie + + def add_default_info_extractors(self): + """ + Add the InfoExtractors returned by gen_extractors to the end of the list + """ + for ie in gen_extractors(): + self.add_info_extractor(ie) + + def add_post_processor(self, pp): + """Add a PostProcessor object to the end of the chain.""" + self._pps.append(pp) + pp.set_downloader(self) + + def add_progress_hook(self, ph): + """Add the progress hook (currently only for the file downloader)""" + self._progress_hooks.append(ph) + + def _bidi_workaround(self, message): + if not hasattr(self, '_output_channel'): + return message + + assert hasattr(self, '_output_process') + assert isinstance(message, compat_str) + line_count = message.count('\n') + 1 + self._output_process.stdin.write((message + '\n').encode('utf-8')) + self._output_process.stdin.flush() + res = ''.join(self._output_channel.readline().decode('utf-8') + for _ in range(line_count)) + return res[:-len('\n')] + + def to_screen(self, message, skip_eol=False): + """Print message to stdout if not in quiet mode.""" + return self.to_stdout(message, skip_eol, check_quiet=True) + + def _write_string(self, s, out=None): + write_string(s, out=out, encoding=self.params.get('encoding')) + + def to_stdout(self, message, skip_eol=False, check_quiet=False): + """Print message to stdout if not in quiet mode.""" + if self.params.get('logger'): + self.params['logger'].debug(message) + elif not check_quiet or not self.params.get('quiet', False): + message = self._bidi_workaround(message) + terminator = ['\n', ''][skip_eol] + output = message + terminator + + self._write_string(output, self._screen_file) + + def to_stderr(self, message): + """Print message to stderr.""" + assert isinstance(message, compat_str) + if self.params.get('logger'): + self.params['logger'].error(message) + else: + message = self._bidi_workaround(message) + output = message + '\n' + self._write_string(output, self._err_file) + + def to_console_title(self, message): + if not self.params.get('consoletitle', False): + return + if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow(): + # c_wchar_p() might not be necessary if `message` is + # already of type unicode() + ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message)) + elif 'TERM' in os.environ: + self._write_string('\033]0;%s\007' % message, self._screen_file) + + def save_console_title(self): + if not self.params.get('consoletitle', False): + return + if 'TERM' in os.environ: + # Save the title on stack + self._write_string('\033[22;0t', self._screen_file) + + def restore_console_title(self): + if not self.params.get('consoletitle', False): + return + if 'TERM' in os.environ: + # Restore the title from stack + self._write_string('\033[23;0t', self._screen_file) + + def __enter__(self): + self.save_console_title() + return self + + def __exit__(self, *args): + self.restore_console_title() + + if self.params.get('cookiefile') is not None: + self.cookiejar.save() + + def trouble(self, message=None, tb=None): + """Determine action to take when a download problem appears. + + Depending on if the downloader has been configured to ignore + download errors or not, this method may throw an exception or + not when errors are found, after printing the message. + + tb, if given, is additional traceback information. + """ + if message is not None: + self.to_stderr(message) + if self.params.get('verbose'): + if tb is None: + if sys.exc_info()[0]: # if .trouble has been called from an except block + tb = '' + if hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]: + tb += ''.join(traceback.format_exception(*sys.exc_info()[1].exc_info)) + tb += compat_str(traceback.format_exc()) + else: + tb_data = traceback.format_list(traceback.extract_stack()) + tb = ''.join(tb_data) + self.to_stderr(tb) + if not self.params.get('ignoreerrors', False): + if sys.exc_info()[0] and hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]: + exc_info = sys.exc_info()[1].exc_info + else: + exc_info = sys.exc_info() + raise DownloadError(message, exc_info) + self._download_retcode = 1 + + def report_warning(self, message): + ''' + Print the message to stderr, it will be prefixed with 'WARNING:' + If stderr is a tty file the 'WARNING:' will be colored + ''' + if self.params.get('logger') is not None: + self.params['logger'].warning(message) + else: + if self.params.get('no_warnings'): + return + if not self.params.get('no_color') and self._err_file.isatty() and os.name != 'nt': + _msg_header = '\033[0;33mWARNING:\033[0m' + else: + _msg_header = 'WARNING:' + warning_message = '%s %s' % (_msg_header, message) + self.to_stderr(warning_message) + + def report_error(self, message, tb=None): + ''' + Do the same as trouble, but prefixes the message with 'ERROR:', colored + in red if stderr is a tty file. + ''' + if not self.params.get('no_color') and self._err_file.isatty() and os.name != 'nt': + _msg_header = '\033[0;31mERROR:\033[0m' + else: + _msg_header = 'ERROR:' + error_message = '%s %s' % (_msg_header, message) + self.trouble(error_message, tb) + + def report_file_already_downloaded(self, file_name): + """Report file has already been fully downloaded.""" + try: + self.to_screen('[download] %s has already been downloaded' % file_name) + except UnicodeEncodeError: + self.to_screen('[download] The file has already been downloaded') + + def prepare_filename(self, info_dict): + """Generate the output filename.""" + try: + template_dict = dict(info_dict) + + template_dict['epoch'] = int(time.time()) + autonumber_size = self.params.get('autonumber_size') + if autonumber_size is None: + autonumber_size = 5 + autonumber_templ = '%0' + str(autonumber_size) + 'd' + template_dict['autonumber'] = autonumber_templ % self._num_downloads + if template_dict.get('playlist_index') is not None: + template_dict['playlist_index'] = '%0*d' % (len(str(template_dict['n_entries'])), template_dict['playlist_index']) + if template_dict.get('resolution') is None: + if template_dict.get('width') and template_dict.get('height'): + template_dict['resolution'] = '%dx%d' % (template_dict['width'], template_dict['height']) + elif template_dict.get('height'): + template_dict['resolution'] = '%sp' % template_dict['height'] + elif template_dict.get('width'): + template_dict['resolution'] = '?x%d' % template_dict['width'] + + sanitize = lambda k, v: sanitize_filename( + compat_str(v), + restricted=self.params.get('restrictfilenames'), + is_id=(k == 'id')) + template_dict = dict((k, sanitize(k, v)) + for k, v in template_dict.items() + if v is not None) + template_dict = collections.defaultdict(lambda: 'NA', template_dict) + + outtmpl = sanitize_path(self.params.get('outtmpl', DEFAULT_OUTTMPL)) + tmpl = compat_expanduser(outtmpl) + filename = tmpl % template_dict + # Temporary fix for #4787 + # 'Treat' all problem characters by passing filename through preferredencoding + # to workaround encoding issues with subprocess on python2 @ Windows + if sys.version_info < (3, 0) and sys.platform == 'win32': + filename = encodeFilename(filename, True).decode(preferredencoding()) + return filename + except ValueError as err: + self.report_error('Error in output template: ' + str(err) + ' (encoding: ' + repr(preferredencoding()) + ')') + return None + + def _match_entry(self, info_dict, incomplete): + """ Returns None iff the file should be downloaded """ + + video_title = info_dict.get('title', info_dict.get('id', 'video')) + if 'title' in info_dict: + # This can happen when we're just evaluating the playlist + title = info_dict['title'] + matchtitle = self.params.get('matchtitle', False) + if matchtitle: + if not re.search(matchtitle, title, re.IGNORECASE): + return '"' + title + '" title did not match pattern "' + matchtitle + '"' + rejecttitle = self.params.get('rejecttitle', False) + if rejecttitle: + if re.search(rejecttitle, title, re.IGNORECASE): + return '"' + title + '" title matched reject pattern "' + rejecttitle + '"' + date = info_dict.get('upload_date', None) + if date is not None: + dateRange = self.params.get('daterange', DateRange()) + if date not in dateRange: + return '%s upload date is not in range %s' % (date_from_str(date).isoformat(), dateRange) + view_count = info_dict.get('view_count', None) + if view_count is not None: + min_views = self.params.get('min_views') + if min_views is not None and view_count < min_views: + return 'Skipping %s, because it has not reached minimum view count (%d/%d)' % (video_title, view_count, min_views) + max_views = self.params.get('max_views') + if max_views is not None and view_count > max_views: + return 'Skipping %s, because it has exceeded the maximum view count (%d/%d)' % (video_title, view_count, max_views) + if age_restricted(info_dict.get('age_limit'), self.params.get('age_limit')): + return 'Skipping "%s" because it is age restricted' % video_title + if self.in_download_archive(info_dict): + return '%s has already been recorded in archive' % video_title + + if not incomplete: + match_filter = self.params.get('match_filter') + if match_filter is not None: + ret = match_filter(info_dict) + if ret is not None: + return ret + + return None + + @staticmethod + def add_extra_info(info_dict, extra_info): + '''Set the keys from extra_info in info dict if they are missing''' + for key, value in extra_info.items(): + info_dict.setdefault(key, value) + + def extract_info(self, url, download=True, ie_key=None, extra_info={}, + process=True, force_generic_extractor=False): + ''' + Returns a list with a dictionary for each video we find. + If 'download', also downloads the videos. + extra_info is a dict containing the extra values to add to each result + ''' + + if not ie_key and force_generic_extractor: + ie_key = 'Generic' + + if ie_key: + ies = [self.get_info_extractor(ie_key)] + else: + ies = self._ies + + for ie in ies: + if not ie.suitable(url): + continue + + if not ie.working(): + self.report_warning('The program functionality for this site has been marked as broken, ' + 'and will probably not work.') + + try: + ie_result = ie.extract(url) + if ie_result is None: # Finished already (backwards compatibility; listformats and friends should be moved here) + break + if isinstance(ie_result, list): + # Backwards compatibility: old IE result format + ie_result = { + '_type': 'compat_list', + 'entries': ie_result, + } + self.add_default_extra_info(ie_result, ie, url) + if process: + return self.process_ie_result(ie_result, download, extra_info) + else: + return ie_result + except ExtractorError as de: # An error we somewhat expected + self.report_error(compat_str(de), de.format_traceback()) + break + except MaxDownloadsReached: + raise + except Exception as e: + if self.params.get('ignoreerrors', False): + self.report_error(compat_str(e), tb=compat_str(traceback.format_exc())) + break + else: + raise + else: + self.report_error('no suitable InfoExtractor for URL %s' % url) + + def add_default_extra_info(self, ie_result, ie, url): + self.add_extra_info(ie_result, { + 'extractor': ie.IE_NAME, + 'webpage_url': url, + 'webpage_url_basename': url_basename(url), + 'extractor_key': ie.ie_key(), + }) + + def process_ie_result(self, ie_result, download=True, extra_info={}): + """ + Take the result of the ie(may be modified) and resolve all unresolved + references (URLs, playlist items). + + It will also download the videos if 'download'. + Returns the resolved ie_result. + """ + + result_type = ie_result.get('_type', 'video') + + if result_type in ('url', 'url_transparent'): + extract_flat = self.params.get('extract_flat', False) + if ((extract_flat == 'in_playlist' and 'playlist' in extra_info) or + extract_flat is True): + if self.params.get('forcejson', False): + self.to_stdout(json.dumps(ie_result)) + return ie_result + + if result_type == 'video': + self.add_extra_info(ie_result, extra_info) + return self.process_video_result(ie_result, download=download) + elif result_type == 'url': + # We have to add extra_info to the results because it may be + # contained in a playlist + return self.extract_info(ie_result['url'], + download, + ie_key=ie_result.get('ie_key'), + extra_info=extra_info) + elif result_type == 'url_transparent': + # Use the information from the embedding page + info = self.extract_info( + ie_result['url'], ie_key=ie_result.get('ie_key'), + extra_info=extra_info, download=False, process=False) + + force_properties = dict( + (k, v) for k, v in ie_result.items() if v is not None) + for f in ('_type', 'url'): + if f in force_properties: + del force_properties[f] + new_result = info.copy() + new_result.update(force_properties) + + assert new_result.get('_type') != 'url_transparent' + + return self.process_ie_result( + new_result, download=download, extra_info=extra_info) + elif result_type == 'playlist' or result_type == 'multi_video': + # We process each entry in the playlist + playlist = ie_result.get('title', None) or ie_result.get('id', None) + self.to_screen('[download] Downloading playlist: %s' % playlist) + + playlist_results = [] + + playliststart = self.params.get('playliststart', 1) - 1 + playlistend = self.params.get('playlistend', None) + # For backwards compatibility, interpret -1 as whole list + if playlistend == -1: + playlistend = None + + playlistitems_str = self.params.get('playlist_items', None) + playlistitems = None + if playlistitems_str is not None: + def iter_playlistitems(format): + for string_segment in format.split(','): + if '-' in string_segment: + start, end = string_segment.split('-') + for item in range(int(start), int(end) + 1): + yield int(item) + else: + yield int(string_segment) + playlistitems = iter_playlistitems(playlistitems_str) + + ie_entries = ie_result['entries'] + if isinstance(ie_entries, list): + n_all_entries = len(ie_entries) + if playlistitems: + entries = [ + ie_entries[i - 1] for i in playlistitems + if -n_all_entries <= i - 1 < n_all_entries] + else: + entries = ie_entries[playliststart:playlistend] + n_entries = len(entries) + self.to_screen( + "[%s] playlist %s: Collected %d video ids (downloading %d of them)" % + (ie_result['extractor'], playlist, n_all_entries, n_entries)) + elif isinstance(ie_entries, PagedList): + if playlistitems: + entries = [] + for item in playlistitems: + entries.extend(ie_entries.getslice( + item - 1, item + )) + else: + entries = ie_entries.getslice( + playliststart, playlistend) + n_entries = len(entries) + self.to_screen( + "[%s] playlist %s: Downloading %d videos" % + (ie_result['extractor'], playlist, n_entries)) + else: # iterable + if playlistitems: + entry_list = list(ie_entries) + entries = [entry_list[i - 1] for i in playlistitems] + else: + # ADDED + entries = list(ie_entries) + """ + entries = list(itertools.islice( + ie_entries, playliststart, playlistend)) + """ + n_entries = len(entries) + self.to_screen( + "[%s] playlist %s: Downloading %d videos" % + (ie_result['extractor'], playlist, n_entries)) + + # ADDED + if self.params.get('playlistreverse', False): + entries = entries[::-1] + entries = entries[playliststart:playlistend] + + for i, entry in enumerate(entries, 1): + self.to_screen('[download] Downloading video %s of %s' % (i, n_entries)) + extra = { + 'n_entries': n_entries, + 'playlist': playlist, + 'playlist_id': ie_result.get('id'), + 'playlist_title': ie_result.get('title'), + 'playlist_index': i + playliststart, + 'extractor': ie_result['extractor'], + 'webpage_url': ie_result['webpage_url'], + 'webpage_url_basename': url_basename(ie_result['webpage_url']), + 'extractor_key': ie_result['extractor_key'], + } + + reason = self._match_entry(entry, incomplete=True) + if reason is not None: + self.to_screen('[download] ' + reason) + continue + + entry_result = self.process_ie_result(entry, + download=download, + extra_info=extra) + playlist_results.append(entry_result) + ie_result['entries'] = playlist_results + return ie_result + elif result_type == 'compat_list': + self.report_warning( + 'Extractor %s returned a compat_list result. ' + 'It needs to be updated.' % ie_result.get('extractor')) + + def _fixup(r): + self.add_extra_info( + r, + { + 'extractor': ie_result['extractor'], + 'webpage_url': ie_result['webpage_url'], + 'webpage_url_basename': url_basename(ie_result['webpage_url']), + 'extractor_key': ie_result['extractor_key'], + } + ) + return r + ie_result['entries'] = [ + self.process_ie_result(_fixup(r), download, extra_info) + for r in ie_result['entries'] + ] + return ie_result + else: + raise Exception('Invalid result type: %s' % result_type) + + def _build_format_filter(self, filter_spec): + " Returns a function to filter the formats according to the filter_spec " + + OPERATORS = { + '<': operator.lt, + '<=': operator.le, + '>': operator.gt, + '>=': operator.ge, + '=': operator.eq, + '!=': operator.ne, + } + operator_rex = re.compile(r'''(?x)\s* + (?Pwidth|height|tbr|abr|vbr|asr|filesize|fps) + \s*(?P%s)(?P\s*\?)?\s* + (?P[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?) + $ + ''' % '|'.join(map(re.escape, OPERATORS.keys()))) + m = operator_rex.search(filter_spec) + if m: + try: + comparison_value = int(m.group('value')) + except ValueError: + comparison_value = parse_filesize(m.group('value')) + if comparison_value is None: + comparison_value = parse_filesize(m.group('value') + 'B') + if comparison_value is None: + raise ValueError( + 'Invalid value %r in format specification %r' % ( + m.group('value'), filter_spec)) + op = OPERATORS[m.group('op')] + + if not m: + STR_OPERATORS = { + '=': operator.eq, + '!=': operator.ne, + } + str_operator_rex = re.compile(r'''(?x) + \s*(?Pext|acodec|vcodec|container|protocol) + \s*(?P%s)(?P\s*\?)? + \s*(?P[a-zA-Z0-9_-]+) + \s*$ + ''' % '|'.join(map(re.escape, STR_OPERATORS.keys()))) + m = str_operator_rex.search(filter_spec) + if m: + comparison_value = m.group('value') + op = STR_OPERATORS[m.group('op')] + + if not m: + raise ValueError('Invalid filter specification %r' % filter_spec) + + def _filter(f): + actual_value = f.get(m.group('key')) + if actual_value is None: + return m.group('none_inclusive') + return op(actual_value, comparison_value) + return _filter + + def build_format_selector(self, format_spec): + def syntax_error(note, start): + message = ( + 'Invalid format specification: ' + '{0}\n\t{1}\n\t{2}^'.format(note, format_spec, ' ' * start[1])) + return SyntaxError(message) + + PICKFIRST = 'PICKFIRST' + MERGE = 'MERGE' + SINGLE = 'SINGLE' + GROUP = 'GROUP' + FormatSelector = collections.namedtuple('FormatSelector', ['type', 'selector', 'filters']) + + def _parse_filter(tokens): + filter_parts = [] + for type, string, start, _, _ in tokens: + if type == tokenize.OP and string == ']': + return ''.join(filter_parts) + else: + filter_parts.append(string) + + def _remove_unused_ops(tokens): + # Remove operators that we don't use and join them with the sourrounding strings + # for example: 'mp4' '-' 'baseline' '-' '16x9' is converted to 'mp4-baseline-16x9' + ALLOWED_OPS = ('/', '+', ',', '(', ')') + last_string, last_start, last_end, last_line = None, None, None, None + for type, string, start, end, line in tokens: + if type == tokenize.OP and string == '[': + if last_string: + yield tokenize.NAME, last_string, last_start, last_end, last_line + last_string = None + yield type, string, start, end, line + # everything inside brackets will be handled by _parse_filter + for type, string, start, end, line in tokens: + yield type, string, start, end, line + if type == tokenize.OP and string == ']': + break + elif type == tokenize.OP and string in ALLOWED_OPS: + if last_string: + yield tokenize.NAME, last_string, last_start, last_end, last_line + last_string = None + yield type, string, start, end, line + elif type in [tokenize.NAME, tokenize.NUMBER, tokenize.OP]: + if not last_string: + last_string = string + last_start = start + last_end = end + else: + last_string += string + if last_string: + yield tokenize.NAME, last_string, last_start, last_end, last_line + + def _parse_format_selection(tokens, inside_merge=False, inside_choice=False, inside_group=False): + selectors = [] + current_selector = None + for type, string, start, _, _ in tokens: + # ENCODING is only defined in python 3.x + if type == getattr(tokenize, 'ENCODING', None): + continue + elif type in [tokenize.NAME, tokenize.NUMBER]: + current_selector = FormatSelector(SINGLE, string, []) + elif type == tokenize.OP: + if string == ')': + if not inside_group: + # ')' will be handled by the parentheses group + tokens.restore_last_token() + break + elif inside_merge and string in ['/', ',']: + tokens.restore_last_token() + break + elif inside_choice and string == ',': + tokens.restore_last_token() + break + elif string == ',': + if not current_selector: + raise syntax_error('"," must follow a format selector', start) + selectors.append(current_selector) + current_selector = None + elif string == '/': + if not current_selector: + raise syntax_error('"/" must follow a format selector', start) + first_choice = current_selector + second_choice = _parse_format_selection(tokens, inside_choice=True) + current_selector = FormatSelector(PICKFIRST, (first_choice, second_choice), []) + elif string == '[': + if not current_selector: + current_selector = FormatSelector(SINGLE, 'best', []) + format_filter = _parse_filter(tokens) + current_selector.filters.append(format_filter) + elif string == '(': + if current_selector: + raise syntax_error('Unexpected "("', start) + group = _parse_format_selection(tokens, inside_group=True) + current_selector = FormatSelector(GROUP, group, []) + elif string == '+': + video_selector = current_selector + audio_selector = _parse_format_selection(tokens, inside_merge=True) + if not video_selector or not audio_selector: + raise syntax_error('"+" must be between two format selectors', start) + current_selector = FormatSelector(MERGE, (video_selector, audio_selector), []) + else: + raise syntax_error('Operator not recognized: "{0}"'.format(string), start) + elif type == tokenize.ENDMARKER: + break + if current_selector: + selectors.append(current_selector) + return selectors + + def _build_selector_function(selector): + if isinstance(selector, list): + fs = [_build_selector_function(s) for s in selector] + + def selector_function(formats): + for f in fs: + for format in f(formats): + yield format + return selector_function + elif selector.type == GROUP: + selector_function = _build_selector_function(selector.selector) + elif selector.type == PICKFIRST: + fs = [_build_selector_function(s) for s in selector.selector] + + def selector_function(formats): + for f in fs: + picked_formats = list(f(formats)) + if picked_formats: + return picked_formats + return [] + elif selector.type == SINGLE: + format_spec = selector.selector + + def selector_function(formats): + formats = list(formats) + if not formats: + return + if format_spec == 'all': + for f in formats: + yield f + elif format_spec in ['best', 'worst', None]: + format_idx = 0 if format_spec == 'worst' else -1 + audiovideo_formats = [ + f for f in formats + if f.get('vcodec') != 'none' and f.get('acodec') != 'none'] + if audiovideo_formats: + yield audiovideo_formats[format_idx] + # for audio only (soundcloud) or video only (imgur) urls, select the best/worst audio format + elif (all(f.get('acodec') != 'none' for f in formats) or + all(f.get('vcodec') != 'none' for f in formats)): + yield formats[format_idx] + elif format_spec == 'bestaudio': + audio_formats = [ + f for f in formats + if f.get('vcodec') == 'none'] + if audio_formats: + yield audio_formats[-1] + elif format_spec == 'worstaudio': + audio_formats = [ + f for f in formats + if f.get('vcodec') == 'none'] + if audio_formats: + yield audio_formats[0] + elif format_spec == 'bestvideo': + video_formats = [ + f for f in formats + if f.get('acodec') == 'none'] + if video_formats: + yield video_formats[-1] + elif format_spec == 'worstvideo': + video_formats = [ + f for f in formats + if f.get('acodec') == 'none'] + if video_formats: + yield video_formats[0] + else: + extensions = ['mp4', 'flv', 'webm', '3gp', 'm4a', 'mp3', 'ogg', 'aac', 'wav'] + if format_spec in extensions: + filter_f = lambda f: f['ext'] == format_spec + else: + filter_f = lambda f: f['format_id'] == format_spec + matches = list(filter(filter_f, formats)) + if matches: + yield matches[-1] + elif selector.type == MERGE: + def _merge(formats_info): + format_1, format_2 = [f['format_id'] for f in formats_info] + # The first format must contain the video and the + # second the audio + if formats_info[0].get('vcodec') == 'none': + self.report_error('The first format must ' + 'contain the video, try using ' + '"-f %s+%s"' % (format_2, format_1)) + return + output_ext = ( + formats_info[0]['ext'] + if self.params.get('merge_output_format') is None + else self.params['merge_output_format']) + return { + 'requested_formats': formats_info, + 'format': '%s+%s' % (formats_info[0].get('format'), + formats_info[1].get('format')), + 'format_id': '%s+%s' % (formats_info[0].get('format_id'), + formats_info[1].get('format_id')), + 'width': formats_info[0].get('width'), + 'height': formats_info[0].get('height'), + 'resolution': formats_info[0].get('resolution'), + 'fps': formats_info[0].get('fps'), + 'vcodec': formats_info[0].get('vcodec'), + 'vbr': formats_info[0].get('vbr'), + 'stretched_ratio': formats_info[0].get('stretched_ratio'), + 'acodec': formats_info[1].get('acodec'), + 'abr': formats_info[1].get('abr'), + 'ext': output_ext, + } + video_selector, audio_selector = map(_build_selector_function, selector.selector) + + def selector_function(formats): + formats = list(formats) + for pair in itertools.product(video_selector(formats), audio_selector(formats)): + yield _merge(pair) + + filters = [self._build_format_filter(f) for f in selector.filters] + + def final_selector(formats): + for _filter in filters: + formats = list(filter(_filter, formats)) + return selector_function(formats) + return final_selector + + stream = io.BytesIO(format_spec.encode('utf-8')) + try: + tokens = list(_remove_unused_ops(compat_tokenize_tokenize(stream.readline))) + except tokenize.TokenError: + raise syntax_error('Missing closing/opening brackets or parenthesis', (0, len(format_spec))) + + class TokenIterator(object): + def __init__(self, tokens): + self.tokens = tokens + self.counter = 0 + + def __iter__(self): + return self + + def __next__(self): + if self.counter >= len(self.tokens): + raise StopIteration() + value = self.tokens[self.counter] + self.counter += 1 + return value + + next = __next__ + + def restore_last_token(self): + self.counter -= 1 + + parsed_selector = _parse_format_selection(iter(TokenIterator(tokens))) + return _build_selector_function(parsed_selector) + + def _calc_headers(self, info_dict): + res = std_headers.copy() + + add_headers = info_dict.get('http_headers') + if add_headers: + res.update(add_headers) + + cookies = self._calc_cookies(info_dict) + if cookies: + res['Cookie'] = cookies + + return res + + def _calc_cookies(self, info_dict): + pr = compat_urllib_request.Request(info_dict['url']) + self.cookiejar.add_cookie_header(pr) + return pr.get_header('Cookie') + + def process_video_result(self, info_dict, download=True): + assert info_dict.get('_type', 'video') == 'video' + + if 'id' not in info_dict: + raise ExtractorError('Missing "id" field in extractor result') + if 'title' not in info_dict: + raise ExtractorError('Missing "title" field in extractor result') + + if 'playlist' not in info_dict: + # It isn't part of a playlist + info_dict['playlist'] = None + info_dict['playlist_index'] = None + + thumbnails = info_dict.get('thumbnails') + if thumbnails is None: + thumbnail = info_dict.get('thumbnail') + if thumbnail: + info_dict['thumbnails'] = thumbnails = [{'url': thumbnail}] + if thumbnails: + thumbnails.sort(key=lambda t: ( + t.get('preference'), t.get('width'), t.get('height'), + t.get('id'), t.get('url'))) + for i, t in enumerate(thumbnails): + if t.get('width') and t.get('height'): + t['resolution'] = '%dx%d' % (t['width'], t['height']) + if t.get('id') is None: + t['id'] = '%d' % i + + if thumbnails and 'thumbnail' not in info_dict: + info_dict['thumbnail'] = thumbnails[-1]['url'] + + if 'display_id' not in info_dict and 'id' in info_dict: + info_dict['display_id'] = info_dict['id'] + + if info_dict.get('upload_date') is None and info_dict.get('timestamp') is not None: + # Working around out-of-range timestamp values (e.g. negative ones on Windows, + # see http://bugs.python.org/issue1646728) + try: + upload_date = datetime.datetime.utcfromtimestamp(info_dict['timestamp']) + info_dict['upload_date'] = upload_date.strftime('%Y%m%d') + except (ValueError, OverflowError, OSError): + pass + + if self.params.get('listsubtitles', False): + if 'automatic_captions' in info_dict: + self.list_subtitles(info_dict['id'], info_dict.get('automatic_captions'), 'automatic captions') + self.list_subtitles(info_dict['id'], info_dict.get('subtitles'), 'subtitles') + return + info_dict['requested_subtitles'] = self.process_subtitles( + info_dict['id'], info_dict.get('subtitles'), + info_dict.get('automatic_captions')) + + # We now pick which formats have to be downloaded + if info_dict.get('formats') is None: + # There's only one format available + formats = [info_dict] + else: + formats = info_dict['formats'] + + if not formats: + raise ExtractorError('No video formats found!') + + formats_dict = {} + + # We check that all the formats have the format and format_id fields + for i, format in enumerate(formats): + if 'url' not in format: + raise ExtractorError('Missing "url" key in result (index %d)' % i) + + if format.get('format_id') is None: + format['format_id'] = compat_str(i) + format_id = format['format_id'] + if format_id not in formats_dict: + formats_dict[format_id] = [] + formats_dict[format_id].append(format) + + # Make sure all formats have unique format_id + for format_id, ambiguous_formats in formats_dict.items(): + if len(ambiguous_formats) > 1: + for i, format in enumerate(ambiguous_formats): + format['format_id'] = '%s-%d' % (format_id, i) + + for i, format in enumerate(formats): + if format.get('format') is None: + format['format'] = '{id} - {res}{note}'.format( + id=format['format_id'], + res=self.format_resolution(format), + note=' ({0})'.format(format['format_note']) if format.get('format_note') is not None else '', + ) + # Automatically determine file extension if missing + if 'ext' not in format: + format['ext'] = determine_ext(format['url']).lower() + # Add HTTP headers, so that external programs can use them from the + # json output + full_format_info = info_dict.copy() + full_format_info.update(format) + format['http_headers'] = self._calc_headers(full_format_info) + + # TODO Central sorting goes here + + if formats[0] is not info_dict: + # only set the 'formats' fields if the original info_dict list them + # otherwise we end up with a circular reference, the first (and unique) + # element in the 'formats' field in info_dict is info_dict itself, + # wich can't be exported to json + info_dict['formats'] = formats + if self.params.get('listformats'): + self.list_formats(info_dict) + return + if self.params.get('list_thumbnails'): + self.list_thumbnails(info_dict) + return + + req_format = self.params.get('format') + if req_format is None: + req_format_list = [] + if (self.params.get('outtmpl', DEFAULT_OUTTMPL) != '-' and + info_dict['extractor'] in ['youtube', 'ted'] and + not info_dict.get('is_live')): + merger = FFmpegMergerPP(self) + if merger.available and merger.can_merge(): + req_format_list.append('bestvideo+bestaudio') + req_format_list.append('best') + req_format = '/'.join(req_format_list) + format_selector = self.build_format_selector(req_format) + formats_to_download = list(format_selector(formats)) + if not formats_to_download: + raise ExtractorError('requested format not available', + expected=True) + + if download: + if len(formats_to_download) > 1: + self.to_screen('[info] %s: downloading video in %s formats' % (info_dict['id'], len(formats_to_download))) + for format in formats_to_download: + new_info = dict(info_dict) + new_info.update(format) + self.process_info(new_info) + # We update the info dict with the best quality format (backwards compatibility) + info_dict.update(formats_to_download[-1]) + return info_dict + + def process_subtitles(self, video_id, normal_subtitles, automatic_captions): + """Select the requested subtitles and their format""" + available_subs = {} + if normal_subtitles and self.params.get('writesubtitles'): + available_subs.update(normal_subtitles) + if automatic_captions and self.params.get('writeautomaticsub'): + for lang, cap_info in automatic_captions.items(): + if lang not in available_subs: + available_subs[lang] = cap_info + + if (not self.params.get('writesubtitles') and not + self.params.get('writeautomaticsub') or not + available_subs): + return None + + if self.params.get('allsubtitles', False): + requested_langs = available_subs.keys() + else: + if self.params.get('subtitleslangs', False): + requested_langs = self.params.get('subtitleslangs') + elif 'en' in available_subs: + requested_langs = ['en'] + else: + requested_langs = [list(available_subs.keys())[0]] + + formats_query = self.params.get('subtitlesformat', 'best') + formats_preference = formats_query.split('/') if formats_query else [] + subs = {} + for lang in requested_langs: + formats = available_subs.get(lang) + if formats is None: + self.report_warning('%s subtitles not available for %s' % (lang, video_id)) + continue + for ext in formats_preference: + if ext == 'best': + f = formats[-1] + break + matches = list(filter(lambda f: f['ext'] == ext, formats)) + if matches: + f = matches[-1] + break + else: + f = formats[-1] + self.report_warning( + 'No subtitle format found matching "%s" for language %s, ' + 'using %s' % (formats_query, lang, f['ext'])) + subs[lang] = f + return subs + + def process_info(self, info_dict): + """Process a single resolved IE result.""" + + assert info_dict.get('_type', 'video') == 'video' + + max_downloads = self.params.get('max_downloads') + if max_downloads is not None: + if self._num_downloads >= int(max_downloads): + raise MaxDownloadsReached() + + info_dict['fulltitle'] = info_dict['title'] + if len(info_dict['title']) > 200: + info_dict['title'] = info_dict['title'][:197] + '...' + + if 'format' not in info_dict: + info_dict['format'] = info_dict['ext'] + + reason = self._match_entry(info_dict, incomplete=False) + if reason is not None: + self.to_screen('[download] ' + reason) + return + + self._num_downloads += 1 + + info_dict['_filename'] = filename = self.prepare_filename(info_dict) + + # Forced printings + if self.params.get('forcetitle', False): + self.to_stdout(info_dict['fulltitle']) + if self.params.get('forceid', False): + self.to_stdout(info_dict['id']) + if self.params.get('forceurl', False): + if info_dict.get('requested_formats') is not None: + for f in info_dict['requested_formats']: + self.to_stdout(f['url'] + f.get('play_path', '')) + else: + # For RTMP URLs, also include the playpath + self.to_stdout(info_dict['url'] + info_dict.get('play_path', '')) + if self.params.get('forcethumbnail', False) and info_dict.get('thumbnail') is not None: + self.to_stdout(info_dict['thumbnail']) + if self.params.get('forcedescription', False) and info_dict.get('description') is not None: + self.to_stdout(info_dict['description']) + if self.params.get('forcefilename', False) and filename is not None: + self.to_stdout(filename) + if self.params.get('forceduration', False) and info_dict.get('duration') is not None: + self.to_stdout(formatSeconds(info_dict['duration'])) + if self.params.get('forceformat', False): + self.to_stdout(info_dict['format']) + if self.params.get('forcejson', False): + self.to_stdout(json.dumps(info_dict)) + + # Do nothing else if in simulate mode + if self.params.get('simulate', False): + return + + if filename is None: + return + + try: + dn = os.path.dirname(sanitize_path(encodeFilename(filename))) + if dn and not os.path.exists(dn): + os.makedirs(dn) + except (OSError, IOError) as err: + self.report_error('unable to create directory ' + compat_str(err)) + return + + if self.params.get('writedescription', False): + descfn = replace_extension(filename, 'description', info_dict.get('ext')) + if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(descfn)): + self.to_screen('[info] Video description is already present') + elif info_dict.get('description') is None: + self.report_warning('There\'s no description to write.') + else: + try: + self.to_screen('[info] Writing video description to: ' + descfn) + with io.open(encodeFilename(descfn), 'w', encoding='utf-8') as descfile: + descfile.write(info_dict['description']) + except (OSError, IOError): + self.report_error('Cannot write description file ' + descfn) + return + + if self.params.get('writeannotations', False): + annofn = replace_extension(filename, 'annotations.xml', info_dict.get('ext')) + if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(annofn)): + self.to_screen('[info] Video annotations are already present') + else: + try: + self.to_screen('[info] Writing video annotations to: ' + annofn) + with io.open(encodeFilename(annofn), 'w', encoding='utf-8') as annofile: + annofile.write(info_dict['annotations']) + except (KeyError, TypeError): + self.report_warning('There are no annotations to write.') + except (OSError, IOError): + self.report_error('Cannot write annotations file: ' + annofn) + return + + subtitles_are_requested = any([self.params.get('writesubtitles', False), + self.params.get('writeautomaticsub')]) + + if subtitles_are_requested and info_dict.get('requested_subtitles'): + # subtitles download errors are already managed as troubles in relevant IE + # that way it will silently go on when used with unsupporting IE + subtitles = info_dict['requested_subtitles'] + ie = self.get_info_extractor(info_dict['extractor_key']) + for sub_lang, sub_info in subtitles.items(): + sub_format = sub_info['ext'] + if sub_info.get('data') is not None: + sub_data = sub_info['data'] + else: + try: + sub_data = ie._download_webpage( + sub_info['url'], info_dict['id'], note=False) + except ExtractorError as err: + self.report_warning('Unable to download subtitle for "%s": %s' % + (sub_lang, compat_str(err.cause))) + continue + try: + sub_filename = subtitles_filename(filename, sub_lang, sub_format) + if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(sub_filename)): + self.to_screen('[info] Video subtitle %s.%s is already_present' % (sub_lang, sub_format)) + else: + self.to_screen('[info] Writing video subtitles to: ' + sub_filename) + with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8') as subfile: + subfile.write(sub_data) + except (OSError, IOError): + self.report_error('Cannot write subtitles file ' + sub_filename) + return + + if self.params.get('writeinfojson', False): + infofn = replace_extension(filename, 'info.json', info_dict.get('ext')) + if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(infofn)): + self.to_screen('[info] Video description metadata is already present') + else: + self.to_screen('[info] Writing video description metadata as JSON to: ' + infofn) + try: + write_json_file(self.filter_requested_info(info_dict), infofn) + except (OSError, IOError): + self.report_error('Cannot write metadata to JSON file ' + infofn) + return + + self._write_thumbnails(info_dict, filename) + + if not self.params.get('skip_download', False): + try: + def dl(name, info): + fd = get_suitable_downloader(info, self.params)(self, self.params) + for ph in self._progress_hooks: + fd.add_progress_hook(ph) + if self.params.get('verbose'): + self.to_stdout('[debug] Invoking downloader on %r' % info.get('url')) + return fd.download(name, info) + + if info_dict.get('requested_formats') is not None: + downloaded = [] + success = True + merger = FFmpegMergerPP(self) + if not merger.available: + postprocessors = [] + self.report_warning('You have requested multiple ' + 'formats but ffmpeg or avconv are not installed.' + ' The formats won\'t be merged.') + else: + postprocessors = [merger] + + def compatible_formats(formats): + video, audio = formats + # Check extension + video_ext, audio_ext = audio.get('ext'), video.get('ext') + if video_ext and audio_ext: + COMPATIBLE_EXTS = ( + ('mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v'), + ('webm') + ) + for exts in COMPATIBLE_EXTS: + if video_ext in exts and audio_ext in exts: + return True + # TODO: Check acodec/vcodec + return False + + filename_real_ext = os.path.splitext(filename)[1][1:] + filename_wo_ext = ( + os.path.splitext(filename)[0] + if filename_real_ext == info_dict['ext'] + else filename) + requested_formats = info_dict['requested_formats'] + if self.params.get('merge_output_format') is None and not compatible_formats(requested_formats): + info_dict['ext'] = 'mkv' + self.report_warning( + 'Requested formats are incompatible for merge and will be merged into mkv.') + # Ensure filename always has a correct extension for successful merge + filename = '%s.%s' % (filename_wo_ext, info_dict['ext']) + if os.path.exists(encodeFilename(filename)): + self.to_screen( + '[download] %s has already been downloaded and ' + 'merged' % filename) + else: + for f in requested_formats: + new_info = dict(info_dict) + new_info.update(f) + fname = self.prepare_filename(new_info) + fname = prepend_extension(fname, 'f%s' % f['format_id'], new_info['ext']) + downloaded.append(fname) + partial_success = dl(fname, new_info) + success = success and partial_success + info_dict['__postprocessors'] = postprocessors + info_dict['__files_to_merge'] = downloaded + else: + # Just a single file + success = dl(filename, info_dict) + except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: + self.report_error('unable to download video data: %s' % str(err)) + return + except (OSError, IOError) as err: + raise UnavailableVideoError(err) + except (ContentTooShortError, ) as err: + self.report_error('content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded)) + return + + if success: + # Fixup content + fixup_policy = self.params.get('fixup') + if fixup_policy is None: + fixup_policy = 'detect_or_warn' + + stretched_ratio = info_dict.get('stretched_ratio') + if stretched_ratio is not None and stretched_ratio != 1: + if fixup_policy == 'warn': + self.report_warning('%s: Non-uniform pixel ratio (%s)' % ( + info_dict['id'], stretched_ratio)) + elif fixup_policy == 'detect_or_warn': + stretched_pp = FFmpegFixupStretchedPP(self) + if stretched_pp.available: + info_dict.setdefault('__postprocessors', []) + info_dict['__postprocessors'].append(stretched_pp) + else: + self.report_warning( + '%s: Non-uniform pixel ratio (%s). Install ffmpeg or avconv to fix this automatically.' % ( + info_dict['id'], stretched_ratio)) + else: + assert fixup_policy in ('ignore', 'never') + + if info_dict.get('requested_formats') is None and info_dict.get('container') == 'm4a_dash': + if fixup_policy == 'warn': + self.report_warning('%s: writing DASH m4a. Only some players support this container.' % ( + info_dict['id'])) + elif fixup_policy == 'detect_or_warn': + fixup_pp = FFmpegFixupM4aPP(self) + if fixup_pp.available: + info_dict.setdefault('__postprocessors', []) + info_dict['__postprocessors'].append(fixup_pp) + else: + self.report_warning( + '%s: writing DASH m4a. Only some players support this container. Install ffmpeg or avconv to fix this automatically.' % ( + info_dict['id'])) + else: + assert fixup_policy in ('ignore', 'never') + + try: + self.post_process(filename, info_dict) + except (PostProcessingError) as err: + self.report_error('postprocessing: %s' % str(err)) + return + self.record_download_archive(info_dict) + + def download(self, url_list): + """Download a given list of URLs.""" + outtmpl = self.params.get('outtmpl', DEFAULT_OUTTMPL) + if (len(url_list) > 1 and + '%' not in outtmpl and + self.params.get('max_downloads') != 1): + raise SameFileError(outtmpl) + + for url in url_list: + try: + # It also downloads the videos + res = self.extract_info( + url, force_generic_extractor=self.params.get('force_generic_extractor', False)) + except UnavailableVideoError: + self.report_error('unable to download video') + except MaxDownloadsReached: + self.to_screen('[info] Maximum number of downloaded files reached.') + raise + else: + if self.params.get('dump_single_json', False): + self.to_stdout(json.dumps(res)) + + return self._download_retcode + + def download_with_info_file(self, info_filename): + with contextlib.closing(fileinput.FileInput( + [info_filename], mode='r', + openhook=fileinput.hook_encoded('utf-8'))) as f: + # FileInput doesn't have a read method, we can't call json.load + info = self.filter_requested_info(json.loads('\n'.join(f))) + try: + self.process_ie_result(info, download=True) + except DownloadError: + webpage_url = info.get('webpage_url') + if webpage_url is not None: + self.report_warning('The info failed to download, trying with "%s"' % webpage_url) + return self.download([webpage_url]) + else: + raise + return self._download_retcode + + @staticmethod + def filter_requested_info(info_dict): + return dict( + (k, v) for k, v in info_dict.items() + if k not in ['requested_formats', 'requested_subtitles']) + + def post_process(self, filename, ie_info): + """Run all the postprocessors on the given file.""" + info = dict(ie_info) + info['filepath'] = filename + pps_chain = [] + if ie_info.get('__postprocessors') is not None: + pps_chain.extend(ie_info['__postprocessors']) + pps_chain.extend(self._pps) + for pp in pps_chain: + files_to_delete = [] + try: + files_to_delete, info = pp.run(info) + except PostProcessingError as e: + self.report_error(e.msg) + if files_to_delete and not self.params.get('keepvideo', False): + for old_filename in files_to_delete: + self.to_screen('Deleting original file %s (pass -k to keep)' % old_filename) + try: + os.remove(encodeFilename(old_filename)) + except (IOError, OSError): + self.report_warning('Unable to remove downloaded original file') + + def _make_archive_id(self, info_dict): + # Future-proof against any change in case + # and backwards compatibility with prior versions + extractor = info_dict.get('extractor_key') + if extractor is None: + if 'id' in info_dict: + extractor = info_dict.get('ie_key') # key in a playlist + if extractor is None: + return None # Incomplete video information + return extractor.lower() + ' ' + info_dict['id'] + + def in_download_archive(self, info_dict): + fn = self.params.get('download_archive') + if fn is None: + return False + + vid_id = self._make_archive_id(info_dict) + if vid_id is None: + return False # Incomplete video information + + try: + with locked_file(fn, 'r', encoding='utf-8') as archive_file: + for line in archive_file: + if line.strip() == vid_id: + return True + except IOError as ioe: + if ioe.errno != errno.ENOENT: + raise + return False + + def record_download_archive(self, info_dict): + fn = self.params.get('download_archive') + if fn is None: + return + vid_id = self._make_archive_id(info_dict) + assert vid_id + with locked_file(fn, 'a', encoding='utf-8') as archive_file: + archive_file.write(vid_id + '\n') + + @staticmethod + def format_resolution(format, default='unknown'): + if format.get('vcodec') == 'none': + return 'audio only' + if format.get('resolution') is not None: + return format['resolution'] + if format.get('height') is not None: + if format.get('width') is not None: + res = '%sx%s' % (format['width'], format['height']) + else: + res = '%sp' % format['height'] + elif format.get('width') is not None: + res = '?x%d' % format['width'] + else: + res = default + return res + + def _format_note(self, fdict): + res = '' + if fdict.get('ext') in ['f4f', 'f4m']: + res += '(unsupported) ' + if fdict.get('format_note') is not None: + res += fdict['format_note'] + ' ' + if fdict.get('tbr') is not None: + res += '%4dk ' % fdict['tbr'] + if fdict.get('container') is not None: + if res: + res += ', ' + res += '%s container' % fdict['container'] + if (fdict.get('vcodec') is not None and + fdict.get('vcodec') != 'none'): + if res: + res += ', ' + res += fdict['vcodec'] + if fdict.get('vbr') is not None: + res += '@' + elif fdict.get('vbr') is not None and fdict.get('abr') is not None: + res += 'video@' + if fdict.get('vbr') is not None: + res += '%4dk' % fdict['vbr'] + if fdict.get('fps') is not None: + res += ', %sfps' % fdict['fps'] + if fdict.get('acodec') is not None: + if res: + res += ', ' + if fdict['acodec'] == 'none': + res += 'video only' + else: + res += '%-5s' % fdict['acodec'] + elif fdict.get('abr') is not None: + if res: + res += ', ' + res += 'audio' + if fdict.get('abr') is not None: + res += '@%3dk' % fdict['abr'] + if fdict.get('asr') is not None: + res += ' (%5dHz)' % fdict['asr'] + if fdict.get('filesize') is not None: + if res: + res += ', ' + res += format_bytes(fdict['filesize']) + elif fdict.get('filesize_approx') is not None: + if res: + res += ', ' + res += '~' + format_bytes(fdict['filesize_approx']) + return res + + def list_formats(self, info_dict): + formats = info_dict.get('formats', [info_dict]) + table = [ + [f['format_id'], f['ext'], self.format_resolution(f), self._format_note(f)] + for f in formats + if f.get('preference') is None or f['preference'] >= -1000] + if len(formats) > 1: + table[-1][-1] += (' ' if table[-1][-1] else '') + '(best)' + + header_line = ['format code', 'extension', 'resolution', 'note'] + self.to_screen( + '[info] Available formats for %s:\n%s' % + (info_dict['id'], render_table(header_line, table))) + + def list_thumbnails(self, info_dict): + thumbnails = info_dict.get('thumbnails') + if not thumbnails: + tn_url = info_dict.get('thumbnail') + if tn_url: + thumbnails = [{'id': '0', 'url': tn_url}] + else: + self.to_screen( + '[info] No thumbnails present for %s' % info_dict['id']) + return + + self.to_screen( + '[info] Thumbnails for %s:' % info_dict['id']) + self.to_screen(render_table( + ['ID', 'width', 'height', 'URL'], + [[t['id'], t.get('width', 'unknown'), t.get('height', 'unknown'), t['url']] for t in thumbnails])) + + def list_subtitles(self, video_id, subtitles, name='subtitles'): + if not subtitles: + self.to_screen('%s has no %s' % (video_id, name)) + return + self.to_screen( + 'Available %s for %s:' % (name, video_id)) + self.to_screen(render_table( + ['Language', 'formats'], + [[lang, ', '.join(f['ext'] for f in reversed(formats))] + for lang, formats in subtitles.items()])) + + def urlopen(self, req): + """ Start an HTTP download """ + return self._opener.open(req, timeout=self._socket_timeout) + + def print_debug_header(self): + if not self.params.get('verbose'): + return + + if type('') is not compat_str: + # Python 2.6 on SLES11 SP1 (https://github.com/rg3/youtube-dl/issues/3326) + self.report_warning( + 'Your Python is broken! Update to a newer and supported version') + + stdout_encoding = getattr( + sys.stdout, 'encoding', 'missing (%s)' % type(sys.stdout).__name__) + encoding_str = ( + '[debug] Encodings: locale %s, fs %s, out %s, pref %s\n' % ( + locale.getpreferredencoding(), + sys.getfilesystemencoding(), + stdout_encoding, + self.get_encoding())) + write_string(encoding_str, encoding=None) + + self._write_string('[debug] youtube-dl version ' + __version__ + '\n') + try: + sp = subprocess.Popen( + ['git', 'rev-parse', '--short', 'HEAD'], + stdout=subprocess.PIPE, stderr=subprocess.PIPE, + cwd=os.path.dirname(os.path.abspath(__file__))) + out, err = sp.communicate() + out = out.decode().strip() + if re.match('[0-9a-f]+', out): + self._write_string('[debug] Git HEAD: ' + out + '\n') + except Exception: + try: + sys.exc_clear() + except Exception: + pass + self._write_string('[debug] Python version %s - %s\n' % ( + platform.python_version(), platform_name())) + + exe_versions = FFmpegPostProcessor.get_versions(self) + exe_versions['rtmpdump'] = rtmpdump_version() + exe_str = ', '.join( + '%s %s' % (exe, v) + for exe, v in sorted(exe_versions.items()) + if v + ) + if not exe_str: + exe_str = 'none' + self._write_string('[debug] exe versions: %s\n' % exe_str) + + proxy_map = {} + for handler in self._opener.handlers: + if hasattr(handler, 'proxies'): + proxy_map.update(handler.proxies) + self._write_string('[debug] Proxy map: ' + compat_str(proxy_map) + '\n') + + if self.params.get('call_home', False): + ipaddr = self.urlopen('https://yt-dl.org/ip').read().decode('utf-8') + self._write_string('[debug] Public IP address: %s\n' % ipaddr) + latest_version = self.urlopen( + 'https://yt-dl.org/latest/version').read().decode('utf-8') + if version_tuple(latest_version) > version_tuple(__version__): + self.report_warning( + 'You are using an outdated version (newest version: %s)! ' + 'See https://yt-dl.org/update if you need help updating.' % + latest_version) + + def _setup_opener(self): + timeout_val = self.params.get('socket_timeout') + self._socket_timeout = 600 if timeout_val is None else float(timeout_val) + + opts_cookiefile = self.params.get('cookiefile') + opts_proxy = self.params.get('proxy') + + if opts_cookiefile is None: + self.cookiejar = compat_cookiejar.CookieJar() + else: + self.cookiejar = compat_cookiejar.MozillaCookieJar( + opts_cookiefile) + if os.access(opts_cookiefile, os.R_OK): + self.cookiejar.load() + + cookie_processor = compat_urllib_request.HTTPCookieProcessor( + self.cookiejar) + if opts_proxy is not None: + if opts_proxy == '': + proxies = {} + else: + proxies = {'http': opts_proxy, 'https': opts_proxy} + else: + proxies = compat_urllib_request.getproxies() + # Set HTTPS proxy to HTTP one if given (https://github.com/rg3/youtube-dl/issues/805) + if 'http' in proxies and 'https' not in proxies: + proxies['https'] = proxies['http'] + proxy_handler = PerRequestProxyHandler(proxies) + + debuglevel = 1 if self.params.get('debug_printtraffic') else 0 + https_handler = make_HTTPS_handler(self.params, debuglevel=debuglevel) + ydlh = YoutubeDLHandler(self.params, debuglevel=debuglevel) + opener = compat_urllib_request.build_opener( + proxy_handler, https_handler, cookie_processor, ydlh) + + # Delete the default user-agent header, which would otherwise apply in + # cases where our custom HTTP handler doesn't come into play + # (See https://github.com/rg3/youtube-dl/issues/1309 for details) + opener.addheaders = [] + self._opener = opener + + def encode(self, s): + if isinstance(s, bytes): + return s # Already encoded + + try: + return s.encode(self.get_encoding()) + except UnicodeEncodeError as err: + err.reason = err.reason + '. Check your system encoding configuration or use the --encoding option.' + raise + + def get_encoding(self): + encoding = self.params.get('encoding') + if encoding is None: + encoding = preferredencoding() + return encoding + + def _write_thumbnails(self, info_dict, filename): + if self.params.get('writethumbnail', False): + thumbnails = info_dict.get('thumbnails') + if thumbnails: + thumbnails = [thumbnails[-1]] + elif self.params.get('write_all_thumbnails', False): + thumbnails = info_dict.get('thumbnails') + else: + return + + if not thumbnails: + # No thumbnails present, so return immediately + return + + for t in thumbnails: + thumb_ext = determine_ext(t['url'], 'jpg') + suffix = '_%s' % t['id'] if len(thumbnails) > 1 else '' + thumb_display_id = '%s ' % t['id'] if len(thumbnails) > 1 else '' + t['filename'] = thumb_filename = os.path.splitext(filename)[0] + suffix + '.' + thumb_ext + + if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(thumb_filename)): + self.to_screen('[%s] %s: Thumbnail %sis already present' % + (info_dict['extractor'], info_dict['id'], thumb_display_id)) + else: + self.to_screen('[%s] %s: Downloading thumbnail %s...' % + (info_dict['extractor'], info_dict['id'], thumb_display_id)) + try: + uf = self.urlopen(t['url']) + with open(thumb_filename, 'wb') as thumbf: + shutil.copyfileobj(uf, thumbf) + self.to_screen('[%s] %s: Writing thumbnail %sto: %s' % + (info_dict['extractor'], info_dict['id'], thumb_display_id, thumb_filename)) + except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: + self.report_warning('Unable to download thumbnail "%s": %s' % + (t['url'], compat_str(err))) diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py new file mode 100644 index 000000000..55b22c889 --- /dev/null +++ b/youtube_dl/__init__.py @@ -0,0 +1,418 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +from __future__ import unicode_literals + +__license__ = 'Public Domain' + +import codecs +import io +import os +import random +import shlex +import sys + + +from .options import ( + parseOpts, +) +from .compat import ( + compat_expanduser, + compat_getpass, + compat_print, + workaround_optparse_bug9161, +) +from .utils import ( + DateRange, + decodeOption, + DEFAULT_OUTTMPL, + DownloadError, + match_filter_func, + MaxDownloadsReached, + preferredencoding, + read_batch_urls, + SameFileError, + setproctitle, + std_headers, + write_string, +) +from .update import update_self +from .downloader import ( + FileDownloader, +) +from .extractor import gen_extractors, list_extractors +from .YoutubeDL import YoutubeDL + + +def _real_main(argv=None): + # Compatibility fixes for Windows + if sys.platform == 'win32': + # https://github.com/rg3/youtube-dl/issues/820 + codecs.register(lambda name: codecs.lookup('utf-8') if name == 'cp65001' else None) + + workaround_optparse_bug9161() + + setproctitle('youtube-dl') + + parser, opts, args = parseOpts(argv) + + # Set user agent + if opts.user_agent is not None: + std_headers['User-Agent'] = opts.user_agent + + # Set referer + if opts.referer is not None: + std_headers['Referer'] = opts.referer + + # Custom HTTP headers + if opts.headers is not None: + for h in opts.headers: + if h.find(':', 1) < 0: + parser.error('wrong header formatting, it should be key:value, not "%s"' % h) + key, value = h.split(':', 2) + if opts.verbose: + write_string('[debug] Adding header from command line option %s:%s\n' % (key, value)) + std_headers[key] = value + + # Dump user agent + if opts.dump_user_agent: + compat_print(std_headers['User-Agent']) + sys.exit(0) + + # Batch file verification + batch_urls = [] + if opts.batchfile is not None: + try: + if opts.batchfile == '-': + batchfd = sys.stdin + else: + batchfd = io.open(opts.batchfile, 'r', encoding='utf-8', errors='ignore') + batch_urls = read_batch_urls(batchfd) + if opts.verbose: + write_string('[debug] Batch file urls: ' + repr(batch_urls) + '\n') + except IOError: + sys.exit('ERROR: batch file could not be read') + all_urls = batch_urls + args + all_urls = [url.strip() for url in all_urls] + _enc = preferredencoding() + all_urls = [url.decode(_enc, 'ignore') if isinstance(url, bytes) else url for url in all_urls] + + if opts.list_extractors: + for ie in list_extractors(opts.age_limit): + compat_print(ie.IE_NAME + (' (CURRENTLY BROKEN)' if not ie._WORKING else '')) + matchedUrls = [url for url in all_urls if ie.suitable(url)] + for mu in matchedUrls: + compat_print(' ' + mu) + sys.exit(0) + if opts.list_extractor_descriptions: + for ie in list_extractors(opts.age_limit): + if not ie._WORKING: + continue + desc = getattr(ie, 'IE_DESC', ie.IE_NAME) + if desc is False: + continue + if hasattr(ie, 'SEARCH_KEY'): + _SEARCHES = ('cute kittens', 'slithering pythons', 'falling cat', 'angry poodle', 'purple fish', 'running tortoise', 'sleeping bunny', 'burping cow') + _COUNTS = ('', '5', '10', 'all') + desc += ' (Example: "%s%s:%s" )' % (ie.SEARCH_KEY, random.choice(_COUNTS), random.choice(_SEARCHES)) + compat_print(desc) + sys.exit(0) + + # Conflicting, missing and erroneous options + if opts.usenetrc and (opts.username is not None or opts.password is not None): + parser.error('using .netrc conflicts with giving username/password') + if opts.password is not None and opts.username is None: + parser.error('account username missing\n') + if opts.outtmpl is not None and (opts.usetitle or opts.autonumber or opts.useid): + parser.error('using output template conflicts with using title, video ID or auto number') + if opts.usetitle and opts.useid: + parser.error('using title conflicts with using video ID') + if opts.username is not None and opts.password is None: + opts.password = compat_getpass('Type account password and press [Return]: ') + if opts.ratelimit is not None: + numeric_limit = FileDownloader.parse_bytes(opts.ratelimit) + if numeric_limit is None: + parser.error('invalid rate limit specified') + opts.ratelimit = numeric_limit + if opts.min_filesize is not None: + numeric_limit = FileDownloader.parse_bytes(opts.min_filesize) + if numeric_limit is None: + parser.error('invalid min_filesize specified') + opts.min_filesize = numeric_limit + if opts.max_filesize is not None: + numeric_limit = FileDownloader.parse_bytes(opts.max_filesize) + if numeric_limit is None: + parser.error('invalid max_filesize specified') + opts.max_filesize = numeric_limit + if opts.retries is not None: + if opts.retries in ('inf', 'infinite'): + opts_retries = float('inf') + else: + try: + opts_retries = int(opts.retries) + except (TypeError, ValueError): + parser.error('invalid retry count specified') + if opts.buffersize is not None: + numeric_buffersize = FileDownloader.parse_bytes(opts.buffersize) + if numeric_buffersize is None: + parser.error('invalid buffer size specified') + opts.buffersize = numeric_buffersize + if opts.playliststart <= 0: + raise ValueError('Playlist start must be positive') + if opts.playlistend not in (-1, None) and opts.playlistend < opts.playliststart: + raise ValueError('Playlist end must be greater than playlist start') + if opts.extractaudio: + if opts.audioformat not in ['best', 'aac', 'mp3', 'm4a', 'opus', 'vorbis', 'wav']: + parser.error('invalid audio format specified') + if opts.audioquality: + opts.audioquality = opts.audioquality.strip('k').strip('K') + if not opts.audioquality.isdigit(): + parser.error('invalid audio quality specified') + if opts.recodevideo is not None: + if opts.recodevideo not in ['mp4', 'flv', 'webm', 'ogg', 'mkv', 'avi']: + parser.error('invalid video recode format specified') + if opts.convertsubtitles is not None: + if opts.convertsubtitles not in ['srt', 'vtt', 'ass']: + parser.error('invalid subtitle format specified') + + if opts.date is not None: + date = DateRange.day(opts.date) + else: + date = DateRange(opts.dateafter, opts.datebefore) + + # Do not download videos when there are audio-only formats + if opts.extractaudio and not opts.keepvideo and opts.format is None: + opts.format = 'bestaudio/best' + + # --all-sub automatically sets --write-sub if --write-auto-sub is not given + # this was the old behaviour if only --all-sub was given. + if opts.allsubtitles and not opts.writeautomaticsub: + opts.writesubtitles = True + + outtmpl = ((opts.outtmpl is not None and opts.outtmpl) or + (opts.format == '-1' and opts.usetitle and '%(title)s-%(id)s-%(format)s.%(ext)s') or + (opts.format == '-1' and '%(id)s-%(format)s.%(ext)s') or + (opts.usetitle and opts.autonumber and '%(autonumber)s-%(title)s-%(id)s.%(ext)s') or + (opts.usetitle and '%(title)s-%(id)s.%(ext)s') or + (opts.useid and '%(id)s.%(ext)s') or + (opts.autonumber and '%(autonumber)s-%(id)s.%(ext)s') or + DEFAULT_OUTTMPL) + if not os.path.splitext(outtmpl)[1] and opts.extractaudio: + parser.error('Cannot download a video and extract audio into the same' + ' file! Use "{0}.%(ext)s" instead of "{0}" as the output' + ' template'.format(outtmpl)) + + any_getting = opts.geturl or opts.gettitle or opts.getid or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat or opts.getduration or opts.dumpjson or opts.dump_single_json + any_printing = opts.print_json + download_archive_fn = compat_expanduser(opts.download_archive) if opts.download_archive is not None else opts.download_archive + + # PostProcessors + postprocessors = [] + # Add the metadata pp first, the other pps will copy it + if opts.metafromtitle: + postprocessors.append({ + 'key': 'MetadataFromTitle', + 'titleformat': opts.metafromtitle + }) + if opts.addmetadata: + postprocessors.append({'key': 'FFmpegMetadata'}) + if opts.extractaudio: + postprocessors.append({ + 'key': 'FFmpegExtractAudio', + 'preferredcodec': opts.audioformat, + 'preferredquality': opts.audioquality, + 'nopostoverwrites': opts.nopostoverwrites, + }) + if opts.recodevideo: + postprocessors.append({ + 'key': 'FFmpegVideoConvertor', + 'preferedformat': opts.recodevideo, + }) + if opts.convertsubtitles: + postprocessors.append({ + 'key': 'FFmpegSubtitlesConvertor', + 'format': opts.convertsubtitles, + }) + if opts.embedsubtitles: + postprocessors.append({ + 'key': 'FFmpegEmbedSubtitle', + }) + if opts.xattrs: + postprocessors.append({'key': 'XAttrMetadata'}) + if opts.embedthumbnail: + already_have_thumbnail = opts.writethumbnail or opts.write_all_thumbnails + postprocessors.append({ + 'key': 'EmbedThumbnail', + 'already_have_thumbnail': already_have_thumbnail + }) + if not already_have_thumbnail: + opts.writethumbnail = True + # Please keep ExecAfterDownload towards the bottom as it allows the user to modify the final file in any way. + # So if the user is able to remove the file before your postprocessor runs it might cause a few problems. + if opts.exec_cmd: + postprocessors.append({ + 'key': 'ExecAfterDownload', + 'exec_cmd': opts.exec_cmd, + }) + if opts.xattr_set_filesize: + try: + import xattr + xattr # Confuse flake8 + except ImportError: + parser.error('setting filesize xattr requested but python-xattr is not available') + external_downloader_args = None + if opts.external_downloader_args: + external_downloader_args = shlex.split(opts.external_downloader_args) + postprocessor_args = None + if opts.postprocessor_args: + postprocessor_args = shlex.split(opts.postprocessor_args) + match_filter = ( + None if opts.match_filter is None + else match_filter_func(opts.match_filter)) + + ydl_opts = { + 'usenetrc': opts.usenetrc, + 'username': opts.username, + 'password': opts.password, + 'twofactor': opts.twofactor, + 'videopassword': opts.videopassword, + 'quiet': (opts.quiet or any_getting or any_printing), + 'no_warnings': opts.no_warnings, + 'forceurl': opts.geturl, + 'forcetitle': opts.gettitle, + 'forceid': opts.getid, + 'forcethumbnail': opts.getthumbnail, + 'forcedescription': opts.getdescription, + 'forceduration': opts.getduration, + 'forcefilename': opts.getfilename, + 'forceformat': opts.getformat, + 'forcejson': opts.dumpjson or opts.print_json, + 'dump_single_json': opts.dump_single_json, + 'simulate': opts.simulate or any_getting, + 'skip_download': opts.skip_download, + 'format': opts.format, + 'listformats': opts.listformats, + 'outtmpl': outtmpl, + 'autonumber_size': opts.autonumber_size, + 'restrictfilenames': opts.restrictfilenames, + 'ignoreerrors': opts.ignoreerrors, + 'force_generic_extractor': opts.force_generic_extractor, + 'ratelimit': opts.ratelimit, + 'nooverwrites': opts.nooverwrites, + 'retries': opts_retries, + 'buffersize': opts.buffersize, + 'noresizebuffer': opts.noresizebuffer, + 'continuedl': opts.continue_dl, + 'noprogress': opts.noprogress, + 'progress_with_newline': opts.progress_with_newline, + 'playliststart': opts.playliststart, + 'playlistend': opts.playlistend, + 'playlistreverse': opts.playlist_reverse, + 'noplaylist': opts.noplaylist, + 'logtostderr': opts.outtmpl == '-', + 'consoletitle': opts.consoletitle, + 'nopart': opts.nopart, + 'updatetime': opts.updatetime, + 'writedescription': opts.writedescription, + 'writeannotations': opts.writeannotations, + 'writeinfojson': opts.writeinfojson, + 'writethumbnail': opts.writethumbnail, + 'write_all_thumbnails': opts.write_all_thumbnails, + 'writesubtitles': opts.writesubtitles, + 'writeautomaticsub': opts.writeautomaticsub, + 'allsubtitles': opts.allsubtitles, + 'listsubtitles': opts.listsubtitles, + 'subtitlesformat': opts.subtitlesformat, + 'subtitleslangs': opts.subtitleslangs, + 'matchtitle': decodeOption(opts.matchtitle), + 'rejecttitle': decodeOption(opts.rejecttitle), + 'max_downloads': opts.max_downloads, + 'prefer_free_formats': opts.prefer_free_formats, + 'verbose': opts.verbose, + 'dump_intermediate_pages': opts.dump_intermediate_pages, + 'write_pages': opts.write_pages, + 'test': opts.test, + 'keepvideo': opts.keepvideo, + 'min_filesize': opts.min_filesize, + 'max_filesize': opts.max_filesize, + 'min_views': opts.min_views, + 'max_views': opts.max_views, + 'daterange': date, + 'cachedir': opts.cachedir, + 'youtube_print_sig_code': opts.youtube_print_sig_code, + 'age_limit': opts.age_limit, + 'download_archive': download_archive_fn, + 'cookiefile': opts.cookiefile, + 'nocheckcertificate': opts.no_check_certificate, + 'prefer_insecure': opts.prefer_insecure, + 'proxy': opts.proxy, + 'socket_timeout': opts.socket_timeout, + 'bidi_workaround': opts.bidi_workaround, + 'debug_printtraffic': opts.debug_printtraffic, + 'prefer_ffmpeg': opts.prefer_ffmpeg, + 'include_ads': opts.include_ads, + 'default_search': opts.default_search, + 'youtube_include_dash_manifest': opts.youtube_include_dash_manifest, + 'encoding': opts.encoding, + 'extract_flat': opts.extract_flat, + 'merge_output_format': opts.merge_output_format, + 'postprocessors': postprocessors, + 'fixup': opts.fixup, + 'source_address': opts.source_address, + 'call_home': opts.call_home, + 'sleep_interval': opts.sleep_interval, + 'external_downloader': opts.external_downloader, + 'list_thumbnails': opts.list_thumbnails, + 'playlist_items': opts.playlist_items, + 'xattr_set_filesize': opts.xattr_set_filesize, + 'match_filter': match_filter, + 'no_color': opts.no_color, + 'ffmpeg_location': opts.ffmpeg_location, + 'hls_prefer_native': opts.hls_prefer_native, + 'external_downloader_args': external_downloader_args, + 'postprocessor_args': postprocessor_args, + 'cn_verification_proxy': opts.cn_verification_proxy, + } + + with YoutubeDL(ydl_opts) as ydl: + # Update version + if opts.update_self: + update_self(ydl.to_screen, opts.verbose) + + # Remove cache dir + if opts.rm_cachedir: + ydl.cache.remove() + + # Maybe do nothing + if (len(all_urls) < 1) and (opts.load_info_filename is None): + if opts.update_self or opts.rm_cachedir: + sys.exit() + + ydl.warn_if_short_id(sys.argv[1:] if argv is None else argv) + parser.error( + 'You must provide at least one URL.\n' + 'Type youtube-dl --help to see a list of all options.') + + try: + if opts.load_info_filename is not None: + retcode = ydl.download_with_info_file(opts.load_info_filename) + else: + retcode = ydl.download(all_urls) + except MaxDownloadsReached: + ydl.to_screen('--max-download limit reached, aborting.') + retcode = 101 + + sys.exit(retcode) + + +def main(argv=None): + try: + _real_main(argv) + except DownloadError: + sys.exit(1) + except SameFileError: + sys.exit('ERROR: fixed output name but more than one file to download') + except KeyboardInterrupt: + sys.exit('\nERROR: Interrupted by user') + +__all__ = ['main', 'YoutubeDL', 'gen_extractors', 'list_extractors'] diff --git a/youtube_dl/options.py b/youtube_dl/options.py new file mode 100644 index 000000000..9016e3498 --- /dev/null +++ b/youtube_dl/options.py @@ -0,0 +1,801 @@ +from __future__ import unicode_literals + +import os.path +import optparse +import shlex +import sys + +from .downloader.external import list_external_downloaders +from .compat import ( + compat_expanduser, + compat_get_terminal_size, + compat_getenv, + compat_kwargs, +) +from .utils import ( + preferredencoding, + write_string, +) +from .version import __version__ + + +def parseOpts(overrideArguments=None): + def _readOptions(filename_bytes, default=[]): + try: + optionf = open(filename_bytes) + except IOError: + return default # silently skip if file is not present + try: + res = [] + for l in optionf: + res += shlex.split(l, comments=True) + finally: + optionf.close() + return res + + def _readUserConf(): + xdg_config_home = compat_getenv('XDG_CONFIG_HOME') + if xdg_config_home: + userConfFile = os.path.join(xdg_config_home, 'youtube-dl', 'config') + if not os.path.isfile(userConfFile): + userConfFile = os.path.join(xdg_config_home, 'youtube-dl.conf') + else: + userConfFile = os.path.join(compat_expanduser('~'), '.config', 'youtube-dl', 'config') + if not os.path.isfile(userConfFile): + userConfFile = os.path.join(compat_expanduser('~'), '.config', 'youtube-dl.conf') + userConf = _readOptions(userConfFile, None) + + if userConf is None: + appdata_dir = compat_getenv('appdata') + if appdata_dir: + userConf = _readOptions( + os.path.join(appdata_dir, 'youtube-dl', 'config'), + default=None) + if userConf is None: + userConf = _readOptions( + os.path.join(appdata_dir, 'youtube-dl', 'config.txt'), + default=None) + + if userConf is None: + userConf = _readOptions( + os.path.join(compat_expanduser('~'), 'youtube-dl.conf'), + default=None) + if userConf is None: + userConf = _readOptions( + os.path.join(compat_expanduser('~'), 'youtube-dl.conf.txt'), + default=None) + + if userConf is None: + userConf = [] + + return userConf + + def _format_option_string(option): + ''' ('-o', '--option') -> -o, --format METAVAR''' + + opts = [] + + if option._short_opts: + opts.append(option._short_opts[0]) + if option._long_opts: + opts.append(option._long_opts[0]) + if len(opts) > 1: + opts.insert(1, ', ') + + if option.takes_value(): + opts.append(' %s' % option.metavar) + + return "".join(opts) + + def _comma_separated_values_options_callback(option, opt_str, value, parser): + setattr(parser.values, option.dest, value.split(',')) + + def _hide_login_info(opts): + opts = list(opts) + for private_opt in ['-p', '--password', '-u', '--username', '--video-password']: + try: + i = opts.index(private_opt) + opts[i + 1] = 'PRIVATE' + except ValueError: + pass + return opts + + # No need to wrap help messages if we're on a wide console + columns = compat_get_terminal_size().columns + max_width = columns if columns else 80 + max_help_position = 80 + + fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position) + fmt.format_option_strings = _format_option_string + + kw = { + 'version': __version__, + 'formatter': fmt, + 'usage': '%prog [OPTIONS] URL [URL...]', + 'conflict_handler': 'resolve', + } + + parser = optparse.OptionParser(**compat_kwargs(kw)) + + general = optparse.OptionGroup(parser, 'General Options') + general.add_option( + '-h', '--help', + action='help', + help='Print this help text and exit') + general.add_option( + '-v', '--version', + action='version', + help='Print program version and exit') + general.add_option( + '-U', '--update', + action='store_true', dest='update_self', + help='Update this program to latest version. Make sure that you have sufficient permissions (run with sudo if needed)') + general.add_option( + '-i', '--ignore-errors', + action='store_true', dest='ignoreerrors', default=False, + help='Continue on download errors, for example to skip unavailable videos in a playlist') + general.add_option( + '--abort-on-error', + action='store_false', dest='ignoreerrors', + help='Abort downloading of further videos (in the playlist or the command line) if an error occurs') + general.add_option( + '--dump-user-agent', + action='store_true', dest='dump_user_agent', default=False, + help='Display the current browser identification') + general.add_option( + '--list-extractors', + action='store_true', dest='list_extractors', default=False, + help='List all supported extractors') + general.add_option( + '--extractor-descriptions', + action='store_true', dest='list_extractor_descriptions', default=False, + help='Output descriptions of all supported extractors') + general.add_option( + '--force-generic-extractor', + action='store_true', dest='force_generic_extractor', default=False, + help='Force extraction to use the generic extractor') + general.add_option( + '--default-search', + dest='default_search', metavar='PREFIX', + help='Use this prefix for unqualified URLs. For example "gvsearch2:" downloads two videos from google videos for youtube-dl "large apple". Use the value "auto" to let youtube-dl guess ("auto_warning" to emit a warning when guessing). "error" just throws an error. The default value "fixup_error" repairs broken URLs, but emits an error if this is not possible instead of searching.') + general.add_option( + '--ignore-config', + action='store_true', + help='Do not read configuration files. ' + 'When given in the global configuration file /etc/youtube-dl.conf: ' + 'Do not read the user configuration in ~/.config/youtube-dl/config ' + '(%APPDATA%/youtube-dl/config.txt on Windows)') + general.add_option( + '--flat-playlist', + action='store_const', dest='extract_flat', const='in_playlist', + default=False, + help='Do not extract the videos of a playlist, only list them.') + general.add_option( + '--no-color', '--no-colors', + action='store_true', dest='no_color', + default=False, + help='Do not emit color codes in output') + + network = optparse.OptionGroup(parser, 'Network Options') + network.add_option( + '--proxy', dest='proxy', + default=None, metavar='URL', + help='Use the specified HTTP/HTTPS proxy. Pass in an empty string (--proxy "") for direct connection') + network.add_option( + '--socket-timeout', + dest='socket_timeout', type=float, default=None, metavar='SECONDS', + help='Time to wait before giving up, in seconds') + network.add_option( + '--source-address', + metavar='IP', dest='source_address', default=None, + help='Client-side IP address to bind to (experimental)', + ) + network.add_option( + '-4', '--force-ipv4', + action='store_const', const='0.0.0.0', dest='source_address', + help='Make all connections via IPv4 (experimental)', + ) + network.add_option( + '-6', '--force-ipv6', + action='store_const', const='::', dest='source_address', + help='Make all connections via IPv6 (experimental)', + ) + network.add_option( + '--cn-verification-proxy', + dest='cn_verification_proxy', default=None, metavar='URL', + help='Use this proxy to verify the IP address for some Chinese sites. ' + 'The default proxy specified by --proxy (or none, if the options is not present) is used for the actual downloading. (experimental)' + ) + + selection = optparse.OptionGroup(parser, 'Video Selection') + selection.add_option( + '--playlist-start', + dest='playliststart', metavar='NUMBER', default=1, type=int, + help='Playlist video to start at (default is %default)') + selection.add_option( + '--playlist-end', + dest='playlistend', metavar='NUMBER', default=None, type=int, + help='Playlist video to end at (default is last)') + selection.add_option( + '--playlist-items', + dest='playlist_items', metavar='ITEM_SPEC', default=None, + help='Playlist video items to download. Specify indices of the videos in the playlist separated by commas like: "--playlist-items 1,2,5,8" if you want to download videos indexed 1, 2, 5, 8 in the playlist. You can specify range: "--playlist-items 1-3,7,10-13", it will download the videos at index 1, 2, 3, 7, 10, 11, 12 and 13.') + selection.add_option( + '--match-title', + dest='matchtitle', metavar='REGEX', + help='Download only matching titles (regex or caseless sub-string)') + selection.add_option( + '--reject-title', + dest='rejecttitle', metavar='REGEX', + help='Skip download for matching titles (regex or caseless sub-string)') + selection.add_option( + '--max-downloads', + dest='max_downloads', metavar='NUMBER', type=int, default=None, + help='Abort after downloading NUMBER files') + selection.add_option( + '--min-filesize', + metavar='SIZE', dest='min_filesize', default=None, + help='Do not download any videos smaller than SIZE (e.g. 50k or 44.6m)') + selection.add_option( + '--max-filesize', + metavar='SIZE', dest='max_filesize', default=None, + help='Do not download any videos larger than SIZE (e.g. 50k or 44.6m)') + selection.add_option( + '--date', + metavar='DATE', dest='date', default=None, + help='Download only videos uploaded in this date') + selection.add_option( + '--datebefore', + metavar='DATE', dest='datebefore', default=None, + help='Download only videos uploaded on or before this date (i.e. inclusive)') + selection.add_option( + '--dateafter', + metavar='DATE', dest='dateafter', default=None, + help='Download only videos uploaded on or after this date (i.e. inclusive)') + selection.add_option( + '--min-views', + metavar='COUNT', dest='min_views', default=None, type=int, + help='Do not download any videos with less than COUNT views') + selection.add_option( + '--max-views', + metavar='COUNT', dest='max_views', default=None, type=int, + help='Do not download any videos with more than COUNT views') + selection.add_option( + '--match-filter', + metavar='FILTER', dest='match_filter', default=None, + help=( + 'Generic video filter (experimental). ' + 'Specify any key (see help for -o for a list of available keys) to' + ' match if the key is present, ' + '!key to check if the key is not present,' + 'key > NUMBER (like "comment_count > 12", also works with ' + '>=, <, <=, !=, =) to compare against a number, and ' + '& to require multiple matches. ' + 'Values which are not known are excluded unless you' + ' put a question mark (?) after the operator.' + 'For example, to only match videos that have been liked more than ' + '100 times and disliked less than 50 times (or the dislike ' + 'functionality is not available at the given service), but who ' + 'also have a description, use --match-filter ' + '"like_count > 100 & dislike_count = (2, 7): + def find_xpath_attr(node, xpath, key, val=None): + """ Find the xpath xpath[@key=val] """ + assert re.match(r'^[a-zA-Z-]+$', key) + if val: + assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val) + expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val)) + return node.find(expr) +else: + def find_xpath_attr(node, xpath, key, val=None): + # Here comes the crazy part: In 2.6, if the xpath is a unicode, + # .//node does not match if a node is a direct child of . ! + if isinstance(xpath, compat_str): + xpath = xpath.encode('ascii') + + for f in node.findall(xpath): + if key not in f.attrib: + continue + if val is None or f.attrib.get(key) == val: + return f + return None + +# On python2.6 the xml.etree.ElementTree.Element methods don't support +# the namespace parameter + + +def xpath_with_ns(path, ns_map): + components = [c.split(':') for c in path.split('/')] + replaced = [] + for c in components: + if len(c) == 1: + replaced.append(c[0]) + else: + ns, tag = c + replaced.append('{%s}%s' % (ns_map[ns], tag)) + return '/'.join(replaced) + + +def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT): + if sys.version_info < (2, 7): # Crazy 2.6 + xpath = xpath.encode('ascii') + + n = node.find(xpath) + if n is None or n.text is None: + if default is not NO_DEFAULT: + return default + elif fatal: + name = xpath if name is None else name + raise ExtractorError('Could not find XML element %s' % name) + else: + return None + return n.text + + +def get_element_by_id(id, html): + """Return the content of the tag with the specified ID in the passed HTML document""" + return get_element_by_attribute("id", id, html) + + +def get_element_by_attribute(attribute, value, html): + """Return the content of the tag with the specified attribute in the passed HTML document""" + + m = re.search(r'''(?xs) + <([a-zA-Z0-9:._-]+) + (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*? + \s+%s=['"]?%s['"]? + (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*? + \s*> + (?P.*?) + + ''' % (re.escape(attribute), re.escape(value)), html) + + if not m: + return None + res = m.group('content') + + if res.startswith('"') or res.startswith("'"): + res = res[1:-1] + + return unescapeHTML(res) + + +def clean_html(html): + """Clean an HTML snippet into a readable string""" + + if html is None: # Convenience for sanitizing descriptions etc. + return html + + # Newline vs
+ html = html.replace('\n', ' ') + html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html) + html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html) + # Strip html tags + html = re.sub('<.*?>', '', html) + # Replace html entities + html = unescapeHTML(html) + return html.strip() + + +def sanitize_open(filename, open_mode): + """Try to open the given filename, and slightly tweak it if this fails. + + Attempts to open the given filename. If this fails, it tries to change + the filename slightly, step by step, until it's either able to open it + or it fails and raises a final exception, like the standard open() + function. + + It returns the tuple (stream, definitive_file_name). + """ + try: + if filename == '-': + if sys.platform == 'win32': + import msvcrt + msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY) + return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename) + stream = open(encodeFilename(filename), open_mode) + return (stream, filename) + except (IOError, OSError) as err: + if err.errno in (errno.EACCES,): + raise + + # In case of error, try to remove win32 forbidden chars + alt_filename = sanitize_path(filename) + if alt_filename == filename: + raise + else: + # An exception here should be caught in the caller + stream = open(encodeFilename(alt_filename), open_mode) + return (stream, alt_filename) + + +def timeconvert(timestr): + """Convert RFC 2822 defined time string into system timestamp""" + timestamp = None + timetuple = email.utils.parsedate_tz(timestr) + if timetuple is not None: + timestamp = email.utils.mktime_tz(timetuple) + return timestamp + + +def sanitize_filename(s, restricted=False, is_id=False): + """Sanitizes a string so it could be used as part of a filename. + If restricted is set, use a stricter subset of allowed characters. + Set is_id if this is not an arbitrary string, but an ID that should be kept if possible + """ + def replace_insane(char): + if char == '?' or ord(char) < 32 or ord(char) == 127: + return '' + elif char == '"': + return '' if restricted else '\'' + elif char == ':': + return '_-' if restricted else ' -' + elif char in '\\/|*<>': + return '_' + if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()): + return '_' + if restricted and ord(char) > 127: + return '_' + return char + + # Handle timestamps + s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s) + result = ''.join(map(replace_insane, s)) + if not is_id: + while '__' in result: + result = result.replace('__', '_') + result = result.strip('_') + # Common case of "Foreign band name - English song title" + if restricted and result.startswith('-_'): + result = result[2:] + if result.startswith('-'): + result = '_' + result[len('-'):] + result = result.lstrip('.') + if not result: + result = '_' + return result + + +def sanitize_path(s): + """Sanitizes and normalizes path on Windows""" + if sys.platform != 'win32': + return s + drive_or_unc, _ = os.path.splitdrive(s) + if sys.version_info < (2, 7) and not drive_or_unc: + drive_or_unc, _ = os.path.splitunc(s) + norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep) + if drive_or_unc: + norm_path.pop(0) + sanitized_path = [ + path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|\.$)', '#', path_part) + for path_part in norm_path] + if drive_or_unc: + sanitized_path.insert(0, drive_or_unc + os.path.sep) + return os.path.join(*sanitized_path) + + +def orderedSet(iterable): + """ Remove all duplicates from the input iterable """ + res = [] + for el in iterable: + if el not in res: + res.append(el) + return res + + +def _htmlentity_transform(entity): + """Transforms an HTML entity to a character.""" + # Known non-numeric HTML entity + if entity in compat_html_entities.name2codepoint: + return compat_chr(compat_html_entities.name2codepoint[entity]) + + mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity) + if mobj is not None: + numstr = mobj.group(1) + if numstr.startswith('x'): + base = 16 + numstr = '0%s' % numstr + else: + base = 10 + return compat_chr(int(numstr, base)) + + # Unknown entity in name, return its literal representation + return ('&%s;' % entity) + + +def unescapeHTML(s): + if s is None: + return None + assert type(s) == compat_str + + return re.sub( + r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s) + + +def get_subprocess_encoding(): + if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5: + # For subprocess calls, encode with locale encoding + # Refer to http://stackoverflow.com/a/9951851/35070 + encoding = preferredencoding() + else: + encoding = sys.getfilesystemencoding() + if encoding is None: + encoding = 'utf-8' + return encoding + + +def encodeFilename(s, for_subprocess=False): + """ + @param s The name of the file + """ + + assert type(s) == compat_str + + # Python 3 has a Unicode API + if sys.version_info >= (3, 0): + return s + + # Pass '' directly to use Unicode APIs on Windows 2000 and up + # (Detecting Windows NT 4 is tricky because 'major >= 4' would + # match Windows 9x series as well. Besides, NT 4 is obsolete.) + if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5: + return s + + return s.encode(get_subprocess_encoding(), 'ignore') + + +def decodeFilename(b, for_subprocess=False): + + if sys.version_info >= (3, 0): + return b + + if not isinstance(b, bytes): + return b + + return b.decode(get_subprocess_encoding(), 'ignore') + + +def encodeArgument(s): + if not isinstance(s, compat_str): + # Legacy code that uses byte strings + # Uncomment the following line after fixing all post processors + # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s)) + s = s.decode('ascii') + return encodeFilename(s, True) + + +def decodeArgument(b): + return decodeFilename(b, True) + + +def decodeOption(optval): + if optval is None: + return optval + if isinstance(optval, bytes): + optval = optval.decode(preferredencoding()) + + assert isinstance(optval, compat_str) + return optval + + +def formatSeconds(secs): + if secs > 3600: + return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60) + elif secs > 60: + return '%d:%02d' % (secs // 60, secs % 60) + else: + return '%d' % secs + + +def make_HTTPS_handler(params, **kwargs): + opts_no_check_certificate = params.get('nocheckcertificate', False) + if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9 + context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH) + if opts_no_check_certificate: + context.check_hostname = False + context.verify_mode = ssl.CERT_NONE + try: + return YoutubeDLHTTPSHandler(params, context=context, **kwargs) + except TypeError: + # Python 2.7.8 + # (create_default_context present but HTTPSHandler has no context=) + pass + + if sys.version_info < (3, 2): + return YoutubeDLHTTPSHandler(params, **kwargs) + else: # Python < 3.4 + context = ssl.SSLContext(ssl.PROTOCOL_TLSv1) + context.verify_mode = (ssl.CERT_NONE + if opts_no_check_certificate + else ssl.CERT_REQUIRED) + context.set_default_verify_paths() + return YoutubeDLHTTPSHandler(params, context=context, **kwargs) + + +def bug_reports_message(): + if ytdl_is_updateable(): + update_cmd = 'type youtube-dl -U to update' + else: + update_cmd = 'see https://yt-dl.org/update on how to update' + msg = '; please report this issue on https://yt-dl.org/bug .' + msg += ' Make sure you are using the latest version; %s.' % update_cmd + msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.' + return msg + + +class ExtractorError(Exception): + """Error during info extraction.""" + + def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None): + """ tb, if given, is the original traceback (so that it can be printed out). + If expected is set, this is a normal error message and most likely not a bug in youtube-dl. + """ + + if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError): + expected = True + if video_id is not None: + msg = video_id + ': ' + msg + if cause: + msg += ' (caused by %r)' % cause + if not expected: + msg += bug_reports_message() + super(ExtractorError, self).__init__(msg) + + self.traceback = tb + self.exc_info = sys.exc_info() # preserve original exception + self.cause = cause + self.video_id = video_id + + def format_traceback(self): + if self.traceback is None: + return None + return ''.join(traceback.format_tb(self.traceback)) + + +class UnsupportedError(ExtractorError): + def __init__(self, url): + super(UnsupportedError, self).__init__( + 'Unsupported URL: %s' % url, expected=True) + self.url = url + + +class RegexNotFoundError(ExtractorError): + """Error when a regex didn't match""" + pass + + +class DownloadError(Exception): + """Download Error exception. + + This exception may be thrown by FileDownloader objects if they are not + configured to continue on errors. They will contain the appropriate + error message. + """ + + def __init__(self, msg, exc_info=None): + """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """ + super(DownloadError, self).__init__(msg) + self.exc_info = exc_info + + +class SameFileError(Exception): + """Same File exception. + + This exception will be thrown by FileDownloader objects if they detect + multiple files would have to be downloaded to the same file on disk. + """ + pass + + +class PostProcessingError(Exception): + """Post Processing exception. + + This exception may be raised by PostProcessor's .run() method to + indicate an error in the postprocessing task. + """ + + def __init__(self, msg): + self.msg = msg + + +class MaxDownloadsReached(Exception): + """ --max-downloads limit has been reached. """ + pass + + +class UnavailableVideoError(Exception): + """Unavailable Format exception. + + This exception will be thrown when a video is requested + in a format that is not available for that video. + """ + pass + + +class ContentTooShortError(Exception): + """Content Too Short exception. + + This exception may be raised by FileDownloader objects when a file they + download is too small for what the server announced first, indicating + the connection was probably interrupted. + """ + + def __init__(self, downloaded, expected): + # Both in bytes + self.downloaded = downloaded + self.expected = expected + + +def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs): + hc = http_class(*args, **kwargs) + source_address = ydl_handler._params.get('source_address') + if source_address is not None: + sa = (source_address, 0) + if hasattr(hc, 'source_address'): # Python 2.7+ + hc.source_address = sa + else: # Python 2.6 + def _hc_connect(self, *args, **kwargs): + sock = compat_socket_create_connection( + (self.host, self.port), self.timeout, sa) + if is_https: + self.sock = ssl.wrap_socket( + sock, self.key_file, self.cert_file, + ssl_version=ssl.PROTOCOL_TLSv1) + else: + self.sock = sock + hc.connect = functools.partial(_hc_connect, hc) + + return hc + + +class YoutubeDLHandler(compat_urllib_request.HTTPHandler): + """Handler for HTTP requests and responses. + + This class, when installed with an OpenerDirector, automatically adds + the standard headers to every HTTP request and handles gzipped and + deflated responses from web servers. If compression is to be avoided in + a particular request, the original request in the program code only has + to include the HTTP header "Youtubedl-No-Compression", which will be + removed before making the real request. + + Part of this code was copied from: + + http://techknack.net/python-urllib2-handlers/ + + Andrew Rowls, the author of that code, agreed to release it to the + public domain. + """ + + def __init__(self, params, *args, **kwargs): + compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs) + self._params = params + + def http_open(self, req): + return self.do_open(functools.partial( + _create_http_connection, self, compat_http_client.HTTPConnection, False), + req) + + @staticmethod + def deflate(data): + try: + return zlib.decompress(data, -zlib.MAX_WBITS) + except zlib.error: + return zlib.decompress(data) + + @staticmethod + def addinfourl_wrapper(stream, headers, url, code): + if hasattr(compat_urllib_request.addinfourl, 'getcode'): + return compat_urllib_request.addinfourl(stream, headers, url, code) + ret = compat_urllib_request.addinfourl(stream, headers, url) + ret.code = code + return ret + + def http_request(self, req): + # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not + # always respected by websites, some tend to give out URLs with non percent-encoded + # non-ASCII characters (see telemb.py, ard.py [#3412]) + # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991) + # To work around aforementioned issue we will replace request's original URL with + # percent-encoded one + # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09) + # the code of this workaround has been moved here from YoutubeDL.urlopen() + url = req.get_full_url() + url_escaped = escape_url(url) + + # Substitute URL if any change after escaping + if url != url_escaped: + req_type = HEADRequest if req.get_method() == 'HEAD' else compat_urllib_request.Request + new_req = req_type( + url_escaped, data=req.data, headers=req.headers, + origin_req_host=req.origin_req_host, unverifiable=req.unverifiable) + new_req.timeout = req.timeout + req = new_req + + for h, v in std_headers.items(): + # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275 + # The dict keys are capitalized because of this bug by urllib + if h.capitalize() not in req.headers: + req.add_header(h, v) + if 'Youtubedl-no-compression' in req.headers: + if 'Accept-encoding' in req.headers: + del req.headers['Accept-encoding'] + del req.headers['Youtubedl-no-compression'] + + if sys.version_info < (2, 7) and '#' in req.get_full_url(): + # Python 2.6 is brain-dead when it comes to fragments + req._Request__original = req._Request__original.partition('#')[0] + req._Request__r_type = req._Request__r_type.partition('#')[0] + + return req + + def http_response(self, req, resp): + old_resp = resp + # gzip + if resp.headers.get('Content-encoding', '') == 'gzip': + content = resp.read() + gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb') + try: + uncompressed = io.BytesIO(gz.read()) + except IOError as original_ioerror: + # There may be junk add the end of the file + # See http://stackoverflow.com/q/4928560/35070 for details + for i in range(1, 1024): + try: + gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb') + uncompressed = io.BytesIO(gz.read()) + except IOError: + continue + break + else: + raise original_ioerror + resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code) + resp.msg = old_resp.msg + # deflate + if resp.headers.get('Content-encoding', '') == 'deflate': + gz = io.BytesIO(self.deflate(resp.read())) + resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code) + resp.msg = old_resp.msg + # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 + if 300 <= resp.code < 400: + location = resp.headers.get('Location') + if location: + # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3 + if sys.version_info >= (3, 0): + location = location.encode('iso-8859-1').decode('utf-8') + location_escaped = escape_url(location) + if location != location_escaped: + del resp.headers['Location'] + resp.headers['Location'] = location_escaped + return resp + + https_request = http_request + https_response = http_response + + +class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler): + def __init__(self, params, https_conn_class=None, *args, **kwargs): + compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs) + self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection + self._params = params + + def https_open(self, req): + kwargs = {} + if hasattr(self, '_context'): # python > 2.6 + kwargs['context'] = self._context + if hasattr(self, '_check_hostname'): # python 3.x + kwargs['check_hostname'] = self._check_hostname + return self.do_open(functools.partial( + _create_http_connection, self, self._https_conn_class, True), + req, **kwargs) + + +def parse_iso8601(date_str, delimiter='T', timezone=None): + """ Return a UNIX timestamp from the given date """ + + if date_str is None: + return None + + if timezone is None: + m = re.search( + r'(\.[0-9]+)?(?:Z$| ?(?P\+|-)(?P[0-9]{2}):?(?P[0-9]{2})$)', + date_str) + if not m: + timezone = datetime.timedelta() + else: + date_str = date_str[:-len(m.group(0))] + if not m.group('sign'): + timezone = datetime.timedelta() + else: + sign = 1 if m.group('sign') == '+' else -1 + timezone = datetime.timedelta( + hours=sign * int(m.group('hours')), + minutes=sign * int(m.group('minutes'))) + date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter) + dt = datetime.datetime.strptime(date_str, date_format) - timezone + return calendar.timegm(dt.timetuple()) + + +def unified_strdate(date_str, day_first=True): + """Return a string with the date in the format YYYYMMDD""" + + if date_str is None: + return None + upload_date = None + # Replace commas + date_str = date_str.replace(',', ' ') + # %z (UTC offset) is only supported in python>=3.2 + if not re.match(r'^[0-9]{1,2}-[0-9]{1,2}-[0-9]{4}$', date_str): + date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str) + # Remove AM/PM + timezone + date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str) + + format_expressions = [ + '%d %B %Y', + '%d %b %Y', + '%B %d %Y', + '%b %d %Y', + '%b %dst %Y %I:%M%p', + '%b %dnd %Y %I:%M%p', + '%b %dth %Y %I:%M%p', + '%Y %m %d', + '%Y-%m-%d', + '%Y/%m/%d', + '%Y/%m/%d %H:%M:%S', + '%Y-%m-%d %H:%M:%S', + '%Y-%m-%d %H:%M:%S.%f', + '%d.%m.%Y %H:%M', + '%d.%m.%Y %H.%M', + '%Y-%m-%dT%H:%M:%SZ', + '%Y-%m-%dT%H:%M:%S.%fZ', + '%Y-%m-%dT%H:%M:%S.%f0Z', + '%Y-%m-%dT%H:%M:%S', + '%Y-%m-%dT%H:%M:%S.%f', + '%Y-%m-%dT%H:%M', + ] + if day_first: + format_expressions.extend([ + '%d-%m-%Y', + '%d.%m.%Y', + '%d/%m/%Y', + '%d/%m/%y', + '%d/%m/%Y %H:%M:%S', + ]) + else: + format_expressions.extend([ + '%m-%d-%Y', + '%m.%d.%Y', + '%m/%d/%Y', + '%m/%d/%y', + '%m/%d/%Y %H:%M:%S', + ]) + for expression in format_expressions: + try: + upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d') + except ValueError: + pass + if upload_date is None: + timetuple = email.utils.parsedate_tz(date_str) + if timetuple: + upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d') + return upload_date + + +def determine_ext(url, default_ext='unknown_video'): + if url is None: + return default_ext + guess = url.partition('?')[0].rpartition('.')[2] + if re.match(r'^[A-Za-z0-9]+$', guess): + return guess + else: + return default_ext + + +def subtitles_filename(filename, sub_lang, sub_format): + return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format + + +def date_from_str(date_str): + """ + Return a datetime object from a string in the format YYYYMMDD or + (now|today)[+-][0-9](day|week|month|year)(s)?""" + today = datetime.date.today() + if date_str in ('now', 'today'): + return today + if date_str == 'yesterday': + return today - datetime.timedelta(days=1) + match = re.match('(now|today)(?P[+-])(?P