Add CLI options to filter usable extractors

Add `--enable-extractors` and `--disable-extractors` options, which make
it possible to restrict the set of extractors to be considered when
downloading. This is useful to handle URLs that match multiple
extractors (although this should be rare), or only using particular
modes of some extractors (for example, only live videos for Twitch,
enabling only `twitch:stream`).

Both options can be specified multiple times, and each argument is
interpreted as a comma-separated list of fnmatch patterns, to allow the
use of wildcards. Comparisons to extractor names are case-insensitive.
The order of the arguments is not relevant - matching always proceeds as
follows:

- Initialize the set of considered extractors to all available
- If --enable-extractors is specified, remove all extractors that
*don't* match those patterns from consideration
- If --disable-extractors is specified, remove all extractors that *do*
match those patterns from consideration
- If --age-limit is specified, remove all extractors that are not
suitable from consideration

Therefore, disables and the age limit take precedence over enables.
This commit is contained in:
Daniel Miranda 2016-09-17 23:29:55 -03:00
parent 190d2027d0
commit bb6f776271
4 changed files with 146 additions and 5 deletions

View File

@ -69,6 +69,14 @@ which means you can modify it, redistribute it or use it however you like.
extractors
--force-generic-extractor Force extraction to use the generic
extractor
--enable-extractors EXTRACTORS Enable only the chosen extractors. Comma-
separated list of patterns, wildcards
allowed. Example:
"twitch:*,youtube:*,vimeo"
--disable-extractors EXTRACTORS Disable the chosen extractors. Comma-
separated list of patterns, wildcards
allowed. Example:
"twitch:*,youtube:*,vimeo"
--default-search PREFIX Use this prefix for unqualified URLs. For
example "gvsearch2:" downloads two videos
from google videos for youtube-dl "large

View File

@ -11,6 +11,7 @@ import subprocess
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from youtube_dl.utils import encodeArgument
from youtube_dl.extractor import gen_extractors, get_info_extractor
rootDir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
@ -39,5 +40,70 @@ class TestExecution(unittest.TestCase):
_, stderr = p.communicate()
self.assertFalse(stderr)
ALL_EXTRACTORS = [ie.IE_NAME for ie in gen_extractors() if ie._WORKING]
EXTRACTOR_CASES = {
'unrestricted': {
'result': ALL_EXTRACTORS
},
'enable_all': {
'enable': '*',
'result': ALL_EXTRACTORS
},
'disable_all': {
'disable': '*',
'result': []
},
'enable_disable_all': {
'enable': '*',
'disable': '*',
'result': []
},
'enable_some': {
'enable': 'youtube,youporn',
'result': ['youtube', 'YouPorn']
},
'enable_and_filter': {
'enable': 'twitch:*',
'disable': 'twitch:stream',
'result': [ie for ie in ALL_EXTRACTORS if ie.startswith('twitch:') and ie != 'twitch:stream']
},
'enable_age_restricted': {
'enable': 'youporn',
'age_limit': 16,
'result': []
}
}
def gen_extractor_case(case):
enable = case.get('enable')
disable = case.get('disable')
age_limit = case.get('age_limit')
result = case['result']
def template(self):
args = [sys.executable, 'youtube_dl/__main__.py', '--list-extractors']
if enable:
args.extend(['--enable-extractors', enable])
if disable:
args.extend(['--disable-extractors', disable])
if age_limit:
args.extend(['--age-limit', str(age_limit)])
out = subprocess.check_output(args, cwd=rootDir, stderr=_DEV_NULL).decode('utf-8')
extractors = filter(lambda e: e and 'BROKEN' not in e, out.split('\n'))
self.assertItemsEqual(extractors, result)
return template
class TestExtractorSelection(unittest.TestCase):
pass
for name, case in EXTRACTOR_CASES.items():
test_method = gen_extractor_case(case)
test_name = str('test_' + name)
test_method.__name__ = test_name
setattr(TestExtractorSelection, test_name, test_method)
del test_method
if __name__ == '__main__':
unittest.main()

View File

@ -10,7 +10,8 @@ import io
import os
import random
import sys
import fnmatch
from collections import OrderedDict
from .options import (
parseOpts,
@ -40,7 +41,7 @@ from .update import update_self
from .downloader import (
FileDownloader,
)
from .extractor import gen_extractors, list_extractors
from .extractor import gen_extractors, gen_extractor_classes
from .extractor.adobepass import MSO_INFO
from .YoutubeDL import YoutubeDL
@ -100,15 +101,67 @@ def _real_main(argv=None):
_enc = preferredencoding()
all_urls = [url.decode(_enc, 'ignore') if isinstance(url, bytes) else url for url in all_urls]
def get_usable_extractors(enable_patterns, disable_patterns, age_limit):
# Unfortunately it's necessary to create instances of all extractors
# instead of just looking at the classes, because some of them don't
# override the ie_key() classmethod to the correct value.
all_extractors = OrderedDict((ie.IE_NAME.lower(), ie) for ie in gen_extractors())
extractors = OrderedDict() if enable_patterns else all_extractors
if enable_patterns:
all_names = list(all_extractors.keys())
for pattern in enable_patterns:
accepted_names = fnmatch.filter(all_names, pattern)
for name in accepted_names:
if name not in extractors:
if opts.verbose:
write_string('[debug] Enabling extractor %s\n' % name)
extractors[name] = all_extractors[name]
if disable_patterns:
for pattern in disable_patterns:
rejected_names = fnmatch.filter(extractors.keys(), pattern)
for name in rejected_names:
if opts.verbose:
write_string('[debug] Disabling extractor %s\n' % name)
del extractors[name]
if age_limit:
for name, extractor in extractors.items():
if not extractor.is_suitable(age_limit):
if opts.verbose:
write_string('[debug] Extractor %s selected by filter, but ignored due to age limit\n' % name)
del extractors[name]
return extractors.values()
def patterns_from_args(args):
if not args:
return
for arg in args:
for pattern in arg.split(','):
yield pattern.lower()
enable_extractors = list(patterns_from_args(opts.enable_extractors))
disable_extractors = list(patterns_from_args(opts.disable_extractors))
extractors = get_usable_extractors(enable_extractors, disable_extractors, opts.age_limit)
if opts.list_extractors:
for ie in list_extractors(opts.age_limit):
extractors.sort(key=lambda ie: ie.IE_NAME.lower())
for ie in extractors:
write_string(ie.IE_NAME + (' (CURRENTLY BROKEN)' if not ie._WORKING else '') + '\n', out=sys.stdout)
matchedUrls = [url for url in all_urls if ie.suitable(url)]
for mu in matchedUrls:
write_string(' ' + mu + '\n', out=sys.stdout)
sys.exit(0)
if opts.list_extractor_descriptions:
for ie in list_extractors(opts.age_limit):
extractors.sort(key=lambda ie: ie.IE_NAME.lower())
for ie in extractors:
if not ie._WORKING:
continue
desc = getattr(ie, 'IE_DESC', ie.IE_NAME)
@ -413,7 +466,10 @@ def _real_main(argv=None):
}
with YoutubeDL(ydl_opts) as ydl:
if not extractors:
parser.error('No usable extractors selected')
with YoutubeDL(ydl_opts, auto_init=False) as ydl:
# Update version
if opts.update_self:
update_self(ydl.to_screen, opts.verbose, ydl._opener)
@ -422,6 +478,9 @@ def _real_main(argv=None):
if opts.rm_cachedir:
ydl.cache.remove()
for extractor in extractors:
ydl.add_info_extractor(extractor)
# Maybe do nothing
if (len(all_urls) < 1) and (opts.load_info_filename is None):
if opts.update_self or opts.rm_cachedir:

View File

@ -167,6 +167,14 @@ def parseOpts(overrideArguments=None):
'--force-generic-extractor',
action='store_true', dest='force_generic_extractor', default=False,
help='Force extraction to use the generic extractor')
general.add_option(
'--enable-extractors', metavar='EXTRACTORS',
action='append', dest='enable_extractors',
help='Enable only the chosen extractors. Comma-separated list of patterns, wildcards allowed. Example: "twitch:*,youtube:*,vimeo"')
general.add_option(
'--disable-extractors', metavar='EXTRACTORS',
action='append', dest='disable_extractors',
help='Disable the chosen extractors. Comma-separated list of patterns, wildcards allowed. Example: "twitch:*,youtube:*,vimeo"')
general.add_option(
'--default-search',
dest='default_search', metavar='PREFIX',