Add CLI options to filter usable extractors

Add `--enable-extractors` and `--disable-extractors` options, which make it possible to restrict the set of extractors to be considered when downloading. This is useful to handle URLs that match multiple extractors (although this should be rare), or only using particular modes of some extractors (for example, only live videos for Twitch, enabling only `twitch:stream`). Both options can be specified multiple times, and each argument is interpreted as a comma-separated list of fnmatch patterns, to allow the use of wildcards. Comparisons to extractor names are case-insensitive. The order of the arguments is not relevant - matching always proceeds as follows: - Initialize the set of considered extractors to all available - If --enable-extractors is specified, remove all extractors that *don't* match those patterns from consideration - If --disable-extractors is specified, remove all extractors that *do* match those patterns from consideration - If --age-limit is specified, remove all extractors that are not suitable from consideration Therefore, disables and the age limit take precedence over enables.
2016-09-17 23:29:55 -03:00 · 2016-09-17 23:29:55 -03:00 · bb6f776271
commit bb6f776271
parent 190d2027d0
4 changed files with 146 additions and 5 deletions
--- a/README.md
+++ b/README.md
@ -69,6 +69,14 @@ which means you can modify it, redistribute it or use it however you like.
                                     extractors
    --force-generic-extractor        Force extraction to use the generic
                                     extractor
+    --enable-extractors EXTRACTORS   Enable only the chosen extractors. Comma-
+                                     separated list of patterns, wildcards
+                                     allowed. Example:
+                                     "twitch:*,youtube:*,vimeo"
+    --disable-extractors EXTRACTORS  Disable the chosen extractors. Comma-
+                                     separated list of patterns, wildcards
+                                     allowed. Example:
+                                     "twitch:*,youtube:*,vimeo"
    --default-search PREFIX          Use this prefix for unqualified URLs. For
                                     example "gvsearch2:" downloads two videos
                                     from google videos for youtube-dl "large
--- a/test/test_execution.py
+++ b/test/test_execution.py
@ -11,6 +11,7 @@ import subprocess
 sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

 from youtube_dl.utils import encodeArgument
+from youtube_dl.extractor import gen_extractors, get_info_extractor

 rootDir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))

@ -39,5 +40,70 @@ class TestExecution(unittest.TestCase):
        _, stderr = p.communicate()
        self.assertFalse(stderr)

+ALL_EXTRACTORS = [ie.IE_NAME for ie in gen_extractors() if ie._WORKING]
+EXTRACTOR_CASES = {
+    'unrestricted': {
+        'result': ALL_EXTRACTORS
+    },
+    'enable_all': {
+        'enable': '*',
+        'result': ALL_EXTRACTORS
+    },
+    'disable_all': {
+        'disable': '*',
+        'result': []
+    },
+    'enable_disable_all': {
+        'enable': '*',
+        'disable': '*',
+        'result': []
+    },
+    'enable_some': {
+        'enable': 'youtube,youporn',
+        'result': ['youtube', 'YouPorn']
+    },
+    'enable_and_filter': {
+        'enable': 'twitch:*',
+        'disable': 'twitch:stream',
+        'result': [ie for ie in ALL_EXTRACTORS if ie.startswith('twitch:') and ie != 'twitch:stream']
+    },
+    'enable_age_restricted': {
+        'enable': 'youporn',
+        'age_limit': 16,
+        'result': []
+    }
+}
+
+def gen_extractor_case(case):
+    enable = case.get('enable')
+    disable = case.get('disable')
+    age_limit = case.get('age_limit')
+    result = case['result']
+
+    def template(self):
+        args = [sys.executable, 'youtube_dl/__main__.py', '--list-extractors']
+        if enable:
+            args.extend(['--enable-extractors', enable])
+        if disable:
+            args.extend(['--disable-extractors', disable])
+        if age_limit:
+            args.extend(['--age-limit', str(age_limit)])
+
+        out = subprocess.check_output(args, cwd=rootDir, stderr=_DEV_NULL).decode('utf-8')
+        extractors = filter(lambda e: e and 'BROKEN' not in e, out.split('\n'))
+        self.assertItemsEqual(extractors, result)
+
+    return template
+
+class TestExtractorSelection(unittest.TestCase):
+    pass
+
+for name, case in EXTRACTOR_CASES.items():
+    test_method = gen_extractor_case(case)
+    test_name = str('test_' + name)
+    test_method.__name__ = test_name
+    setattr(TestExtractorSelection, test_name, test_method)
+    del test_method
+
 if __name__ == '__main__':
    unittest.main()
--- a/youtube_dl/init.py
+++ b/youtube_dl/init.py
@ -10,7 +10,8 @@ import io
 import os
 import random
 import sys
-
+import fnmatch
+from collections import OrderedDict

 from .options import (
    parseOpts,
@ -40,7 +41,7 @@ from .update import update_self
 from .downloader import (
    FileDownloader,
 )
-from .extractor import gen_extractors, list_extractors
+from .extractor import gen_extractors, gen_extractor_classes
 from .extractor.adobepass import MSO_INFO
 from .YoutubeDL import YoutubeDL

@ -100,15 +101,67 @@ def _real_main(argv=None):
    _enc = preferredencoding()
    all_urls = [url.decode(_enc, 'ignore') if isinstance(url, bytes) else url for url in all_urls]

+    def get_usable_extractors(enable_patterns, disable_patterns, age_limit):
+        # Unfortunately it's necessary to create instances of all extractors
+        # instead of just looking at the classes, because some of them don't
+        # override the ie_key() classmethod to the correct value.
+
+        all_extractors = OrderedDict((ie.IE_NAME.lower(), ie) for ie in gen_extractors())
+        extractors = OrderedDict() if enable_patterns else all_extractors
+
+        if enable_patterns:
+            all_names = list(all_extractors.keys())
+            for pattern in enable_patterns:
+                accepted_names = fnmatch.filter(all_names, pattern)
+                for name in accepted_names:
+                    if name not in extractors:
+                        if opts.verbose:
+                            write_string('[debug] Enabling extractor %s\n' % name)
+
+                        extractors[name] = all_extractors[name]
+
+        if disable_patterns:
+            for pattern in disable_patterns:
+                rejected_names = fnmatch.filter(extractors.keys(), pattern)
+                for name in rejected_names:
+                    if opts.verbose:
+                        write_string('[debug] Disabling extractor %s\n' % name)
+
+                    del extractors[name]
+
+        if age_limit:
+            for name, extractor in extractors.items():
+                if not extractor.is_suitable(age_limit):
+                    if opts.verbose:
+                        write_string('[debug] Extractor %s selected by filter, but ignored due to age limit\n' % name)
+
+                    del extractors[name]
+
+        return extractors.values()
+
+    def patterns_from_args(args):
+        if not args:
+            return
+
+        for arg in args:
+            for pattern in arg.split(','):
+                yield pattern.lower()
+
+    enable_extractors = list(patterns_from_args(opts.enable_extractors))
+    disable_extractors = list(patterns_from_args(opts.disable_extractors))
+    extractors = get_usable_extractors(enable_extractors, disable_extractors, opts.age_limit)
+
    if opts.list_extractors:
-        for ie in list_extractors(opts.age_limit):
+        extractors.sort(key=lambda ie: ie.IE_NAME.lower())
+        for ie in extractors:
            write_string(ie.IE_NAME + (' (CURRENTLY BROKEN)' if not ie._WORKING else '') + '\n', out=sys.stdout)
            matchedUrls = [url for url in all_urls if ie.suitable(url)]
            for mu in matchedUrls:
                write_string('  ' + mu + '\n', out=sys.stdout)
        sys.exit(0)
    if opts.list_extractor_descriptions:
-        for ie in list_extractors(opts.age_limit):
+        extractors.sort(key=lambda ie: ie.IE_NAME.lower())
+        for ie in extractors:
            if not ie._WORKING:
                continue
            desc = getattr(ie, 'IE_DESC', ie.IE_NAME)
@ -413,7 +466,10 @@ def _real_main(argv=None):

    }

-    with YoutubeDL(ydl_opts) as ydl:
+    if not extractors:
+        parser.error('No usable extractors selected')
+
+    with YoutubeDL(ydl_opts, auto_init=False) as ydl:
        # Update version
        if opts.update_self:
            update_self(ydl.to_screen, opts.verbose, ydl._opener)
@ -422,6 +478,9 @@ def _real_main(argv=None):
        if opts.rm_cachedir:
            ydl.cache.remove()

+        for extractor in extractors:
+            ydl.add_info_extractor(extractor)
+
        # Maybe do nothing
        if (len(all_urls) < 1) and (opts.load_info_filename is None):
            if opts.update_self or opts.rm_cachedir:
--- a/youtube_dl/options.py
+++ b/youtube_dl/options.py
@ -167,6 +167,14 @@ def parseOpts(overrideArguments=None):
        '--force-generic-extractor',
        action='store_true', dest='force_generic_extractor', default=False,
        help='Force extraction to use the generic extractor')
+    general.add_option(
+        '--enable-extractors', metavar='EXTRACTORS',
+        action='append', dest='enable_extractors',
+        help='Enable only the chosen extractors. Comma-separated list of patterns, wildcards allowed. Example: "twitch:*,youtube:*,vimeo"')
+    general.add_option(
+        '--disable-extractors', metavar='EXTRACTORS',
+        action='append', dest='disable_extractors',
+        help='Disable the chosen extractors. Comma-separated list of patterns, wildcards allowed. Example: "twitch:*,youtube:*,vimeo"')
    general.add_option(
        '--default-search',
        dest='default_search', metavar='PREFIX',