diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index be5c9a47f..768fd8142 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -965,41 +965,27 @@ class InfoExtractor(object): video_info['description'] = playlist_description return video_info - def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None, return_all=False): + def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None): """ Perform a regex search on the given string, using a single or a list of patterns returning the first matching group. In case of failure return a default value or raise a WARNING or a RegexNotFoundError, depending on fatal, specifying the field name. """ - matches = [] - if isinstance(pattern, (str, compat_str, compiled_regex_type)): - if return_all: - matches = list(re.finditer(pattern, string, flags)) - else: - mobj = re.search(pattern, string, flags) + mobj = re.search(pattern, string, flags) else: for p in pattern: - if return_all: - new_matches = list(re.finditer(p, string, flags)) - matches.extend(new_matches) - else: - mobj = re.search(p, string, flags) - if mobj: - break + mobj = re.search(p, string, flags) + if mobj: + break if not self._downloader.params.get('no_color') and compat_os_name != 'nt' and sys.stderr.isatty(): _name = '\033[0;34m%s\033[0m' % name else: _name = name - if return_all and len(matches) > 0: - if group is None: - return list(map(lambda m: next(g for g in m.groups() if g is not None), matches)) - else: - return list(map(lambda m: m.group(group), matches)) - elif mobj: + if mobj: if group is None: # return the first matching group return next(g for g in mobj.groups() if g is not None) @@ -1013,6 +999,40 @@ class InfoExtractor(object): self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message()) return None + def _search_regex_all(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None): + """ + Perform a regex search on the given string, using a single or a list of + patterns returning the first matching group. Returns all matches. + In case of failure return a default value or raise a WARNING or a + RegexNotFoundError, depending on fatal, specifying the field name. + """ + matches = [] + + if isinstance(pattern, (str, compat_str, compiled_regex_type)): + matches = list(re.finditer(pattern, string, flags)) + else: + for p in pattern: + new_matches = list(re.finditer(p, string, flags)) + matches.extend(new_matches) + + if not self._downloader.params.get('no_color') and compat_os_name != 'nt' and sys.stderr.isatty(): + _name = '\033[0;34m%s\033[0m' % name + else: + _name = name + + if len(matches) > 0: + if group is None: + return list(map(lambda m: next(g for g in m.groups() if g is not None), matches)) + else: + return list(map(lambda m: m.group(group), matches)) + elif default is not NO_DEFAULT: + return default + elif fatal: + raise RegexNotFoundError('Unable to extract %s' % _name) + else: + self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message()) + return None + def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None): """ Like _search_regex, but strips HTML tags and unescapes entities. @@ -1188,10 +1208,10 @@ class InfoExtractor(object): 'twitter card player') def _search_json_ld(self, html, video_id, expected_type=None, **kwargs): - json_lds = self._search_regex( + json_lds = self._search_regex_all( JSON_LD_RE, html, 'JSON-LD', group='json_ld', return_all=True, **kwargs) default = kwargs.get('default', NO_DEFAULT) - if not json_lds or len(json_lds) == 0: + if not json_lds: return default if default is not NO_DEFAULT else {} # JSON-LD may be malformed and thus `fatal` should be respected. # At the same time `default` may be passed that assumes `fatal=False`