Add bulk support for go.

2020-01-10 00:31:55 -08:00 · 2020-01-10 00:31:55 -08:00 · 8e1b235c0e
commit 8e1b235c0e
parent 68fa15155f
2 changed files with 60 additions and 2 deletions
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@ -1007,6 +1007,40 @@ class InfoExtractor(object):
            self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
            return None
    def _search_regex_all(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
        """
        The same as _search_regex, except will return all matches for all patterns instead of just one
        """
        ret = []
        if isinstance(pattern, (str, compat_str, compiled_regex_type)):
            for match in re.finditer(pattern, string, flags):
                if group is None:
                    ret.append(next(g for g in match.groups() if g is not None))
                else:
                    ret.append(match.group(group))
        else:
            for p in pattern:
                for match in re.finditer(p, string, flags):
                    if group is None:
                        ret.append(next(g for g in match.groups() if g is not None))
                    else:
                        ret.append(match.group(group))
        if not self._downloader.params.get('no_color') and compat_os_name != 'nt' and sys.stderr.isatty():
            _name = '\033[0;34m%s\033[0m' % name
        else:
            _name = name
        if len(ret) > 0:
            return ret
        elif default is not NO_DEFAULT:
            return default
        elif fatal:
            raise RegexNotFoundError('Unable to extract %s' % _name)
        else:
            self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
            return None
    def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
        """
        Like _search_regex, but strips HTML tags and unescapes entities.
--- a/youtube_dl/extractor/go.py
+++ b/youtube_dl/extractor/go.py
@ -132,14 +132,21 @@ class GoIE(AdobePassIE):
        brand = site_info.get('brand')
        if not video_id or not site_info:
            webpage = self._download_webpage(url, display_id or video_id)
-            video_id = self._search_regex(
+            video_id = self._search_regex_all(
                (
                    # There may be inner quotes, e.g. data-video-id="'VDKA3609139'"
                    # from http://freeform.go.com/shows/shadowhunters/episodes/season-2/1-this-guilty-blood
                    r'data-video-id=["\']*(VDKA\w+)',
                    # https://abc.com/shows/the-rookie/episode-guide/season-02/03-the-bet
                    r'\b(?:video)?id["\']\s*:\s*["\'](VDKA\w+)'
-                ), webpage, 'video id', default=video_id)
+                ), webpage, 'video id', default=[video_id])
            # Remove duplicates and nulls
            if video_id:
                tmp = []
                [tmp.append(x) for x in video_id if x and x not in tmp]
                video_id = tmp
            if not site_info:
                brand = self._search_regex(
                    (r'data-brand=\s*["\']\s*(\d+)',
@ -160,6 +167,23 @@ class GoIE(AdobePassIE):
                        video['url'], 'Go', video.get('id'), video.get('title')))
                entries.reverse()
                return self.playlist_result(entries, show_id, show_title)
        if not isinstance(video_id, list):
            video_id = [video_id]
        entries = []
        for id in video_id:
            entry = self._real_extract_single(id, site_info, brand)
            if entry:
                entries.append(entry)
        if len(entries) == 0:
            return None
        elif len(entries) == 1:
            return entries[0]
        return self.playlist_result(entries)
    def _real_extract_single(self, video_id, site_info, brand):
        video_data = self._extract_videos(brand, video_id)[0]
        video_id = video_data['id']
        title = video_data['title']