[extractor/generic] extract multiple formats for HTML5 video tags

2015-09-09 12:17:40 +01:00 · 2015-09-09 12:17:40 +01:00 · 89474a4317
commit 89474a4317
parent 47004d9579
1 changed files with 32 additions and 22 deletions
--- a/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@ -1797,7 +1797,9 @@ class GenericIE(InfoExtractor):
                found = filter_video(re.findall(r'<meta.*?property="og:video".*?content="(.*?)"', webpage))
        if not found:
            # HTML5 video
-            found = re.findall(r'(?s)<(?:video|audio)[^<]*(?:>.*?<source[^>]*)?\s+src=["\'](.*?)["\']', webpage)
+            found = re.findall(r'(?s)<(?:video|audio)[^>]*>(.*?)</(?:video|audio)>', webpage)
            if found:
                found = [re.findall(r'(?s)<source[^>]*src=["\']([^"\']+)["\'][^>]*>', video) for video in found]
        if not found:
            REDIRECT_REGEX = r'[0-9]{,2};\s*(?:URL|url)=\'?([^\'"]+)'
            found = re.search(
@ -1820,33 +1822,41 @@ class GenericIE(InfoExtractor):
            raise UnsupportedError(url)
        entries = []
-        for video_url in found:
+        for video_urls in found:
-            video_url = compat_urlparse.urljoin(url, video_url)
+            if isinstance(video_urls, str):
-            video_id = compat_urllib_parse_unquote(os.path.basename(video_url))
+                video_urls = [video_urls]
            # Sometimes, jwplayer extraction will result in a YouTube URL
            if YoutubeIE.suitable(video_url):
                entries.append(self.url_result(video_url, 'Youtube'))
                continue
            video_id = compat_urllib_parse_unquote(os.path.basename(url))
            # here's a fun little line of code for you:
            video_id = os.path.splitext(video_id)[0]
-            ext = determine_ext(video_url)
+            formats = []
-            if ext == 'smil':
+            for video_url in video_urls:
                video_url = compat_urlparse.urljoin(url, video_url)
                # Sometimes, jwplayer extraction will result in a YouTube URL
                if YoutubeIE.suitable(video_url):
                    entries.append(self.url_result(video_url, 'Youtube'))
                    continue
                ext = determine_ext(video_url)
                if ext == 'smil':
                    entries.append({
                        'id': video_id,
                        'formats': self._extract_smil_formats(video_url, video_id),
                        'uploader': video_uploader,
                        'title': video_title,
                        'age_limit': age_limit,
                    })
                elif ext == 'xspf':
                    return self.playlist_result(self._extract_xspf_playlist(video_url, video_id), video_id)
                else:
                    formats.append({'url': video_url})
            if formats:
                entries.append({
                    'id': video_id,
-                    'formats': self._extract_smil_formats(video_url, video_id),
+                    'formats': formats,
                    'uploader': video_uploader,
                    'title': video_title,
                    'age_limit': age_limit,
                })
            elif ext == 'xspf':
                return self.playlist_result(self._extract_xspf_playlist(video_url, video_id), video_id)
            else:
                entries.append({
                    'id': video_id,
                    'url': video_url,
                    'uploader': video_uploader,
                    'title': video_title,
                    'age_limit': age_limit,