[liveleak] extract multiple videos

2015-09-15 14:28:03 +02:00 · 2015-09-15 14:28:03 +02:00 · 0b0fb6fd42
commit 0b0fb6fd42
parent 31208a07c2
1 changed files with 101 additions and 45 deletions
--- a/youtube_dl/extractor/liveleak.py
+++ b/youtube_dl/extractor/liveleak.py
@ -53,9 +53,26 @@ class LiveLeakIE(InfoExtractor):
        }
    }]

+    video_count = 0
+    def _video_count(self):
+        self.video_count += 1
+        if self.video_count == 1:
+            return ''
+        else:
+            return '-' + str(self.video_count-1)
+
+    # Removing '.h264_*.mp4' gives the raw video, which is essentially
+    # the same video without the LiveLeak logo at the top (see
+    # https://github.com/rg3/youtube-dl/pull/4768)
+    def _get_orig_video_url(self, url):
+        return re.sub(r'\.h264_.+?\.mp4', '', url)
+
    def _real_extract(self, url):
-        video_id = self._match_id(url)
-        webpage = self._download_webpage(url, video_id)
+
+        entries = list()  # collect all found videos
+
+        page_id = self._match_id(url)
+        webpage = self._download_webpage(url, page_id)

        video_title = self._og_search_title(webpage).replace('LiveLeak.com -', '').strip()
        video_description = self._og_search_description(webpage)
@ -65,55 +82,94 @@ class LiveLeakIE(InfoExtractor):
            r'you confirm that you are ([0-9]+) years and over.',
            webpage, 'age limit', default=None))

+        # extracts native video #1 (single video, maybe multiple formats)
        sources_raw = self._search_regex(
            r'(?s)sources:\s*(\[.*?\]),', webpage, 'video URLs', default=None)
-        if sources_raw is None:
-            alt_source = self._search_regex(
-                r'(file: ".*?"),', webpage, 'video URL', default=None)
-            if alt_source:
-                sources_raw = '[{ %s}]' % alt_source
-            else:
-                # Maybe an embed?
-                embed_url = self._search_regex(
-                    r'<iframe[^>]+src="(http://www.prochan.com/embed\?[^"]+)"',
-                    webpage, 'embed URL')
-                return {
-                    '_type': 'url_transparent',
-                    'url': embed_url,
-                    'id': video_id,
-                    'title': video_title,
-                    'description': video_description,
-                    'uploader': video_uploader,
-                    'age_limit': age_limit,
-                }
+        if sources_raw:
+            sources_json = re.sub(r'\s([a-z]+):\s', r'"\1": ', sources_raw)
+            sources = json.loads(sources_json)

-        sources_json = re.sub(r'\s([a-z]+):\s', r'"\1": ', sources_raw)
-        sources = json.loads(sources_json)
+            formats = [{
+                'format_id': '%s' % i,
+                'format_note': s.get('label'),
+                'url': s['file'],
+            } for i, s in enumerate(sources)]
+            for i, s in enumerate(sources):
+                orig_url = self._get_orig_video_url(s['file'])
+                if s['file'] != orig_url:
+                    formats.append({
+                        'format_id': 'original-%s' % i,
+                        'format_note': s.get('label'),
+                        'url': orig_url,
+                        'preference': 1,
+                    })
+            self._sort_formats(formats)

-        formats = [{
-            'format_id': '%s' % i,
-            'format_note': s.get('label'),
-            'url': s['file'],
-        } for i, s in enumerate(sources)]
-        for i, s in enumerate(sources):
-            # Removing '.h264_*.mp4' gives the raw video, which is essentially
-            # the same video without the LiveLeak logo at the top (see
-            # https://github.com/rg3/youtube-dl/pull/4768)
-            orig_url = re.sub(r'\.h264_.+?\.mp4', '', s['file'])
-            if s['file'] != orig_url:
+            entries.append({
+                'id': page_id,
+                'title': video_title,
+                'description': video_description,
+                'uploader': video_uploader,
+                'formats': formats,
+                'age_limit': age_limit,
+            })
+
+        # extracts native videos #2 (maybe multiple videos, single format)
+        sources = re.findall(r'(?s)jwplayer\("file_[0-9a-f]+"\).+?file: "(.*?)"', webpage)
+        for url in sources:
+            formats = [{
+                'format_id': '0',
+                'format_note': 'standard quality (with logo)',
+                'url': url,
+            }]
+            orig_url = self._get_orig_video_url(url)
+            if orig_url != url:
                formats.append({
-                    'format_id': 'original-%s' % i,
-                    'format_note': s.get('label'),
+                    'format_id': '1',
+                    'format_note': 'high quality (no logo)',
                    'url': orig_url,
                    'preference': 1,
                })
-        self._sort_formats(formats)
+            entries.append({
+                'id': page_id + self._video_count(),
+                'title': video_title,
+                'description': video_description,
+                'uploader': video_uploader,
+                'formats': formats,
+                'age_limit': age_limit,
+            })

-        return {
-            'id': video_id,
-            'title': video_title,
-            'description': video_description,
-            'uploader': video_uploader,
-            'formats': formats,
-            'age_limit': age_limit,
-        }
+
+        # collect embedded videos:
+        embed_urls = list()
+
+        # prochan.com:
+        embed_prochan = (re.findall(
+            r'<iframe[^>]+src="(http://www.prochan.com/embed\?[^"]+)"',
+            webpage))
+        if len(embed_prochan) > 0:
+            for embed in embed_prochan:
+                embed_urls.append(embed)
+
+        # add all collected embed urls
+        for embed_url in embed_urls:
+            entries.append({
+                '_type': 'url_transparent',
+                'id': page_id + self._video_count(),
+                'url': embed_url,
+                'title': video_title,
+                'description': video_description,
+                'uploader': video_uploader,
+                'age_limit': age_limit,
+            })
+
+        if len(entries) == 0:
+            raise ExtractorError('No videos found')
+        if len(entries) == 1:
+            return entries[0]
+        else:
+            return {
+                '_type': 'multi_video',
+                'id': page_id,
+                'entries': entries,
+            }