diff --git a/youtube_dl/extractor/liveleak.py b/youtube_dl/extractor/liveleak.py index 857edfde2..cee1e00c7 100644 --- a/youtube_dl/extractor/liveleak.py +++ b/youtube_dl/extractor/liveleak.py @@ -53,9 +53,26 @@ class LiveLeakIE(InfoExtractor): } }] + video_count = 0 + def _video_count(self): + self.video_count += 1 + if self.video_count == 1: + return '' + else: + return '-' + str(self.video_count-1) + + # Removing '.h264_*.mp4' gives the raw video, which is essentially + # the same video without the LiveLeak logo at the top (see + # https://github.com/rg3/youtube-dl/pull/4768) + def _get_orig_video_url(self, url): + return re.sub(r'\.h264_.+?\.mp4', '', url) + def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + + entries = list() # collect all found videos + + page_id = self._match_id(url) + webpage = self._download_webpage(url, page_id) video_title = self._og_search_title(webpage).replace('LiveLeak.com -', '').strip() video_description = self._og_search_description(webpage) @@ -65,55 +82,94 @@ class LiveLeakIE(InfoExtractor): r'you confirm that you are ([0-9]+) years and over.', webpage, 'age limit', default=None)) + # extracts native video #1 (single video, maybe multiple formats) sources_raw = self._search_regex( r'(?s)sources:\s*(\[.*?\]),', webpage, 'video URLs', default=None) - if sources_raw is None: - alt_source = self._search_regex( - r'(file: ".*?"),', webpage, 'video URL', default=None) - if alt_source: - sources_raw = '[{ %s}]' % alt_source - else: - # Maybe an embed? - embed_url = self._search_regex( - r']+src="(http://www.prochan.com/embed\?[^"]+)"', - webpage, 'embed URL') - return { - '_type': 'url_transparent', - 'url': embed_url, - 'id': video_id, - 'title': video_title, - 'description': video_description, - 'uploader': video_uploader, - 'age_limit': age_limit, - } + if sources_raw: + sources_json = re.sub(r'\s([a-z]+):\s', r'"\1": ', sources_raw) + sources = json.loads(sources_json) - sources_json = re.sub(r'\s([a-z]+):\s', r'"\1": ', sources_raw) - sources = json.loads(sources_json) + formats = [{ + 'format_id': '%s' % i, + 'format_note': s.get('label'), + 'url': s['file'], + } for i, s in enumerate(sources)] + for i, s in enumerate(sources): + orig_url = self._get_orig_video_url(s['file']) + if s['file'] != orig_url: + formats.append({ + 'format_id': 'original-%s' % i, + 'format_note': s.get('label'), + 'url': orig_url, + 'preference': 1, + }) + self._sort_formats(formats) - formats = [{ - 'format_id': '%s' % i, - 'format_note': s.get('label'), - 'url': s['file'], - } for i, s in enumerate(sources)] - for i, s in enumerate(sources): - # Removing '.h264_*.mp4' gives the raw video, which is essentially - # the same video without the LiveLeak logo at the top (see - # https://github.com/rg3/youtube-dl/pull/4768) - orig_url = re.sub(r'\.h264_.+?\.mp4', '', s['file']) - if s['file'] != orig_url: + entries.append({ + 'id': page_id, + 'title': video_title, + 'description': video_description, + 'uploader': video_uploader, + 'formats': formats, + 'age_limit': age_limit, + }) + + # extracts native videos #2 (maybe multiple videos, single format) + sources = re.findall(r'(?s)jwplayer\("file_[0-9a-f]+"\).+?file: "(.*?)"', webpage) + for url in sources: + formats = [{ + 'format_id': '0', + 'format_note': 'standard quality (with logo)', + 'url': url, + }] + orig_url = self._get_orig_video_url(url) + if orig_url != url: formats.append({ - 'format_id': 'original-%s' % i, - 'format_note': s.get('label'), + 'format_id': '1', + 'format_note': 'high quality (no logo)', 'url': orig_url, 'preference': 1, }) - self._sort_formats(formats) + entries.append({ + 'id': page_id + self._video_count(), + 'title': video_title, + 'description': video_description, + 'uploader': video_uploader, + 'formats': formats, + 'age_limit': age_limit, + }) - return { - 'id': video_id, - 'title': video_title, - 'description': video_description, - 'uploader': video_uploader, - 'formats': formats, - 'age_limit': age_limit, - } + + # collect embedded videos: + embed_urls = list() + + # prochan.com: + embed_prochan = (re.findall( + r']+src="(http://www.prochan.com/embed\?[^"]+)"', + webpage)) + if len(embed_prochan) > 0: + for embed in embed_prochan: + embed_urls.append(embed) + + # add all collected embed urls + for embed_url in embed_urls: + entries.append({ + '_type': 'url_transparent', + 'id': page_id + self._video_count(), + 'url': embed_url, + 'title': video_title, + 'description': video_description, + 'uploader': video_uploader, + 'age_limit': age_limit, + }) + + if len(entries) == 0: + raise ExtractorError('No videos found') + if len(entries) == 1: + return entries[0] + else: + return { + '_type': 'multi_video', + 'id': page_id, + 'entries': entries, + }