From 0b0fb6fd421e2591190ef6ba25f0c7c6243e391d Mon Sep 17 00:00:00 2001 From: aeph6Ee0 Date: Tue, 15 Sep 2015 14:28:03 +0200 Subject: [PATCH 1/8] [liveleak] extract multiple videos --- youtube_dl/extractor/liveleak.py | 146 +++++++++++++++++++++---------- 1 file changed, 101 insertions(+), 45 deletions(-) diff --git a/youtube_dl/extractor/liveleak.py b/youtube_dl/extractor/liveleak.py index 857edfde2..cee1e00c7 100644 --- a/youtube_dl/extractor/liveleak.py +++ b/youtube_dl/extractor/liveleak.py @@ -53,9 +53,26 @@ class LiveLeakIE(InfoExtractor): } }] + video_count = 0 + def _video_count(self): + self.video_count += 1 + if self.video_count == 1: + return '' + else: + return '-' + str(self.video_count-1) + + # Removing '.h264_*.mp4' gives the raw video, which is essentially + # the same video without the LiveLeak logo at the top (see + # https://github.com/rg3/youtube-dl/pull/4768) + def _get_orig_video_url(self, url): + return re.sub(r'\.h264_.+?\.mp4', '', url) + def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + + entries = list() # collect all found videos + + page_id = self._match_id(url) + webpage = self._download_webpage(url, page_id) video_title = self._og_search_title(webpage).replace('LiveLeak.com -', '').strip() video_description = self._og_search_description(webpage) @@ -65,55 +82,94 @@ class LiveLeakIE(InfoExtractor): r'you confirm that you are ([0-9]+) years and over.', webpage, 'age limit', default=None)) + # extracts native video #1 (single video, maybe multiple formats) sources_raw = self._search_regex( r'(?s)sources:\s*(\[.*?\]),', webpage, 'video URLs', default=None) - if sources_raw is None: - alt_source = self._search_regex( - r'(file: ".*?"),', webpage, 'video URL', default=None) - if alt_source: - sources_raw = '[{ %s}]' % alt_source - else: - # Maybe an embed? - embed_url = self._search_regex( - r']+src="(http://www.prochan.com/embed\?[^"]+)"', - webpage, 'embed URL') - return { - '_type': 'url_transparent', - 'url': embed_url, - 'id': video_id, - 'title': video_title, - 'description': video_description, - 'uploader': video_uploader, - 'age_limit': age_limit, - } + if sources_raw: + sources_json = re.sub(r'\s([a-z]+):\s', r'"\1": ', sources_raw) + sources = json.loads(sources_json) - sources_json = re.sub(r'\s([a-z]+):\s', r'"\1": ', sources_raw) - sources = json.loads(sources_json) + formats = [{ + 'format_id': '%s' % i, + 'format_note': s.get('label'), + 'url': s['file'], + } for i, s in enumerate(sources)] + for i, s in enumerate(sources): + orig_url = self._get_orig_video_url(s['file']) + if s['file'] != orig_url: + formats.append({ + 'format_id': 'original-%s' % i, + 'format_note': s.get('label'), + 'url': orig_url, + 'preference': 1, + }) + self._sort_formats(formats) - formats = [{ - 'format_id': '%s' % i, - 'format_note': s.get('label'), - 'url': s['file'], - } for i, s in enumerate(sources)] - for i, s in enumerate(sources): - # Removing '.h264_*.mp4' gives the raw video, which is essentially - # the same video without the LiveLeak logo at the top (see - # https://github.com/rg3/youtube-dl/pull/4768) - orig_url = re.sub(r'\.h264_.+?\.mp4', '', s['file']) - if s['file'] != orig_url: + entries.append({ + 'id': page_id, + 'title': video_title, + 'description': video_description, + 'uploader': video_uploader, + 'formats': formats, + 'age_limit': age_limit, + }) + + # extracts native videos #2 (maybe multiple videos, single format) + sources = re.findall(r'(?s)jwplayer\("file_[0-9a-f]+"\).+?file: "(.*?)"', webpage) + for url in sources: + formats = [{ + 'format_id': '0', + 'format_note': 'standard quality (with logo)', + 'url': url, + }] + orig_url = self._get_orig_video_url(url) + if orig_url != url: formats.append({ - 'format_id': 'original-%s' % i, - 'format_note': s.get('label'), + 'format_id': '1', + 'format_note': 'high quality (no logo)', 'url': orig_url, 'preference': 1, }) - self._sort_formats(formats) + entries.append({ + 'id': page_id + self._video_count(), + 'title': video_title, + 'description': video_description, + 'uploader': video_uploader, + 'formats': formats, + 'age_limit': age_limit, + }) - return { - 'id': video_id, - 'title': video_title, - 'description': video_description, - 'uploader': video_uploader, - 'formats': formats, - 'age_limit': age_limit, - } + + # collect embedded videos: + embed_urls = list() + + # prochan.com: + embed_prochan = (re.findall( + r']+src="(http://www.prochan.com/embed\?[^"]+)"', + webpage)) + if len(embed_prochan) > 0: + for embed in embed_prochan: + embed_urls.append(embed) + + # add all collected embed urls + for embed_url in embed_urls: + entries.append({ + '_type': 'url_transparent', + 'id': page_id + self._video_count(), + 'url': embed_url, + 'title': video_title, + 'description': video_description, + 'uploader': video_uploader, + 'age_limit': age_limit, + }) + + if len(entries) == 0: + raise ExtractorError('No videos found') + if len(entries) == 1: + return entries[0] + else: + return { + '_type': 'multi_video', + 'id': page_id, + 'entries': entries, + } From d67614a921c001fad7bc16a9ccdfd9714465c873 Mon Sep 17 00:00:00 2001 From: aeph6Ee0 Date: Tue, 15 Sep 2015 15:04:43 +0200 Subject: [PATCH 2/8] [liveleak] remove rate limit form url (+regex fix) --- youtube_dl/extractor/liveleak.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/liveleak.py b/youtube_dl/extractor/liveleak.py index cee1e00c7..6315ae4ed 100644 --- a/youtube_dl/extractor/liveleak.py +++ b/youtube_dl/extractor/liveleak.py @@ -67,6 +67,9 @@ class LiveLeakIE(InfoExtractor): def _get_orig_video_url(self, url): return re.sub(r'\.h264_.+?\.mp4', '', url) + def _remove_rate_limit(self, url): + return re.sub(r'&ec_rate=[0-9]+', '', url) + def _real_extract(self, url): entries = list() # collect all found videos @@ -92,10 +95,10 @@ class LiveLeakIE(InfoExtractor): formats = [{ 'format_id': '%s' % i, 'format_note': s.get('label'), - 'url': s['file'], + 'url': self._remove_rate_limit(s['file']), } for i, s in enumerate(sources)] for i, s in enumerate(sources): - orig_url = self._get_orig_video_url(s['file']) + orig_url = self._remove_rate_limit(self._get_orig_video_url(s['file'])) if s['file'] != orig_url: formats.append({ 'format_id': 'original-%s' % i, @@ -115,8 +118,9 @@ class LiveLeakIE(InfoExtractor): }) # extracts native videos #2 (maybe multiple videos, single format) - sources = re.findall(r'(?s)jwplayer\("file_[0-9a-f]+"\).+?file: "(.*?)"', webpage) + sources = re.findall(r'(?s)jwplayer.+?file: "(.+?)".+?config:', webpage) for url in sources: + url = self._remove_rate_limit(url) formats = [{ 'format_id': '0', 'format_note': 'standard quality (with logo)', @@ -151,7 +155,7 @@ class LiveLeakIE(InfoExtractor): for embed in embed_prochan: embed_urls.append(embed) - # add all collected embed urls + # add all collected embed urls to list for embed_url in embed_urls: entries.append({ '_type': 'url_transparent', @@ -163,8 +167,6 @@ class LiveLeakIE(InfoExtractor): 'age_limit': age_limit, }) - if len(entries) == 0: - raise ExtractorError('No videos found') if len(entries) == 1: return entries[0] else: From ac7ea5c4c8080f89309f6347bb23774889063367 Mon Sep 17 00:00:00 2001 From: aeph6Ee0 Date: Tue, 15 Sep 2015 15:37:52 +0200 Subject: [PATCH 3/8] [liveleak] add extractor for embedded youtube videos --- youtube_dl/extractor/liveleak.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/liveleak.py b/youtube_dl/extractor/liveleak.py index 6315ae4ed..68b37646b 100644 --- a/youtube_dl/extractor/liveleak.py +++ b/youtube_dl/extractor/liveleak.py @@ -148,12 +148,16 @@ class LiveLeakIE(InfoExtractor): embed_urls = list() # prochan.com: - embed_prochan = (re.findall( - r']+src="(http://www.prochan.com/embed\?[^"]+)"', - webpage)) - if len(embed_prochan) > 0: - for embed in embed_prochan: - embed_urls.append(embed) + for embed_prochan in re.findall( + r']+src="(http://www.prochan.com/embed\?[^"]+)"', + webpage): + embed_urls.append(embed_prochan) + + # youtube.com: + for embed_youtube in re.findall( + r']+src="(http[s]?://www.youtube.com/embed/[^"]+)"', + webpage): + embed_urls.append(embed_youtube) # add all collected embed urls to list for embed_url in embed_urls: @@ -167,6 +171,7 @@ class LiveLeakIE(InfoExtractor): 'age_limit': age_limit, }) + if len(entries) == 1: return entries[0] else: From 4ac4c76eade7e078061bd4ebb7771acca84f5299 Mon Sep 17 00:00:00 2001 From: aeph6Ee0 Date: Tue, 15 Sep 2015 15:56:51 +0200 Subject: [PATCH 4/8] [liveleak] flake8 --- youtube_dl/extractor/liveleak.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/liveleak.py b/youtube_dl/extractor/liveleak.py index 68b37646b..ee859425c 100644 --- a/youtube_dl/extractor/liveleak.py +++ b/youtube_dl/extractor/liveleak.py @@ -54,12 +54,13 @@ class LiveLeakIE(InfoExtractor): }] video_count = 0 + def _video_count(self): self.video_count += 1 if self.video_count == 1: return '' else: - return '-' + str(self.video_count-1) + return '-' + str(self.video_count - 1) # Removing '.h264_*.mp4' gives the raw video, which is essentially # the same video without the LiveLeak logo at the top (see @@ -143,7 +144,6 @@ class LiveLeakIE(InfoExtractor): 'age_limit': age_limit, }) - # collect embedded videos: embed_urls = list() @@ -171,7 +171,6 @@ class LiveLeakIE(InfoExtractor): 'age_limit': age_limit, }) - if len(entries) == 1: return entries[0] else: From 4c24e389dcf34fe18cf0d8e8021d9342d8c27fbe Mon Sep 17 00:00:00 2001 From: aeph6Ee0 Date: Tue, 15 Sep 2015 16:33:57 +0200 Subject: [PATCH 5/8] [liveleak] removed global video counter --- youtube_dl/extractor/liveleak.py | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/youtube_dl/extractor/liveleak.py b/youtube_dl/extractor/liveleak.py index ee859425c..346759196 100644 --- a/youtube_dl/extractor/liveleak.py +++ b/youtube_dl/extractor/liveleak.py @@ -53,14 +53,12 @@ class LiveLeakIE(InfoExtractor): } }] - video_count = 0 - - def _video_count(self): - self.video_count += 1 - if self.video_count == 1: + def _video_count(self, entries): + count = len(entries) + if count == 0: return '' else: - return '-' + str(self.video_count - 1) + return '-' + str(count) # Removing '.h264_*.mp4' gives the raw video, which is essentially # the same video without the LiveLeak logo at the top (see @@ -110,7 +108,7 @@ class LiveLeakIE(InfoExtractor): self._sort_formats(formats) entries.append({ - 'id': page_id, + 'id': page_id + self._video_count(entries), 'title': video_title, 'description': video_description, 'uploader': video_uploader, @@ -136,7 +134,7 @@ class LiveLeakIE(InfoExtractor): 'preference': 1, }) entries.append({ - 'id': page_id + self._video_count(), + 'id': page_id + self._video_count(entries), 'title': video_title, 'description': video_description, 'uploader': video_uploader, @@ -163,7 +161,7 @@ class LiveLeakIE(InfoExtractor): for embed_url in embed_urls: entries.append({ '_type': 'url_transparent', - 'id': page_id + self._video_count(), + 'id': page_id + self._video_count(entries), 'url': embed_url, 'title': video_title, 'description': video_description, From 9d74b496c0f98f61dbc1b1ba772017c8b45b9a33 Mon Sep 17 00:00:00 2001 From: aeph6Ee0 Date: Tue, 15 Sep 2015 18:48:40 +0200 Subject: [PATCH 6/8] [liveleak] add tests for multi_video and embedded youtube video --- youtube_dl/extractor/liveleak.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/youtube_dl/extractor/liveleak.py b/youtube_dl/extractor/liveleak.py index 346759196..db70bbc5c 100644 --- a/youtube_dl/extractor/liveleak.py +++ b/youtube_dl/extractor/liveleak.py @@ -51,6 +51,28 @@ class LiveLeakIE(InfoExtractor): 'uploader': 'bony333', 'title': 'Crazy Hungarian tourist films close call waterspout in Croatia' } + }, { + # Multiple videos per page (https://github.com/rg3/youtube-dl/issues/6542) + 'url': 'http://www.liveleak.com/view?i=677_1439397581', + 'info_dict': { + 'id': '677_1439397581', + 'title': 'Fuel Depot in China Explosion caught on video', + }, + 'playlist_mincount': 3 + }, { + # Embedded youtube video + 'url': 'http://www.liveleak.com/view?i=db4_1442324398', + 'md5': 'c72ce559d02cf26b6540c87d6a015c0c', + 'info_dict': { + 'id': 'db4_1442324398', + 'ext': 'mp4', + 'description': 'Is it worth 6 minutes of your time to listen to this?', + 'uploader': 'Vfor', + 'uploader_id': 'iSSerDc', + 'upload_date': '20070703', + 'title': "Pachelbel's Canon in D - Breathtaking" + } + }] def _video_count(self, entries): @@ -175,5 +197,6 @@ class LiveLeakIE(InfoExtractor): return { '_type': 'multi_video', 'id': page_id, + 'title': 'Fuel Depot in China Explosion caught on video', 'entries': entries, } From 1f52830eaff45b059a306eedc6e5ed58bb8d7efc Mon Sep 17 00:00:00 2001 From: aeph6Ee0 Date: Tue, 15 Sep 2015 21:46:42 +0200 Subject: [PATCH 7/8] [liveleak] remove comments --- youtube_dl/extractor/liveleak.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/youtube_dl/extractor/liveleak.py b/youtube_dl/extractor/liveleak.py index db70bbc5c..bb5274b1e 100644 --- a/youtube_dl/extractor/liveleak.py +++ b/youtube_dl/extractor/liveleak.py @@ -164,22 +164,18 @@ class LiveLeakIE(InfoExtractor): 'age_limit': age_limit, }) - # collect embedded videos: embed_urls = list() - # prochan.com: for embed_prochan in re.findall( r']+src="(http://www.prochan.com/embed\?[^"]+)"', webpage): embed_urls.append(embed_prochan) - # youtube.com: for embed_youtube in re.findall( r']+src="(http[s]?://www.youtube.com/embed/[^"]+)"', webpage): embed_urls.append(embed_youtube) - # add all collected embed urls to list for embed_url in embed_urls: entries.append({ '_type': 'url_transparent', From 831e7138d1d2eaaf336dddff883b5c9a05aa6c4f Mon Sep 17 00:00:00 2001 From: aeph6Ee0 Date: Tue, 22 Sep 2015 00:50:15 +0200 Subject: [PATCH 8/8] [liveleak] minor fixes --- youtube_dl/extractor/liveleak.py | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/liveleak.py b/youtube_dl/extractor/liveleak.py index bb5274b1e..23bcf9437 100644 --- a/youtube_dl/extractor/liveleak.py +++ b/youtube_dl/extractor/liveleak.py @@ -167,12 +167,12 @@ class LiveLeakIE(InfoExtractor): embed_urls = list() for embed_prochan in re.findall( - r']+src="(http://www.prochan.com/embed\?[^"]+)"', + r']+src="(https?://www.prochan.com/embed\?[^"]+)"', webpage): embed_urls.append(embed_prochan) for embed_youtube in re.findall( - r']+src="(http[s]?://www.youtube.com/embed/[^"]+)"', + r']+src="(https?://www.youtube.com/embed/[^"]+)"', webpage): embed_urls.append(embed_youtube) @@ -187,12 +187,9 @@ class LiveLeakIE(InfoExtractor): 'age_limit': age_limit, }) - if len(entries) == 1: - return entries[0] - else: - return { - '_type': 'multi_video', - 'id': page_id, - 'title': 'Fuel Depot in China Explosion caught on video', - 'entries': entries, - } + return { + '_type': 'multi_video', + 'id': page_id, + 'title': video_title, + 'entries': entries, + }