From fd2bd579117b7039a2b99080752b8b5952d09f93 Mon Sep 17 00:00:00 2001 From: Alex Seiler Date: Tue, 24 May 2016 18:59:54 +0200 Subject: [PATCH 1/5] [blick] Add new extractor (blick.ch is a swiss newspaper platform, which provides also videos) --- youtube_dl/extractor/blick.py | 123 +++++++++++++++++++++++++++++ youtube_dl/extractor/extractors.py | 1 + 2 files changed, 124 insertions(+) create mode 100644 youtube_dl/extractor/blick.py diff --git a/youtube_dl/extractor/blick.py b/youtube_dl/extractor/blick.py new file mode 100644 index 000000000..3a0977205 --- /dev/null +++ b/youtube_dl/extractor/blick.py @@ -0,0 +1,123 @@ +# encoding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +import re + + +class BlickIE(InfoExtractor): + _VALID_URL = r'(?:https?://)?(?:www\.)?blick\.ch/.*-id(?P\d+).*\.html' + + _TESTS = [{ + 'url': 'http://www.blick.ch/sport/uli-forte-vor-dem-abstiegs-showdown-ich-gehe-davon-aus-dass-der-fussball-gott-fcz-fan-ist-id5070813.html', + 'info_dict': { + 'id': '5070813', + 'ext': 'mp4', + 'title': 'uli-forte-vor-dem-abstiegs-showdown-ich-gehe-davon-aus-dass-der-fussball-gott-fcz-fan-ist', + 'thumbnail': 'http://blick.simplex.tv/content/51/52/70062/simvid_1.jpg', + 'description': 'Am Mittwochabend entscheidet sich, ob der FCZ oder der FC Lugano aus der Super League absteigt. Uli Forte schwört dabei auf den Fussball-Gott und zündet in der Kirche eine Kerze an.' + } + }, { + 'url': 'http://www.blick.ch/sport/tennis/nominiert-fuer-musik-preis-in-schweden-so-toll-singt-guenthardts-tochter-alessandra-id5066863.html', + 'info_dict': { + 'id': '5066863', + 'ext': 'mp4', + 'title': 'nominiert-fuer-musik-preis-in-schweden-so-toll-singt-guenthardts-tochter-alessandra', + 'thumbnail': 'http://f.blick.ch/img/incoming/crop5066860/5146024130-csquare-w300-h300/Bildschirmfoto-2016-05-23-um-14.jpg', + 'description': 'Da ist Papa Heinz mächtig stolz. Seine Tochter Alessandra Günthardt ist für einen schwedischen Musik-Preis unter den drei Nominierten. Die Abstimmung läuft noch bis 7. Juni.' + } + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + webpage = self._download_webpage(url, video_id) + + found_videos_og = re.findall(r'= 1000 and tbr < 2000: + attr = 'sq' + elif tbr >= 2000: + attr = 'hq' + except TypeError: + attr = 'un' + elem['format_id'] = attr + '-' + str(tbr) + entry_info_dict['duration'] = duration + return entry_info_dict + + def calculateDuration(self, m3u8_url, video_id): + content = self._download_webpage_handle( + m3u8_url, + video_id, + note='Downloading m3u8 information', + errnote='Failed to download m3u8 information', + fatal=False + ) + if content is False: + return None + m3u8_doc, rlh = content + duration = 0.0 + try: + for line in m3u8_doc.splitlines(): + if line.startswith('#EXTINF:'): + dur = line[8:].strip()[:-1] + duration += float(dur) + except ValueError: + return None + return duration diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index f9fed18f6..e2a6cf315 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -84,6 +84,7 @@ from .bleacherreport import ( BleacherReportIE, BleacherReportCMSIE, ) +from .blick import BlickIE from .blinkx import BlinkxIE from .bloomberg import BloombergIE from .bokecc import BokeCCIE From ea6e87e1655f80468b7382e6de4dfa44ca750624 Mon Sep 17 00:00:00 2001 From: Alex Seiler Date: Wed, 25 May 2016 02:27:19 +0200 Subject: [PATCH 2/5] [blick] Use builtin functions from common.py instead of re.findall --- youtube_dl/extractor/blick.py | 35 +++++++++++++++++++++++++++++++---- 1 file changed, 31 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/blick.py b/youtube_dl/extractor/blick.py index 3a0977205..c5a1a8442 100644 --- a/youtube_dl/extractor/blick.py +++ b/youtube_dl/extractor/blick.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals from .common import InfoExtractor +from ..utils import RegexNotFoundError import re @@ -26,6 +27,15 @@ class BlickIE(InfoExtractor): 'thumbnail': 'http://f.blick.ch/img/incoming/crop5066860/5146024130-csquare-w300-h300/Bildschirmfoto-2016-05-23-um-14.jpg', 'description': 'Da ist Papa Heinz mächtig stolz. Seine Tochter Alessandra Günthardt ist für einen schwedischen Musik-Preis unter den drei Nominierten. Die Abstimmung läuft noch bis 7. Juni.' } + }, { + 'url': 'http://www.blick.ch/sport/fussball/superleague/totomat-fehler-in-sion-fcz-buff-stinksauer-wegen-falschem-lugano-resultat-id5063421.html', + 'info_dict': { + 'id': '5063421', + 'ext': 'mp4', + 'title': 'totomat-fehler-in-sion-fcz-buff-stinksauer-wegen-falschem-lugano-resultat', + 'thumbnail': 'http://f.blick.ch/img/incoming/crop5063475/820602933-csquare-w300-h300/Bildschirmfoto-2016-05-22-um-19.jpg', + 'description': 'Der FC Zürich bleibt das Schlusslicht der Raiffeisen Super League. Einen dicken Hals bekommen Buff und Co. aber wegen einer falschen Resultatanzeige aus dem Ländle.', + } }] def _real_extract(self, url): @@ -33,10 +43,27 @@ class BlickIE(InfoExtractor): video_id = mobj.group('id') webpage = self._download_webpage(url, video_id) - found_videos_og = re.findall(r' Date: Wed, 25 May 2016 15:45:35 +0200 Subject: [PATCH 3/5] [blick] Use default=False argument instead of catching possible RegexNotFoundError. --- youtube_dl/extractor/blick.py | 24 ++++++------------------ 1 file changed, 6 insertions(+), 18 deletions(-) diff --git a/youtube_dl/extractor/blick.py b/youtube_dl/extractor/blick.py index c5a1a8442..5f9837cfe 100644 --- a/youtube_dl/extractor/blick.py +++ b/youtube_dl/extractor/blick.py @@ -2,7 +2,6 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import RegexNotFoundError import re @@ -46,23 +45,12 @@ class BlickIE(InfoExtractor): found_videos = [] regex_og = self._og_regexes('video') regex_ogs = self._og_regexes('video:secure_url') - try: - video_og = self._html_search_regex(regex_og, webpage, name=None) - found_videos.append(video_og) - except RegexNotFoundError: - pass - try: - video_ogs = self._html_search_regex(regex_ogs, webpage, name=None) - if video_ogs not in found_videos: - found_videos.append(video_ogs) - except RegexNotFoundError: - pass - try: - video_meta = self._html_search_meta('contentURL', webpage) - if video_meta not in found_videos: - found_videos.append(video_meta) - except RegexNotFoundError: - pass + video_og = self._html_search_regex(regex_og, webpage, name=None, default=None, fatal=False) + video_ogs = self._html_search_regex(regex_ogs, webpage, name=None, default=None, fatal=False) + video_meta = self._html_search_meta('contentURL', webpage, fatal=False, default=None) + for elem in [video_og, video_ogs, video_meta]: + if elem: + found_videos.append(elem) video_url = '' for video in found_videos: From ab193cf18a6df418bed706c4b0a4920e011f691d Mon Sep 17 00:00:00 2001 From: Alex Seiler Date: Fri, 27 May 2016 00:30:00 +0200 Subject: [PATCH 4/5] [blick] Did some suggested changes. - Use _match_id to get the video_id - Extract the video title from the webpage instead of the url - Removed unnecessary "if entry_info_dict.get('formats'):" check --- youtube_dl/extractor/blick.py | 66 ++++++++++++++++------------------- 1 file changed, 30 insertions(+), 36 deletions(-) diff --git a/youtube_dl/extractor/blick.py b/youtube_dl/extractor/blick.py index 5f9837cfe..d57ed4f75 100644 --- a/youtube_dl/extractor/blick.py +++ b/youtube_dl/extractor/blick.py @@ -13,7 +13,7 @@ class BlickIE(InfoExtractor): 'info_dict': { 'id': '5070813', 'ext': 'mp4', - 'title': 'uli-forte-vor-dem-abstiegs-showdown-ich-gehe-davon-aus-dass-der-fussball-gott-fcz-fan-ist', + 'title': 'Uli Forte vor dem Abstiegs-Showdown: «Ich gehe davon aus, dass der Fussball-Gott FCZ-Fan ist»', 'thumbnail': 'http://blick.simplex.tv/content/51/52/70062/simvid_1.jpg', 'description': 'Am Mittwochabend entscheidet sich, ob der FCZ oder der FC Lugano aus der Super League absteigt. Uli Forte schwört dabei auf den Fussball-Gott und zündet in der Kirche eine Kerze an.' } @@ -22,7 +22,7 @@ class BlickIE(InfoExtractor): 'info_dict': { 'id': '5066863', 'ext': 'mp4', - 'title': 'nominiert-fuer-musik-preis-in-schweden-so-toll-singt-guenthardts-tochter-alessandra', + 'title': 'Nominiert für Musik-Preis in Schweden: So toll singt Günthardts Tochter Alessandra', 'thumbnail': 'http://f.blick.ch/img/incoming/crop5066860/5146024130-csquare-w300-h300/Bildschirmfoto-2016-05-23-um-14.jpg', 'description': 'Da ist Papa Heinz mächtig stolz. Seine Tochter Alessandra Günthardt ist für einen schwedischen Musik-Preis unter den drei Nominierten. Die Abstimmung läuft noch bis 7. Juni.' } @@ -31,22 +31,21 @@ class BlickIE(InfoExtractor): 'info_dict': { 'id': '5063421', 'ext': 'mp4', - 'title': 'totomat-fehler-in-sion-fcz-buff-stinksauer-wegen-falschem-lugano-resultat', + 'title': 'Totomat-Fehler in Sion! FCZ-Buff stinksauer wegen falschem Lugano-Resultat', 'thumbnail': 'http://f.blick.ch/img/incoming/crop5063475/820602933-csquare-w300-h300/Bildschirmfoto-2016-05-22-um-19.jpg', 'description': 'Der FC Zürich bleibt das Schlusslicht der Raiffeisen Super League. Einen dicken Hals bekommen Buff und Co. aber wegen einer falschen Resultatanzeige aus dem Ländle.', } }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) found_videos = [] regex_og = self._og_regexes('video') regex_ogs = self._og_regexes('video:secure_url') - video_og = self._html_search_regex(regex_og, webpage, name=None, default=None, fatal=False) - video_ogs = self._html_search_regex(regex_ogs, webpage, name=None, default=None, fatal=False) + video_og = self._html_search_regex(regex_og, webpage, name=None, default=None) + video_ogs = self._html_search_regex(regex_ogs, webpage, name=None, default=None) video_meta = self._html_search_meta('contentURL', webpage, fatal=False, default=None) for elem in [video_og, video_ogs, video_meta]: if elem: @@ -66,11 +65,7 @@ class BlickIE(InfoExtractor): if not video_url: return [] - video_title = str(url) - b_ind = video_title.rfind('/') + 1 - e_ind = video_title.rfind('-id') - video_title = video_title[b_ind:e_ind] - + video_title = self._og_search_title(webpage) video_description = self._og_search_description(webpage) thumbnail = self._og_search_thumbnail(webpage) @@ -87,32 +82,31 @@ class BlickIE(InfoExtractor): ext='mp4', entry_protocol='m3u8_native') - if entry_info_dict.get('formats'): - self._sort_formats(entry_info_dict['formats']) + self._sort_formats(entry_info_dict['formats']) - # Remove entries containing a url to an index.m3u8 file - cleaned_formats = [x for x in entry_info_dict['formats'] if x.get('format_id') != 'meta'] - entry_info_dict['formats'] = cleaned_formats + # Remove entries containing a url to an index.m3u8 file + cleaned_formats = [x for x in entry_info_dict['formats'] if x.get('format_id') != 'meta'] + entry_info_dict['formats'] = cleaned_formats - duration_found = False - duration = None - attr = '' - for elem in entry_info_dict.get('formats'): - if not duration_found: - duration = self.calculateDuration(elem['url'], video_id) - duration_found = True if duration else False - tbr = elem.get('tbr') - try: - attr = '' - if tbr < 1000: - attr = 'lq' - elif tbr >= 1000 and tbr < 2000: - attr = 'sq' - elif tbr >= 2000: - attr = 'hq' - except TypeError: - attr = 'un' - elem['format_id'] = attr + '-' + str(tbr) + duration_found = False + duration = None + attr = '' + for elem in entry_info_dict.get('formats'): + if not duration_found: + duration = self.calculateDuration(elem['url'], video_id) + duration_found = True if duration else False + tbr = elem.get('tbr') + try: + attr = '' + if tbr < 1000: + attr = 'lq' + elif tbr >= 1000 and tbr < 2000: + attr = 'sq' + elif tbr >= 2000: + attr = 'hq' + except TypeError: + attr = 'un' + elem['format_id'] = attr + '-' + str(tbr) entry_info_dict['duration'] = duration return entry_info_dict From 57f9aafeefba3e58e2765f67de5a166d124adca4 Mon Sep 17 00:00:00 2001 From: Alex Seiler Date: Fri, 27 May 2016 00:38:59 +0200 Subject: [PATCH 5/5] [blick] Do not remove the 'formats' entry, which contains a .m3u8 file for multiple version of the video. --- youtube_dl/extractor/blick.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/youtube_dl/extractor/blick.py b/youtube_dl/extractor/blick.py index d57ed4f75..566b3969c 100644 --- a/youtube_dl/extractor/blick.py +++ b/youtube_dl/extractor/blick.py @@ -84,10 +84,6 @@ class BlickIE(InfoExtractor): self._sort_formats(entry_info_dict['formats']) - # Remove entries containing a url to an index.m3u8 file - cleaned_formats = [x for x in entry_info_dict['formats'] if x.get('format_id') != 'meta'] - entry_info_dict['formats'] = cleaned_formats - duration_found = False duration = None attr = ''