From f880f2847b98faddd31bbb1cd6c2d8cf61e55378 Mon Sep 17 00:00:00 2001 From: Jeroen Meulemeester Date: Wed, 12 Apr 2017 22:15:27 +0200 Subject: [PATCH] Fix remarks on review of #8008 Use the generic extraction method KalturaIE._extract_url() Add support for embedded vier.be partner videos --- youtube_dl/extractor/nieuwsblad.py | 42 +++++++++++++++++++++--------- youtube_dl/extractor/vier.py | 22 +++++++++++++--- 2 files changed, 48 insertions(+), 16 deletions(-) diff --git a/youtube_dl/extractor/nieuwsblad.py b/youtube_dl/extractor/nieuwsblad.py index 5f01f26f5..eaf6df333 100644 --- a/youtube_dl/extractor/nieuwsblad.py +++ b/youtube_dl/extractor/nieuwsblad.py @@ -4,6 +4,7 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from .kaltura import KalturaIE from ..utils import ( smuggle_url ) @@ -21,6 +22,8 @@ class NieuwsbladIE(InfoExtractor): 'id': '02036890', 'ext': 'mp4', 'title': 'Krijgt zieke Pauline (3) het mooiste kerstcadeau?', + 'description': 'Er is misschien toch goed nieuws voor de zieke Pauline (3). Het Riziv buigt zich' + ' namelijk over de vraag om de peperdure behandeling van 15.000 euro terug t...', 'thumbnail': 're:http.*jpg$', } }, @@ -32,6 +35,8 @@ class NieuwsbladIE(InfoExtractor): 'id': '01986463', 'ext': 'mp4', 'title': 'Angst voor terreur: fotograaf toont hoe hij de werkelijkheid kan manipuleren', + 'description': 'De metro rijdt niet, de scholen en crèches zijn dicht, vele winkels zijn gesloten. ' + 'Fotograaf Jimmy Kets brengt Brussel vandaag in beeld. Maar hij toont ook...', 'thumbnail': 're:http.*jpg$', } }, @@ -49,6 +54,19 @@ class NieuwsbladIE(InfoExtractor): 'uploader_id': 'dcc-video-manager-hbvl@mediahuis.be' } }, + # Source: Vier.be + { + 'url': 'http://www.nieuwsblad.be/cnt/dmf20170411_02829396', + 'md5': '35cb487bfd8c61fe38c9838420fd0de6', + 'info_dict': { + 'id': '02829396', + 'ext': 'mp4', + 'title': 'Dit is het nieuwste speeltje van Michel Van den Brande', + 'description': 'In de jongste aflevering van \'The Sky is the Limit\' pronkt Michel Van den Brande' + ' met zijn nieuwste speeltje: een glanzende BMW. Een van zijn medewerkers ma...', + 'thumbnail': 're:^https?://.*\.png$', + } + }, ] def _real_extract(self, url): @@ -58,10 +76,19 @@ class NieuwsbladIE(InfoExtractor): iframe_m = re.search(r']+src="(.+?kaltura.com.*?)"', webpage) if iframe_m: - return self._extract_kaltura(url, webpage) + kaltura_url = KalturaIE._extract_url(webpage) + url_with_source = smuggle_url(kaltura_url, {'source_url': url}) + return self.url_result(url_with_source, 'Kaltura') + + iframe_m = re.search(r']+src="(.+?vier.be.*?)"', webpage) + if iframe_m: + vier_url = iframe_m.group(1) + url_with_source = smuggle_url(vier_url, {'source_url': url, 'video_id': video_id}) + return self.url_result(url_with_source, 'Vier') thumbnail = self._og_search_thumbnail(webpage) title = self._og_search_title(webpage) + description = self._og_search_description(webpage) iframe_m = re.search(r']+src="(.+?vrt.be.*?)"', webpage) if iframe_m: @@ -77,17 +104,6 @@ class NieuwsbladIE(InfoExtractor): 'url': video_url, 'id': video_id, 'title': title, + 'description': description, 'thumbnail': thumbnail } - - def _extract_kaltura(self, url, web_page): - """ Delegate the video extraction to 'Kaltura' extractor """ - kaltura_id = self._search_regex(r'entry_id\s*:\s*\"(.+?)\"', web_page, 'kaltura_id') - kaltura_wid = self._search_regex(r'wid\s*\:\s*\"(.+?)\"', web_page, 'kaltura_wid') - kaltura_uiconf_id = self._search_regex(r'uiconf_id\s*:\s*\"(.+?)\"', web_page, 'kaltura_uiconf_id') - kaltura_url = ( - 'https://cdnapisec.kaltura.com/index.php/kwidget/wid/{0}/uiconf_id/{1}/entry_id/{2}' - .format(kaltura_wid, kaltura_uiconf_id, kaltura_id) - ) - url_with_source = smuggle_url(kaltura_url, {'source_url': url}) - return self.url_result(url_with_source, 'Kaltura') diff --git a/youtube_dl/extractor/vier.py b/youtube_dl/extractor/vier.py index 5ef7635b6..21a9d2f9c 100644 --- a/youtube_dl/extractor/vier.py +++ b/youtube_dl/extractor/vier.py @@ -5,12 +5,19 @@ import re import itertools from .common import InfoExtractor +from ..utils import ( + unsmuggle_url +) class VierIE(InfoExtractor): IE_NAME = 'vier' IE_DESC = 'vier.be and vijf.be' - _VALID_URL = r'https?://(?:www\.)?(?Pvier|vijf)\.be/(?:[^/]+/videos/(?P[^/]+)(?:/(?P\d+))?|video/v3/embed/(?P\d+))' + _VALID_URL = r'https?://(?:www\.)?(?Pvier|vijf)\.be/(' \ + r'?:[^/]+/videos/(?P[^/]+)(?:/(?P\d+))' \ + r'?|video/partner/embed/v2/(?P\d+)/' \ + r'?|video/v3/embed/(?P\d+)' \ + r')' _TESTS = [{ 'url': 'http://www.vier.be/planb/videos/het-wordt-warm-de-moestuin/16129', 'info_dict': { @@ -30,7 +37,7 @@ class VierIE(InfoExtractor): 'id': '2561614', 'display_id': 'zo-grappig-temptation-island-hosts-moeten-kiezen-tussen-onmogelijke-dilemmas', 'ext': 'mp4', - 'title': 'ZO grappig: Temptation Island hosts moeten kiezen tussen onmogelijke dilemma\'s', + 'title': 'EXTRA: Temptation Island hosts moeten kiezen tussen onmogelijke dilemma\'s', 'description': 'Het spel is simpel: Annelien Coorevits en Rick Brandsteder krijgen telkens 2 dilemma\'s voorgeschoteld en ze MOETEN een keuze maken.', }, 'params': { @@ -43,11 +50,15 @@ class VierIE(InfoExtractor): }, { 'url': 'http://www.vier.be/video/v3/embed/16129', 'only_matching': True, + }, { + 'url': 'http://www.vier.be/video/partner/embed/v2/2658547/4b5a8c17b5358cb1d1b48e57966721bbef6df328/srnieuwsblad/asmh', + 'only_matching': True, }] def _real_extract(self, url): + url, smuggled_data = unsmuggle_url(url, {}) mobj = re.match(self._VALID_URL, url) - embed_id = mobj.group('embed_id') + embed_id = mobj.group('embed_id') or mobj.group('partner_embed_id') display_id = mobj.group('display_id') or embed_id site = mobj.group('site') @@ -67,6 +78,11 @@ class VierIE(InfoExtractor): formats = self._extract_wowza_formats(playlist_url, display_id, skip_protocols=['dash']) self._sort_formats(formats) + video_id = smuggled_data.get('video_id') or video_id + source_url = smuggled_data.get('source_url') + if source_url: + webpage = self._download_webpage(source_url, display_id) + title = self._og_search_title(webpage, default=display_id) description = self._og_search_description(webpage, default=None) thumbnail = self._og_search_thumbnail(webpage, default=None)