From 657be7fa62986a3180ead83f5b62c2dd0d28cf23 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=BCrn=20Brodersen?= Date: Sat, 4 Jul 2015 02:03:13 +0200 Subject: [PATCH 1/7] [wdr] Update for the wdr extractor --- youtube_dl/extractor/wdr.py | 114 +++++++++++++++++++----------------- 1 file changed, 60 insertions(+), 54 deletions(-) diff --git a/youtube_dl/extractor/wdr.py b/youtube_dl/extractor/wdr.py index b46802306..4916944ff 100644 --- a/youtube_dl/extractor/wdr.py +++ b/youtube_dl/extractor/wdr.py @@ -3,6 +3,7 @@ from __future__ import unicode_literals import itertools import re +import json from .common import InfoExtractor from ..compat import ( @@ -16,9 +17,8 @@ from ..utils import ( class WDRIE(InfoExtractor): - _PLAYER_REGEX = '-(?:video|audio)player(?:_size-[LMS])?' - _VALID_URL = r'(?Phttps?://www\d?\.(?:wdr\d?|funkhauseuropa)\.de/)(?P.+?)(?P%s)?\.html' % _PLAYER_REGEX - + _PLAYER_REGEX = 'https?://deviceids-medstdp.wdr.de/ondemand/.+?/.+?\.js' + _VALID_URL = r'(?Phttps?://www\d?\.(?:wdr\d?|funkhauseuropa)\.de/)(?P.+?)\.html' _TESTS = [ { 'url': 'http://www1.wdr.de/mediathek/video/sendungen/servicezeit/videoservicezeit560-videoplayer_size-L.html', @@ -95,80 +95,86 @@ class WDRIE(InfoExtractor): } ] + def _overiew_page_extractor(self, page_url, page_id, webpage): + entries = [] + for page_num in itertools.count(2): + hrefs = re.findall( + r'
  • \s*]*>\s*\s*\s*]*>\s*\s*\n
    \n Date: Sat, 4 Jul 2015 02:58:46 +0200 Subject: [PATCH 2/7] [wdr] Updated tests and found bugs --- youtube_dl/extractor/wdr.py | 62 +++++++++++++++---------------------- 1 file changed, 25 insertions(+), 37 deletions(-) diff --git a/youtube_dl/extractor/wdr.py b/youtube_dl/extractor/wdr.py index 4916944ff..1e2900c02 100644 --- a/youtube_dl/extractor/wdr.py +++ b/youtube_dl/extractor/wdr.py @@ -21,54 +21,41 @@ class WDRIE(InfoExtractor): _VALID_URL = r'(?Phttps?://www\d?\.(?:wdr\d?|funkhauseuropa)\.de/)(?P.+?)\.html' _TESTS = [ { - 'url': 'http://www1.wdr.de/mediathek/video/sendungen/servicezeit/videoservicezeit560-videoplayer_size-L.html', + 'url': 'http://www1.wdr.de/mediathek/video/sendungen/hier_und_heute/videostreetfoodpioniere100.html', 'info_dict': { - 'id': 'mdb-362427', - 'ext': 'flv', - 'title': 'Servicezeit', - 'description': 'md5:c8f43e5e815eeb54d0b96df2fba906cb', - 'upload_date': '20140310', + 'id': 'mdb-750693', + 'ext': 'mp4', + 'title': 'Streetfood-Pioniere', + 'description': 'md5:bff1fdc6de7df044ac2bec13ab46e6a9', + 'upload_date': '20150703', 'is_live': False }, 'params': { 'skip_download': True, + 'format': 'best' }, }, { - 'url': 'http://www1.wdr.de/themen/av/videomargaspiegelisttot101-videoplayer.html', + 'url': 'http://www1.wdr.de/mediathek/audio/1live/einslive-bahnansage-100.html', + 'md5': '87c389aac18ee6fc041aa1ced52aac76', 'info_dict': { - 'id': 'mdb-363194', - 'ext': 'flv', - 'title': 'Marga Spiegel ist tot', - 'description': 'md5:2309992a6716c347891c045be50992e4', - 'upload_date': '20140311', - 'is_live': False - }, - 'params': { - 'skip_download': True, - }, - }, - { - 'url': 'http://www1.wdr.de/themen/kultur/audioerlebtegeschichtenmargaspiegel100-audioplayer.html', - 'md5': '83e9e8fefad36f357278759870805898', - 'info_dict': { - 'id': 'mdb-194332', + 'id': 'mdb-726385', 'ext': 'mp3', - 'title': 'Erlebte Geschichten: Marga Spiegel (29.11.2009)', - 'description': 'md5:2309992a6716c347891c045be50992e4', - 'upload_date': '20091129', + 'title': 'Weselsky | 1LIVE Bahnansage (04.06.2015)', + 'description': 'md5:8b9ef2af8c1bb01394ab98f3450ff04d', + 'upload_date': '20150604', 'is_live': False }, }, { - 'url': 'http://www.funkhauseuropa.de/av/audioflaviacoelhoamaramar100-audioplayer.html', - 'md5': '99a1443ff29af19f6c52cf6f4dc1f4aa', + 'url': 'http://www.funkhauseuropa.de/musik/musikspecials/roskilde-zweitausendfuenfzehn-100.html', + 'md5': 'e50e0c8900f6558ae12cd9953aca5a20', 'info_dict': { - 'id': 'mdb-478135', + 'id': 'mdb-752045', 'ext': 'mp3', - 'title': 'Flavia Coelho: Amar é Amar', - 'description': 'md5:7b29e97e10dfb6e265238b32fa35b23a', - 'upload_date': '20140717', + 'title': 'Roskilde Festival 2015', + 'description': 'md5:48e7a0a884c0e841a9d9174e27c67df3', + 'upload_date': '20150702', 'is_live': False }, }, @@ -83,10 +70,10 @@ class WDRIE(InfoExtractor): 'url': 'http://www1.wdr.de/mediathek/video/livestream/index.html', 'info_dict': { 'id': 'mdb-103364', - 'title': 're:^WDR Fernsehen [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'title': 're:^24 Stunden Livestream [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', 'description': 'md5:ae2ff888510623bf8d4b115f95a9b7c9', 'ext': 'flv', - 'upload_date': '20150212', + 'upload_date': '20150101', 'is_live': True }, 'params': { @@ -122,7 +109,7 @@ class WDRIE(InfoExtractor): webpage = self._download_webpage(url, page_id) entries = re.search(r'%s' % self._PLAYER_REGEX, webpage) - if entries == None: # Overview page + if entries is None: # Overview page return self._overiew_page_extractor(page_url, page_id, webpage) jsonpage = self._download_webpage(entries.group(0), entries.group(0)) @@ -138,7 +125,8 @@ class WDRIE(InfoExtractor): video_url = video_field['audioURL'] else: break - is_live = video_field.get('flashvarsExt', {'isLive': '0'}) == {'isLive': '1'} + is_live = video_field.get('flashvarsExt', {'isLive': '0'}) + is_live = is_live.get('isLive', '0') == '1' if video_url.endswith('.f4m'): video_url += '?hdcore=3.2.0&plugin=aasp-3.2.0.77.18' @@ -156,7 +144,7 @@ class WDRIE(InfoExtractor): formats.append({'url': video_url, 'ext': ext, 'format_id': _id}) thumbnail = re.search('
    \n
    \n Date: Sat, 4 Jul 2015 13:14:14 +0200 Subject: [PATCH 3/7] [wdr] Use old extractor method and added more formats --- youtube_dl/extractor/wdr.py | 141 ++++++++++++++++++++---------------- 1 file changed, 79 insertions(+), 62 deletions(-) diff --git a/youtube_dl/extractor/wdr.py b/youtube_dl/extractor/wdr.py index 1e2900c02..23e26f8d3 100644 --- a/youtube_dl/extractor/wdr.py +++ b/youtube_dl/extractor/wdr.py @@ -3,7 +3,6 @@ from __future__ import unicode_literals import itertools import re -import json from .common import InfoExtractor from ..compat import ( @@ -17,15 +16,16 @@ from ..utils import ( class WDRIE(InfoExtractor): - _PLAYER_REGEX = 'https?://deviceids-medstdp.wdr.de/ondemand/.+?/.+?\.js' - _VALID_URL = r'(?Phttps?://www\d?\.(?:wdr\d?|funkhauseuropa)\.de/)(?P.+?)\.html' + _PLAYER_REGEX = '-(?:video|audio)player(?:_size-[LMS])?' + _VALID_URL = r'(?Phttps?://www\d?\.(?:wdr\d?|funkhauseuropa)\.de/)(?P.+?)(?P%s)?\.html' % _PLAYER_REGEX + _TESTS = [ { 'url': 'http://www1.wdr.de/mediathek/video/sendungen/hier_und_heute/videostreetfoodpioniere100.html', 'info_dict': { 'id': 'mdb-750693', 'ext': 'mp4', - 'title': 'Streetfood-Pioniere', + 'title': 'HIER UND HEUTE: Streetfood-Pioniere', 'description': 'md5:bff1fdc6de7df044ac2bec13ab46e6a9', 'upload_date': '20150703', 'is_live': False @@ -41,8 +41,8 @@ class WDRIE(InfoExtractor): 'info_dict': { 'id': 'mdb-726385', 'ext': 'mp3', - 'title': 'Weselsky | 1LIVE Bahnansage (04.06.2015)', - 'description': 'md5:8b9ef2af8c1bb01394ab98f3450ff04d', + 'title': '1LIVE Bahnansage', + 'description': 'md5:36016b06288e1f1a5b2602c8fe947b8d', 'upload_date': '20150604', 'is_live': False }, @@ -54,7 +54,7 @@ class WDRIE(InfoExtractor): 'id': 'mdb-752045', 'ext': 'mp3', 'title': 'Roskilde Festival 2015', - 'description': 'md5:48e7a0a884c0e841a9d9174e27c67df3', + 'description': 'md5:7b29e97e10dfb6e265238b32fa35b23a', 'upload_date': '20150702', 'is_live': False }, @@ -82,82 +82,99 @@ class WDRIE(InfoExtractor): } ] - def _overiew_page_extractor(self, page_url, page_id, webpage): - entries = [] - for page_num in itertools.count(2): - hrefs = re.findall( - r'
  • \s*]*>\s*\s*\s*]*>\s*\s*\n
    \n\n\n Date: Sat, 4 Jul 2015 23:46:18 +0200 Subject: [PATCH 4/7] [wdr] Separate media and playlist extractiona bit more; Don't return a playlist for single items --- youtube_dl/extractor/wdr.py | 77 +++++++++++++++++++++++-------------- 1 file changed, 49 insertions(+), 28 deletions(-) diff --git a/youtube_dl/extractor/wdr.py b/youtube_dl/extractor/wdr.py index 23e26f8d3..ce9918ce0 100644 --- a/youtube_dl/extractor/wdr.py +++ b/youtube_dl/extractor/wdr.py @@ -21,7 +21,7 @@ class WDRIE(InfoExtractor): _TESTS = [ { - 'url': 'http://www1.wdr.de/mediathek/video/sendungen/hier_und_heute/videostreetfoodpioniere100.html', + 'url': 'http://www1.wdr.de/mediathek/video/sendungen/hier_und_heute/videostreetfoodpioniere100.html', # Test single media extraction (video) 'info_dict': { 'id': 'mdb-750693', 'ext': 'mp4', @@ -36,38 +36,46 @@ class WDRIE(InfoExtractor): }, }, { - 'url': 'http://www1.wdr.de/mediathek/audio/1live/einslive-bahnansage-100.html', + 'url': 'http://www1.wdr.de/mediathek/audio/1live/einslive-bahnansage-100.html', # Test single media extraction (audio) 'md5': '87c389aac18ee6fc041aa1ced52aac76', 'info_dict': { 'id': 'mdb-726385', 'ext': 'mp3', 'title': '1LIVE Bahnansage', - 'description': 'md5:36016b06288e1f1a5b2602c8fe947b8d', + 'description': 'md5:8b9ef2af8c1bb01394ab98f3450ff04d', 'upload_date': '20150604', 'is_live': False }, }, { - 'url': 'http://www.funkhauseuropa.de/musik/musikspecials/roskilde-zweitausendfuenfzehn-100.html', + 'url': 'http://www.funkhauseuropa.de/musik/musikspecials/roskilde-zweitausendfuenfzehn-100.html', # Test single media extraction (audio) 'md5': 'e50e0c8900f6558ae12cd9953aca5a20', 'info_dict': { 'id': 'mdb-752045', 'ext': 'mp3', 'title': 'Roskilde Festival 2015', - 'description': 'md5:7b29e97e10dfb6e265238b32fa35b23a', + 'description': 'md5:48e7a0a884c0e841a9d9174e27c67df3', 'upload_date': '20150702', 'is_live': False }, }, { - 'url': 'http://www1.wdr.de/mediathek/video/sendungen/quarks_und_co/filterseite-quarks-und-co100.html', + 'url': 'http://www1.wdr.de/mediathek/video/sendungen/quarks_und_co/filterseite-quarks-und-co100.html', # Test playlist extraction (containing links to webpages) 'playlist_mincount': 146, 'info_dict': { 'id': 'mediathek/video/sendungen/quarks_und_co/filterseite-quarks-und-co100', + 'title': 'md5:31d3634678b18f90a9fc4e7cd34ba3b2' } }, { - 'url': 'http://www1.wdr.de/mediathek/video/livestream/index.html', + 'url': 'http://www.funkhauseuropa.de/index.html', # Test playlist extraction (containing links to playerpages) + 'playlist_mincount': 3, + 'info_dict': { + 'id': 'index', + } + }, + { + 'url': 'http://www1.wdr.de/mediathek/video/livestream/index.html', # Test live tv 'info_dict': { 'id': 'mdb-103364', 'title': 're:^24 Stunden Livestream [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', @@ -82,23 +90,7 @@ class WDRIE(InfoExtractor): } ] - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - page_url = mobj.group('url') - page_id = mobj.group('id') - - webpage = self._download_webpage(url, page_id) - - if mobj.group('player') is None: - entries = [ - self.url_result(page_url + href, 'WDR') - for href in re.findall(r'\n\n 1: # Different playlist page + return self.playlist_result(entries, page_id) + + elif mobj.group('player') is not None or (entries and len(entries) == 1): # Media page (either just a single player link on the webpage or the webpage is the player) + if not entries: + entries = None + return self._media_extract(page_url, page_id, mobj, webpage, entries) + class WDRMobileIE(InfoExtractor): _VALID_URL = r'''(?x) From 83d79150689ba66e103a374355d87447c32419c7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=BCrn=20Brodersen?= Date: Sun, 5 Jul 2015 00:41:23 +0200 Subject: [PATCH 5/7] [wdr] Use of self._extract_m3u8_formats and utils.qualities --- youtube_dl/extractor/wdr.py | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/youtube_dl/extractor/wdr.py b/youtube_dl/extractor/wdr.py index ce9918ce0..d81cdfd8f 100644 --- a/youtube_dl/extractor/wdr.py +++ b/youtube_dl/extractor/wdr.py @@ -12,6 +12,7 @@ from ..compat import ( from ..utils import ( determine_ext, unified_strdate, + qualities ) @@ -78,7 +79,7 @@ class WDRIE(InfoExtractor): 'url': 'http://www1.wdr.de/mediathek/video/livestream/index.html', # Test live tv 'info_dict': { 'id': 'mdb-103364', - 'title': 're:^24 Stunden Livestream [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'title': 're:^WDR Fernsehen Live [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', 'description': 'md5:ae2ff888510623bf8d4b115f95a9b7c9', 'ext': 'flv', 'upload_date': '20150101', @@ -151,25 +152,24 @@ class WDRIE(InfoExtractor): formats.append({'ext': ext, 'url': video_url}) - m3u8_url = re.search(r'
  • \n\n\n Date: Tue, 14 Jul 2015 14:56:45 +0200 Subject: [PATCH 6/7] [wdr] small changes in how to decide if a url is a playlist --- youtube_dl/extractor/wdr.py | 58 +++++++++++++++++++++++++++---------- 1 file changed, 43 insertions(+), 15 deletions(-) diff --git a/youtube_dl/extractor/wdr.py b/youtube_dl/extractor/wdr.py index d81cdfd8f..6248c8b00 100644 --- a/youtube_dl/extractor/wdr.py +++ b/youtube_dl/extractor/wdr.py @@ -22,7 +22,22 @@ class WDRIE(InfoExtractor): _TESTS = [ { - 'url': 'http://www1.wdr.de/mediathek/video/sendungen/hier_und_heute/videostreetfoodpioniere100.html', # Test single media extraction (video) + 'url': 'http://www1.wdr.de/mediathek/video/sendungen/hier_und_heute/videostreetfoodpioniere100.html', # Test single media extraction (video, link to webpage) + 'info_dict': { + 'id': 'mdb-750693', + 'ext': 'mp4', + 'title': 'HIER UND HEUTE: Streetfood-Pioniere', + 'description': 'md5:bff1fdc6de7df044ac2bec13ab46e6a9', + 'upload_date': '20150703', + 'is_live': False + }, + 'params': { + 'skip_download': True, + 'format': 'best' + }, + }, + { + 'url': 'http://www1.wdr.de/mediathek/video/sendungen/hier_und_heute/videostreetfoodpioniere100-videoplayer_size-L.html', # Test single media extraction (video, link to playerpage) 'info_dict': { 'id': 'mdb-750693', 'ext': 'mp4', @@ -49,23 +64,35 @@ class WDRIE(InfoExtractor): }, }, { - 'url': 'http://www.funkhauseuropa.de/musik/musikspecials/roskilde-zweitausendfuenfzehn-100.html', # Test single media extraction (audio) + 'url': 'http://www.funkhauseuropa.de/av/audioroskildefestival100-audioplayer.html', # Test single media extraction (audio) 'md5': 'e50e0c8900f6558ae12cd9953aca5a20', 'info_dict': { 'id': 'mdb-752045', 'ext': 'mp3', 'title': 'Roskilde Festival 2015', - 'description': 'md5:48e7a0a884c0e841a9d9174e27c67df3', + 'description': 'md5:7b29e97e10dfb6e265238b32fa35b23a', 'upload_date': '20150702', 'is_live': False }, }, + { + 'url': 'http://www.funkhauseuropa.de/themen/aktuell/zwanzig-jahre-mpdrei-100.html', # Test single media extraction (audio) + 'md5': 'a0966afb15714a5c5a364b8d36a6e721', + 'info_dict': { + 'id': 'mdb-762163', + 'ext': 'mp3', + 'title': '20 Jahre mp3', + 'description': 'md5:5b1d78b210443081e9a08a9d0fb78306', + 'upload_date': '20150714', + 'is_live': False + }, + }, { 'url': 'http://www1.wdr.de/mediathek/video/sendungen/quarks_und_co/filterseite-quarks-und-co100.html', # Test playlist extraction (containing links to webpages) 'playlist_mincount': 146, 'info_dict': { 'id': 'mediathek/video/sendungen/quarks_und_co/filterseite-quarks-und-co100', - 'title': 'md5:31d3634678b18f90a9fc4e7cd34ba3b2' + 'title': 'md5:acf18a9eb2e3342d05de07380f1672b4' } }, { @@ -110,11 +137,11 @@ class WDRIE(InfoExtractor): note='Downloading playlist page %d' % page_num) return self.playlist_result(entries, page_id, webpage) - def _media_extract(self, page_url, page_id, mobj, webpage, entries): - if mobj.group('player') is None: - mobj = re.search(self._VALID_URL, entries[0]['url']) - playerpage = self._download_webpage(entries[0]['url'], mobj.group('id') + mobj.group('player')) - else: + def _media_extract(self, page_url, page_id, webpage, mobj=None, entrie=None): + if entrie is not None: + mobj = re.search(self._VALID_URL, entrie['url']) + playerpage = self._download_webpage(entrie['url'], mobj.group('id') + mobj.group('player')) + elif mobj is not None: playerpage = webpage formats = [] flashvars = compat_parse_qs( @@ -198,16 +225,17 @@ class WDRIE(InfoExtractor): ] # The url doesn't seem to contain any information if the current page is a playlist or page with a single media item - if not entries and mobj.group('player') is None: # Playlist page + if not entries and mobj.group('player') is None: # Playlist containing links to webpages return self._playlist_extract(page_url, page_id, webpage) - elif entries and len(entries) > 1: # Different playlist page + elif entries and len(entries) > 1: # Playlist containing multiple playerpages return self.playlist_result(entries, page_id) - elif mobj.group('player') is not None or (entries and len(entries) == 1): # Media page (either just a single player link on the webpage or the webpage is the player) - if not entries: - entries = None - return self._media_extract(page_url, page_id, mobj, webpage, entries) + elif mobj.group('player') is not None: # Mediaextractor (used if a playlist containes multiple playerpages) + return self._media_extract(page_url, page_id, webpage, mobj=mobj) + + elif entries and len(entries) == 1: # Mediaextractor (a page with a single video is usally not a playlist) + return self._media_extract(page_url, page_id, webpage, entrie=entries[0]) class WDRMobileIE(InfoExtractor): From 63f14d8b73116836c4cbe2349eca3111a3f5eab2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=BCrn=20Brodersen?= Date: Sat, 8 Aug 2015 18:18:55 +0200 Subject: [PATCH 7/7] Fix for broken thumbnail url --- youtube_dl/extractor/wdr.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/youtube_dl/extractor/wdr.py b/youtube_dl/extractor/wdr.py index 6248c8b00..5af617d17 100644 --- a/youtube_dl/extractor/wdr.py +++ b/youtube_dl/extractor/wdr.py @@ -151,6 +151,11 @@ class WDRIE(InfoExtractor): video_url = flashvars['dslSrc'][0] title = flashvars['trackerClipTitle'][0] thumbnail = flashvars['startPicture'][0] if 'startPicture' in flashvars else None + + if thumbnail is not None: + double_url_regex = r'(' + re.escape(page_url) + r'*){2,}' + thumbnail = re.sub(double_url_regex, page_url, thumbnail) + is_live = flashvars.get('isLive', ['0'])[0] == '1' if is_live: