[wdr] Update for the wdr extractor

2015-07-04 02:03:13 +02:00 · 2015-07-04 02:03:13 +02:00 · 657be7fa62
commit 657be7fa62
parent e56a4c9e9b
1 changed files with 60 additions and 54 deletions
--- a/youtube_dl/extractor/wdr.py
+++ b/youtube_dl/extractor/wdr.py
@ -3,6 +3,7 @@ from __future__ import unicode_literals
 import itertools
 import re
 import json
 from .common import InfoExtractor
 from ..compat import (
@ -16,9 +17,8 @@ from ..utils import (
 class WDRIE(InfoExtractor):
-    _PLAYER_REGEX = '-(?:video|audio)player(?:_size-[LMS])?'
+    _PLAYER_REGEX = 'https?://deviceids-medstdp.wdr.de/ondemand/.+?/.+?\.js'
-    _VALID_URL = r'(?P<url>https?://www\d?\.(?:wdr\d?|funkhauseuropa)\.de/)(?P<id>.+?)(?P<player>%s)?\.html' % _PLAYER_REGEX
+    _VALID_URL = r'(?P<url>https?://www\d?\.(?:wdr\d?|funkhauseuropa)\.de/)(?P<id>.+?)\.html'
    _TESTS = [
        {
            'url': 'http://www1.wdr.de/mediathek/video/sendungen/servicezeit/videoservicezeit560-videoplayer_size-L.html',
@ -95,23 +95,7 @@ class WDRIE(InfoExtractor):
        }
    ]
-    def _real_extract(self, url):
+    def _overiew_page_extractor(self, page_url, page_id, webpage):
        mobj = re.match(self._VALID_URL, url)
        page_url = mobj.group('url')
        page_id = mobj.group('id')
        webpage = self._download_webpage(url, page_id)
        if mobj.group('player') is None:
            entries = [
                self.url_result(page_url + href, 'WDR')
                for href in re.findall(r'<a href="/?(.+?%s\.html)" rel="nofollow"' % self._PLAYER_REGEX, webpage)
            ]
            if entries:  # Playlist page
                return self.playlist_result(entries, page_id)
            # Overview page
        entries = []
        for page_num in itertools.count(2):
            hrefs = re.findall(
@ -130,25 +114,31 @@ class WDRIE(InfoExtractor):
                note='Downloading playlist page %d' % page_num)
        return self.playlist_result(entries, page_id)
-        flashvars = compat_parse_qs(
+    def _real_extract(self, url):
-            self._html_search_regex(r'<param name="flashvars" value="([^"]+)"', webpage, 'flashvars'))
+        mobj = re.match(self._VALID_URL, url)
        page_url = mobj.group('url')
        page_id = mobj.group('id')
-        page_id = flashvars['trackerClipId'][0]
+        webpage = self._download_webpage(url, page_id)
-        video_url = flashvars['dslSrc'][0]
+        entries = re.search(r'%s' % self._PLAYER_REGEX, webpage)
        title = flashvars['trackerClipTitle'][0]
        thumbnail = flashvars['startPicture'][0] if 'startPicture' in flashvars else None
        is_live = flashvars.get('isLive', ['0'])[0] == '1'
-        if is_live:
+        if entries == None: # Overview page
-            title = self._live_title(title)
+            return self._overiew_page_extractor(page_url, page_id, webpage)
-        if 'trackerClipAirTime' in flashvars:
+        jsonpage = self._download_webpage(entries.group(0), entries.group(0))
-            upload_date = flashvars['trackerClipAirTime'][0]
+        jsonvars = json.loads(jsonpage[38:-2])
        page_id = jsonvars['trackerData']['trackerClipId']
        title = jsonvars['trackerData']['trackerClipTitle']
        formats = []
        for _id, video_field in jsonvars['mediaResource'].items():
            if 'videoURL' in video_field:
                video_url = video_field['videoURL']
            elif 'audioURL' in video_field:
                video_url = video_field['audioURL']
            else:
-            upload_date = self._html_search_meta('DC.Date', webpage, 'upload date')
+                break
-
+            is_live = video_field.get('flashvarsExt', {'isLive': '0'}) == {'isLive': '1'}
        if upload_date:
            upload_date = unified_strdate(upload_date)
            if video_url.endswith('.f4m'):
                video_url += '?hdcore=3.2.0&plugin=aasp-3.2.0.77.18'
@ -163,12 +153,28 @@ class WDRIE(InfoExtractor):
            else:
                ext = determine_ext(video_url)
-        description = self._html_search_meta('Description', webpage, 'description')
+            formats.append({'url': video_url, 'ext': ext, 'format_id': _id})
        thumbnail = re.search('<div class="illustrationCont w960">\n<div class="linkCont">\n<img src="(?P<thumbnail>.+?)"', webpage)
        if thumbnail != None:
            thumbnail = page_url + thumbnail.group('thumbnail')
        if is_live:
            title = self._live_title(title)
        if 'trackerClipAirTime' in jsonvars['trackerData']:
            upload_date = jsonvars['trackerData']['trackerClipAirTime']
        else:
            upload_date = self._html_search_meta('DC.Date', webpage, 'content')
        if upload_date:
            upload_date = unified_strdate(upload_date)
        description = self._html_search_meta('Description', webpage, 'content')
        return {
            'id': page_id,
-            'url': video_url,
+            'formats': formats,
            'ext': ext,
            'title': title,
            'description': description,
            'thumbnail': thumbnail,