[wdr] Use old extractor method and added more formats

2015-07-04 13:14:14 +02:00 · 2015-07-04 13:14:14 +02:00 · 15db1db897
commit 15db1db897
parent b4fd2653ea
1 changed files with 79 additions and 62 deletions
--- a/youtube_dl/extractor/wdr.py
+++ b/youtube_dl/extractor/wdr.py
@ -3,7 +3,6 @@ from __future__ import unicode_literals

 import itertools
 import re
-import json

 from .common import InfoExtractor
 from ..compat import (
@ -17,15 +16,16 @@ from ..utils import (


 class WDRIE(InfoExtractor):
-    _PLAYER_REGEX = 'https?://deviceids-medstdp.wdr.de/ondemand/.+?/.+?\.js'
-    _VALID_URL = r'(?P<url>https?://www\d?\.(?:wdr\d?|funkhauseuropa)\.de/)(?P<id>.+?)\.html'
+    _PLAYER_REGEX = '-(?:video|audio)player(?:_size-[LMS])?'
+    _VALID_URL = r'(?P<url>https?://www\d?\.(?:wdr\d?|funkhauseuropa)\.de/)(?P<id>.+?)(?P<player>%s)?\.html' % _PLAYER_REGEX
+
    _TESTS = [
        {
            'url': 'http://www1.wdr.de/mediathek/video/sendungen/hier_und_heute/videostreetfoodpioniere100.html',
            'info_dict': {
                'id': 'mdb-750693',
                'ext': 'mp4',
-                'title': 'Streetfood-Pioniere',
+                'title': 'HIER UND HEUTE: Streetfood-Pioniere',
                'description': 'md5:bff1fdc6de7df044ac2bec13ab46e6a9',
                'upload_date': '20150703',
                'is_live': False
@ -41,8 +41,8 @@ class WDRIE(InfoExtractor):
            'info_dict': {
                'id': 'mdb-726385',
                'ext': 'mp3',
-                'title': 'Weselsky | 1LIVE Bahnansage (04.06.2015)',
-                'description': 'md5:8b9ef2af8c1bb01394ab98f3450ff04d',
+                'title': '1LIVE Bahnansage',
+                'description': 'md5:36016b06288e1f1a5b2602c8fe947b8d',
                'upload_date': '20150604',
                'is_live': False
            },
@ -54,7 +54,7 @@ class WDRIE(InfoExtractor):
                'id': 'mdb-752045',
                'ext': 'mp3',
                'title': 'Roskilde Festival 2015',
-                'description': 'md5:48e7a0a884c0e841a9d9174e27c67df3',
+                'description': 'md5:7b29e97e10dfb6e265238b32fa35b23a',
                'upload_date': '20150702',
                'is_live': False
            },
@ -82,82 +82,99 @@ class WDRIE(InfoExtractor):
        }
    ]

-    def _overiew_page_extractor(self, page_url, page_id, webpage):
-        entries = []
-        for page_num in itertools.count(2):
-            hrefs = re.findall(
-                r'<li class="mediathekvideo"\s*>\s*<img[^>]*>\s*<a href="(/mediathek/video/[^"]+)"',
-                webpage)
-            entries.extend(
-                self.url_result(page_url + href, 'WDR')
-                for href in hrefs)
-            next_url_m = re.search(
-                r'<li class="nextToLast">\s*<a href="([^"]+)"', webpage)
-            if not next_url_m:
-                break
-            next_url = page_url + next_url_m.group(1)
-            webpage = self._download_webpage(
-                next_url, page_id,
-                note='Downloading playlist page %d' % page_num)
-        return self.playlist_result(entries, page_id)
-
    def _real_extract(self, url):
        mobj = re.match(self._VALID_URL, url)
        page_url = mobj.group('url')
        page_id = mobj.group('id')

        webpage = self._download_webpage(url, page_id)
-        entries = re.search(r'%s' % self._PLAYER_REGEX, webpage)

-        if entries is None:  # Overview page
-            return self._overiew_page_extractor(page_url, page_id, webpage)
+        if mobj.group('player') is None:
+            entries = [
+                self.url_result(page_url + href, 'WDR')
+                for href in re.findall(r'<a href="/?(.+?%s\.html)" rel="nofollow"' % self._PLAYER_REGEX, webpage)
+            ]

-        jsonpage = self._download_webpage(entries.group(0), entries.group(0))
-        jsonvars = json.loads(jsonpage[38:-2])
+            if entries:  # Playlist page
+                return self.playlist_result(entries, page_id)
+
+            # Overview page
+            entries = []
+            for page_num in itertools.count(2):
+                hrefs = re.findall(
+                    r'<li class="mediathekvideo"\s*>\s*<img[^>]*>\s*<a href="(/mediathek/video/[^"]+)"',
+                    webpage)
+                entries.extend(
+                    self.url_result(page_url + href, 'WDR')
+                    for href in hrefs)
+                next_url_m = re.search(
+                    r'<li class="nextToLast">\s*<a href="([^"]+)"', webpage)
+                if not next_url_m:
+                    break
+                next_url = page_url + next_url_m.group(1)
+                webpage = self._download_webpage(
+                    next_url, page_id,
+                    note='Downloading playlist page %d' % page_num)
+            return self.playlist_result(entries, page_id)

-        page_id = jsonvars['trackerData']['trackerClipId']
-        title = jsonvars['trackerData']['trackerClipTitle']
        formats = []
-        for _id, video_field in jsonvars['mediaResource'].items():
-            if 'videoURL' in video_field:
-                video_url = video_field['videoURL']
-            elif 'audioURL' in video_field:
-                video_url = video_field['audioURL']
-            else:
-                break
-            is_live = video_field.get('flashvarsExt', {'isLive': '0'})
-            is_live = is_live.get('isLive', '0') == '1'
+        flashvars = compat_parse_qs(
+            self._html_search_regex(r'<param name="flashvars" value="([^"]+)"', webpage, 'flashvars'))

-            if video_url.endswith('.f4m'):
-                video_url += '?hdcore=3.2.0&plugin=aasp-3.2.0.77.18'
-                ext = 'flv'
-            elif video_url.endswith('.smil'):
-                fmt = self._extract_smil_formats(video_url, page_id)[0]
-                video_url = fmt['url']
-                sep = '&' if '?' in video_url else '?'
-                video_url += sep
-                video_url += 'hdcore=3.3.0&plugin=aasp-3.3.0.99.43'
-                ext = fmt['ext']
-            else:
-                ext = determine_ext(video_url)
-
-            formats.append({'url': video_url, 'ext': ext, 'format_id': _id})
-
-        thumbnail = re.search('<div class="illustrationCont w960">\n<div class="linkCont">\n<img src="(?P<thumbnail>.+?)"', webpage)
-        if thumbnail is not None:
-            thumbnail = page_url + thumbnail.group('thumbnail')
+        page_id = flashvars['trackerClipId'][0]
+        video_url = flashvars['dslSrc'][0]
+        title = flashvars['trackerClipTitle'][0]
+        thumbnail = flashvars['startPicture'][0] if 'startPicture' in flashvars else None
+        is_live = flashvars.get('isLive', ['0'])[0] == '1'

        if is_live:
            title = self._live_title(title)

-        if 'trackerClipAirTime' in jsonvars['trackerData']:
-            upload_date = jsonvars['trackerData']['trackerClipAirTime']
+        if 'trackerClipAirTime' in flashvars:
+            upload_date = flashvars['trackerClipAirTime'][0]
        else:
            upload_date = self._html_search_meta('DC.Date', webpage, 'content')

        if upload_date:
            upload_date = unified_strdate(upload_date)

+        if video_url.endswith('.f4m'):
+            video_url += '?hdcore=3.2.0&plugin=aasp-3.2.0.77.18'
+            ext = 'flv'
+        elif video_url.endswith('.smil'):
+            fmt = self._extract_smil_formats(video_url, page_id)[0]
+            video_url = fmt['url']
+            sep = '&' if '?' in video_url else '?'
+            video_url += sep
+            video_url += 'hdcore=3.3.0&plugin=aasp-3.3.0.99.43'
+            ext = fmt['ext']
+        else:
+            ext = determine_ext(video_url)
+
+        formats.append({'ext': ext, 'url': video_url})
+
+        m3u8_url = re.search(r'<li>\n<a rel="adaptiv" type="application/vnd\.apple\.mpegURL" href="(?P<link>.+?)"', webpage)
+
+        if m3u8_url is not None:
+            m3u8_url = m3u8_url.group('link')
+            formats.append({'ext': 'm3u8', 'url': m3u8_url})
+
+        webL_quality = -1
+        for video_vars in re.findall(r'<li>\n<a rel="(?P<format_id>web.?)"  href=".+?/(?P<link>fsk.+?)"', webpage):
+            format_id = video_vars[0]
+            video_url = 'http://ondemand-ww.wdr.de/medstdp/' + video_vars[1]
+            ext = determine_ext(video_url)
+            if format_id == 'webL':
+                quality = webL_quality
+                webL_quality -= 1
+            if format_id == 'webM':
+                quality = -3
+            if format_id == 'webS':
+                quality = -4
+            formats.append({'format_id': format_id, 'ext': ext, 'url': video_url, 'source_preference': quality})
+
+        self._sort_formats(formats)
+
        description = self._html_search_meta('Description', webpage, 'content')

        return {