2014-07-01 09:59:57 +02:00
# -*- coding: utf-8 -*-
2014-03-12 04:20:47 +07:00
from __future__ import unicode_literals
2015-01-09 21:33:07 +01:00
import itertools
2014-03-12 04:20:47 +07:00
import re
from . common import InfoExtractor
2014-12-13 12:24:42 +01:00
from . . compat import (
2014-03-30 07:25:42 +02:00
compat_parse_qs ,
2014-07-23 02:44:30 +02:00
compat_urlparse ,
2014-12-13 12:24:42 +01:00
)
from . . utils import (
2014-03-12 04:20:47 +07:00
determine_ext ,
2014-03-30 07:25:42 +02:00
unified_strdate ,
2015-07-05 00:41:23 +02:00
qualities
2014-03-12 04:20:47 +07:00
)
class WDRIE ( InfoExtractor ) :
2015-07-04 13:14:14 +02:00
_PLAYER_REGEX = ' -(?:video|audio)player(?:_size-[LMS])? '
_VALID_URL = r ' (?P<url>https?://www \ d? \ .(?:wdr \ d?|funkhauseuropa) \ .de/)(?P<id>.+?)(?P<player> %s )? \ .html ' % _PLAYER_REGEX
2014-03-12 04:20:47 +07:00
_TESTS = [
{
2015-07-04 23:46:18 +02:00
' url ' : ' http://www1.wdr.de/mediathek/video/sendungen/hier_und_heute/videostreetfoodpioniere100.html ' , # Test single media extraction (video)
2014-03-12 04:20:47 +07:00
' info_dict ' : {
2015-07-04 02:58:46 +02:00
' id ' : ' mdb-750693 ' ,
' ext ' : ' mp4 ' ,
2015-07-04 13:14:14 +02:00
' title ' : ' HIER UND HEUTE: Streetfood-Pioniere ' ,
2015-07-04 02:58:46 +02:00
' description ' : ' md5:bff1fdc6de7df044ac2bec13ab46e6a9 ' ,
' upload_date ' : ' 20150703 ' ,
2015-02-24 21:23:59 +02:00
' is_live ' : False
2014-03-12 04:20:47 +07:00
} ,
' params ' : {
' skip_download ' : True ,
2015-07-04 02:58:46 +02:00
' format ' : ' best '
2014-03-12 04:20:47 +07:00
} ,
} ,
{
2015-07-04 23:46:18 +02:00
' url ' : ' http://www1.wdr.de/mediathek/audio/1live/einslive-bahnansage-100.html ' , # Test single media extraction (audio)
2015-07-04 02:58:46 +02:00
' md5 ' : ' 87c389aac18ee6fc041aa1ced52aac76 ' ,
2014-03-12 04:20:47 +07:00
' info_dict ' : {
2015-07-04 02:58:46 +02:00
' id ' : ' mdb-726385 ' ,
2014-03-12 04:20:47 +07:00
' ext ' : ' mp3 ' ,
2015-07-04 13:14:14 +02:00
' title ' : ' 1LIVE Bahnansage ' ,
2015-07-04 23:46:18 +02:00
' description ' : ' md5:8b9ef2af8c1bb01394ab98f3450ff04d ' ,
2015-07-04 02:58:46 +02:00
' upload_date ' : ' 20150604 ' ,
2015-02-24 21:23:59 +02:00
' is_live ' : False
2014-03-12 04:20:47 +07:00
} ,
} ,
{
2015-07-04 23:46:18 +02:00
' url ' : ' http://www.funkhauseuropa.de/musik/musikspecials/roskilde-zweitausendfuenfzehn-100.html ' , # Test single media extraction (audio)
2015-07-04 02:58:46 +02:00
' md5 ' : ' e50e0c8900f6558ae12cd9953aca5a20 ' ,
2014-03-12 04:20:47 +07:00
' info_dict ' : {
2015-07-04 02:58:46 +02:00
' id ' : ' mdb-752045 ' ,
2014-03-12 04:20:47 +07:00
' ext ' : ' mp3 ' ,
2015-07-04 02:58:46 +02:00
' title ' : ' Roskilde Festival 2015 ' ,
2015-07-04 23:46:18 +02:00
' description ' : ' md5:48e7a0a884c0e841a9d9174e27c67df3 ' ,
2015-07-04 02:58:46 +02:00
' upload_date ' : ' 20150702 ' ,
2015-02-24 21:23:59 +02:00
' is_live ' : False
2014-03-12 04:20:47 +07:00
} ,
} ,
2015-01-09 21:33:07 +01:00
{
2015-07-04 23:46:18 +02:00
' url ' : ' http://www1.wdr.de/mediathek/video/sendungen/quarks_und_co/filterseite-quarks-und-co100.html ' , # Test playlist extraction (containing links to webpages)
2015-01-09 21:33:07 +01:00
' playlist_mincount ' : 146 ,
2015-02-01 15:27:16 +01:00
' info_dict ' : {
' id ' : ' mediathek/video/sendungen/quarks_und_co/filterseite-quarks-und-co100 ' ,
2015-07-04 23:46:18 +02:00
' title ' : ' md5:31d3634678b18f90a9fc4e7cd34ba3b2 '
2015-02-01 15:27:16 +01:00
}
2015-02-24 21:23:59 +02:00
} ,
{
2015-07-04 23:46:18 +02:00
' url ' : ' http://www.funkhauseuropa.de/index.html ' , # Test playlist extraction (containing links to playerpages)
' playlist_mincount ' : 3 ,
' info_dict ' : {
' id ' : ' index ' ,
}
} ,
{
' url ' : ' http://www1.wdr.de/mediathek/video/livestream/index.html ' , # Test live tv
2015-02-24 21:23:59 +02:00
' info_dict ' : {
' id ' : ' mdb-103364 ' ,
2015-07-05 00:41:23 +02:00
' title ' : ' re:^WDR Fernsehen Live [0-9] {4} -[0-9] {2} -[0-9] {2} [0-9] {2} :[0-9] {2} $ ' ,
2015-02-24 21:23:59 +02:00
' description ' : ' md5:ae2ff888510623bf8d4b115f95a9b7c9 ' ,
' ext ' : ' flv ' ,
2015-07-04 02:58:46 +02:00
' upload_date ' : ' 20150101 ' ,
2015-02-24 21:23:59 +02:00
' is_live ' : True
} ,
' params ' : {
' skip_download ' : True ,
} ,
2015-01-09 21:33:07 +01:00
}
2014-03-12 04:20:47 +07:00
]
2015-07-04 23:46:18 +02:00
def _playlist_extract ( self , page_url , page_id , webpage ) :
2015-07-04 13:14:14 +02:00
entries = [ ]
for page_num in itertools . count ( 2 ) :
hrefs = re . findall (
r ' <li class= " mediathekvideo " \ s*> \ s*<img[^>]*> \ s*<a href= " (/mediathek/video/[^ " ]+) " ' ,
webpage )
entries . extend (
self . url_result ( page_url + href , ' WDR ' )
for href in hrefs )
next_url_m = re . search (
r ' <li class= " nextToLast " > \ s*<a href= " ([^ " ]+) " ' , webpage )
if not next_url_m :
break
next_url = page_url + next_url_m . group ( 1 )
webpage = self . _download_webpage (
next_url , page_id ,
note = ' Downloading playlist page %d ' % page_num )
2015-07-04 23:46:18 +02:00
return self . playlist_result ( entries , page_id , webpage )
2015-07-04 02:03:13 +02:00
2015-07-04 23:46:18 +02:00
def _media_extract ( self , page_url , page_id , mobj , webpage , entries ) :
if mobj . group ( ' player ' ) is None :
mobj = re . search ( self . _VALID_URL , entries [ 0 ] [ ' url ' ] )
playerpage = self . _download_webpage ( entries [ 0 ] [ ' url ' ] , mobj . group ( ' id ' ) + mobj . group ( ' player ' ) )
else :
playerpage = webpage
2015-07-04 02:03:13 +02:00
formats = [ ]
2015-07-04 13:14:14 +02:00
flashvars = compat_parse_qs (
2015-07-04 23:46:18 +02:00
self . _html_search_regex ( r ' <param name= " flashvars " value= " ([^ " ]+) " ' , playerpage , ' flashvars ' ) )
2015-07-04 13:14:14 +02:00
page_id = flashvars [ ' trackerClipId ' ] [ 0 ]
video_url = flashvars [ ' dslSrc ' ] [ 0 ]
title = flashvars [ ' trackerClipTitle ' ] [ 0 ]
thumbnail = flashvars [ ' startPicture ' ] [ 0 ] if ' startPicture ' in flashvars else None
is_live = flashvars . get ( ' isLive ' , [ ' 0 ' ] ) [ 0 ] == ' 1 '
2015-02-24 21:23:59 +02:00
if is_live :
title = self . _live_title ( title )
2014-03-12 04:20:47 +07:00
2015-07-04 13:14:14 +02:00
if ' trackerClipAirTime ' in flashvars :
upload_date = flashvars [ ' trackerClipAirTime ' ] [ 0 ]
2014-03-12 04:20:47 +07:00
else :
2015-07-04 02:03:13 +02:00
upload_date = self . _html_search_meta ( ' DC.Date ' , webpage , ' content ' )
2014-03-12 04:20:47 +07:00
if upload_date :
upload_date = unified_strdate ( upload_date )
2015-07-04 13:14:14 +02:00
if video_url . endswith ( ' .f4m ' ) :
video_url + = ' ?hdcore=3.2.0&plugin=aasp-3.2.0.77.18 '
ext = ' flv '
elif video_url . endswith ( ' .smil ' ) :
fmt = self . _extract_smil_formats ( video_url , page_id ) [ 0 ]
video_url = fmt [ ' url ' ]
sep = ' & ' if ' ? ' in video_url else ' ? '
video_url + = sep
video_url + = ' hdcore=3.3.0&plugin=aasp-3.3.0.99.43 '
ext = fmt [ ' ext ' ]
else :
ext = determine_ext ( video_url )
formats . append ( { ' ext ' : ext , ' url ' : video_url } )
2015-07-05 00:41:23 +02:00
m3u8_url = re . search ( r ' <li> \ n<a rel= " adaptiv " type= " application/vnd \ .apple \ .mpegURL " href= " (?P<link>.+?) " ' , playerpage )
2015-07-04 13:14:14 +02:00
if m3u8_url is not None :
m3u8_url = m3u8_url . group ( ' link ' )
2015-07-05 00:41:23 +02:00
formats . extend ( self . _extract_m3u8_formats ( m3u8_url , page_id ) )
2015-07-04 13:14:14 +02:00
2015-07-05 00:41:23 +02:00
quality = qualities ( [ ' webS ' , ' webM ' , ' webL_Lo ' , ' webL_Hi ' ] )
webL_first = True # There are two videos tagged as webL. The first one is usually of better quality
2015-07-04 23:46:18 +02:00
for video_vars in re . findall ( r ' <li> \ n<a rel= " (?P<format_id>web.?) " href= " .+?/(?P<link>fsk.+?) " ' , playerpage ) :
2015-07-04 13:14:14 +02:00
format_id = video_vars [ 0 ]
2015-07-05 00:41:23 +02:00
video_url = ' http://ondemand-ww.wdr.de/medstdp/ ' + video_vars [ 1 ] # Just using the href results in a warning page (that tells you to install flash player) and not the actual media
2015-07-04 13:14:14 +02:00
ext = determine_ext ( video_url )
2015-07-05 00:41:23 +02:00
if format_id == ' webL ' and webL_first is True :
format_id = ' webL_Hi '
webL_first = False
elif format_id == ' webL ' and webL_first is False :
format_id = ' webL_Lo '
formats . append ( { ' format_id ' : format_id , ' ext ' : ext , ' url ' : video_url , ' source_preference ' : quality ( format_id ) } )
2015-07-04 13:14:14 +02:00
self . _sort_formats ( formats )
2015-07-04 23:46:18 +02:00
description = self . _html_search_meta ( ' Description ' , webpage , ' content ' ) # Using the webpage works better with funkhauseuropa
2014-03-12 04:20:47 +07:00
return {
' id ' : page_id ,
2015-07-04 02:03:13 +02:00
' formats ' : formats ,
2014-03-12 04:20:47 +07:00
' title ' : title ,
' description ' : description ,
' thumbnail ' : thumbnail ,
' upload_date ' : upload_date ,
2015-02-24 21:23:59 +02:00
' is_live ' : is_live
2014-03-30 07:25:42 +02:00
}
2015-07-04 23:46:18 +02:00
def _real_extract ( self , url ) :
mobj = re . match ( self . _VALID_URL , url )
page_url = mobj . group ( ' url ' )
page_id = mobj . group ( ' id ' )
webpage = self . _download_webpage ( url , page_id )
entries = [
self . url_result ( page_url + href , ' WDR ' )
for href in re . findall ( r ' <a href= " /?(.+? %s \ .html) " rel= " nofollow " ' % self . _PLAYER_REGEX , webpage )
]
2015-07-05 00:41:23 +02:00
# The url doesn't seem to contain any information if the current page is a playlist or page with a single media item
2015-07-04 23:46:18 +02:00
if not entries and mobj . group ( ' player ' ) is None : # Playlist page
return self . _playlist_extract ( page_url , page_id , webpage )
elif entries and len ( entries ) > 1 : # Different playlist page
return self . playlist_result ( entries , page_id )
elif mobj . group ( ' player ' ) is not None or ( entries and len ( entries ) == 1 ) : # Media page (either just a single player link on the webpage or the webpage is the player)
if not entries :
entries = None
return self . _media_extract ( page_url , page_id , mobj , webpage , entries )
2014-03-30 07:25:42 +02:00
2014-05-12 22:17:19 +02:00
class WDRMobileIE ( InfoExtractor ) :
_VALID_URL = r ''' (?x)
https ? : / / mobile - ondemand \. wdr \. de /
. * ? / fsk ( ? P < age_limit > [ 0 - 9 ] + )
/ [ 0 - 9 ] + / [ 0 - 9 ] + /
( ? P < id > [ 0 - 9 ] + ) _ ( ? P < title > [ 0 - 9 ] + ) '''
IE_NAME = ' wdr:mobile '
_TEST = {
' url ' : ' http://mobile-ondemand.wdr.de/CMS2010/mdb/ondemand/weltweit/fsk0/42/421735/421735_4283021.mp4 ' ,
' info_dict ' : {
' title ' : ' 4283021 ' ,
' id ' : ' 421735 ' ,
2014-07-01 09:59:57 +02:00
' ext ' : ' mp4 ' ,
2014-05-12 22:17:19 +02:00
' age_limit ' : 0 ,
} ,
2014-07-01 09:59:57 +02:00
' skip ' : ' Problems with loading data. '
2014-05-12 22:17:19 +02:00
}
def _real_extract ( self , url ) :
mobj = re . match ( self . _VALID_URL , url )
return {
' id ' : mobj . group ( ' id ' ) ,
' title ' : mobj . group ( ' title ' ) ,
' age_limit ' : int ( mobj . group ( ' age_limit ' ) ) ,
' url ' : url ,
2015-01-24 18:19:58 +01:00
' http_headers ' : {
' User-Agent ' : ' mobile ' ,
} ,
2014-05-12 22:17:19 +02:00
}
2014-03-30 07:25:42 +02:00
class WDRMausIE ( InfoExtractor ) :
2014-03-30 07:42:35 +02:00
_VALID_URL = ' http://(?:www \ .)?wdrmaus \ .de/(?:[^/]+/) { ,2}(?P<id>[^/?#]+)(?:/index \ .php5|(?<!index) \ .php5|/(?:$|[?#])) '
2014-03-30 07:25:42 +02:00
IE_DESC = ' Sendung mit der Maus '
_TESTS = [ {
' url ' : ' http://www.wdrmaus.de/aktuelle-sendung/index.php5 ' ,
' info_dict ' : {
' id ' : ' aktuelle-sendung ' ,
' ext ' : ' mp4 ' ,
' thumbnail ' : ' re:^http://.+ \ .jpg ' ,
' upload_date ' : ' re:^[0-9] {8} $ ' ,
' title ' : ' re:^[0-9.] {10} - Aktuelle Sendung$ ' ,
}
} , {
' url ' : ' http://www.wdrmaus.de/sachgeschichten/sachgeschichten/40_jahre_maus.php5 ' ,
' md5 ' : ' 3b1227ca3ed28d73ec5737c65743b2a3 ' ,
' info_dict ' : {
' id ' : ' 40_jahre_maus ' ,
' ext ' : ' mp4 ' ,
' thumbnail ' : ' re:^http://.+ \ .jpg ' ,
' upload_date ' : ' 20131007 ' ,
' title ' : ' 12.03.2011 - 40 Jahre Maus ' ,
}
} ]
def _real_extract ( self , url ) :
2015-01-09 20:52:49 +01:00
video_id = self . _match_id ( url )
2014-03-30 07:25:42 +02:00
webpage = self . _download_webpage ( url , video_id )
param_code = self . _html_search_regex (
r ' <a href= " \ ?startVideo=1&([^ " ]+) " ' , webpage , ' parameters ' )
title_date = self . _search_regex (
r ' <div class= " sendedatum " ><p>Sendedatum: \ s*([0-9 \ .]+)</p> ' ,
webpage , ' air date ' )
title_str = self . _html_search_regex (
r ' <h1>(.*?)</h1> ' , webpage , ' title ' )
title = ' %s - %s ' % ( title_date , title_str )
upload_date = unified_strdate (
self . _html_search_meta ( ' dc.date ' , webpage ) )
fields = compat_parse_qs ( param_code )
video_url = fields [ ' firstVideo ' ] [ 0 ]
thumbnail = compat_urlparse . urljoin ( url , fields [ ' startPicture ' ] [ 0 ] )
formats = [ {
' format_id ' : ' rtmp ' ,
' url ' : video_url ,
} ]
jscode = self . _download_webpage (
' http://www.wdrmaus.de/codebase/js/extended-medien.min.js ' ,
video_id , fatal = False ,
note = ' Downloading URL translation table ' ,
errnote = ' Could not download URL translation table ' )
if jscode :
for m in re . finditer (
r " stream: \ s* ' dslSrc=(?P<stream>[^ ' ]+) ' , \ s*download: \ s* ' (?P<dl>[^ ' ]+) ' \ s* \ } " ,
jscode ) :
if video_url . startswith ( m . group ( ' stream ' ) ) :
http_url = video_url . replace (
m . group ( ' stream ' ) , m . group ( ' dl ' ) )
formats . append ( {
' format_id ' : ' http ' ,
' url ' : http_url ,
} )
break
self . _sort_formats ( formats )
return {
' id ' : video_id ,
' title ' : title ,
' formats ' : formats ,
' thumbnail ' : thumbnail ,
' upload_date ' : upload_date ,
}