2014-01-27 18:40:10 +01:00
# coding: utf-8
from __future__ import unicode_literals
2013-06-23 20:24:07 +02:00
import re
from . common import InfoExtractor
2014-10-10 20:35:34 +02:00
from . generic import GenericIE
2013-06-23 20:24:07 +02:00
from . . utils import (
2014-01-27 18:40:10 +01:00
determine_ext ,
2019-08-05 12:48:39 +02:00
dict_get ,
2013-06-23 20:24:07 +02:00
ExtractorError ,
2014-08-27 02:36:57 +02:00
int_or_none ,
parse_duration ,
2018-12-17 05:29:59 +07:00
qualities ,
str_or_none ,
try_get ,
2014-08-27 02:36:57 +02:00
unified_strdate ,
2018-12-17 05:29:59 +07:00
unified_timestamp ,
2016-07-09 03:18:45 +01:00
update_url_query ,
2018-07-21 19:08:28 +07:00
url_or_none ,
2019-08-05 12:48:39 +02:00
url_basename ,
2018-12-17 05:29:59 +07:00
xpath_text ,
2013-06-23 20:24:07 +02:00
)
2015-10-26 16:41:24 +01:00
from . . compat import compat_etree_fromstring
2013-06-23 20:24:07 +02:00
2014-01-27 18:40:10 +01:00
2014-08-27 02:36:57 +02:00
class ARDMediathekIE ( InfoExtractor ) :
IE_NAME = ' ARD:mediathek '
2018-12-10 01:37:10 +07:00
_VALID_URL = r ' ^https?://(?:(?:(?:www|classic) \ .)?ardmediathek \ .de|mediathek \ .(?:daserste|rbb-online) \ .de|one \ .ard \ .de)/(?:.*/)(?P<video_id>[0-9]+|[^0-9][^/ \ ?]+)[^/ \ ?]*(?: \ ?.*)? '
2014-01-27 18:40:10 +01:00
2014-07-20 21:38:02 +02:00
_TESTS = [ {
2018-02-25 17:41:12 +07:00
# available till 26.07.2022
2018-02-25 11:38:07 +01:00
' url ' : ' http://www.ardmediathek.de/tv/S % C3 % 9CDLICHT/Was-ist-die-Kunst-der-Zukunft-liebe-Ann/BR-Fernsehen/Video?bcastId=34633636&documentId=44726822 ' ,
' info_dict ' : {
' id ' : ' 44726822 ' ,
' ext ' : ' mp4 ' ,
' title ' : ' Was ist die Kunst der Zukunft, liebe Anna McCarthy? ' ,
' description ' : ' md5:4ada28b3e3b5df01647310e41f3a62f5 ' ,
' duration ' : 1740 ,
} ,
' params ' : {
# m3u8 download
' skip_download ' : True ,
}
2018-09-01 11:42:30 +02:00
} , {
' url ' : ' https://one.ard.de/tv/Mord-mit-Aussicht/Mord-mit-Aussicht-6-39-T % C3 % B6dliche-Nach/ONE/Video?bcastId=46384294&documentId=55586872 ' ,
' only_matching ' : True ,
2015-07-20 00:05:10 +06:00
} , {
# audio
' url ' : ' http://www.ardmediathek.de/tv/WDR-H % C3 % B6rspiel-Speicher/Tod-eines-Fu % C3 %9F ballers/WDR-3/Audio-Podcast?documentId=28488308&bcastId=23074086 ' ,
2018-02-25 17:41:12 +07:00
' only_matching ' : True ,
2015-07-20 00:08:21 +06:00
} , {
' url ' : ' http://mediathek.daserste.de/sendungen_a-z/328454_anne-will/22429276_vertrauen-ist-gut-spionieren-ist-besser-geht ' ,
' only_matching ' : True ,
2016-07-18 02:25:31 +07:00
} , {
# audio
' url ' : ' http://mediathek.rbb-online.de/radio/Hörspiel/Vor-dem-Fest/kulturradio/Audio?documentId=30796318&topRessort=radio&bcastId=9839158 ' ,
2018-02-25 17:41:12 +07:00
' only_matching ' : True ,
2018-12-10 01:37:10 +07:00
} , {
' url ' : ' https://classic.ardmediathek.de/tv/Panda-Gorilla-Co/Panda-Gorilla-Co-Folge-274/Das-Erste/Video?bcastId=16355486&documentId=58234698 ' ,
' only_matching ' : True ,
2014-07-20 21:38:02 +02:00
} ]
2013-06-23 20:24:07 +02:00
2018-12-17 04:51:57 +07:00
@classmethod
def suitable ( cls , url ) :
return False if ARDBetaMediathekIE . suitable ( url ) else super ( ARDMediathekIE , cls ) . suitable ( url )
2015-07-19 23:59:50 +06:00
def _extract_media_info ( self , media_info_url , webpage , video_id ) :
media_info = self . _download_json (
media_info_url , video_id , ' Downloading media JSON ' )
formats = self . _extract_formats ( media_info , video_id )
if not formats :
if ' " fsk " ' in webpage :
raise ExtractorError (
' This video is only available after 20:00 ' , expected = True )
elif media_info . get ( ' _geoblocked ' ) :
raise ExtractorError ( ' This video is not available due to geo restriction ' , expected = True )
self . _sort_formats ( formats )
duration = int_or_none ( media_info . get ( ' _duration ' ) )
thumbnail = media_info . get ( ' _previewImage ' )
2017-07-29 23:07:28 +07:00
is_live = media_info . get ( ' _isLive ' ) is True
2015-07-19 23:59:50 +06:00
subtitles = { }
subtitle_url = media_info . get ( ' _subtitleUrl ' )
if subtitle_url :
subtitles [ ' de ' ] = [ {
2016-04-12 21:20:31 +06:00
' ext ' : ' ttml ' ,
2015-07-19 23:59:50 +06:00
' url ' : subtitle_url ,
} ]
return {
' id ' : video_id ,
' duration ' : duration ,
' thumbnail ' : thumbnail ,
2017-07-29 23:07:28 +07:00
' is_live ' : is_live ,
2015-07-19 23:59:50 +06:00
' formats ' : formats ,
' subtitles ' : subtitles ,
}
def _extract_formats ( self , media_info , video_id ) :
type_ = media_info . get ( ' _type ' )
media_array = media_info . get ( ' _mediaArray ' , [ ] )
formats = [ ]
for num , media in enumerate ( media_array ) :
for stream in media . get ( ' _mediaStreamArray ' , [ ] ) :
stream_urls = stream . get ( ' _stream ' )
if not stream_urls :
continue
if not isinstance ( stream_urls , list ) :
stream_urls = [ stream_urls ]
quality = stream . get ( ' _quality ' )
server = stream . get ( ' _server ' )
for stream_url in stream_urls :
2018-07-21 19:08:28 +07:00
if not url_or_none ( stream_url ) :
2017-12-05 23:01:57 +07:00
continue
2015-07-19 23:59:50 +06:00
ext = determine_ext ( stream_url )
2015-12-25 17:37:50 +01:00
if quality != ' auto ' and ext in ( ' f4m ' , ' m3u8 ' ) :
continue
2015-07-19 23:59:50 +06:00
if ext == ' f4m ' :
2015-12-29 00:58:24 +06:00
formats . extend ( self . _extract_f4m_formats (
2016-07-09 03:18:45 +01:00
update_url_query ( stream_url , {
' hdcore ' : ' 3.1.1 ' ,
' plugin ' : ' aasp-3.1.1.69.124 '
} ) ,
video_id , f4m_id = ' hds ' , fatal = False ) )
2015-07-19 23:59:50 +06:00
elif ext == ' m3u8 ' :
2015-12-29 00:58:24 +06:00
formats . extend ( self . _extract_m3u8_formats (
2016-07-09 03:18:45 +01:00
stream_url , video_id , ' mp4 ' , m3u8_id = ' hls ' , fatal = False ) )
2015-07-19 23:59:50 +06:00
else :
if server and server . startswith ( ' rtmp ' ) :
f = {
' url ' : server ,
' play_path ' : stream_url ,
' format_id ' : ' a %s -rtmp- %s ' % ( num , quality ) ,
}
2017-12-05 23:01:57 +07:00
else :
2015-07-19 23:59:50 +06:00
f = {
' url ' : stream_url ,
' format_id ' : ' a %s - %s - %s ' % ( num , ext , quality )
}
m = re . search ( r ' _(?P<width> \ d+)x(?P<height> \ d+) \ .mp4$ ' , stream_url )
if m :
f . update ( {
' width ' : int ( m . group ( ' width ' ) ) ,
' height ' : int ( m . group ( ' height ' ) ) ,
} )
if type_ == ' audio ' :
f [ ' vcodec ' ] = ' none '
formats . append ( f )
return formats
2013-06-23 20:24:07 +02:00
def _real_extract ( self , url ) :
# determine video id from url
m = re . match ( self . _VALID_URL , url )
2017-07-29 23:07:28 +07:00
document_id = None
2013-06-23 20:24:07 +02:00
numid = re . search ( r ' documentId=([0-9]+) ' , url )
if numid :
2017-07-29 23:07:28 +07:00
document_id = video_id = numid . group ( 1 )
2013-06-23 20:24:07 +02:00
else :
video_id = m . group ( ' video_id ' )
2014-07-31 21:23:15 +07:00
webpage = self . _download_webpage ( url , video_id )
2014-01-27 18:40:10 +01:00
2016-10-25 21:21:47 +07:00
ERRORS = (
( ' >Leider liegt eine Störung vor. ' , ' Video %s is unavailable ' ) ,
( ' >Der gewünschte Beitrag ist nicht mehr verfügbar.< ' ,
' Video %s is no longer available ' ) ,
)
for pattern , message in ERRORS :
if pattern in webpage :
raise ExtractorError ( message % video_id , expected = True )
2015-03-16 00:21:38 +06:00
2014-10-22 14:24:53 +02:00
if re . search ( r ' [ \ ?&]rss($|[=&]) ' , url ) :
2015-10-26 16:41:24 +01:00
doc = compat_etree_fromstring ( webpage . encode ( ' utf-8 ' ) )
2014-10-10 20:35:34 +02:00
if doc . tag == ' rss ' :
return GenericIE ( ) . _extract_rss ( url , video_id , doc )
2014-01-27 18:40:10 +01:00
title = self . _html_search_regex (
2014-05-30 04:59:18 +02:00
[ r ' <h1(?: \ s+class= " boxTopHeadline " )?>(.*?)</h1> ' ,
2017-10-09 23:50:53 +07:00
r ' <meta name= " dcterms \ .title " content= " (.*?) " /> ' ,
2018-12-06 21:41:02 +01:00
r ' <h4 class= " headline " >(.*?)</h4> ' ,
r ' <title[^>]*>(.*?)</title> ' ] ,
2014-05-30 04:59:18 +02:00
webpage , ' title ' )
2014-01-27 18:40:10 +01:00
description = self . _html_search_meta (
2014-07-20 21:38:02 +02:00
' dcterms.abstract ' , webpage , ' description ' , default = None )
if description is None :
description = self . _html_search_meta (
2018-12-06 21:41:02 +01:00
' description ' , webpage , ' meta description ' , default = None )
if description is None :
description = self . _html_search_regex (
r ' <p \ s+class= " teasertext " >(.+?)</p> ' ,
webpage , ' teaser text ' , default = None )
2014-07-20 21:38:02 +02:00
# Thumbnail is sometimes not present.
# It is in the mobile version, but that seems to use a different URL
# structure altogether.
thumbnail = self . _og_search_thumbnail ( webpage , default = None )
media_streams = re . findall ( r ''' (?x)
mediaCollection \. addMediaStream \( [ 0 - 9 ] + , \s * [ 0 - 9 ] + , \s * " [^ " ] * " , \ s*
" ([^ " ] + ) " ' ' ' , webpage)
if media_streams :
QUALITIES = qualities ( [ ' lo ' , ' hi ' , ' hq ' ] )
formats = [ ]
for furl in set ( media_streams ) :
if furl . endswith ( ' .f4m ' ) :
fid = ' f4m '
else :
fid_m = re . match ( r ' .* \ .([^.]+) \ .[^.]+$ ' , furl )
fid = fid_m . group ( 1 ) if fid_m else None
formats . append ( {
' quality ' : QUALITIES ( fid ) ,
' format_id ' : fid ,
' url ' : furl ,
} )
2015-07-19 23:59:50 +06:00
self . _sort_formats ( formats )
info = {
' formats ' : formats ,
}
2014-07-20 21:38:02 +02:00
else : # request JSON file
2017-07-29 23:07:28 +07:00
if not document_id :
video_id = self . _search_regex (
r ' /play/(?:config|media)/( \ d+) ' , webpage , ' media id ' )
2015-07-19 23:59:50 +06:00
info = self . _extract_media_info (
2017-07-29 23:07:28 +07:00
' http://www.ardmediathek.de/play/media/ %s ' % video_id ,
webpage , video_id )
2014-01-27 18:40:10 +01:00
2015-07-19 23:59:50 +06:00
info . update ( {
2014-01-27 18:40:10 +01:00
' id ' : video_id ,
2017-07-29 23:07:28 +07:00
' title ' : self . _live_title ( title ) if info . get ( ' is_live ' ) else title ,
2014-01-27 18:40:10 +01:00
' description ' : description ,
' thumbnail ' : thumbnail ,
2015-07-19 23:59:50 +06:00
} )
return info
2014-08-27 02:36:57 +02:00
class ARDIE ( InfoExtractor ) :
2016-09-08 17:04:57 +07:00
_VALID_URL = r ' (?P<mainurl>https?://(www \ .)?daserste \ .de/[^?#]+/videos/(?P<display_id>[^/?#]+)-(?P<id>[0-9]+)) \ .html '
2018-02-25 11:38:07 +01:00
_TESTS = [ {
2018-02-25 17:41:12 +07:00
# available till 14.02.2019
2018-02-25 11:38:07 +01:00
' url ' : ' http://www.daserste.de/information/talk/maischberger/videos/das-groko-drama-zerlegen-sich-die-volksparteien-video-102.html ' ,
' md5 ' : ' 8e4ec85f31be7c7fc08a26cdbc5a1f49 ' ,
' info_dict ' : {
' display_id ' : ' das-groko-drama-zerlegen-sich-die-volksparteien-video ' ,
' id ' : ' 102 ' ,
' ext ' : ' mp4 ' ,
' duration ' : 4435.0 ,
' title ' : ' Das GroKo-Drama: Zerlegen sich die Volksparteien? ' ,
' upload_date ' : ' 20180214 ' ,
' thumbnail ' : r ' re:^https?://.* \ .jpg$ ' ,
} ,
2018-02-25 17:41:12 +07:00
} , {
2014-08-27 02:36:57 +02:00
' url ' : ' http://www.daserste.de/information/reportage-dokumentation/dokus/videos/die-story-im-ersten-mission-unter-falscher-flagge-100.html ' ,
2018-02-25 17:41:12 +07:00
' only_matching ' : True ,
2018-02-25 11:38:07 +01:00
} ]
2014-08-27 02:36:57 +02:00
def _real_extract ( self , url ) :
mobj = re . match ( self . _VALID_URL , url )
display_id = mobj . group ( ' display_id ' )
player_url = mobj . group ( ' mainurl ' ) + ' ~playerXml.xml '
doc = self . _download_xml ( player_url , display_id )
video_node = doc . find ( ' ./video ' )
2014-09-13 09:09:55 +02:00
upload_date = unified_strdate ( xpath_text (
video_node , ' ./broadcastDate ' ) )
thumbnail = xpath_text ( video_node , ' .//teaserImage//variant/url ' )
2014-08-27 02:36:57 +02:00
formats = [ ]
for a in video_node . findall ( ' .//asset ' ) :
f = {
' format_id ' : a . attrib [ ' type ' ] ,
' width ' : int_or_none ( a . find ( ' ./frameWidth ' ) . text ) ,
' height ' : int_or_none ( a . find ( ' ./frameHeight ' ) . text ) ,
' vbr ' : int_or_none ( a . find ( ' ./bitrateVideo ' ) . text ) ,
' abr ' : int_or_none ( a . find ( ' ./bitrateAudio ' ) . text ) ,
' vcodec ' : a . find ( ' ./codecVideo ' ) . text ,
' tbr ' : int_or_none ( a . find ( ' ./totalBitrate ' ) . text ) ,
}
if a . find ( ' ./serverPrefix ' ) . text :
f [ ' url ' ] = a . find ( ' ./serverPrefix ' ) . text
f [ ' playpath ' ] = a . find ( ' ./fileName ' ) . text
else :
f [ ' url ' ] = a . find ( ' ./fileName ' ) . text
formats . append ( f )
self . _sort_formats ( formats )
return {
' id ' : mobj . group ( ' id ' ) ,
' formats ' : formats ,
' display_id ' : display_id ,
' title ' : video_node . find ( ' ./title ' ) . text ,
' duration ' : parse_duration ( video_node . find ( ' ./duration ' ) . text ) ,
' upload_date ' : upload_date ,
' thumbnail ' : thumbnail ,
}
2018-09-01 00:18:17 +02:00
class ARDBetaMediathekIE ( InfoExtractor ) :
2018-12-17 04:51:57 +07:00
_VALID_URL = r ' https://(?:beta|www) \ .ardmediathek \ .de/[^/]+/(?:player|live)/(?P<video_id>[a-zA-Z0-9]+)(?:/(?P<display_id>[^/?#]+))? '
2018-09-01 00:18:17 +02:00
_TESTS = [ {
' url ' : ' https://beta.ardmediathek.de/ard/player/Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhdG9ydC9mYmM4NGM1NC0xNzU4LTRmZGYtYWFhZS0wYzcyZTIxNGEyMDE/die-robuste-roswita ' ,
' md5 ' : ' 2d02d996156ea3c397cfc5036b5d7f8f ' ,
' info_dict ' : {
' display_id ' : ' die-robuste-roswita ' ,
' id ' : ' Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhdG9ydC9mYmM4NGM1NC0xNzU4LTRmZGYtYWFhZS0wYzcyZTIxNGEyMDE ' ,
' title ' : ' Tatort: Die robuste Roswita ' ,
' description ' : r ' re:^Der Mord.*trüber ist als die Ilm. ' ,
' duration ' : 5316 ,
' thumbnail ' : ' https://img.ardmediathek.de/standard/00/55/43/59/34/-1774185891/16x9/960?mandant=ard ' ,
' upload_date ' : ' 20180826 ' ,
' ext ' : ' mp4 ' ,
} ,
2018-12-17 04:51:57 +07:00
} , {
' url ' : ' https://www.ardmediathek.de/ard/player/Y3JpZDovL3N3ci5kZS9hZXgvbzEwNzE5MTU/ ' ,
' only_matching ' : True ,
} , {
' url ' : ' https://www.ardmediathek.de/swr/live/Y3JpZDovL3N3ci5kZS8xMzQ4MTA0Mg ' ,
' only_matching ' : True ,
2018-09-01 00:18:17 +02:00
} ]
2019-08-05 12:48:39 +02:00
_format_url_templates = [
# Das Erste
{
' pattern ' : r ' ^.+/(?P<width> \ d+)-[^/]+_[^/]+ \ .. { 3,4}$ ' ,
' format_id_suffix ' : ' width ' ,
} ,
# SWR / SR / NDR
{
' pattern ' : r ' ^.+/[^/]+ \ .(?P<width_key>[a-z]+) \ .. { 3,4}$ ' ,
' format_id_suffix ' : ' width_key ' ,
' width_dict ' : {
# SWR / SR
' xxl ' : 1920 ,
' xl ' : 1280 ,
' l ' : 960 ,
' ml ' : 640 ,
' m ' : 512 ,
' sm ' : 480 ,
' s ' : 320 ,
# NDR
' hd ' : 1280 ,
' hq ' : 960 ,
' ln ' : 640 ,
' hi ' : 512 ,
' mn ' : 480 ,
' lo ' : 320 ,
} ,
} ,
# BR / ARD-alpha / SR
{
' pattern ' : r ' ^.+/[^/]+_(?P<width_key>[A-Z0-9]) \ .. { 3,4}$ ' ,
' format_id_suffix ' : ' width_key ' ,
' width_dict ' : {
# BR, ARD-alpha
' X ' : 1280 ,
' C ' : 960 ,
' E ' : 640 ,
' B ' : 512 ,
' 2 ' : 480 ,
' A ' : 480 ,
' 0 ' : 320 ,
# SR
' P ' : 1280 ,
' L ' : 960 ,
' N ' : 640 ,
' M ' : 512 ,
' K ' : 480 ,
' S ' : 320 ,
} ,
} ,
# HR
{
' pattern ' : r ' ^.+/[^/]+?(?P<width>[0-9]+)x(?P<height>[0-9]+)-(?P<fps>[0-9]+)[pi]-(?P<tbr>[0-9]+)kbit \ .. { 3,4}$ ' ,
' format_id_suffix ' : ' tbr ' ,
} ,
# Radio Bremen
{
' pattern ' : r ' ^.+/[^/]+_(?P<height> \ d+)p \ .. { 3,4}$ ' ,
' format_id_suffix ' : ' height ' ,
} ,
# RBB
{
' pattern ' : r ' ^.+/[^/]+_(?P<vbr> \ d+)k \ .. { 3,4}$ ' ,
' format_id_suffix ' : ' vbr ' ,
} ,
# tagesschau24
{
' pattern ' : r ' ^.+/[^/]+ \ .(?P<width_key>[a-z]+) \ .[^/]+ \ .. { 3,4}$ ' ,
' format_id_suffix ' : ' width_key ' ,
' width_dict ' : {
' webxl ' : 1280 ,
' webl ' : 960 ,
' webml ' : 640 ,
' webm ' : 512 ,
' websm ' : 480 ,
' webs ' : 256 ,
} ,
} ,
# MDR
{
' pattern ' : r ' ^.+/[^/]+-(?P<width_key>[a-z0-9]+)_[^/]+ \ .. { 3,4}$ ' ,
' format_id_suffix ' : ' width_key ' ,
' width_dict ' : {
' be7c2950aac6 ' : 1280 ,
' 730aae549c28 ' : 960 ,
' 41dd60577440 ' : 640 ,
' 9a4bb04739be ' : 512 ,
' 39c393010ca9 ' : 480 ,
' d1ceaa57a495 ' : 320 ,
} ,
} ,
# TODO Find out format data for videos from WDR and ONE.
]
def _get_format_from_url ( self , format_url , quality ) :
""" Extract as much format data from the format_url as possible.
Use the templates listed in _format_url_templates to do so .
"""
result = {
' url ' : format_url ,
' preference ' : 10 , # Plain HTTP, that's nice
}
format_id_suffix = None
for template in self . _format_url_templates :
m = re . match ( template [ ' pattern ' ] , format_url )
if m :
groupdict = m . groupdict ( )
result [ ' width ' ] = int_or_none ( groupdict . get ( ' width ' ) )
result [ ' height ' ] = int_or_none ( groupdict . get ( ' height ' ) )
result [ ' fps ' ] = int_or_none ( groupdict . get ( ' fps ' ) )
result [ ' tbr ' ] = int_or_none ( groupdict . get ( ' tbr ' ) )
result [ ' vbr ' ] = int_or_none ( groupdict . get ( ' vbr ' ) )
width_dict = template . get ( ' width_dict ' )
if width_dict :
result [ ' width ' ] = width_dict . get ( groupdict . get ( ' width_key ' ) )
format_id_suffix = groupdict . get ( template . get ( ' format_id_suffix ' ) )
break
if result . get ( ' width ' ) and not result . get ( ' height ' ) :
result [ ' height ' ] = int ( ( result [ ' width ' ] / 16 ) * 9 )
if result . get ( ' height ' ) and not result . get ( ' width ' ) :
result [ ' width ' ] = int ( ( result [ ' height ' ] / 9 ) * 16 )
result [ ' format_id ' ] = ( ( ' http- ' + quality ) if quality else ' http ' ) + ( ' - ' + format_id_suffix if format_id_suffix else ' ' )
return result
2019-08-17 22:50:51 +02:00
def _extract_episode_info ( self , title ) :
patterns = [
r ' .*(?P<ep_info> \ (S(?P<season_number> \ d+)/E(?P<episode_number> \ d+) \ )).* ' ,
r ' .*(?P<ep_info> \ ((?:Folge |Teil )?(?P<episode_number> \ d+)(?:/ \ d+)? \ )).* ' ,
r ' .*(?P<ep_info>Folge (?P<episode_number> \ d+)(?: \ :| -|) ) \ " (?P<episode>.+) \ " .* ' ,
r ' .*(?P<ep_info>Folge (?P<episode_number> \ d+)(?: \ :| -|) ).* ' ,
]
res = { }
for pattern in patterns :
m = re . match ( pattern , title )
if m :
groupdict = m . groupdict ( )
for int_entry in [ ' season_number ' , ' episode_number ' ] :
res [ int_entry ] = int_or_none ( groupdict . get ( int_entry ) )
for str_entry in [ ' episode ' ] :
res [ str_entry ] = str_or_none ( groupdict . get ( str_entry ) )
if groupdict . get ( ' ep_info ' ) and not res [ ' episode ' ] :
res [ ' episode ' ] = str_or_none ( title . replace ( groupdict . get ( ' ep_info ' ) , ' ' ) )
if res [ ' episode ' ] :
res [ ' episode ' ] = res [ ' episode ' ] . strip ( )
break
return res
2019-08-05 12:48:39 +02:00
2018-09-01 00:18:17 +02:00
def _real_extract ( self , url ) :
mobj = re . match ( self . _VALID_URL , url )
video_id = mobj . group ( ' video_id ' )
2018-12-17 04:51:57 +07:00
display_id = mobj . group ( ' display_id ' ) or video_id
2018-09-01 00:18:17 +02:00
webpage = self . _download_webpage ( url , display_id )
2018-09-01 01:59:13 +02:00
data_json = self . _search_regex ( r ' window \ .__APOLLO_STATE__ \ s*= \ s*( \ { .*); \ n ' , webpage , ' json ' )
2018-09-01 00:18:17 +02:00
data = self . _parse_json ( data_json , display_id )
2019-08-17 22:50:51 +02:00
#import json
#print(json.dumps(data, indent=2))
2018-09-01 00:18:17 +02:00
res = {
' id ' : video_id ,
' display_id ' : display_id ,
}
formats = [ ]
2018-12-17 05:29:59 +07:00
subtitles = { }
geoblocked = False
2019-08-17 20:43:28 +02:00
blocked_by_fsk = False
2018-09-01 00:18:17 +02:00
for widget in data . values ( ) :
2018-12-17 05:29:59 +07:00
if widget . get ( ' _geoblocked ' ) is True :
geoblocked = True
2019-08-17 20:43:28 +02:00
if widget . get ( ' blockedByFsk ' ) is True :
blocked_by_fsk = True
2018-09-01 00:18:17 +02:00
if ' _duration ' in widget :
2018-12-17 05:29:59 +07:00
res [ ' duration ' ] = int_or_none ( widget [ ' _duration ' ] )
2018-09-01 00:18:17 +02:00
if ' clipTitle ' in widget :
res [ ' title ' ] = widget [ ' clipTitle ' ]
if ' _previewImage ' in widget :
res [ ' thumbnail ' ] = widget [ ' _previewImage ' ]
if ' broadcastedOn ' in widget :
2018-12-17 05:29:59 +07:00
res [ ' timestamp ' ] = unified_timestamp ( widget [ ' broadcastedOn ' ] )
2018-09-01 00:18:17 +02:00
if ' synopsis ' in widget :
res [ ' description ' ] = widget [ ' synopsis ' ]
2019-08-17 20:43:28 +02:00
if ' maturityContentRating ' in widget :
fsk_str = str_or_none ( widget [ ' maturityContentRating ' ] )
if fsk_str :
m = re . match ( r ' (?:FSK|fsk|Fsk)( \ d+) ' , fsk_str )
if m and m . group ( 1 ) :
res [ ' age_limit ' ] = int_or_none ( m . group ( 1 ) )
else :
res [ ' age_limit ' ] = 0
2018-12-17 05:29:59 +07:00
subtitle_url = url_or_none ( widget . get ( ' _subtitleUrl ' ) )
if subtitle_url :
subtitles . setdefault ( ' de ' , [ ] ) . append ( {
2018-09-01 00:18:17 +02:00
' ext ' : ' ttml ' ,
2018-12-17 05:29:59 +07:00
' url ' : subtitle_url ,
} )
2018-09-01 00:18:17 +02:00
if ' _quality ' in widget :
2019-08-05 12:48:39 +02:00
# Read format URLs from a MediaStreamArray
stream_array = try_get ( widget ,
lambda x : x [ ' _stream ' ] [ ' json ' ] )
if not stream_array :
2018-12-17 05:29:59 +07:00
continue
2019-08-05 12:48:39 +02:00
for format_url in stream_array :
format_url = url_or_none ( format_url )
if not format_url :
continue
# Make sure this format isn't already in our list.
# Occassionally, there are duplicate files from
# different servers.
duplicate = next ( ( x for x in formats
if url_basename ( x [ ' url ' ] ) == url_basename ( format_url ) ) , None )
if duplicate :
2018-12-17 05:29:59 +07:00
continue
2019-08-05 12:48:39 +02:00
ext = determine_ext ( format_url )
if ext == ' f4m ' :
formats . extend ( self . _extract_f4m_formats (
format_url + ' ?hdcore=3.11.0 ' ,
video_id , f4m_id = ' hds ' , fatal = False ) )
elif ext == ' m3u8 ' :
formats . extend ( self . _extract_m3u8_formats (
format_url , video_id , ' mp4 ' , m3u8_id = ' hls ' ,
fatal = False ) )
else :
quality = str_or_none ( widget . get ( ' _quality ' ) )
formats . append ( self . _get_format_from_url ( format_url , quality ) )
2018-09-01 00:18:17 +02:00
2018-12-17 05:29:59 +07:00
if not formats and geoblocked :
self . raise_geo_restricted (
msg = ' This video is not available due to geoblocking ' ,
countries = [ ' DE ' ] )
2019-08-17 20:43:28 +02:00
if not formats and blocked_by_fsk :
raise ExtractorError (
2019-08-17 22:50:51 +02:00
msg = ' This video is currently not available due to age restrictions (FSK %d ). Try again from %02d :00 to 06:00. ' % ( res [ ' age_limit ' ] , 22 if res [ ' age_limit ' ] < 18 else 23 ) ,
expected = True )
2019-08-17 20:43:28 +02:00
2018-09-01 00:18:17 +02:00
self . _sort_formats ( formats )
2018-12-17 05:29:59 +07:00
res . update ( {
' subtitles ' : subtitles ,
' formats ' : formats ,
} )
2018-09-01 00:18:17 +02:00
2019-08-17 22:50:51 +02:00
res . update ( self . _extract_episode_info ( res . get ( ' title ' ) ) )
2018-09-01 00:18:17 +02:00
return res