2015-07-13 07:39:55 -05:00
from __future__ import unicode_literals
import re
from . common import InfoExtractor
from . . utils import (
2015-07-17 03:27:47 -05:00
ExtractorError ,
js_to_json ,
float_or_none ,
int_or_none ,
2015-07-13 07:39:55 -05:00
)
2015-07-17 03:27:47 -05:00
2015-07-13 07:39:55 -05:00
class YospaceIE ( InfoExtractor ) :
_VALID_URL = r ' http://(?:csm-[a-z]|mas-[a-z]).cds \ d+.yospace.com/(?P<type>csm|mas)/(?P<id> \ d+/ \ d+) '
_TESTS = [
{
' url ' : ' http://csm-e.cds1.yospace.com/csm/108986746/100312513735 ' ,
' info_dict ' : {
' id ' : ' 108986746_100312513735 ' ,
' ext ' : ' mp4 ' ,
' title ' : ' 108986746_100312513735 ' ,
' description ' : None ,
} ,
' params ' : {
# m3u8 download
' skip_download ' : True ,
} ,
} ,
]
def _extract_m3u8 ( self , hls_url ) :
formats = self . _extract_m3u8_formats ( hls_url , ' hls ' , ' mp4 ' , m3u8_id = ' hls ' )
return formats
def _extract_formats ( self , mas_url , video_id ) :
formats = [ ]
hls_url = None
2015-07-21 03:33:33 -05:00
jfpage = self . _download_webpage ( mas_url , ' json ' )
2015-07-13 07:39:55 -05:00
jf = self . _parse_json ( jfpage , video_id , transform_source = js_to_json )
for ent in jf :
2015-07-17 03:27:47 -05:00
if ent . get ( ' type ' , ' ' ) == ' application/x-mpeg-url ' :
hls_url = ent . get ( ' url ' )
formats . extend ( self . _extract_m3u8 ( hls_url ) )
else :
tbr = float_or_none ( ent . get ( ' size ' , 0 ) , 1000 )
if tbr == 0 :
r = re . search ( r ' [ \ ? \ &]q=( \ d+) ' , ent . get ( ' url ' ) )
if r :
tbr = float_or_none ( r . group ( 1 ) , 1 )
formats . append ( {
' url ' : ent . get ( ' url ' ) ,
' format_id ' : ent . get ( ' method ' , ' unknown ' ) + ' - ' + ent . get ( ' container ' , ' unknown ' ) ,
' protocol ' : ent . get ( ' url ' ) . split ( ' : ' ) [ 0 ] ,
' tbr ' : tbr ,
' ext ' : ent . get ( ' container ' , ' unknown ' )
} )
2015-07-13 07:39:55 -05:00
return formats
def _real_extract ( self , url ) :
mobj = re . match ( self . _VALID_URL , url )
video_id = mobj . group ( ' id ' )
url_type = mobj . group ( ' type ' )
display_id = url_type
formats = [ ]
hls_url = None
if url_type == ' mas ' :
2015-07-17 03:27:47 -05:00
mas_url = url . split ( ' ? ' ) [ 0 ] + ' ?trans=json '
2015-07-13 07:39:55 -05:00
formats = self . _extract_formats ( mas_url , video_id )
else :
hls_url = url
if hls_url is not None :
formats . extend ( self . _extract_m3u8 ( hls_url ) )
self . _sort_formats ( formats )
return {
2015-07-17 03:27:47 -05:00
' id ' : video_id . replace ( ' / ' , ' _ ' ) ,
2015-07-13 07:39:55 -05:00
' display_id ' : display_id ,
2015-07-17 03:27:47 -05:00
' title ' : video_id . replace ( ' / ' , ' _ ' ) ,
2015-07-13 07:39:55 -05:00
' formats ' : formats ,
}
2015-07-17 03:27:47 -05:00
2015-07-13 07:39:55 -05:00
class ReutersIE ( YospaceIE ) :
_VALID_URL = r ' http://(?:www \ .)?reuters.com/.*?(?P<id>[^/]+)$ '
_TESTS = [
{
' url ' : ' http://www.reuters.com/article/2015/06/22/mideast-crisis-turkey-idINKBN0P21VN20150622 ' ,
' info_dict ' : {
' id ' : ' 364679847 ' ,
' ext ' : ' mp4 ' ,
' title ' : ' Refugees flood back into Syria from Turkey ' ,
' description ' : ' Thousands of Syrians have streamed back across the border from Turkey to their hometown, now liberated from Islamic State. Sean Carberry reports. ' ,
} ,
' params ' : {
' skip_download ' : True ,
} ,
} ,
{
' url ' : ' http://www.reuters.com/video/2015/07/10/mexican-volcano-spews-fire-and-ash?videoId=364911782 ' ,
' info_dict ' : {
' id ' : ' 364911782 ' ,
' ext ' : ' mp4 ' ,
' title ' : ' Mexican volcano spews fire and ash ' ,
' description ' : ' Mexico \' s Fire Volcano spews fire and ash as streams of lava run down its side. Rough Cut (no reporter narration). ' ,
} ,
' params ' : {
' skip_download ' : True ,
} ,
} ,
]
2015-07-17 03:27:47 -05:00
def _scrape_javascript ( self , webpage ) :
2015-07-13 07:39:55 -05:00
ret = [ ]
rdata = { }
2015-07-17 03:27:47 -05:00
2015-07-21 03:33:33 -05:00
javascript_chunks = re . findall ( r ' <script[^>]*>(.*?)</script> ' , webpage , re . DOTALL )
2015-07-13 07:39:55 -05:00
if not javascript_chunks :
return
2015-07-17 03:27:47 -05:00
2015-07-13 07:39:55 -05:00
def msub ( m ) :
s = m . group ( 1 )
if rdata . get ( s ) :
s = rdata . get ( s )
2015-07-21 03:33:33 -05:00
return ' : ' + s + m . group ( 2 ) + ' '
return ' : False ' + m . group ( 2 ) + ' '
def cleanjsonvars ( str ) : # just str/int variables that won't break js_to_json
# restr=r'[\'"]([^\'"]+)[\'"]\s*:\s*(([\'"])(|.*?[^\\])\3|\d+|[a-zA-Z0-9\._]+)\s*[,\}\]]'
restr = r """ (?x)
[ \' " ]([^ \' " ]+)[ \' " ] # quoted key
\s * : \s * # key -> var
( # var: str/int/bareword
( [ \' " ]) # str: startquote -> \3
( #
| # str: blank
. * ? [ ^ \\] # str: accounting for \'s
) #
\3 | # str: endquote
\d + | # int
[ a - zA - Z0 - 9 \. _ ] + # bareword
) #
\s * [ , \} \] ] # end with , or } ] if nested
"""
m = re . findall ( restr , str )
return ' { ' + ' \n ' . join ( [ " ' " + f [ 0 ] + " ' : " + f [ 1 ] + ' , '
for f in m ] ) + ' } '
2015-07-17 03:27:47 -05:00
2015-07-13 07:39:55 -05:00
vidnum = 0
for innerhtml in javascript_chunks :
2015-07-21 03:33:33 -05:00
js_vars = re . findall ( r ' ^ \ s*(Reuters \ .[a-zA-Z0-9 \ ._]+) \ s*= \ s*([ \' " ](?:|.*?[^ \\ ][ \' " ])| \ d+); ' , innerhtml , re . M )
if js_vars :
for ent in js_vars :
if not ent [ 1 ] :
continue
rdata [ ent [ 0 ] ] = ent [ 1 ]
drawplayer_js = re . search ( r ' Reuters.yovideo.drawPlayer \ (( \ { .+? \ }) \ ); ' , innerhtml , re . DOTALL )
2015-07-13 07:39:55 -05:00
if drawplayer_js :
vidnum + = 1
2015-07-21 03:33:33 -05:00
js = cleanjsonvars ( drawplayer_js . group ( 1 ) )
js = re . sub ( r ' : \ s*(Reuters \ .[a-zA-Z_]+ \ .[a-zA-Z_]+) \ s*([, \ }]) ' , msub , js )
vdata = self . _parse_json ( js , ' javascript chunk ' , transform_source = js_to_json )
2015-07-17 03:27:47 -05:00
desc = re . search ( r ' var RTR_VideoBlurb \ s*= \ s* " (.+?) " ; ' , innerhtml , re . DOTALL )
2015-07-13 07:39:55 -05:00
if desc :
vdata [ ' description ' ] = desc . group ( 1 )
2015-07-17 03:27:47 -05:00
vdata [ ' vidnum ' ] = vidnum
2015-07-13 07:39:55 -05:00
ret . append ( vdata )
return ret
def _real_extract ( self , url ) :
mobj = re . match ( self . _VALID_URL , url )
video_id = mobj . group ( ' id ' )
ret = [ ]
webpage = self . _download_webpage ( url , video_id )
vids = self . _scrape_javascript ( webpage )
for vid in vids :
2015-07-17 03:27:47 -05:00
vurl = vid . get ( ' flv ' , vid . get ( ' mpeg ' ) )
if vurl :
formats = [ ]
formats . append ( {
' url ' : vurl ,
' format_id ' : ' embed-flv- ' + str ( vid . get ( ' vidnum ' ) ) ,
' protocol ' : vurl . split ( ' : ' ) [ 0 ] ,
' width ' : int_or_none ( vid . get ( ' width ' ) ) ,
' height ' : int_or_none ( vid . get ( ' height ' ) ) ,
' ext ' : ' flv ' ,
' tbr ' : 1080.0 if vid . get ( ' vbc ' , ' vbcValue ' ) == ' vbcValue ' else float_or_none ( vid . get ( ' vbc ' ) ) ,
} )
yo_id_str = re . search ( r ' yospace.+/( \ d+) \ ?f=( \ d+) ' , vurl )
if yo_id_str :
yo_id = yo_id_str . group ( 1 ) + ' / ' + yo_id_str . group ( 2 )
murl = ' http://mas-e.cds1.yospace.com/mas/ ' + yo_id + ' ?trans=json '
# yurl = 'http://csm-e.cds1.yospace.com/csm/'+yo_id
formats . extend ( self . _extract_formats ( murl , video_id ) )
if formats :
self . _sort_formats ( formats )
ret . append ( {
' id ' : vid . get ( ' id ' , video_id ) ,
' title ' : vid . get ( ' title ' , video_id ) ,
' description ' : vid . get ( ' description ' ) ,
' webpage_url ' : url ,
' formats ' : formats ,
} )
2015-07-13 07:39:55 -05:00
if not ret :
raise ExtractorError ( ' No video found ' , expected = True )
if len ( ret ) > 1 :
2015-07-17 03:27:47 -05:00
return self . playlist_result ( ret , video_id , ' reuters ' )
2015-07-13 07:39:55 -05:00
return ret [ 0 ]