reuters: fix syntax

This commit is contained in:
fnord 2015-07-17 03:27:47 -05:00
parent 1ee382603f
commit d4ebd851ef

View File

@ -4,14 +4,12 @@ import re
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import ( from ..utils import (
parse_iso8601, ExtractorError,
strip_jsonp,
js_to_json, js_to_json,
float_or_none, float_or_none,
int_or_none, int_or_none,
) )
import re
from pprint import pprint
class YospaceIE(InfoExtractor): class YospaceIE(InfoExtractor):
_VALID_URL = r'http://(?:csm-[a-z]|mas-[a-z]).cds\d+.yospace.com/(?P<type>csm|mas)/(?P<id>\d+/\d+)' _VALID_URL = r'http://(?:csm-[a-z]|mas-[a-z]).cds\d+.yospace.com/(?P<type>csm|mas)/(?P<id>\d+/\d+)'
@ -41,21 +39,21 @@ class YospaceIE(InfoExtractor):
jfpage = self._download_webpage(mas_url, video_id) jfpage = self._download_webpage(mas_url, video_id)
jf = self._parse_json(jfpage, video_id, transform_source=js_to_json) jf = self._parse_json(jfpage, video_id, transform_source=js_to_json)
for ent in jf: for ent in jf:
if ent.get('type','') == 'application/x-mpeg-url': if ent.get('type', '') == 'application/x-mpeg-url':
hls_url = ent.get('url') hls_url = ent.get('url')
formats.extend(self._extract_m3u8(hls_url)) formats.extend(self._extract_m3u8(hls_url))
else: else:
tbr = float_or_none(ent.get('size',0), 1000) tbr = float_or_none(ent.get('size', 0), 1000)
if tbr == 0: if tbr == 0:
r = re.search(r'[\?\&]q=(\d+)', ent.get('url')) r = re.search(r'[\?\&]q=(\d+)', ent.get('url'))
if r: if r:
tbr=float_or_none(r.group(1), 1) tbr = float_or_none(r.group(1), 1)
formats.append({ formats.append({
'url': ent.get('url'), 'url': ent.get('url'),
'format_id': ent.get('method','unknown')+'-'+ent.get('container','unknown'), 'format_id': ent.get('method', 'unknown') + '-' + ent.get('container', 'unknown'),
'protocol': ent.get('url').split(':')[0], 'protocol': ent.get('url').split(':')[0],
'tbr': tbr, 'tbr': tbr,
'ext': ent.get('container','unknown') 'ext': ent.get('container', 'unknown')
}) })
return formats return formats
@ -64,12 +62,11 @@ class YospaceIE(InfoExtractor):
video_id = mobj.group('id') video_id = mobj.group('id')
url_type = mobj.group('type') url_type = mobj.group('type')
display_id = url_type display_id = url_type
title = display_id
formats = [] formats = []
hls_url = None hls_url = None
if url_type == 'mas': if url_type == 'mas':
mas_url = url.split('?')[0]+'?trans=json' mas_url = url.split('?')[0] + '?trans=json'
formats = self._extract_formats(mas_url, video_id) formats = self._extract_formats(mas_url, video_id)
else: else:
hls_url = url hls_url = url
@ -77,16 +74,16 @@ class YospaceIE(InfoExtractor):
if hls_url is not None: if hls_url is not None:
formats.extend(self._extract_m3u8(hls_url)) formats.extend(self._extract_m3u8(hls_url))
self._sort_formats(formats) self._sort_formats(formats)
return { return {
'id': video_id.replace('/','_'), 'id': video_id.replace('/', '_'),
'display_id': display_id, 'display_id': display_id,
'title': video_id.replace('/','_'), 'title': video_id.replace('/', '_'),
'formats': formats, 'formats': formats,
} }
class ReutersIE(YospaceIE): class ReutersIE(YospaceIE):
_VALID_URL = r'http://(?:www\.)?reuters.com/.*?(?P<id>[^/]+)$' _VALID_URL = r'http://(?:www\.)?reuters.com/.*?(?P<id>[^/]+)$'
_TESTS = [ _TESTS = [
@ -116,83 +113,84 @@ class ReutersIE(YospaceIE):
}, },
] ]
def _scrape_javascript(self,webpage): def _scrape_javascript(self, webpage):
ret = [] ret = []
rdata = {} rdata = {}
javascript_chunks = re.findall(r'<script[^>]+text/javascript[^>]*>(.*?)</script>',webpage,re.DOTALL) javascript_chunks = re.findall(r'<script[^>]+text/javascript[^>]*>(.*?)</script>', webpage, re.DOTALL)
if not javascript_chunks: if not javascript_chunks:
return return
def msub(m): def msub(m):
s = m.group(1) s = m.group(1)
if rdata.get(s): if rdata.get(s):
s = rdata.get(s) s = rdata.get(s)
return ': "'+s+'",\n' return ': "' + s + '",\n'
return ': False,\n' return ': False,\n'
vidnum = 0 vidnum = 0
for innerhtml in javascript_chunks: for innerhtml in javascript_chunks:
drawplayer_js = re.search(r'Reuters.yovideo.drawPlayer\((\{[^\}]+://.+?\})\);',innerhtml,re.DOTALL); drawplayer_js = re.search(r'Reuters.yovideo.drawPlayer\((\{[^\}]+://.+?\})\);', innerhtml, re.DOTALL)
if drawplayer_js: if drawplayer_js:
vidnum += 1 vidnum += 1
drawplayer_js = re.sub(r'".+?"\s*:\s*[^\d"\'].+?,\n','',drawplayer_js.group(1)) drawplayer_js = re.sub(r'".+?"\s*:\s*[^\d"\'].+?,\n', '', drawplayer_js.group(1))
vdata = self._parse_json(drawplayer_js, 'javascript chunk', transform_source=js_to_json) vdata = self._parse_json(drawplayer_js, 'javascript chunk', transform_source=js_to_json)
desc = re.search(r'var RTR_VideoBlurb\s*=\s*"(.+?)";',innerhtml,re.DOTALL) desc = re.search(r'var RTR_VideoBlurb\s*=\s*"(.+?)";', innerhtml, re.DOTALL)
if desc: if desc:
vdata['description'] = desc.group(1) vdata['description'] = desc.group(1)
vdata['vidnum']=vidnum vdata['vidnum'] = vidnum
ret.append(vdata) ret.append(vdata)
else: else:
if re.search(r'^\s*Reuters\.([^\s\[\]\.]+\.[^\[\]]+?)\s*=\s*[\'\"\d].+?;\s*\n',innerhtml,re.M): if re.search(r'^\s*Reuters\.([^\s\[\]\.]+\.[^\[\]]+?)\s*=\s*[\'\"\d].+?;\s*\n', innerhtml, re.M):
js_vars = re.findall(r'^\s*Reuters\.([^\s\[\]\.]+\.[^\s\[\]\.]+)\s*=\s*[\'"]?(.*?)[\'"]?;\s*\n',innerhtml,re.M) js_vars = re.findall(r'^\s*Reuters\.([^\s\[\]\.]+\.[^\s\[\]\.]+)\s*=\s*[\'"]?(.*?)[\'"]?;\s*\n', innerhtml, re.M)
for ent in js_vars: for ent in js_vars:
if re.search(r'["\'].+?[\(\)\+]',ent[1]):
continue
if not ent[1]: if not ent[1]:
continue continue
if re.search(r'["\'].+?[\(\)\+]', ent[1]):
continue
rdata[ent[0]] = ent[1] rdata[ent[0]] = ent[1]
drawplayer_js = re.search(r'Reuters.yovideo.drawPlayer\((\{.+?\})\);',innerhtml,re.DOTALL); drawplayer_js = re.search(r'Reuters.yovideo.drawPlayer\((\{.+?\})\);', innerhtml, re.DOTALL)
if drawplayer_js: if drawplayer_js:
vidnum += 1 vidnum += 1
ds = drawplayer_js.group(1) ds = drawplayer_js.group(1)
ds = re.sub(r':\s*Reuters\.([a-zA-Z_]+\.[a-zA-Z_]+)\s*,\s*\n',msub,ds) ds = re.sub(r':\s*Reuters\.([a-zA-Z_]+\.[a-zA-Z_]+)\s*,\s*\n', msub, ds)
# "stuff_with": "variables like "+this("breaks")+" js_to_json"; # "stuff_with": "variables like "+this("breaks")+" js_to_json";
ds = re.sub(r'[\'"].+?[\'"]\s*:\s*\(.+,\s*\n','',ds) ds = re.sub(r'[\'"].+?[\'"]\s*:\s*\(.+,\s*\n', '', ds)
vdata = self._parse_json(ds, 'parsed javascript chunk', transform_source=js_to_json) vdata = self._parse_json(ds, 'parsed javascript chunk', transform_source=js_to_json)
vdata['vidnum']=vidnum vdata['vidnum'] = vidnum
ret.append(vdata) ret.append(vdata)
return ret return ret
def _real_extract(self, url): def _real_extract(self, url):
from .yospace import YospaceIE
mobj = re.match(self._VALID_URL, url) mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id') video_id = mobj.group('id')
ret = [] ret = []
webpage = self._download_webpage(url, video_id) webpage = self._download_webpage(url, video_id)
vids = self._scrape_javascript(webpage) vids = self._scrape_javascript(webpage)
for vid in vids: for vid in vids:
vurl = vid.get('flv',vid.get('mpeg')) vurl = vid.get('flv', vid.get('mpeg'))
if vurl: if vurl:
formats = [] formats = []
formats.append({ formats.append({
'url': vurl, 'url': vurl,
'format_id': 'embed-flv-'+str(vid.get('vidnum')), 'format_id': 'embed-flv-' + str(vid.get('vidnum')),
'protocol': vurl.split(':')[0], 'protocol': vurl.split(':')[0],
'width': int_or_none(vid.get('width')), 'width': int_or_none(vid.get('width')),
'height': int_or_none(vid.get('height')), 'height': int_or_none(vid.get('height')),
'ext': 'flv', 'ext': 'flv',
'tbr': 1080.0 if vid.get('vbc','vbcValue') == 'vbcValue' else float_or_none(vid.get('vbc')), 'tbr': 1080.0 if vid.get('vbc', 'vbcValue') == 'vbcValue' else float_or_none(vid.get('vbc')),
}) })
yo_id_str = re.search(r'yospace.+/(\d+)\?f=(\d+)',vurl) yo_id_str = re.search(r'yospace.+/(\d+)\?f=(\d+)', vurl)
if yo_id_str: if yo_id_str:
yo_id = yo_id_str.group(1)+'/'+yo_id_str.group(2) yo_id = yo_id_str.group(1) + '/' + yo_id_str.group(2)
murl = 'http://mas-e.cds1.yospace.com/mas/'+yo_id+'?trans=json' murl = 'http://mas-e.cds1.yospace.com/mas/' + yo_id + '?trans=json'
yurl = 'http://csm-e.cds1.yospace.com/csm/'+yo_id # yurl = 'http://csm-e.cds1.yospace.com/csm/'+yo_id
formats.extend(self._extract_formats(murl, video_id)) formats.extend(self._extract_formats(murl, video_id))
if formats: if formats:
self._sort_formats(formats) self._sort_formats(formats)
ret.append({ ret.append({
'id': vid.get('id',video_id), 'id': vid.get('id', video_id),
'title': vid.get('title',video_id), 'title': vid.get('title', video_id),
'description': vid.get('description'), 'description': vid.get('description'),
'webpage_url': url, 'webpage_url': url,
'formats': formats, 'formats': formats,
@ -202,5 +200,3 @@ class ReutersIE(YospaceIE):
if len(ret) > 1: if len(ret) > 1:
return self.playlist_result(ret, video_id, 'reuters') return self.playlist_result(ret, video_id, 'reuters')
return ret[0] return ret[0]