reuters: fix syntax
This commit is contained in:
parent
1ee382603f
commit
d4ebd851ef
@ -4,14 +4,12 @@ import re
|
||||
|
||||
from .common import InfoExtractor
|
||||
from ..utils import (
|
||||
parse_iso8601,
|
||||
strip_jsonp,
|
||||
js_to_json,
|
||||
float_or_none,
|
||||
int_or_none,
|
||||
ExtractorError,
|
||||
js_to_json,
|
||||
float_or_none,
|
||||
int_or_none,
|
||||
)
|
||||
import re
|
||||
from pprint import pprint
|
||||
|
||||
|
||||
class YospaceIE(InfoExtractor):
|
||||
_VALID_URL = r'http://(?:csm-[a-z]|mas-[a-z]).cds\d+.yospace.com/(?P<type>csm|mas)/(?P<id>\d+/\d+)'
|
||||
@ -41,22 +39,22 @@ class YospaceIE(InfoExtractor):
|
||||
jfpage = self._download_webpage(mas_url, video_id)
|
||||
jf = self._parse_json(jfpage, video_id, transform_source=js_to_json)
|
||||
for ent in jf:
|
||||
if ent.get('type','') == 'application/x-mpeg-url':
|
||||
hls_url = ent.get('url')
|
||||
formats.extend(self._extract_m3u8(hls_url))
|
||||
else:
|
||||
tbr = float_or_none(ent.get('size',0), 1000)
|
||||
if tbr == 0:
|
||||
r = re.search(r'[\?\&]q=(\d+)', ent.get('url'))
|
||||
if r:
|
||||
tbr=float_or_none(r.group(1), 1)
|
||||
formats.append({
|
||||
'url': ent.get('url'),
|
||||
'format_id': ent.get('method','unknown')+'-'+ent.get('container','unknown'),
|
||||
'protocol': ent.get('url').split(':')[0],
|
||||
'tbr': tbr,
|
||||
'ext': ent.get('container','unknown')
|
||||
})
|
||||
if ent.get('type', '') == 'application/x-mpeg-url':
|
||||
hls_url = ent.get('url')
|
||||
formats.extend(self._extract_m3u8(hls_url))
|
||||
else:
|
||||
tbr = float_or_none(ent.get('size', 0), 1000)
|
||||
if tbr == 0:
|
||||
r = re.search(r'[\?\&]q=(\d+)', ent.get('url'))
|
||||
if r:
|
||||
tbr = float_or_none(r.group(1), 1)
|
||||
formats.append({
|
||||
'url': ent.get('url'),
|
||||
'format_id': ent.get('method', 'unknown') + '-' + ent.get('container', 'unknown'),
|
||||
'protocol': ent.get('url').split(':')[0],
|
||||
'tbr': tbr,
|
||||
'ext': ent.get('container', 'unknown')
|
||||
})
|
||||
return formats
|
||||
|
||||
def _real_extract(self, url):
|
||||
@ -64,12 +62,11 @@ class YospaceIE(InfoExtractor):
|
||||
video_id = mobj.group('id')
|
||||
url_type = mobj.group('type')
|
||||
display_id = url_type
|
||||
title = display_id
|
||||
formats = []
|
||||
hls_url = None
|
||||
|
||||
if url_type == 'mas':
|
||||
mas_url = url.split('?')[0]+'?trans=json'
|
||||
mas_url = url.split('?')[0] + '?trans=json'
|
||||
formats = self._extract_formats(mas_url, video_id)
|
||||
else:
|
||||
hls_url = url
|
||||
@ -77,16 +74,16 @@ class YospaceIE(InfoExtractor):
|
||||
if hls_url is not None:
|
||||
formats.extend(self._extract_m3u8(hls_url))
|
||||
|
||||
|
||||
self._sort_formats(formats)
|
||||
|
||||
return {
|
||||
'id': video_id.replace('/','_'),
|
||||
'id': video_id.replace('/', '_'),
|
||||
'display_id': display_id,
|
||||
'title': video_id.replace('/','_'),
|
||||
'title': video_id.replace('/', '_'),
|
||||
'formats': formats,
|
||||
}
|
||||
|
||||
|
||||
class ReutersIE(YospaceIE):
|
||||
_VALID_URL = r'http://(?:www\.)?reuters.com/.*?(?P<id>[^/]+)$'
|
||||
_TESTS = [
|
||||
@ -116,91 +113,90 @@ class ReutersIE(YospaceIE):
|
||||
},
|
||||
]
|
||||
|
||||
def _scrape_javascript(self,webpage):
|
||||
def _scrape_javascript(self, webpage):
|
||||
ret = []
|
||||
rdata = {}
|
||||
|
||||
javascript_chunks = re.findall(r'<script[^>]+text/javascript[^>]*>(.*?)</script>',webpage,re.DOTALL)
|
||||
javascript_chunks = re.findall(r'<script[^>]+text/javascript[^>]*>(.*?)</script>', webpage, re.DOTALL)
|
||||
if not javascript_chunks:
|
||||
return
|
||||
|
||||
def msub(m):
|
||||
s = m.group(1)
|
||||
if rdata.get(s):
|
||||
s = rdata.get(s)
|
||||
return ': "'+s+'",\n'
|
||||
return ': "' + s + '",\n'
|
||||
return ': False,\n'
|
||||
|
||||
vidnum = 0
|
||||
for innerhtml in javascript_chunks:
|
||||
drawplayer_js = re.search(r'Reuters.yovideo.drawPlayer\((\{[^\}]+://.+?\})\);',innerhtml,re.DOTALL);
|
||||
drawplayer_js = re.search(r'Reuters.yovideo.drawPlayer\((\{[^\}]+://.+?\})\);', innerhtml, re.DOTALL)
|
||||
if drawplayer_js:
|
||||
vidnum += 1
|
||||
drawplayer_js = re.sub(r'".+?"\s*:\s*[^\d"\'].+?,\n','',drawplayer_js.group(1))
|
||||
drawplayer_js = re.sub(r'".+?"\s*:\s*[^\d"\'].+?,\n', '', drawplayer_js.group(1))
|
||||
vdata = self._parse_json(drawplayer_js, 'javascript chunk', transform_source=js_to_json)
|
||||
desc = re.search(r'var RTR_VideoBlurb\s*=\s*"(.+?)";',innerhtml,re.DOTALL)
|
||||
desc = re.search(r'var RTR_VideoBlurb\s*=\s*"(.+?)";', innerhtml, re.DOTALL)
|
||||
if desc:
|
||||
vdata['description'] = desc.group(1)
|
||||
vdata['vidnum']=vidnum
|
||||
vdata['vidnum'] = vidnum
|
||||
ret.append(vdata)
|
||||
else:
|
||||
if re.search(r'^\s*Reuters\.([^\s\[\]\.]+\.[^\[\]]+?)\s*=\s*[\'\"\d].+?;\s*\n',innerhtml,re.M):
|
||||
js_vars = re.findall(r'^\s*Reuters\.([^\s\[\]\.]+\.[^\s\[\]\.]+)\s*=\s*[\'"]?(.*?)[\'"]?;\s*\n',innerhtml,re.M)
|
||||
if re.search(r'^\s*Reuters\.([^\s\[\]\.]+\.[^\[\]]+?)\s*=\s*[\'\"\d].+?;\s*\n', innerhtml, re.M):
|
||||
js_vars = re.findall(r'^\s*Reuters\.([^\s\[\]\.]+\.[^\s\[\]\.]+)\s*=\s*[\'"]?(.*?)[\'"]?;\s*\n', innerhtml, re.M)
|
||||
for ent in js_vars:
|
||||
if re.search(r'["\'].+?[\(\)\+]',ent[1]):
|
||||
continue
|
||||
if not ent[1]:
|
||||
continue
|
||||
continue
|
||||
if re.search(r'["\'].+?[\(\)\+]', ent[1]):
|
||||
continue
|
||||
rdata[ent[0]] = ent[1]
|
||||
drawplayer_js = re.search(r'Reuters.yovideo.drawPlayer\((\{.+?\})\);',innerhtml,re.DOTALL);
|
||||
drawplayer_js = re.search(r'Reuters.yovideo.drawPlayer\((\{.+?\})\);', innerhtml, re.DOTALL)
|
||||
if drawplayer_js:
|
||||
vidnum += 1
|
||||
ds = drawplayer_js.group(1)
|
||||
ds = re.sub(r':\s*Reuters\.([a-zA-Z_]+\.[a-zA-Z_]+)\s*,\s*\n',msub,ds)
|
||||
# "stuff_with": "variables like "+this("breaks")+" js_to_json";
|
||||
ds = re.sub(r'[\'"].+?[\'"]\s*:\s*\(.+,\s*\n','',ds)
|
||||
vdata = self._parse_json(ds, 'parsed javascript chunk', transform_source=js_to_json)
|
||||
vdata['vidnum']=vidnum
|
||||
ret.append(vdata)
|
||||
vidnum += 1
|
||||
ds = drawplayer_js.group(1)
|
||||
ds = re.sub(r':\s*Reuters\.([a-zA-Z_]+\.[a-zA-Z_]+)\s*,\s*\n', msub, ds)
|
||||
# "stuff_with": "variables like "+this("breaks")+" js_to_json";
|
||||
ds = re.sub(r'[\'"].+?[\'"]\s*:\s*\(.+,\s*\n', '', ds)
|
||||
vdata = self._parse_json(ds, 'parsed javascript chunk', transform_source=js_to_json)
|
||||
vdata['vidnum'] = vidnum
|
||||
ret.append(vdata)
|
||||
return ret
|
||||
|
||||
def _real_extract(self, url):
|
||||
from .yospace import YospaceIE
|
||||
mobj = re.match(self._VALID_URL, url)
|
||||
video_id = mobj.group('id')
|
||||
ret = []
|
||||
webpage = self._download_webpage(url, video_id)
|
||||
vids = self._scrape_javascript(webpage)
|
||||
for vid in vids:
|
||||
vurl = vid.get('flv',vid.get('mpeg'))
|
||||
if vurl:
|
||||
formats = []
|
||||
formats.append({
|
||||
'url': vurl,
|
||||
'format_id': 'embed-flv-'+str(vid.get('vidnum')),
|
||||
'protocol': vurl.split(':')[0],
|
||||
'width': int_or_none(vid.get('width')),
|
||||
'height': int_or_none(vid.get('height')),
|
||||
'ext': 'flv',
|
||||
'tbr': 1080.0 if vid.get('vbc','vbcValue') == 'vbcValue' else float_or_none(vid.get('vbc')),
|
||||
})
|
||||
yo_id_str = re.search(r'yospace.+/(\d+)\?f=(\d+)',vurl)
|
||||
if yo_id_str:
|
||||
yo_id = yo_id_str.group(1)+'/'+yo_id_str.group(2)
|
||||
murl = 'http://mas-e.cds1.yospace.com/mas/'+yo_id+'?trans=json'
|
||||
yurl = 'http://csm-e.cds1.yospace.com/csm/'+yo_id
|
||||
formats.extend(self._extract_formats(murl, video_id))
|
||||
if formats:
|
||||
self._sort_formats(formats)
|
||||
ret.append({
|
||||
'id': vid.get('id',video_id),
|
||||
'title': vid.get('title',video_id),
|
||||
'description': vid.get('description'),
|
||||
'webpage_url': url,
|
||||
'formats': formats,
|
||||
})
|
||||
vurl = vid.get('flv', vid.get('mpeg'))
|
||||
if vurl:
|
||||
formats = []
|
||||
formats.append({
|
||||
'url': vurl,
|
||||
'format_id': 'embed-flv-' + str(vid.get('vidnum')),
|
||||
'protocol': vurl.split(':')[0],
|
||||
'width': int_or_none(vid.get('width')),
|
||||
'height': int_or_none(vid.get('height')),
|
||||
'ext': 'flv',
|
||||
'tbr': 1080.0 if vid.get('vbc', 'vbcValue') == 'vbcValue' else float_or_none(vid.get('vbc')),
|
||||
})
|
||||
yo_id_str = re.search(r'yospace.+/(\d+)\?f=(\d+)', vurl)
|
||||
if yo_id_str:
|
||||
yo_id = yo_id_str.group(1) + '/' + yo_id_str.group(2)
|
||||
murl = 'http://mas-e.cds1.yospace.com/mas/' + yo_id + '?trans=json'
|
||||
# yurl = 'http://csm-e.cds1.yospace.com/csm/'+yo_id
|
||||
formats.extend(self._extract_formats(murl, video_id))
|
||||
if formats:
|
||||
self._sort_formats(formats)
|
||||
ret.append({
|
||||
'id': vid.get('id', video_id),
|
||||
'title': vid.get('title', video_id),
|
||||
'description': vid.get('description'),
|
||||
'webpage_url': url,
|
||||
'formats': formats,
|
||||
})
|
||||
if not ret:
|
||||
raise ExtractorError('No video found', expected=True)
|
||||
if len(ret) > 1:
|
||||
return self.playlist_result(ret, video_id, 'reuters')
|
||||
return self.playlist_result(ret, video_id, 'reuters')
|
||||
return ret[0]
|
||||
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user