From 1ee382603fe421cace6bf4f29ee989047ace2f93 Mon Sep 17 00:00:00 2001 From: fnord Date: Mon, 13 Jul 2015 07:39:55 -0500 Subject: [PATCH 1/3] Add support for reuters.com news videos, and yospace.com backend --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/yospace.py | 206 +++++++++++++++++++++++++++++++ 2 files changed, 207 insertions(+) create mode 100644 youtube_dl/extractor/yospace.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index cbaa07391..0be279fc6 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -757,6 +757,7 @@ from .yandexmusic import ( from .yesjapan import YesJapanIE from .yinyuetai import YinYueTaiIE from .ynet import YnetIE +from .yospace import YospaceIE, ReutersIE from .youjizz import YouJizzIE from .youku import YoukuIE from .youporn import YouPornIE diff --git a/youtube_dl/extractor/yospace.py b/youtube_dl/extractor/yospace.py new file mode 100644 index 000000000..250f86218 --- /dev/null +++ b/youtube_dl/extractor/yospace.py @@ -0,0 +1,206 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + parse_iso8601, + strip_jsonp, + js_to_json, + float_or_none, + int_or_none, +) +import re +from pprint import pprint + +class YospaceIE(InfoExtractor): + _VALID_URL = r'http://(?:csm-[a-z]|mas-[a-z]).cds\d+.yospace.com/(?Pcsm|mas)/(?P\d+/\d+)' + _TESTS = [ + { + 'url': 'http://csm-e.cds1.yospace.com/csm/108986746/100312513735', + 'info_dict': { + 'id': '108986746_100312513735', + 'ext': 'mp4', + 'title': '108986746_100312513735', + 'description': None, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, + ] + + def _extract_m3u8(self, hls_url): + formats = self._extract_m3u8_formats(hls_url, 'hls', 'mp4', m3u8_id='hls') + return formats + + def _extract_formats(self, mas_url, video_id): + formats = [] + hls_url = None + jfpage = self._download_webpage(mas_url, video_id) + jf = self._parse_json(jfpage, video_id, transform_source=js_to_json) + for ent in jf: + if ent.get('type','') == 'application/x-mpeg-url': + hls_url = ent.get('url') + formats.extend(self._extract_m3u8(hls_url)) + else: + tbr = float_or_none(ent.get('size',0), 1000) + if tbr == 0: + r = re.search(r'[\?\&]q=(\d+)', ent.get('url')) + if r: + tbr=float_or_none(r.group(1), 1) + formats.append({ + 'url': ent.get('url'), + 'format_id': ent.get('method','unknown')+'-'+ent.get('container','unknown'), + 'protocol': ent.get('url').split(':')[0], + 'tbr': tbr, + 'ext': ent.get('container','unknown') + }) + return formats + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + url_type = mobj.group('type') + display_id = url_type + title = display_id + formats = [] + hls_url = None + + if url_type == 'mas': + mas_url = url.split('?')[0]+'?trans=json' + formats = self._extract_formats(mas_url, video_id) + else: + hls_url = url + + if hls_url is not None: + formats.extend(self._extract_m3u8(hls_url)) + + + self._sort_formats(formats) + + return { + 'id': video_id.replace('/','_'), + 'display_id': display_id, + 'title': video_id.replace('/','_'), + 'formats': formats, + } + +class ReutersIE(YospaceIE): + _VALID_URL = r'http://(?:www\.)?reuters.com/.*?(?P[^/]+)$' + _TESTS = [ + { + 'url': 'http://www.reuters.com/article/2015/06/22/mideast-crisis-turkey-idINKBN0P21VN20150622', + 'info_dict': { + 'id': '364679847', + 'ext': 'mp4', + 'title': 'Refugees flood back into Syria from Turkey', + 'description': 'Thousands of Syrians have streamed back across the border from Turkey to their hometown, now liberated from Islamic State. Sean Carberry reports.', + }, + 'params': { + 'skip_download': True, + }, + }, + { + 'url': 'http://www.reuters.com/video/2015/07/10/mexican-volcano-spews-fire-and-ash?videoId=364911782', + 'info_dict': { + 'id': '364911782', + 'ext': 'mp4', + 'title': 'Mexican volcano spews fire and ash', + 'description': 'Mexico\'s Fire Volcano spews fire and ash as streams of lava run down its side. Rough Cut (no reporter narration).', + }, + 'params': { + 'skip_download': True, + }, + }, + ] + + def _scrape_javascript(self,webpage): + ret = [] + rdata = {} + + javascript_chunks = re.findall(r']+text/javascript[^>]*>(.*?)',webpage,re.DOTALL) + if not javascript_chunks: + return + def msub(m): + s = m.group(1) + if rdata.get(s): + s = rdata.get(s) + return ': "'+s+'",\n' + return ': False,\n' + vidnum = 0 + for innerhtml in javascript_chunks: + drawplayer_js = re.search(r'Reuters.yovideo.drawPlayer\((\{[^\}]+://.+?\})\);',innerhtml,re.DOTALL); + if drawplayer_js: + vidnum += 1 + drawplayer_js = re.sub(r'".+?"\s*:\s*[^\d"\'].+?,\n','',drawplayer_js.group(1)) + vdata = self._parse_json(drawplayer_js, 'javascript chunk', transform_source=js_to_json) + desc = re.search(r'var RTR_VideoBlurb\s*=\s*"(.+?)";',innerhtml,re.DOTALL) + if desc: + vdata['description'] = desc.group(1) + vdata['vidnum']=vidnum + ret.append(vdata) + else: + if re.search(r'^\s*Reuters\.([^\s\[\]\.]+\.[^\[\]]+?)\s*=\s*[\'\"\d].+?;\s*\n',innerhtml,re.M): + js_vars = re.findall(r'^\s*Reuters\.([^\s\[\]\.]+\.[^\s\[\]\.]+)\s*=\s*[\'"]?(.*?)[\'"]?;\s*\n',innerhtml,re.M) + for ent in js_vars: + if re.search(r'["\'].+?[\(\)\+]',ent[1]): + continue + if not ent[1]: + continue + rdata[ent[0]] = ent[1] + drawplayer_js = re.search(r'Reuters.yovideo.drawPlayer\((\{.+?\})\);',innerhtml,re.DOTALL); + if drawplayer_js: + vidnum += 1 + ds = drawplayer_js.group(1) + ds = re.sub(r':\s*Reuters\.([a-zA-Z_]+\.[a-zA-Z_]+)\s*,\s*\n',msub,ds) + # "stuff_with": "variables like "+this("breaks")+" js_to_json"; + ds = re.sub(r'[\'"].+?[\'"]\s*:\s*\(.+,\s*\n','',ds) + vdata = self._parse_json(ds, 'parsed javascript chunk', transform_source=js_to_json) + vdata['vidnum']=vidnum + ret.append(vdata) + return ret + + def _real_extract(self, url): + from .yospace import YospaceIE + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + ret = [] + webpage = self._download_webpage(url, video_id) + vids = self._scrape_javascript(webpage) + for vid in vids: + vurl = vid.get('flv',vid.get('mpeg')) + if vurl: + formats = [] + formats.append({ + 'url': vurl, + 'format_id': 'embed-flv-'+str(vid.get('vidnum')), + 'protocol': vurl.split(':')[0], + 'width': int_or_none(vid.get('width')), + 'height': int_or_none(vid.get('height')), + 'ext': 'flv', + 'tbr': 1080.0 if vid.get('vbc','vbcValue') == 'vbcValue' else float_or_none(vid.get('vbc')), + }) + yo_id_str = re.search(r'yospace.+/(\d+)\?f=(\d+)',vurl) + if yo_id_str: + yo_id = yo_id_str.group(1)+'/'+yo_id_str.group(2) + murl = 'http://mas-e.cds1.yospace.com/mas/'+yo_id+'?trans=json' + yurl = 'http://csm-e.cds1.yospace.com/csm/'+yo_id + formats.extend(self._extract_formats(murl, video_id)) + if formats: + self._sort_formats(formats) + ret.append({ + 'id': vid.get('id',video_id), + 'title': vid.get('title',video_id), + 'description': vid.get('description'), + 'webpage_url': url, + 'formats': formats, + }) + if not ret: + raise ExtractorError('No video found', expected=True) + if len(ret) > 1: + return self.playlist_result(ret, video_id, 'reuters') + return ret[0] + + From d4ebd851ef9916d9a6f06a1f5ea261295be6f1c2 Mon Sep 17 00:00:00 2001 From: fnord Date: Fri, 17 Jul 2015 03:27:47 -0500 Subject: [PATCH 2/3] reuters: fix syntax --- youtube_dl/extractor/yospace.py | 158 ++++++++++++++++---------------- 1 file changed, 77 insertions(+), 81 deletions(-) diff --git a/youtube_dl/extractor/yospace.py b/youtube_dl/extractor/yospace.py index 250f86218..7e0d67d36 100644 --- a/youtube_dl/extractor/yospace.py +++ b/youtube_dl/extractor/yospace.py @@ -4,14 +4,12 @@ import re from .common import InfoExtractor from ..utils import ( - parse_iso8601, - strip_jsonp, - js_to_json, - float_or_none, - int_or_none, + ExtractorError, + js_to_json, + float_or_none, + int_or_none, ) -import re -from pprint import pprint + class YospaceIE(InfoExtractor): _VALID_URL = r'http://(?:csm-[a-z]|mas-[a-z]).cds\d+.yospace.com/(?Pcsm|mas)/(?P\d+/\d+)' @@ -41,22 +39,22 @@ class YospaceIE(InfoExtractor): jfpage = self._download_webpage(mas_url, video_id) jf = self._parse_json(jfpage, video_id, transform_source=js_to_json) for ent in jf: - if ent.get('type','') == 'application/x-mpeg-url': - hls_url = ent.get('url') - formats.extend(self._extract_m3u8(hls_url)) - else: - tbr = float_or_none(ent.get('size',0), 1000) - if tbr == 0: - r = re.search(r'[\?\&]q=(\d+)', ent.get('url')) - if r: - tbr=float_or_none(r.group(1), 1) - formats.append({ - 'url': ent.get('url'), - 'format_id': ent.get('method','unknown')+'-'+ent.get('container','unknown'), - 'protocol': ent.get('url').split(':')[0], - 'tbr': tbr, - 'ext': ent.get('container','unknown') - }) + if ent.get('type', '') == 'application/x-mpeg-url': + hls_url = ent.get('url') + formats.extend(self._extract_m3u8(hls_url)) + else: + tbr = float_or_none(ent.get('size', 0), 1000) + if tbr == 0: + r = re.search(r'[\?\&]q=(\d+)', ent.get('url')) + if r: + tbr = float_or_none(r.group(1), 1) + formats.append({ + 'url': ent.get('url'), + 'format_id': ent.get('method', 'unknown') + '-' + ent.get('container', 'unknown'), + 'protocol': ent.get('url').split(':')[0], + 'tbr': tbr, + 'ext': ent.get('container', 'unknown') + }) return formats def _real_extract(self, url): @@ -64,12 +62,11 @@ class YospaceIE(InfoExtractor): video_id = mobj.group('id') url_type = mobj.group('type') display_id = url_type - title = display_id formats = [] hls_url = None if url_type == 'mas': - mas_url = url.split('?')[0]+'?trans=json' + mas_url = url.split('?')[0] + '?trans=json' formats = self._extract_formats(mas_url, video_id) else: hls_url = url @@ -77,16 +74,16 @@ class YospaceIE(InfoExtractor): if hls_url is not None: formats.extend(self._extract_m3u8(hls_url)) - self._sort_formats(formats) return { - 'id': video_id.replace('/','_'), + 'id': video_id.replace('/', '_'), 'display_id': display_id, - 'title': video_id.replace('/','_'), + 'title': video_id.replace('/', '_'), 'formats': formats, } + class ReutersIE(YospaceIE): _VALID_URL = r'http://(?:www\.)?reuters.com/.*?(?P[^/]+)$' _TESTS = [ @@ -116,91 +113,90 @@ class ReutersIE(YospaceIE): }, ] - def _scrape_javascript(self,webpage): + def _scrape_javascript(self, webpage): ret = [] rdata = {} - - javascript_chunks = re.findall(r']+text/javascript[^>]*>(.*?)',webpage,re.DOTALL) + + javascript_chunks = re.findall(r']+text/javascript[^>]*>(.*?)', webpage, re.DOTALL) if not javascript_chunks: return + def msub(m): s = m.group(1) if rdata.get(s): s = rdata.get(s) - return ': "'+s+'",\n' + return ': "' + s + '",\n' return ': False,\n' + vidnum = 0 for innerhtml in javascript_chunks: - drawplayer_js = re.search(r'Reuters.yovideo.drawPlayer\((\{[^\}]+://.+?\})\);',innerhtml,re.DOTALL); + drawplayer_js = re.search(r'Reuters.yovideo.drawPlayer\((\{[^\}]+://.+?\})\);', innerhtml, re.DOTALL) if drawplayer_js: vidnum += 1 - drawplayer_js = re.sub(r'".+?"\s*:\s*[^\d"\'].+?,\n','',drawplayer_js.group(1)) + drawplayer_js = re.sub(r'".+?"\s*:\s*[^\d"\'].+?,\n', '', drawplayer_js.group(1)) vdata = self._parse_json(drawplayer_js, 'javascript chunk', transform_source=js_to_json) - desc = re.search(r'var RTR_VideoBlurb\s*=\s*"(.+?)";',innerhtml,re.DOTALL) + desc = re.search(r'var RTR_VideoBlurb\s*=\s*"(.+?)";', innerhtml, re.DOTALL) if desc: vdata['description'] = desc.group(1) - vdata['vidnum']=vidnum + vdata['vidnum'] = vidnum ret.append(vdata) else: - if re.search(r'^\s*Reuters\.([^\s\[\]\.]+\.[^\[\]]+?)\s*=\s*[\'\"\d].+?;\s*\n',innerhtml,re.M): - js_vars = re.findall(r'^\s*Reuters\.([^\s\[\]\.]+\.[^\s\[\]\.]+)\s*=\s*[\'"]?(.*?)[\'"]?;\s*\n',innerhtml,re.M) + if re.search(r'^\s*Reuters\.([^\s\[\]\.]+\.[^\[\]]+?)\s*=\s*[\'\"\d].+?;\s*\n', innerhtml, re.M): + js_vars = re.findall(r'^\s*Reuters\.([^\s\[\]\.]+\.[^\s\[\]\.]+)\s*=\s*[\'"]?(.*?)[\'"]?;\s*\n', innerhtml, re.M) for ent in js_vars: - if re.search(r'["\'].+?[\(\)\+]',ent[1]): - continue if not ent[1]: - continue + continue + if re.search(r'["\'].+?[\(\)\+]', ent[1]): + continue rdata[ent[0]] = ent[1] - drawplayer_js = re.search(r'Reuters.yovideo.drawPlayer\((\{.+?\})\);',innerhtml,re.DOTALL); + drawplayer_js = re.search(r'Reuters.yovideo.drawPlayer\((\{.+?\})\);', innerhtml, re.DOTALL) if drawplayer_js: - vidnum += 1 - ds = drawplayer_js.group(1) - ds = re.sub(r':\s*Reuters\.([a-zA-Z_]+\.[a-zA-Z_]+)\s*,\s*\n',msub,ds) - # "stuff_with": "variables like "+this("breaks")+" js_to_json"; - ds = re.sub(r'[\'"].+?[\'"]\s*:\s*\(.+,\s*\n','',ds) - vdata = self._parse_json(ds, 'parsed javascript chunk', transform_source=js_to_json) - vdata['vidnum']=vidnum - ret.append(vdata) + vidnum += 1 + ds = drawplayer_js.group(1) + ds = re.sub(r':\s*Reuters\.([a-zA-Z_]+\.[a-zA-Z_]+)\s*,\s*\n', msub, ds) + # "stuff_with": "variables like "+this("breaks")+" js_to_json"; + ds = re.sub(r'[\'"].+?[\'"]\s*:\s*\(.+,\s*\n', '', ds) + vdata = self._parse_json(ds, 'parsed javascript chunk', transform_source=js_to_json) + vdata['vidnum'] = vidnum + ret.append(vdata) return ret def _real_extract(self, url): - from .yospace import YospaceIE mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') ret = [] webpage = self._download_webpage(url, video_id) vids = self._scrape_javascript(webpage) for vid in vids: - vurl = vid.get('flv',vid.get('mpeg')) - if vurl: - formats = [] - formats.append({ - 'url': vurl, - 'format_id': 'embed-flv-'+str(vid.get('vidnum')), - 'protocol': vurl.split(':')[0], - 'width': int_or_none(vid.get('width')), - 'height': int_or_none(vid.get('height')), - 'ext': 'flv', - 'tbr': 1080.0 if vid.get('vbc','vbcValue') == 'vbcValue' else float_or_none(vid.get('vbc')), - }) - yo_id_str = re.search(r'yospace.+/(\d+)\?f=(\d+)',vurl) - if yo_id_str: - yo_id = yo_id_str.group(1)+'/'+yo_id_str.group(2) - murl = 'http://mas-e.cds1.yospace.com/mas/'+yo_id+'?trans=json' - yurl = 'http://csm-e.cds1.yospace.com/csm/'+yo_id - formats.extend(self._extract_formats(murl, video_id)) - if formats: - self._sort_formats(formats) - ret.append({ - 'id': vid.get('id',video_id), - 'title': vid.get('title',video_id), - 'description': vid.get('description'), - 'webpage_url': url, - 'formats': formats, - }) + vurl = vid.get('flv', vid.get('mpeg')) + if vurl: + formats = [] + formats.append({ + 'url': vurl, + 'format_id': 'embed-flv-' + str(vid.get('vidnum')), + 'protocol': vurl.split(':')[0], + 'width': int_or_none(vid.get('width')), + 'height': int_or_none(vid.get('height')), + 'ext': 'flv', + 'tbr': 1080.0 if vid.get('vbc', 'vbcValue') == 'vbcValue' else float_or_none(vid.get('vbc')), + }) + yo_id_str = re.search(r'yospace.+/(\d+)\?f=(\d+)', vurl) + if yo_id_str: + yo_id = yo_id_str.group(1) + '/' + yo_id_str.group(2) + murl = 'http://mas-e.cds1.yospace.com/mas/' + yo_id + '?trans=json' + # yurl = 'http://csm-e.cds1.yospace.com/csm/'+yo_id + formats.extend(self._extract_formats(murl, video_id)) + if formats: + self._sort_formats(formats) + ret.append({ + 'id': vid.get('id', video_id), + 'title': vid.get('title', video_id), + 'description': vid.get('description'), + 'webpage_url': url, + 'formats': formats, + }) if not ret: raise ExtractorError('No video found', expected=True) if len(ret) > 1: - return self.playlist_result(ret, video_id, 'reuters') + return self.playlist_result(ret, video_id, 'reuters') return ret[0] - - From b811d57342bbb6e489270922c1ff4bf0db30ab63 Mon Sep 17 00:00:00 2001 From: fnord Date: Tue, 21 Jul 2015 03:33:33 -0500 Subject: [PATCH 3/3] Better js handling / tolerance of messy json --- youtube_dl/extractor/yospace.py | 63 +++++++++++++++++++-------------- 1 file changed, 37 insertions(+), 26 deletions(-) diff --git a/youtube_dl/extractor/yospace.py b/youtube_dl/extractor/yospace.py index 7e0d67d36..63b381936 100644 --- a/youtube_dl/extractor/yospace.py +++ b/youtube_dl/extractor/yospace.py @@ -36,7 +36,7 @@ class YospaceIE(InfoExtractor): def _extract_formats(self, mas_url, video_id): formats = [] hls_url = None - jfpage = self._download_webpage(mas_url, video_id) + jfpage = self._download_webpage(mas_url, 'json') jf = self._parse_json(jfpage, video_id, transform_source=js_to_json) for ent in jf: if ent.get('type', '') == 'application/x-mpeg-url': @@ -117,7 +117,7 @@ class ReutersIE(YospaceIE): ret = [] rdata = {} - javascript_chunks = re.findall(r']+text/javascript[^>]*>(.*?)', webpage, re.DOTALL) + javascript_chunks = re.findall(r']*>(.*?)', webpage, re.DOTALL) if not javascript_chunks: return @@ -125,40 +125,51 @@ class ReutersIE(YospaceIE): s = m.group(1) if rdata.get(s): s = rdata.get(s) - return ': "' + s + '",\n' - return ': False,\n' + return ': ' + s + m.group(2) + '' + return ': False' + m.group(2) + '' + + def cleanjsonvars(str): # just str/int variables that won't break js_to_json + # restr=r'[\'"]([^\'"]+)[\'"]\s*:\s*(([\'"])(|.*?[^\\])\3|\d+|[a-zA-Z0-9\._]+)\s*[,\}\]]' + restr = r"""(?x) + [\'"]([^\'"]+)[\'"] # quoted key + \s*:\s* # key -> var + ( # var: str/int/bareword + ([\'"]) # str: startquote -> \3 + ( # + | # str: blank + .*?[^\\] # str: accounting for \'s + ) # + \3| # str: endquote + \d+| # int + [a-zA-Z0-9\._]+ # bareword + ) # + \s*[,\}\]] # end with , or } ] if nested + """ + m = re.findall(restr, str) + return '{' + '\n'.join(["'" + f[0] + "': " + f[1] + ',' + for f in m]) + '}' vidnum = 0 for innerhtml in javascript_chunks: - drawplayer_js = re.search(r'Reuters.yovideo.drawPlayer\((\{[^\}]+://.+?\})\);', innerhtml, re.DOTALL) + + js_vars = re.findall(r'^\s*(Reuters\.[a-zA-Z0-9\._]+)\s*=\s*([\'"](?:|.*?[^\\][\'"])|\d+);', innerhtml, re.M) + if js_vars: + for ent in js_vars: + if not ent[1]: + continue + rdata[ent[0]] = ent[1] + + drawplayer_js = re.search(r'Reuters.yovideo.drawPlayer\((\{.+?\})\);', innerhtml, re.DOTALL) if drawplayer_js: vidnum += 1 - drawplayer_js = re.sub(r'".+?"\s*:\s*[^\d"\'].+?,\n', '', drawplayer_js.group(1)) - vdata = self._parse_json(drawplayer_js, 'javascript chunk', transform_source=js_to_json) + js = cleanjsonvars(drawplayer_js.group(1)) + js = re.sub(r':\s*(Reuters\.[a-zA-Z_]+\.[a-zA-Z_]+)\s*([,\}])', msub, js) + vdata = self._parse_json(js, 'javascript chunk', transform_source=js_to_json) desc = re.search(r'var RTR_VideoBlurb\s*=\s*"(.+?)";', innerhtml, re.DOTALL) if desc: vdata['description'] = desc.group(1) vdata['vidnum'] = vidnum ret.append(vdata) - else: - if re.search(r'^\s*Reuters\.([^\s\[\]\.]+\.[^\[\]]+?)\s*=\s*[\'\"\d].+?;\s*\n', innerhtml, re.M): - js_vars = re.findall(r'^\s*Reuters\.([^\s\[\]\.]+\.[^\s\[\]\.]+)\s*=\s*[\'"]?(.*?)[\'"]?;\s*\n', innerhtml, re.M) - for ent in js_vars: - if not ent[1]: - continue - if re.search(r'["\'].+?[\(\)\+]', ent[1]): - continue - rdata[ent[0]] = ent[1] - drawplayer_js = re.search(r'Reuters.yovideo.drawPlayer\((\{.+?\})\);', innerhtml, re.DOTALL) - if drawplayer_js: - vidnum += 1 - ds = drawplayer_js.group(1) - ds = re.sub(r':\s*Reuters\.([a-zA-Z_]+\.[a-zA-Z_]+)\s*,\s*\n', msub, ds) - # "stuff_with": "variables like "+this("breaks")+" js_to_json"; - ds = re.sub(r'[\'"].+?[\'"]\s*:\s*\(.+,\s*\n', '', ds) - vdata = self._parse_json(ds, 'parsed javascript chunk', transform_source=js_to_json) - vdata['vidnum'] = vidnum - ret.append(vdata) return ret def _real_extract(self, url):