diff --git a/youtube_dl/extractor/yospace.py b/youtube_dl/extractor/yospace.py index 7e0d67d36..63b381936 100644 --- a/youtube_dl/extractor/yospace.py +++ b/youtube_dl/extractor/yospace.py @@ -36,7 +36,7 @@ class YospaceIE(InfoExtractor): def _extract_formats(self, mas_url, video_id): formats = [] hls_url = None - jfpage = self._download_webpage(mas_url, video_id) + jfpage = self._download_webpage(mas_url, 'json') jf = self._parse_json(jfpage, video_id, transform_source=js_to_json) for ent in jf: if ent.get('type', '') == 'application/x-mpeg-url': @@ -117,7 +117,7 @@ class ReutersIE(YospaceIE): ret = [] rdata = {} - javascript_chunks = re.findall(r']+text/javascript[^>]*>(.*?)', webpage, re.DOTALL) + javascript_chunks = re.findall(r']*>(.*?)', webpage, re.DOTALL) if not javascript_chunks: return @@ -125,40 +125,51 @@ class ReutersIE(YospaceIE): s = m.group(1) if rdata.get(s): s = rdata.get(s) - return ': "' + s + '",\n' - return ': False,\n' + return ': ' + s + m.group(2) + '' + return ': False' + m.group(2) + '' + + def cleanjsonvars(str): # just str/int variables that won't break js_to_json + # restr=r'[\'"]([^\'"]+)[\'"]\s*:\s*(([\'"])(|.*?[^\\])\3|\d+|[a-zA-Z0-9\._]+)\s*[,\}\]]' + restr = r"""(?x) + [\'"]([^\'"]+)[\'"] # quoted key + \s*:\s* # key -> var + ( # var: str/int/bareword + ([\'"]) # str: startquote -> \3 + ( # + | # str: blank + .*?[^\\] # str: accounting for \'s + ) # + \3| # str: endquote + \d+| # int + [a-zA-Z0-9\._]+ # bareword + ) # + \s*[,\}\]] # end with , or } ] if nested + """ + m = re.findall(restr, str) + return '{' + '\n'.join(["'" + f[0] + "': " + f[1] + ',' + for f in m]) + '}' vidnum = 0 for innerhtml in javascript_chunks: - drawplayer_js = re.search(r'Reuters.yovideo.drawPlayer\((\{[^\}]+://.+?\})\);', innerhtml, re.DOTALL) + + js_vars = re.findall(r'^\s*(Reuters\.[a-zA-Z0-9\._]+)\s*=\s*([\'"](?:|.*?[^\\][\'"])|\d+);', innerhtml, re.M) + if js_vars: + for ent in js_vars: + if not ent[1]: + continue + rdata[ent[0]] = ent[1] + + drawplayer_js = re.search(r'Reuters.yovideo.drawPlayer\((\{.+?\})\);', innerhtml, re.DOTALL) if drawplayer_js: vidnum += 1 - drawplayer_js = re.sub(r'".+?"\s*:\s*[^\d"\'].+?,\n', '', drawplayer_js.group(1)) - vdata = self._parse_json(drawplayer_js, 'javascript chunk', transform_source=js_to_json) + js = cleanjsonvars(drawplayer_js.group(1)) + js = re.sub(r':\s*(Reuters\.[a-zA-Z_]+\.[a-zA-Z_]+)\s*([,\}])', msub, js) + vdata = self._parse_json(js, 'javascript chunk', transform_source=js_to_json) desc = re.search(r'var RTR_VideoBlurb\s*=\s*"(.+?)";', innerhtml, re.DOTALL) if desc: vdata['description'] = desc.group(1) vdata['vidnum'] = vidnum ret.append(vdata) - else: - if re.search(r'^\s*Reuters\.([^\s\[\]\.]+\.[^\[\]]+?)\s*=\s*[\'\"\d].+?;\s*\n', innerhtml, re.M): - js_vars = re.findall(r'^\s*Reuters\.([^\s\[\]\.]+\.[^\s\[\]\.]+)\s*=\s*[\'"]?(.*?)[\'"]?;\s*\n', innerhtml, re.M) - for ent in js_vars: - if not ent[1]: - continue - if re.search(r'["\'].+?[\(\)\+]', ent[1]): - continue - rdata[ent[0]] = ent[1] - drawplayer_js = re.search(r'Reuters.yovideo.drawPlayer\((\{.+?\})\);', innerhtml, re.DOTALL) - if drawplayer_js: - vidnum += 1 - ds = drawplayer_js.group(1) - ds = re.sub(r':\s*Reuters\.([a-zA-Z_]+\.[a-zA-Z_]+)\s*,\s*\n', msub, ds) - # "stuff_with": "variables like "+this("breaks")+" js_to_json"; - ds = re.sub(r'[\'"].+?[\'"]\s*:\s*\(.+,\s*\n', '', ds) - vdata = self._parse_json(ds, 'parsed javascript chunk', transform_source=js_to_json) - vdata['vidnum'] = vidnum - ret.append(vdata) return ret def _real_extract(self, url):