diff --git a/youtube_dl/extractor/cba.py b/youtube_dl/extractor/cba.py index c87171e2b..f291b2da4 100644 --- a/youtube_dl/extractor/cba.py +++ b/youtube_dl/extractor/cba.py @@ -8,7 +8,6 @@ from .common import InfoExtractor from ..utils import ( clean_html, ExtractorError, - strip_bom_utf8, RegexNotFoundError, UnavailableVideoError, update_url_query, @@ -59,8 +58,7 @@ class CBAIE(InfoExtractor): description = '' formats = [] - posts_result = self._download_json(api_posts_url, video_id, 'query posts api-endpoint', - 'unable to query posts api-endpoint', transform_source=strip_bom_utf8) + posts_result = self._download_json(api_posts_url, video_id, 'query posts api-endpoint', 'unable to query posts api-endpoint') try: title = clean_html(posts_result['title']['rendered']) description = clean_html(posts_result['content']['rendered']) @@ -73,7 +71,7 @@ class CBAIE(InfoExtractor): api_media_url = update_url_query(api_media_url, {'c': self._API_KEY}) media_result = self._download_json(api_media_url, video_id, 'query media api-endpoint%s' % api_key_str, - 'unable to qeury media api-endpoint%s' % api_key_str, transform_source=strip_bom_utf8) + 'unable to qeury media api-endpoint%s' % api_key_str) for media in media_result: try: url = media['source_url'] diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index da0af29ec..f46857523 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -57,6 +57,7 @@ from ..utils import ( parse_m3u8_attributes, extract_attributes, parse_codecs, + parse_strip_bom, ) @@ -438,6 +439,10 @@ class InfoExtractor(object): def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None): content_type = urlh.headers.get('Content-Type', '') webpage_bytes = urlh.read() + webpage_bytes, bom_enc = parse_strip_bom(webpage_bytes) + if not encoding: + encoding = bom_enc + if prefix is not None: webpage_bytes = prefix + webpage_bytes if not encoding: diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 1d3d6600c..3dea36635 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -2235,8 +2235,8 @@ def age_restricted(content_limit, age_limit): return age_limit < content_limit -def is_html(first_bytes): - """ Detect whether a file contains HTML by examining its first bytes. """ +def parse_strip_bom(data): + """ try to find Unicode BOM and strip it. """ BOMS = [ (b'\xef\xbb\xbf', 'utf-8'), @@ -2246,12 +2246,20 @@ def is_html(first_bytes): (b'\xfe\xff', 'utf-16-be'), ] for bom, enc in BOMS: - if first_bytes.startswith(bom): - s = first_bytes[len(bom):].decode(enc, 'replace') - break + if data.startswith(bom): + return data[len(bom):], enc else: - s = first_bytes.decode('utf-8', 'replace') + return data, None + +def is_html(first_bytes): + """ Detect whether a file contains HTML by examining its first bytes. """ + + first_bytes, enc = parse_strip_bom(first_bytes) + if enc == None: + enc = 'utf-8' + + s = first_bytes.decode(enc, 'replace') return re.match(r'^\s*<', s) @@ -3121,11 +3129,3 @@ def decode_png(png_data): current_row.append(color) return width, height, pixels - - -def strip_bom_utf8(s): - BOM_UTF8 = u'\ufeff' - if s.startswith(BOM_UTF8): - return s[len(BOM_UTF8):] - - return s