reverted commit 7f4b465 in favor of much easier fix: using utf-8-sig for encoding

2016-09-02 01:36:46 +02:00 · 2016-09-02 01:36:46 +02:00 · a72b74c6f9
commit a72b74c6f9
parent 2ac405397e
3 changed files with 9 additions and 21 deletions
--- a/youtube_dl/extractor/cba.py
+++ b/youtube_dl/extractor/cba.py
@ -50,7 +50,8 @@ class CBAIE(InfoExtractor):
        description = ''
        formats = []
-        posts_result = self._download_json(api_posts_url, video_id, 'query posts api-endpoint', 'unable to query posts api-endpoint')
+        posts_result = self._download_json(api_posts_url, video_id, 'query posts api-endpoint',
                                           'unable to query posts api-endpoint', encoding='utf-8-sig')
        try:
            title = posts_result['title']['raw']
        except KeyError:
@ -71,7 +72,7 @@ class CBAIE(InfoExtractor):
            api_media_url = update_url_query(api_media_url, {'c': self._API_KEY})
        media_result = self._download_json(api_media_url, video_id, 'query media api-endpoint%s' % api_key_str,
-                                           'unable to qeury media api-endpoint%s' % api_key_str)
+                                           'unable to qeury media api-endpoint%s' % api_key_str, encoding='utf-8-sig')
        for media in media_result:
            try:
                url = media['source_url']
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@ -57,7 +57,6 @@ from ..utils import (
    parse_m3u8_attributes,
    extract_attributes,
    parse_codecs,
    parse_strip_bom,
 )
@ -439,10 +438,6 @@ class InfoExtractor(object):
    def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
        content_type = urlh.headers.get('Content-Type', '')
        webpage_bytes = urlh.read()
        webpage_bytes, bom_enc = parse_strip_bom(webpage_bytes)
        if not encoding:
            encoding = bom_enc
        if prefix is not None:
            webpage_bytes = prefix + webpage_bytes
        if not encoding:
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@ -2235,8 +2235,8 @@ def age_restricted(content_limit, age_limit):
    return age_limit < content_limit
-def parse_strip_bom(data):
+def is_html(first_bytes):
-    """ try to find Unicode BOM and strip it. """
+    """ Detect whether a file contains HTML by examining its first bytes. """
    BOMS = [
        (b'\xef\xbb\xbf', 'utf-8'),
@ -2246,20 +2246,12 @@ def parse_strip_bom(data):
        (b'\xfe\xff', 'utf-16-be'),
    ]
    for bom, enc in BOMS:
-        if data.startswith(bom):
+        if first_bytes.startswith(bom):
-            return data[len(bom):], enc
+            s = first_bytes[len(bom):].decode(enc, 'replace')
            break
    else:
-        return data, None
+        s = first_bytes.decode('utf-8', 'replace')
 def is_html(first_bytes):
    """ Detect whether a file contains HTML by examining its first bytes. """
    first_bytes, enc = parse_strip_bom(first_bytes)
    if enc == None:
        enc = 'utf-8'
    s = first_bytes.decode(enc, 'replace')
    return re.match(r'^\s*<', s)