try to get Encoding from Unicode BOM and strip it from downloaded webpage
This commit is contained in:
parent
4087164678
commit
7f4b4650b0
@ -8,7 +8,6 @@ from .common import InfoExtractor
|
||||
from ..utils import (
|
||||
clean_html,
|
||||
ExtractorError,
|
||||
strip_bom_utf8,
|
||||
RegexNotFoundError,
|
||||
UnavailableVideoError,
|
||||
update_url_query,
|
||||
@ -59,8 +58,7 @@ class CBAIE(InfoExtractor):
|
||||
description = ''
|
||||
formats = []
|
||||
|
||||
posts_result = self._download_json(api_posts_url, video_id, 'query posts api-endpoint',
|
||||
'unable to query posts api-endpoint', transform_source=strip_bom_utf8)
|
||||
posts_result = self._download_json(api_posts_url, video_id, 'query posts api-endpoint', 'unable to query posts api-endpoint')
|
||||
try:
|
||||
title = clean_html(posts_result['title']['rendered'])
|
||||
description = clean_html(posts_result['content']['rendered'])
|
||||
@ -73,7 +71,7 @@ class CBAIE(InfoExtractor):
|
||||
api_media_url = update_url_query(api_media_url, {'c': self._API_KEY})
|
||||
|
||||
media_result = self._download_json(api_media_url, video_id, 'query media api-endpoint%s' % api_key_str,
|
||||
'unable to qeury media api-endpoint%s' % api_key_str, transform_source=strip_bom_utf8)
|
||||
'unable to qeury media api-endpoint%s' % api_key_str)
|
||||
for media in media_result:
|
||||
try:
|
||||
url = media['source_url']
|
||||
|
@ -57,6 +57,7 @@ from ..utils import (
|
||||
parse_m3u8_attributes,
|
||||
extract_attributes,
|
||||
parse_codecs,
|
||||
parse_strip_bom,
|
||||
)
|
||||
|
||||
|
||||
@ -438,6 +439,10 @@ class InfoExtractor(object):
|
||||
def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
|
||||
content_type = urlh.headers.get('Content-Type', '')
|
||||
webpage_bytes = urlh.read()
|
||||
webpage_bytes, bom_enc = parse_strip_bom(webpage_bytes)
|
||||
if not encoding:
|
||||
encoding = bom_enc
|
||||
|
||||
if prefix is not None:
|
||||
webpage_bytes = prefix + webpage_bytes
|
||||
if not encoding:
|
||||
|
@ -2235,8 +2235,8 @@ def age_restricted(content_limit, age_limit):
|
||||
return age_limit < content_limit
|
||||
|
||||
|
||||
def is_html(first_bytes):
|
||||
""" Detect whether a file contains HTML by examining its first bytes. """
|
||||
def parse_strip_bom(data):
|
||||
""" try to find Unicode BOM and strip it. """
|
||||
|
||||
BOMS = [
|
||||
(b'\xef\xbb\xbf', 'utf-8'),
|
||||
@ -2246,12 +2246,20 @@ def is_html(first_bytes):
|
||||
(b'\xfe\xff', 'utf-16-be'),
|
||||
]
|
||||
for bom, enc in BOMS:
|
||||
if first_bytes.startswith(bom):
|
||||
s = first_bytes[len(bom):].decode(enc, 'replace')
|
||||
break
|
||||
if data.startswith(bom):
|
||||
return data[len(bom):], enc
|
||||
else:
|
||||
s = first_bytes.decode('utf-8', 'replace')
|
||||
return data, None
|
||||
|
||||
|
||||
def is_html(first_bytes):
|
||||
""" Detect whether a file contains HTML by examining its first bytes. """
|
||||
|
||||
first_bytes, enc = parse_strip_bom(first_bytes)
|
||||
if enc == None:
|
||||
enc = 'utf-8'
|
||||
|
||||
s = first_bytes.decode(enc, 'replace')
|
||||
return re.match(r'^\s*<', s)
|
||||
|
||||
|
||||
@ -3121,11 +3129,3 @@ def decode_png(png_data):
|
||||
current_row.append(color)
|
||||
|
||||
return width, height, pixels
|
||||
|
||||
|
||||
def strip_bom_utf8(s):
|
||||
BOM_UTF8 = u'\ufeff'
|
||||
if s.startswith(BOM_UTF8):
|
||||
return s[len(BOM_UTF8):]
|
||||
|
||||
return s
|
||||
|
Loading…
x
Reference in New Issue
Block a user