[extractor/common] Add the encoding parameter
The QQMusic info extractor need forced encoding for correct working.
This commit is contained in:
		
							parent
							
								
									a685ae511a
								
							
						
					
					
						commit
						c9a779695d
					
				| @ -324,7 +324,7 @@ class InfoExtractor(object): | |||||||
|                 self._downloader.report_warning(errmsg) |                 self._downloader.report_warning(errmsg) | ||||||
|                 return False |                 return False | ||||||
| 
 | 
 | ||||||
|     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True): |     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None): | ||||||
|         """ Returns a tuple (page content as string, URL handle) """ |         """ Returns a tuple (page content as string, URL handle) """ | ||||||
|         # Strip hashes from the URL (#1038) |         # Strip hashes from the URL (#1038) | ||||||
|         if isinstance(url_or_request, (compat_str, str)): |         if isinstance(url_or_request, (compat_str, str)): | ||||||
| @ -334,14 +334,11 @@ class InfoExtractor(object): | |||||||
|         if urlh is False: |         if urlh is False: | ||||||
|             assert not fatal |             assert not fatal | ||||||
|             return False |             return False | ||||||
|         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal) |         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding) | ||||||
|         return (content, urlh) |         return (content, urlh) | ||||||
| 
 | 
 | ||||||
|     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None): |     @staticmethod | ||||||
|         content_type = urlh.headers.get('Content-Type', '') |     def _guess_encoding_from_content(content_type, webpage_bytes): | ||||||
|         webpage_bytes = urlh.read() |  | ||||||
|         if prefix is not None: |  | ||||||
|             webpage_bytes = prefix + webpage_bytes |  | ||||||
|         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type) |         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type) | ||||||
|         if m: |         if m: | ||||||
|             encoding = m.group(1) |             encoding = m.group(1) | ||||||
| @ -354,6 +351,16 @@ class InfoExtractor(object): | |||||||
|                 encoding = 'utf-16' |                 encoding = 'utf-16' | ||||||
|             else: |             else: | ||||||
|                 encoding = 'utf-8' |                 encoding = 'utf-8' | ||||||
|  | 
 | ||||||
|  |         return encoding | ||||||
|  | 
 | ||||||
|  |     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None): | ||||||
|  |         content_type = urlh.headers.get('Content-Type', '') | ||||||
|  |         webpage_bytes = urlh.read() | ||||||
|  |         if prefix is not None: | ||||||
|  |             webpage_bytes = prefix + webpage_bytes | ||||||
|  |         if not encoding: | ||||||
|  |             encoding = self._guess_encoding_from_content(content_type, webpage_bytes) | ||||||
|         if self._downloader.params.get('dump_intermediate_pages', False): |         if self._downloader.params.get('dump_intermediate_pages', False): | ||||||
|             try: |             try: | ||||||
|                 url = url_or_request.get_full_url() |                 url = url_or_request.get_full_url() | ||||||
| @ -410,13 +417,13 @@ class InfoExtractor(object): | |||||||
| 
 | 
 | ||||||
|         return content |         return content | ||||||
| 
 | 
 | ||||||
|     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5): |     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None): | ||||||
|         """ Returns the data of the page as a string """ |         """ Returns the data of the page as a string """ | ||||||
|         success = False |         success = False | ||||||
|         try_count = 0 |         try_count = 0 | ||||||
|         while success is False: |         while success is False: | ||||||
|             try: |             try: | ||||||
|                 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal) |                 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding) | ||||||
|                 success = True |                 success = True | ||||||
|             except compat_http_client.IncompleteRead as e: |             except compat_http_client.IncompleteRead as e: | ||||||
|                 try_count += 1 |                 try_count += 1 | ||||||
| @ -431,10 +438,10 @@ class InfoExtractor(object): | |||||||
| 
 | 
 | ||||||
|     def _download_xml(self, url_or_request, video_id, |     def _download_xml(self, url_or_request, video_id, | ||||||
|                       note='Downloading XML', errnote='Unable to download XML', |                       note='Downloading XML', errnote='Unable to download XML', | ||||||
|                       transform_source=None, fatal=True): |                       transform_source=None, fatal=True, encoding=None): | ||||||
|         """Return the xml as an xml.etree.ElementTree.Element""" |         """Return the xml as an xml.etree.ElementTree.Element""" | ||||||
|         xml_string = self._download_webpage( |         xml_string = self._download_webpage( | ||||||
|             url_or_request, video_id, note, errnote, fatal=fatal) |             url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding) | ||||||
|         if xml_string is False: |         if xml_string is False: | ||||||
|             return xml_string |             return xml_string | ||||||
|         if transform_source: |         if transform_source: | ||||||
| @ -445,9 +452,10 @@ class InfoExtractor(object): | |||||||
|                        note='Downloading JSON metadata', |                        note='Downloading JSON metadata', | ||||||
|                        errnote='Unable to download JSON metadata', |                        errnote='Unable to download JSON metadata', | ||||||
|                        transform_source=None, |                        transform_source=None, | ||||||
|                        fatal=True): |                        fatal=True, encoding=None): | ||||||
|         json_string = self._download_webpage( |         json_string = self._download_webpage( | ||||||
|             url_or_request, video_id, note, errnote, fatal=fatal) |             url_or_request, video_id, note, errnote, fatal=fatal, | ||||||
|  |             encoding=encoding) | ||||||
|         if (not fatal) and json_string is False: |         if (not fatal) and json_string is False: | ||||||
|             return None |             return None | ||||||
|         return self._parse_json( |         return self._parse_json( | ||||||
|  | |||||||
| @ -24,7 +24,7 @@ class QQMusicIE(InfoExtractor): | |||||||
|             'title': '可惜没如果', |             'title': '可惜没如果', | ||||||
|             'upload_date': '20141227', |             'upload_date': '20141227', | ||||||
|             'creator': '林俊杰', |             'creator': '林俊杰', | ||||||
|             'description': 'md5:242c97c2847e0495583b7b13764f7106', |             'description': 'md5:4348ff1dd24036906baa7b6f973f8d30', | ||||||
|         } |         } | ||||||
|     }] |     }] | ||||||
| 
 | 
 | ||||||
| @ -41,7 +41,7 @@ class QQMusicIE(InfoExtractor): | |||||||
|         detail_info_page = self._download_webpage( |         detail_info_page = self._download_webpage( | ||||||
|             'http://s.plcloud.music.qq.com/fcgi-bin/fcg_yqq_song_detail_info.fcg?songmid=%s&play=0' % mid, |             'http://s.plcloud.music.qq.com/fcgi-bin/fcg_yqq_song_detail_info.fcg?songmid=%s&play=0' % mid, | ||||||
|             mid, note='Download song detail info', |             mid, note='Download song detail info', | ||||||
|             errnote='Unable to get song detail info') |             errnote='Unable to get song detail info', encoding='gbk') | ||||||
| 
 | 
 | ||||||
|         song_name = self._html_search_regex( |         song_name = self._html_search_regex( | ||||||
|             r"songname:\s*'([^']+)'", detail_info_page, 'song name') |             r"songname:\s*'([^']+)'", detail_info_page, 'song name') | ||||||
|  | |||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user