some HTMLParser bugfixes
This commit is contained in:
		
							parent
							
								
									9e6dd23876
								
							
						
					
					
						commit
						9beb5af82e
					
				
							
								
								
									
										
											BIN
										
									
								
								youtube-dl
									
									
									
									
									
								
							
							
						
						
									
										
											BIN
										
									
								
								youtube-dl
									
									
									
									
									
								
							
										
											Binary file not shown.
										
									
								
							
							
								
								
									
										
											BIN
										
									
								
								youtube-dl.exe
									
									
									
									
									
								
							
							
						
						
									
										
											BIN
										
									
								
								youtube-dl.exe
									
									
									
									
									
								
							
										
											Binary file not shown.
										
									
								
							| @ -359,8 +359,8 @@ class YoutubeIE(InfoExtractor): | |||||||
| 					pass | 					pass | ||||||
| 
 | 
 | ||||||
| 		# description | 		# description | ||||||
| 		video_description = get_element_by_id("eow-description", video_webpage) | 		video_description = get_element_by_id("eow-description", video_webpage.decode('utf8')) | ||||||
| 		if video_description: video_description = clean_html(video_description.decode('utf8')) | 		if video_description: video_description = clean_html(video_description) | ||||||
| 		else: video_description = '' | 		else: video_description = '' | ||||||
| 			 | 			 | ||||||
| 		# closed captions | 		# closed captions | ||||||
| @ -1055,8 +1055,8 @@ class VimeoIE(InfoExtractor): | |||||||
| 		video_thumbnail = config["video"]["thumbnail"] | 		video_thumbnail = config["video"]["thumbnail"] | ||||||
| 
 | 
 | ||||||
| 		# Extract video description | 		# Extract video description | ||||||
| 		video_description = get_element_by_id("description", webpage) | 		video_description = get_element_by_id("description", webpage.decode('utf8')) | ||||||
| 		if video_description: video_description = clean_html(video_description.decode('utf8')) | 		if video_description: video_description = clean_html(video_description) | ||||||
| 		else: video_description = '' | 		else: video_description = '' | ||||||
| 
 | 
 | ||||||
| 		# Extract upload date | 		# Extract upload date | ||||||
|  | |||||||
| @ -73,7 +73,7 @@ def htmlentity_transform(matchobj): | |||||||
| 	# Unknown entity in name, return its literal representation | 	# Unknown entity in name, return its literal representation | ||||||
| 	return (u'&%s;' % entity) | 	return (u'&%s;' % entity) | ||||||
| 
 | 
 | ||||||
| 
 | HTMLParser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix | ||||||
| class IDParser(HTMLParser.HTMLParser): | class IDParser(HTMLParser.HTMLParser): | ||||||
| 	"""Modified HTMLParser that isolates a tag with the specified id""" | 	"""Modified HTMLParser that isolates a tag with the specified id""" | ||||||
| 	def __init__(self, id): | 	def __init__(self, id): | ||||||
| @ -83,8 +83,17 @@ class IDParser(HTMLParser.HTMLParser): | |||||||
| 		self.depth = {} | 		self.depth = {} | ||||||
| 		self.html = None | 		self.html = None | ||||||
| 		self.watch_startpos = False | 		self.watch_startpos = False | ||||||
|  | 		self.error_count = 0 | ||||||
| 		HTMLParser.HTMLParser.__init__(self) | 		HTMLParser.HTMLParser.__init__(self) | ||||||
| 
 | 
 | ||||||
|  | 	def error(self, message): | ||||||
|  | 		print self.getpos() | ||||||
|  | 		if self.error_count > 10 or self.started: | ||||||
|  | 			raise HTMLParser.HTMLParseError(message, self.getpos()) | ||||||
|  | 		self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line | ||||||
|  | 		self.error_count += 1 | ||||||
|  | 		self.goahead(1) | ||||||
|  | 
 | ||||||
| 	def loads(self, html): | 	def loads(self, html): | ||||||
| 		self.html = html | 		self.html = html | ||||||
| 		self.feed(html) | 		self.feed(html) | ||||||
|  | |||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user