removed dependency from lxml: added IDParser
This commit is contained in:
		
							parent
							
								
									d6a9615347
								
							
						
					
					
						commit
						c6f45d4314
					
				
							
								
								
									
										131
									
								
								youtube-dl
									
									
									
									
									
								
							
							
						
						
									
										131
									
								
								youtube-dl
									
									
									
									
									
								
							| @ -15,6 +15,7 @@ __authors__  = ( | ||||
| 	'Kevin Ngo', | ||||
| 	'Ori Avtalion', | ||||
| 	'shizeeg', | ||||
| 	'Filippo Valsorda', | ||||
| 	) | ||||
| 
 | ||||
| __license__ = 'Public Domain' | ||||
| @ -66,11 +67,6 @@ try: | ||||
| except ImportError: | ||||
| 	from cgi import parse_qs | ||||
| 
 | ||||
| try: | ||||
| 	import lxml.etree | ||||
| except ImportError: | ||||
| 	pass # Handled below | ||||
| 
 | ||||
| try: | ||||
| 	import xml.etree.ElementTree | ||||
| except ImportError: # Python<2.5: Not officially supported, but let it slip | ||||
| @ -197,6 +193,69 @@ except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/tr | ||||
| 				raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')') | ||||
| 			return res | ||||
| 
 | ||||
| 
 | ||||
| class IDParser(HTMLParser.HTMLParser): | ||||
| 	"""Modified HTMLParser that isolates a tag with the specified id""" | ||||
| 	def __init__(self, id): | ||||
| 		self.id = id | ||||
| 		self.result = None | ||||
| 		self.started = False | ||||
| 		self.depth = {} | ||||
| 		self.html = None | ||||
| 		self.watch_startpos = False | ||||
| 		HTMLParser.HTMLParser.__init__(self) | ||||
| 
 | ||||
| 	def loads(self, html): | ||||
| 		self.html = html | ||||
| 		self.feed(html) | ||||
| 		self.close() | ||||
| 
 | ||||
| 	def handle_starttag(self, tag, attrs): | ||||
| 		attrs = dict(attrs) | ||||
| 		if self.started: | ||||
| 			self.find_startpos(None) | ||||
| 		if 'id' in attrs and attrs['id'] == self.id: | ||||
| 			self.result = [tag] | ||||
| 			self.started = True | ||||
| 			self.watch_startpos = True | ||||
| 		if self.started: | ||||
| 			if not tag in self.depth: self.depth[tag] = 0 | ||||
| 			self.depth[tag] += 1 | ||||
| 
 | ||||
| 	def handle_endtag(self, tag): | ||||
| 		if self.started: | ||||
| 			if tag in self.depth: self.depth[tag] -= 1 | ||||
| 			if self.depth[self.result[0]] == 0: | ||||
| 				self.started = False | ||||
| 				self.result.append(self.getpos()) | ||||
| 
 | ||||
| 	def find_startpos(self, x): | ||||
| 		"""Needed to put the start position of the result (self.result[1]) | ||||
| 		after the opening tag with the requested id""" | ||||
| 		if self.watch_startpos: | ||||
| 			self.watch_startpos = False | ||||
| 			self.result.append(self.getpos()) | ||||
| 	handle_entityref = handle_charref = handle_data = handle_comment = \ | ||||
| 	handle_decl = handle_pi = unknown_decl = find_startpos | ||||
| 
 | ||||
| 	def get_result(self): | ||||
| 		if self.result == None: return None | ||||
| 		if len(self.result) != 3: return None | ||||
| 		lines = self.html.split('\n') | ||||
| 		lines = lines[self.result[1][0]-1:self.result[2][0]] | ||||
| 		lines[0] = lines[0][self.result[1][1]:] | ||||
| 		if len(lines) == 1: | ||||
| 			lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]] | ||||
| 		lines[-1] = lines[-1][:self.result[2][1]] | ||||
| 		return '\n'.join(lines).strip() | ||||
| 
 | ||||
| def get_element_by_id(id, html): | ||||
| 	"""Return the content of the tag with the specified id in the passed HTML document""" | ||||
| 	parser = IDParser(id) | ||||
| 	parser.loads(html) | ||||
| 	return parser.get_result() | ||||
| 
 | ||||
| 
 | ||||
| def preferredencoding(): | ||||
| 	"""Get preferred encoding. | ||||
| 
 | ||||
| @ -241,6 +300,18 @@ def htmlentity_transform(matchobj): | ||||
| 	return (u'&%s;' % entity) | ||||
| 
 | ||||
| 
 | ||||
| def clean_html(html): | ||||
| 	"""Clean an HTML snippet into a readable string""" | ||||
| 	# Newline vs <br /> | ||||
| 	html = html.replace('\n', ' ') | ||||
| 	html = re.sub('\s*<\s*br\s*/?\s*>\s*', '\n', html) | ||||
| 	# Strip html tags | ||||
| 	html = re.sub('<.*?>', '', html) | ||||
| 	# Replace html entities | ||||
| 	html = re.sub(ur'(?u)&(.+?);', htmlentity_transform, html) | ||||
| 	return html | ||||
| 
 | ||||
| 
 | ||||
| def sanitize_title(utitle): | ||||
| 	"""Sanitizes a video title so it could be used as part of a filename.""" | ||||
| 	utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle) | ||||
| @ -1419,18 +1490,9 @@ class YoutubeIE(InfoExtractor): | ||||
| 					pass | ||||
| 
 | ||||
| 		# description | ||||
| 		try: | ||||
| 			lxml.etree | ||||
| 		except NameError: | ||||
| 			video_description = u'No description available.' | ||||
| 			mobj = re.search(r'<meta name="description" content="(.*?)">', video_webpage) | ||||
| 			if mobj is not None: | ||||
| 				video_description = mobj.group(1).decode('utf-8') | ||||
| 		else: | ||||
| 			html_parser = lxml.etree.HTMLParser(encoding='utf-8') | ||||
| 			vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser) | ||||
| 			video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()')) | ||||
| 			# TODO use another parser | ||||
| 		video_description = get_element_by_id("eow-description", video_webpage) | ||||
| 		if video_description: video_description = clean_html(video_description.decode('utf8')) | ||||
| 		else: video_description = '' | ||||
| 			 | ||||
| 		# closed captions | ||||
| 		video_subtitles = None | ||||
| @ -2164,18 +2226,9 @@ class VimeoIE(InfoExtractor): | ||||
| 		video_thumbnail = config["video"]["thumbnail"] | ||||
| 
 | ||||
| 		# Extract video description | ||||
| 		try: | ||||
| 			lxml.etree | ||||
| 		except NameError: | ||||
| 			video_description = u'No description available.' | ||||
| 			mobj = re.search(r'<meta name="description" content="(.*?)" />', webpage, re.MULTILINE) | ||||
| 			if mobj is not None: | ||||
| 				video_description = mobj.group(1) | ||||
| 		else: | ||||
| 			html_parser = lxml.etree.HTMLParser() | ||||
| 			vwebpage_doc = lxml.etree.parse(StringIO.StringIO(webpage), html_parser) | ||||
| 			video_description = u''.join(vwebpage_doc.xpath('id("description")//text()')).strip() | ||||
| 			# TODO use another parser | ||||
| 		video_description = get_element_by_id("description", webpage) | ||||
| 		if video_description: video_description = clean_html(video_description.decode('utf8')) | ||||
| 		else: video_description = '' | ||||
| 
 | ||||
| 		# Extract upload date | ||||
| 		video_upload_date = u'NA' | ||||
| @ -3342,8 +3395,6 @@ class EscapistIE(InfoExtractor): | ||||
| 		self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName) | ||||
| 
 | ||||
| 	def _real_extract(self, url): | ||||
| 		htmlParser = HTMLParser.HTMLParser() | ||||
| 
 | ||||
| 		mobj = re.match(self._VALID_URL, url) | ||||
| 		if mobj is None: | ||||
| 			self._downloader.trouble(u'ERROR: invalid URL: %s' % url) | ||||
| @ -3359,11 +3410,11 @@ class EscapistIE(InfoExtractor): | ||||
| 			return | ||||
| 
 | ||||
| 		descMatch = re.search('<meta name="description" content="([^"]*)"', webPage) | ||||
| 		description = htmlParser.unescape(descMatch.group(1)) | ||||
| 		description = unescapeHTML(descMatch.group(1)) | ||||
| 		imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage) | ||||
| 		imgUrl = htmlParser.unescape(imgMatch.group(1)) | ||||
| 		imgUrl = unescapeHTML(imgMatch.group(1)) | ||||
| 		playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage) | ||||
| 		playerUrl = htmlParser.unescape(playerUrlMatch.group(1)) | ||||
| 		playerUrl = unescapeHTML(playerUrlMatch.group(1)) | ||||
| 		configUrlMatch = re.search('config=(.*)$', playerUrl) | ||||
| 		configUrl = urllib2.unquote(configUrlMatch.group(1)) | ||||
| 
 | ||||
| @ -3422,8 +3473,6 @@ class CollegeHumorIE(InfoExtractor): | ||||
| 		self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id)) | ||||
| 
 | ||||
| 	def _real_extract(self, url): | ||||
| 		htmlParser = HTMLParser.HTMLParser() | ||||
| 
 | ||||
| 		mobj = re.match(self._VALID_URL, url) | ||||
| 		if mobj is None: | ||||
| 			self._downloader.trouble(u'ERROR: invalid URL: %s' % url) | ||||
| @ -3494,8 +3543,6 @@ class XVideosIE(InfoExtractor): | ||||
| 		self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id)) | ||||
| 
 | ||||
| 	def _real_extract(self, url): | ||||
| 		htmlParser = HTMLParser.HTMLParser() | ||||
| 
 | ||||
| 		mobj = re.match(self._VALID_URL, url) | ||||
| 		if mobj is None: | ||||
| 			self._downloader.trouble(u'ERROR: invalid URL: %s' % url) | ||||
| @ -3584,8 +3631,6 @@ class SoundcloudIE(InfoExtractor): | ||||
| 		self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id)) | ||||
| 
 | ||||
| 	def _real_extract(self, url): | ||||
| 		htmlParser = HTMLParser.HTMLParser() | ||||
| 
 | ||||
| 		mobj = re.match(self._VALID_URL, url) | ||||
| 		if mobj is None: | ||||
| 			self._downloader.trouble(u'ERROR: invalid URL: %s' % url) | ||||
| @ -3673,8 +3718,6 @@ class InfoQIE(InfoExtractor): | ||||
| 		self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id)) | ||||
| 
 | ||||
| 	def _real_extract(self, url): | ||||
| 		htmlParser = HTMLParser.HTMLParser() | ||||
| 
 | ||||
| 		mobj = re.match(self._VALID_URL, url) | ||||
| 		if mobj is None: | ||||
| 			self._downloader.trouble(u'ERROR: invalid URL: %s' % url) | ||||
| @ -3908,8 +3951,6 @@ class StanfordOpenClassroomIE(InfoExtractor): | ||||
| 			except UnavailableVideoError, err: | ||||
| 				self._downloader.trouble(u'\nERROR: unable to download video') | ||||
| 		elif mobj.group('course'): # A course page | ||||
| 			unescapeHTML = HTMLParser.HTMLParser().unescape | ||||
| 
 | ||||
| 			course = mobj.group('course') | ||||
| 			info = { | ||||
| 				'id': _simplify_title(course), | ||||
| @ -3946,8 +3987,6 @@ class StanfordOpenClassroomIE(InfoExtractor): | ||||
| 				assert entry['type'] == 'reference' | ||||
| 				self.extract(entry['url']) | ||||
| 		else: # Root page | ||||
| 			unescapeHTML = HTMLParser.HTMLParser().unescape | ||||
| 
 | ||||
| 			info = { | ||||
| 				'id': 'Stanford OpenClassroom', | ||||
| 				'type': 'playlist', | ||||
|  | ||||
| @ -67,11 +67,6 @@ try: | ||||
| except ImportError: | ||||
| 	from cgi import parse_qs | ||||
| 
 | ||||
| try: | ||||
| 	import lxml.etree | ||||
| except ImportError: | ||||
| 	pass # Handled below | ||||
| 
 | ||||
| try: | ||||
| 	import xml.etree.ElementTree | ||||
| except ImportError: # Python<2.5: Not officially supported, but let it slip | ||||
| @ -198,6 +193,69 @@ except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/tr | ||||
| 				raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')') | ||||
| 			return res | ||||
| 
 | ||||
| 
 | ||||
| class IDParser(HTMLParser.HTMLParser): | ||||
| 	"""Modified HTMLParser that isolates a tag with the specified id""" | ||||
| 	def __init__(self, id): | ||||
| 		self.id = id | ||||
| 		self.result = None | ||||
| 		self.started = False | ||||
| 		self.depth = {} | ||||
| 		self.html = None | ||||
| 		self.watch_startpos = False | ||||
| 		HTMLParser.HTMLParser.__init__(self) | ||||
| 
 | ||||
| 	def loads(self, html): | ||||
| 		self.html = html | ||||
| 		self.feed(html) | ||||
| 		self.close() | ||||
| 
 | ||||
| 	def handle_starttag(self, tag, attrs): | ||||
| 		attrs = dict(attrs) | ||||
| 		if self.started: | ||||
| 			self.find_startpos(None) | ||||
| 		if 'id' in attrs and attrs['id'] == self.id: | ||||
| 			self.result = [tag] | ||||
| 			self.started = True | ||||
| 			self.watch_startpos = True | ||||
| 		if self.started: | ||||
| 			if not tag in self.depth: self.depth[tag] = 0 | ||||
| 			self.depth[tag] += 1 | ||||
| 
 | ||||
| 	def handle_endtag(self, tag): | ||||
| 		if self.started: | ||||
| 			if tag in self.depth: self.depth[tag] -= 1 | ||||
| 			if self.depth[self.result[0]] == 0: | ||||
| 				self.started = False | ||||
| 				self.result.append(self.getpos()) | ||||
| 
 | ||||
| 	def find_startpos(self, x): | ||||
| 		"""Needed to put the start position of the result (self.result[1]) | ||||
| 		after the opening tag with the requested id""" | ||||
| 		if self.watch_startpos: | ||||
| 			self.watch_startpos = False | ||||
| 			self.result.append(self.getpos()) | ||||
| 	handle_entityref = handle_charref = handle_data = handle_comment = \ | ||||
| 	handle_decl = handle_pi = unknown_decl = find_startpos | ||||
| 
 | ||||
| 	def get_result(self): | ||||
| 		if self.result == None: return None | ||||
| 		if len(self.result) != 3: return None | ||||
| 		lines = self.html.split('\n') | ||||
| 		lines = lines[self.result[1][0]-1:self.result[2][0]] | ||||
| 		lines[0] = lines[0][self.result[1][1]:] | ||||
| 		if len(lines) == 1: | ||||
| 			lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]] | ||||
| 		lines[-1] = lines[-1][:self.result[2][1]] | ||||
| 		return '\n'.join(lines).strip() | ||||
| 
 | ||||
| def get_element_by_id(id, html): | ||||
| 	"""Return the content of the tag with the specified id in the passed HTML document""" | ||||
| 	parser = IDParser(id) | ||||
| 	parser.loads(html) | ||||
| 	return parser.get_result() | ||||
| 
 | ||||
| 
 | ||||
| def preferredencoding(): | ||||
| 	"""Get preferred encoding. | ||||
| 
 | ||||
| @ -246,7 +304,7 @@ def clean_html(html): | ||||
| 	"""Clean an HTML snippet into a readable string""" | ||||
| 	# Newline vs <br /> | ||||
| 	html = html.replace('\n', ' ') | ||||
| 	html = re.sub('<\s*br\s*/?\s*>', '\n', html) | ||||
| 	html = re.sub('\s*<\s*br\s*/?\s*>\s*', '\n', html) | ||||
| 	# Strip html tags | ||||
| 	html = re.sub('<.*?>', '', html) | ||||
| 	# Replace html entities | ||||
| @ -1432,18 +1490,9 @@ class YoutubeIE(InfoExtractor): | ||||
| 					pass | ||||
| 
 | ||||
| 		# description | ||||
| 		try: | ||||
| 			lxml.etree | ||||
| 		except NameError: | ||||
| 			video_description = u'No description available.' | ||||
| 			mobj = re.search(r'<meta name="description" content="(.*?)">', video_webpage) | ||||
| 			if mobj is not None: | ||||
| 				video_description = mobj.group(1).decode('utf-8') | ||||
| 		else: | ||||
| 			html_parser = lxml.etree.HTMLParser(encoding='utf-8') | ||||
| 			vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser) | ||||
| 			video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()')) | ||||
| 			# TODO use another parser | ||||
| 		video_description = get_element_by_id("eow-description", video_webpage) | ||||
| 		if video_description: video_description = clean_html(video_description.decode('utf8')) | ||||
| 		else: video_description = '' | ||||
| 			 | ||||
| 		# closed captions | ||||
| 		video_subtitles = None | ||||
| @ -2177,18 +2226,9 @@ class VimeoIE(InfoExtractor): | ||||
| 		video_thumbnail = config["video"]["thumbnail"] | ||||
| 
 | ||||
| 		# Extract video description | ||||
| 		try: | ||||
| 			lxml.etree | ||||
| 		except NameError: | ||||
| 			video_description = u'No description available.' | ||||
| 			mobj = re.search(r'<meta name="description" content="(.*?)" />', webpage, re.MULTILINE) | ||||
| 			if mobj is not None: | ||||
| 				video_description = mobj.group(1) | ||||
| 		else: | ||||
| 			html_parser = lxml.etree.HTMLParser() | ||||
| 			vwebpage_doc = lxml.etree.parse(StringIO.StringIO(webpage), html_parser) | ||||
| 			video_description = u''.join(vwebpage_doc.xpath('id("description")//text()')).strip() | ||||
| 			# TODO use another parser | ||||
| 		video_description = get_element_by_id("description", webpage) | ||||
| 		if video_description: video_description = clean_html(video_description.decode('utf8')) | ||||
| 		else: video_description = '' | ||||
| 
 | ||||
| 		# Extract upload date | ||||
| 		video_upload_date = u'NA' | ||||
|  | ||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user