parse tudou url like http://www.tudou.com/albumplay/TenTw_JgiPM/PzsAs5usU9A.html

2013-10-07 16:00:57 +08:00 · 2013-10-07 16:00:57 +08:00 · 01639fdcfa
commit 01639fdcfa
parent 387ae5f30b
1 changed files with 148 additions and 50 deletions
--- a/youtube_dl/extractor/tudou.py
+++ b/youtube_dl/extractor/tudou.py
@ -2,12 +2,17 @@
 import re
 import json
 import urllib2
 from time import time
 from random import randint
 from .common import InfoExtractor
 class TudouIE(InfoExtractor):
-    _VALID_URL = r'(?:http://)?(?:www\.)?tudou\.com/(?:listplay|programs)/(?:view|(.+?))/(?:([^/]+)|([^/]+))(?:\.html)?'
+	#_VALID_URL = r'(?:http://)?(?:www\.)?tudou\.com/((?:listplay|programs)/(?:view|(.+?)))|(?:albumplay)/(?:([^/]+)|([^/]+))(?:\.html)?'
 	_VALID_URL = r'(?:http://)?(?:www\.)?tudou\.com/(((?:listplay|programs)/(?:view|(.+?)))|((?:albumplay)(?:/[^/]*)))/(?:([^/]+)|([^/]+))(?:\.html)?'
 	_TEST = {
 	u'url': u'http://www.tudou.com/listplay/zzdE77v6Mmo/2xN2duXMxmw.html',
 	u'file': u'159448201.f4v',
@ -25,13 +30,106 @@ class TudouIE(InfoExtractor):
 		final_url = self._html_search_regex('>(.+?)</f>',webpage, 'video url')
 		return final_url
 	def get_page(self,url):
 		request=urllib2.urlopen(url)
 		html=request.read()
 		content_type=request.headers.get('Content-Type')
 		m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 		if m:
 			encoding=m.group(1)
 			html=html.decode(encoding,"replace")
 			return html
 		else:
 			return None
 	def isyouku(self,url):
 		request=urllib2.urlopen(url)
 		html=request.read()
 		content_type=request.headers.get('Content-Type')
 		m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 		if m:
 			encoding=m.group(1)
 			html=html.decode(encoding,"replace")
 		vcode=re.search(r'vcode\s*[:=]\s*\'([^\']+)\'',html).group(1)
 		title=re.search(r'kw\s*[:=]\s*[\'\"]([^\']+?)[\'\"]',html).group(1)
 		if vcode:
 			return (vcode,title)
 		else:
 			return None
 	def downloadYouku_by_id(self,videoId,title):
 		info=self.get_youkuinfo(videoId)
 		result=[]
 		urls,sizes=zip(*self.find_video(info,None))
 		pattern=re.compile(r'/st/([^/]+)/')
 		ext=str(re.search(pattern, urls[0]).group(1))
 		for i,url in enumerate(urls):
 			part_info={
 					'id':i,
 					'url':url,
 					'ext':ext,
 					'title':title,
 					'thumbnail':None,
 					}
 			result.append(part_info)
 		return result
 	def get_youkuinfo(self,videoId):
 		return json.loads(self.get_page('http://v.youku.com/player/getPlayList/VideoIDS/' + videoId + '/timezone/+08/version/5/source/out/Sc/2'))
 	def find_video(self,info, stream_type = None):
 		#key = '%s%x' % (info['data'][0]['key2'], int(info['data'][0]['key1'], 16) ^ 0xA55AA5A5)
 		segs = info['data'][0]['segs']
 		types = segs.keys()
 		if not stream_type:
 			for x in ['hd2', 'mp4', 'flv']:
 				if x in types:
 					stream_type = x
 					break
 			else:
 				raise NotImplementedError()
 		assert stream_type in ('hd2', 'mp4', 'flv')
 		file_type = {'hd2': 'flv', 'mp4': 'mp4', 'flv': 'flv'}[stream_type]
 		seed = info['data'][0]['seed']
 		source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\\:._-1234567890")
 		mixed = ''
 		while source:
 			seed = (seed * 211 + 30031) & 0xFFFF
 			index = seed * len(source) >> 16
 			c = source.pop(index)
 			mixed += c
 		ids = info['data'][0]['streamfileids'][stream_type].split('*')[:-1]
 		vid = ''.join(mixed[int(i)] for i in ids)
 		sid = '%s%s%s' % (int(time() * 1000), randint(1000, 1999), randint(1000, 9999))
 		urls = []
 		for s in segs[stream_type]:
 			no = '%02x' % int(s['no'])
 			url = 'http://f.youku.com/player/getFlvPath/sid/%s_%s/st/%s/fileid/%s%s%s?K=%s&ts=%s' % (sid, no, file_type, vid[:8], no.upper(), vid[10:], s['k'], s['seconds'])
 			urls.append((url, int(s['size'])))
 		return urls
 	def _real_extract(self, url):
 		mobj = re.match(self._VALID_URL, url)
 		video_id = mobj.group(2)
 		if video_id is None:
 				vcode,title=self.isyouku(url)
 				if not vcode:
 					print "Not transferring to Youku"
 					return None
 				return self.downloadYouku_by_id(vcode,title)
 		webpage = self._download_webpage(url, video_id)
 		title = re.search(",kw:\"(.+)\"",webpage)
 		if title is None:
 			title = re.search(",kw: \'(.+)\'",webpage)
 		title = title.group(1)
 		thumbnail_url = re.search(",pic: \'(.+?)\'",webpage)
 		if thumbnail_url is None: