parse tudou url like http://www.tudou.com/albumplay/TenTw_JgiPM/PzsAs5usU9A.html

2013-10-07 16:00:57 +08:00 · 2013-10-07 16:00:57 +08:00 · 01639fdcfa
commit 01639fdcfa
parent 387ae5f30b
1 changed files with 148 additions and 50 deletions
--- a/youtube_dl/extractor/tudou.py
+++ b/youtube_dl/extractor/tudou.py
@ -2,62 +2,160 @@
 import re
 import json
 import urllib2
 from time import time
 from random import randint
 from .common import InfoExtractor
 class TudouIE(InfoExtractor):
-    _VALID_URL = r'(?:http://)?(?:www\.)?tudou\.com/(?:listplay|programs)/(?:view|(.+?))/(?:([^/]+)|([^/]+))(?:\.html)?'
+	#_VALID_URL = r'(?:http://)?(?:www\.)?tudou\.com/((?:listplay|programs)/(?:view|(.+?)))|(?:albumplay)/(?:([^/]+)|([^/]+))(?:\.html)?'
-    _TEST = {
+	_VALID_URL = r'(?:http://)?(?:www\.)?tudou\.com/(((?:listplay|programs)/(?:view|(.+?)))|((?:albumplay)(?:/[^/]*)))/(?:([^/]+)|([^/]+))(?:\.html)?'
-        u'url': u'http://www.tudou.com/listplay/zzdE77v6Mmo/2xN2duXMxmw.html',
+	_TEST = {
-        u'file': u'159448201.f4v',
+	u'url': u'http://www.tudou.com/listplay/zzdE77v6Mmo/2xN2duXMxmw.html',
-        u'md5': u'140a49ed444bd22f93330985d8475fcb',
+	u'file': u'159448201.f4v',
-        u'info_dict': {
+	u'md5': u'140a49ed444bd22f93330985d8475fcb',
-            u"title": u"卡马乔国足开大脚长传冲吊集锦"
+	u'info_dict': {
-        }
+		u"title": u"卡马乔国足开大脚长传冲吊集锦"
-    }
+		}
 	}
-    def _url_for_id(self, id, quality = None):
+	def _url_for_id(self, id, quality = None):
-        info_url = "http://v2.tudou.com/f?id="+str(id)
+		info_url = "http://v2.tudou.com/f?id="+str(id)
-        if quality:
+		if quality:
-            info_url += '&hd' + quality
+			info_url += '&hd' + quality
-        webpage = self._download_webpage(info_url, id, "Opening the info webpage")
+		webpage = self._download_webpage(info_url, id, "Opening the info webpage")
-        final_url = self._html_search_regex('>(.+?)</f>',webpage, 'video url')
+		final_url = self._html_search_regex('>(.+?)</f>',webpage, 'video url')
-        return final_url
+		return final_url
-    def _real_extract(self, url):
+	def get_page(self,url):
-        mobj = re.match(self._VALID_URL, url)
+		request=urllib2.urlopen(url)
-        video_id = mobj.group(2)
+		html=request.read()
-        webpage = self._download_webpage(url, video_id)
+		content_type=request.headers.get('Content-Type')
-        title = re.search(",kw:\"(.+)\"",webpage)
+		m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
-        if title is None:
+		if m:
-            title = re.search(",kw: \'(.+)\'",webpage)
+			encoding=m.group(1)
-        title = title.group(1)
+			html=html.decode(encoding,"replace")
-        thumbnail_url = re.search(",pic: \'(.+?)\'",webpage)
+			return html
-        if thumbnail_url is None:
+		else:
-            thumbnail_url = re.search(",pic:\"(.+?)\"",webpage)
+			return None
        thumbnail_url = thumbnail_url.group(1)
        segs_json = self._search_regex(r'segs: \'(.*)\'', webpage, 'segments')
        segments = json.loads(segs_json)
        # It looks like the keys are the arguments that have to be passed as
        # the hd field in the request url, we pick the higher
        quality = sorted(segments.keys())[-1]
        parts = segments[quality]
        result = []
        len_parts = len(parts)
        if len_parts > 1:
            self.to_screen(u'%s: found %s parts' % (video_id, len_parts))
        for part in parts:
            part_id = part['k']
            final_url = self._url_for_id(part_id, quality)
            ext = (final_url.split('?')[0]).split('.')[-1]
            part_info = {'id': part_id,
                          'url': final_url,
                          'ext': ext,
                          'title': title,
                          'thumbnail': thumbnail_url,
                          }
            result.append(part_info)
-        return result
+	def isyouku(self,url):
 		request=urllib2.urlopen(url)
 		html=request.read()
 		content_type=request.headers.get('Content-Type')
 		m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 		if m:
 			encoding=m.group(1)
 			html=html.decode(encoding,"replace")
 		vcode=re.search(r'vcode\s*[:=]\s*\'([^\']+)\'',html).group(1)
 		title=re.search(r'kw\s*[:=]\s*[\'\"]([^\']+?)[\'\"]',html).group(1)
 		if vcode:
 			return (vcode,title)
 		else:
 			return None
 	def downloadYouku_by_id(self,videoId,title):
 		info=self.get_youkuinfo(videoId)
 		result=[]
 		urls,sizes=zip(*self.find_video(info,None))
 		pattern=re.compile(r'/st/([^/]+)/')
 		ext=str(re.search(pattern, urls[0]).group(1))
 		for i,url in enumerate(urls):
 			part_info={
 					'id':i,
 					'url':url,
 					'ext':ext,
 					'title':title,
 					'thumbnail':None,
 					}
 			result.append(part_info)
 		return result
 	def get_youkuinfo(self,videoId):
 		return json.loads(self.get_page('http://v.youku.com/player/getPlayList/VideoIDS/' + videoId + '/timezone/+08/version/5/source/out/Sc/2'))
 	def find_video(self,info, stream_type = None):
 		#key = '%s%x' % (info['data'][0]['key2'], int(info['data'][0]['key1'], 16) ^ 0xA55AA5A5)
 		segs = info['data'][0]['segs']
 		types = segs.keys()
 		if not stream_type:
 			for x in ['hd2', 'mp4', 'flv']:
 				if x in types:
 					stream_type = x
 					break
 			else:
 				raise NotImplementedError()
 		assert stream_type in ('hd2', 'mp4', 'flv')
 		file_type = {'hd2': 'flv', 'mp4': 'mp4', 'flv': 'flv'}[stream_type]
 		seed = info['data'][0]['seed']
 		source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\\:._-1234567890")
 		mixed = ''
 		while source:
 			seed = (seed * 211 + 30031) & 0xFFFF
 			index = seed * len(source) >> 16
 			c = source.pop(index)
 			mixed += c
 		ids = info['data'][0]['streamfileids'][stream_type].split('*')[:-1]
 		vid = ''.join(mixed[int(i)] for i in ids)
 		sid = '%s%s%s' % (int(time() * 1000), randint(1000, 1999), randint(1000, 9999))
 		urls = []
 		for s in segs[stream_type]:
 			no = '%02x' % int(s['no'])
 			url = 'http://f.youku.com/player/getFlvPath/sid/%s_%s/st/%s/fileid/%s%s%s?K=%s&ts=%s' % (sid, no, file_type, vid[:8], no.upper(), vid[10:], s['k'], s['seconds'])
 			urls.append((url, int(s['size'])))
 		return urls
 	def _real_extract(self, url):
 		mobj = re.match(self._VALID_URL, url)
 		video_id = mobj.group(2)
 		if video_id is None:
 				vcode,title=self.isyouku(url)
 				if not vcode:
 					print "Not transferring to Youku"
 					return None
 				return self.downloadYouku_by_id(vcode,title)
 		webpage = self._download_webpage(url, video_id)
 		title = re.search(",kw:\"(.+)\"",webpage)
 		if title is None:
 			title = re.search(",kw: \'(.+)\'",webpage)
 		title = title.group(1)
 		thumbnail_url = re.search(",pic: \'(.+?)\'",webpage)
 		if thumbnail_url is None:
 			thumbnail_url = re.search(",pic:\"(.+?)\"",webpage)
 		thumbnail_url = thumbnail_url.group(1)
 		segs_json = self._search_regex(r'segs: \'(.*)\'', webpage, 'segments')
 		segments = json.loads(segs_json)
 		# It looks like the keys are the arguments that have to be passed as
 		# the hd field in the request url, we pick the higher
 		quality = sorted(segments.keys())[-1]
 		parts = segments[quality]
 		result = []
 		len_parts = len(parts)
 		if len_parts > 1:
 			self.to_screen(u'%s: found %s parts' % (video_id, len_parts))
 		for part in parts:
 			part_id = part['k']
 			final_url = self._url_for_id(part_id, quality)
 			ext = (final_url.split('?')[0]).split('.')[-1]
 			part_info = {'id': part_id,
 						  'url': final_url,
 						  'ext': ext,
 						  'title': title,
 						  'thumbnail': thumbnail_url,
 						  }
 			result.append(part_info)
 		return result