l1ving_youtube-dl/youtube_dl/extractor/tudou.py

# coding: utf-8

import re
import json
import urllib2

from time import time
from random import randint

from .common import InfoExtractor


class TudouIE(InfoExtractor):
	#_VALID_URL = r'(?:http://)?(?:www\.)?tudou\.com/((?:listplay|programs)/(?:view|(.+?)))|(?:albumplay)/(?:([^/]+)|([^/]+))(?:\.html)?'
	_VALID_URL = r'(?:http://)?(?:www\.)?tudou\.com/(((?:listplay|programs)/(?:view|(.+?)))|((?:albumplay)(?:/[^/]*)))/(?:([^/]+)|([^/]+))(?:\.html)?'
	_TEST = {
	u'url': u'http://www.tudou.com/listplay/zzdE77v6Mmo/2xN2duXMxmw.html',
	u'file': u'159448201.f4v',
	u'md5': u'140a49ed444bd22f93330985d8475fcb',
	u'info_dict': {
		u"title": u"卡马乔国足开大脚长传冲吊集锦"
		}
	}

	def _url_for_id(self, id, quality = None):
		info_url = "http://v2.tudou.com/f?id="+str(id)
		if quality:
			info_url += '&hd' + quality
		webpage = self._download_webpage(info_url, id, "Opening the info webpage")
		final_url = self._html_search_regex('>(.+?)</f>',webpage, 'video url')
		return final_url

	def get_page(self,url):
		request=urllib2.urlopen(url)
		html=request.read()
		content_type=request.headers.get('Content-Type')
		m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
		if m:
			encoding=m.group(1)
			html=html.decode(encoding,"replace")
			return html
		else:
			return None


	def isyouku(self,url):
		request=urllib2.urlopen(url)
		html=request.read()
		content_type=request.headers.get('Content-Type')
		m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
		if m:
			encoding=m.group(1)
			html=html.decode(encoding,"replace")
		vcode=re.search(r'vcode\s*[:=]\s*\'([^\']+)\'',html).group(1)
		title=re.search(r'kw\s*[:=]\s*[\'\"]([^\']+?)[\'\"]',html).group(1)
		if vcode:
			return (vcode,title)
		else:
			return None

	def downloadYouku_by_id(self,videoId,title):
		info=self.get_youkuinfo(videoId)
		result=[]
		urls,sizes=zip(*self.find_video(info,None))
		pattern=re.compile(r'/st/([^/]+)/')
		ext=str(re.search(pattern, urls[0]).group(1))
		for i,url in enumerate(urls):
			part_info={
					'id':i,
					'url':url,
					'ext':ext,
					'title':title,
					'thumbnail':None,
					}
			result.append(part_info)

		return result


	def get_youkuinfo(self,videoId):
		return json.loads(self.get_page('http://v.youku.com/player/getPlayList/VideoIDS/' + videoId + '/timezone/+08/version/5/source/out/Sc/2'))

	def find_video(self,info, stream_type = None):
		#key = '%s%x' % (info['data'][0]['key2'], int(info['data'][0]['key1'], 16) ^ 0xA55AA5A5)
		segs = info['data'][0]['segs']
		types = segs.keys()
		if not stream_type:
			for x in ['hd2', 'mp4', 'flv']:
				if x in types:
					stream_type = x
					break
			else:
				raise NotImplementedError()
		assert stream_type in ('hd2', 'mp4', 'flv')
		file_type = {'hd2': 'flv', 'mp4': 'mp4', 'flv': 'flv'}[stream_type]

		seed = info['data'][0]['seed']
		source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\\:._-1234567890")
		mixed = ''
		while source:
			seed = (seed * 211 + 30031) & 0xFFFF
			index = seed * len(source) >> 16
			c = source.pop(index)
			mixed += c

		ids = info['data'][0]['streamfileids'][stream_type].split('*')[:-1]
		vid = ''.join(mixed[int(i)] for i in ids)

		sid = '%s%s%s' % (int(time() * 1000), randint(1000, 1999), randint(1000, 9999))

		urls = []
		for s in segs[stream_type]:
			no = '%02x' % int(s['no'])
			url = 'http://f.youku.com/player/getFlvPath/sid/%s_%s/st/%s/fileid/%s%s%s?K=%s&ts=%s' % (sid, no, file_type, vid[:8], no.upper(), vid[10:], s['k'], s['seconds'])
			urls.append((url, int(s['size'])))
		return urls

	def _real_extract(self, url):
		mobj = re.match(self._VALID_URL, url)
		video_id = mobj.group(2)
		if video_id is None:
				vcode,title=self.isyouku(url)
				if not vcode:
					print "Not transferring to Youku"
					return None
				return self.downloadYouku_by_id(vcode,title)

		webpage = self._download_webpage(url, video_id)
		title = re.search(",kw:\"(.+)\"",webpage)
		if title is None:
			title = re.search(",kw: \'(.+)\'",webpage)

		title = title.group(1)
		thumbnail_url = re.search(",pic: \'(.+?)\'",webpage)
		if thumbnail_url is None:
			thumbnail_url = re.search(",pic:\"(.+?)\"",webpage)
		thumbnail_url = thumbnail_url.group(1)

		segs_json = self._search_regex(r'segs: \'(.*)\'', webpage, 'segments')
		segments = json.loads(segs_json)
		# It looks like the keys are the arguments that have to be passed as
		# the hd field in the request url, we pick the higher
		quality = sorted(segments.keys())[-1]
		parts = segments[quality]
		result = []
		len_parts = len(parts)
		if len_parts > 1:
			self.to_screen(u'%s: found %s parts' % (video_id, len_parts))
		for part in parts:
			part_id = part['k']
			final_url = self._url_for_id(part_id, quality)
			ext = (final_url.split('?')[0]).split('.')[-1]
			part_info = {'id': part_id,
						  'url': final_url,
						  'ext': ext,
						  'title': title,
						  'thumbnail': thumbnail_url,
						  }
			result.append(part_info)

		return result