1. Highest quality is the first one in ('ori', 'super', 'high', 'nor') Their chinese names are 原画, 超清, 高清, 标清, respectively 2. Sohu's server can't identify duplicate slashes in the url.
98 lines
3.3 KiB
Python
98 lines
3.3 KiB
Python
# encoding: utf-8
|
||
|
||
import json
|
||
import re
|
||
|
||
from .common import InfoExtractor
|
||
from ..utils import ExtractorError
|
||
|
||
|
||
class SohuIE(InfoExtractor):
|
||
_VALID_URL = r'https?://(?P<mytv>my\.)?tv\.sohu\.com/.+?/(?(mytv)|n)(?P<id>\d+)\.shtml.*?'
|
||
|
||
_TEST = {
|
||
u'url': u'http://tv.sohu.com/20130724/n382479172.shtml#super',
|
||
u'file': u'382479172.mp4',
|
||
u'md5': u'bde8d9a6ffd82c63a1eefaef4eeefec7',
|
||
u'info_dict': {
|
||
u'title': u'MV:Far East Movement《The Illest》',
|
||
},
|
||
}
|
||
|
||
def _real_extract(self, url):
|
||
|
||
def _fetch_data(vid_id, mytv=False):
|
||
if mytv:
|
||
base_data_url = 'http://my.tv.sohu.com/play/videonew.do?vid='
|
||
else:
|
||
base_data_url = u'http://hot.vrs.sohu.com/vrs_flash.action?vid='
|
||
data_url = base_data_url + str(vid_id)
|
||
data_json = self._download_webpage(
|
||
data_url, video_id,
|
||
note=u'Downloading JSON data for ' + str(vid_id))
|
||
return json.loads(data_json)
|
||
|
||
mobj = re.match(self._VALID_URL, url)
|
||
video_id = mobj.group('id')
|
||
mytv = mobj.group('mytv') is not None
|
||
|
||
webpage = self._download_webpage(url, video_id)
|
||
raw_title = self._html_search_regex(r'(?s)<title>(.+?)</title>',
|
||
webpage, u'video title')
|
||
title = raw_title.partition('-')[0].strip()
|
||
|
||
vid = self._html_search_regex(r'var vid ?= ?["\'](\d+)["\']', webpage,
|
||
u'video path')
|
||
data = _fetch_data(vid, mytv)
|
||
|
||
QUALITIES = ('ori', 'super', 'high', 'nor')
|
||
vid_ids = [data['data'][q + 'Vid']
|
||
for q in QUALITIES
|
||
if data['data'][q + 'Vid'] != 0]
|
||
if not vid_ids:
|
||
raise ExtractorError(u'No formats available for this video')
|
||
|
||
# For now, we just pick the highest available quality
|
||
vid_id = vid_ids[0]
|
||
|
||
format_data = data if vid == vid_id else _fetch_data(vid_id, mytv)
|
||
part_count = format_data['data']['totalBlocks']
|
||
allot = format_data['allot']
|
||
prot = format_data['prot']
|
||
clipsURL = format_data['data']['clipsURL']
|
||
su = format_data['data']['su']
|
||
|
||
playlist = []
|
||
for i in range(part_count):
|
||
part_url = ('http://%s/?prot=%s&file=%s&new=%s' %
|
||
(allot, prot, clipsURL[i], su[i]))
|
||
part_str = self._download_webpage(
|
||
part_url, video_id,
|
||
note=u'Downloading part %d of %d' % (i+1, part_count))
|
||
|
||
part_info = part_str.split('|')
|
||
if part_info[0][-1] == '/' and su[i][0] == '/':
|
||
video_url = '%s%s?key=%s' % (part_info[0], su[i][1:], part_info[3])
|
||
else:
|
||
video_url = '%s%s?key=%s' % (part_info[0], su[i], part_info[3])
|
||
|
||
video_info = {
|
||
'id': '%s_part%02d' % (video_id, i + 1),
|
||
'title': title,
|
||
'url': video_url,
|
||
'ext': 'mp4',
|
||
}
|
||
playlist.append(video_info)
|
||
|
||
if len(playlist) == 1:
|
||
info = playlist[0]
|
||
info['id'] = video_id
|
||
else:
|
||
info = {
|
||
'_type': 'playlist',
|
||
'entries': playlist,
|
||
'id': video_id,
|
||
}
|
||
|
||
return info
|