Merge remote-tracking branch 'upstream/master' into this-american-life

This commit is contained in:
Eric Wong 2015-06-15 14:36:59 -07:00
commit 12d9ea1481
8 changed files with 246 additions and 111 deletions

View File

@ -127,3 +127,4 @@ Julian Richen
Ping O. Ping O.
Mister Hat Mister Hat
Peter Ding Peter Ding
jackyzy823

View File

@ -1033,12 +1033,6 @@ class YoutubeDL(object):
info_dict['id'], info_dict.get('subtitles'), info_dict['id'], info_dict.get('subtitles'),
info_dict.get('automatic_captions')) info_dict.get('automatic_captions'))
# This extractors handle format selection themselves
if info_dict['extractor'] in ['Youku']:
if download:
self.process_info(info_dict)
return info_dict
# We now pick which formats have to be downloaded # We now pick which formats have to be downloaded
if info_dict.get('formats') is None: if info_dict.get('formats') is None:
# There's only one format available # There's only one format available

View File

@ -60,7 +60,7 @@ class FranceTVBaseInfoExtractor(InfoExtractor):
continue continue
video_url_parsed = compat_urllib_parse_urlparse(video_url) video_url_parsed = compat_urllib_parse_urlparse(video_url)
f4m_url = self._download_webpage( f4m_url = self._download_webpage(
'http://hdfauth.francetv.fr/esi/urltokengen2.html?url=%s' % video_url_parsed.path, 'http://hdfauth.francetv.fr/esi/TA?url=%s' % video_url_parsed.path,
video_id, 'Downloading f4m manifest token', fatal=False) video_id, 'Downloading f4m manifest token', fatal=False)
if f4m_url: if f4m_url:
formats.extend(self._extract_f4m_formats(f4m_url, video_id, 1, format_id)) formats.extend(self._extract_f4m_formats(f4m_url, video_id, 1, format_id))

View File

@ -83,7 +83,7 @@ class SafariIE(SafariBaseIE):
library/view/[^/]+| library/view/[^/]+|
api/v1/book api/v1/book
)/ )/
(?P<course_id>\d+)/ (?P<course_id>[^/]+)/
(?:chapter(?:-content)?/)? (?:chapter(?:-content)?/)?
(?P<part>part\d+)\.html (?P<part>part\d+)\.html
''' '''
@ -100,6 +100,10 @@ class SafariIE(SafariBaseIE):
}, { }, {
'url': 'https://www.safaribooksonline.com/api/v1/book/9780133392838/chapter/part00.html', 'url': 'https://www.safaribooksonline.com/api/v1/book/9780133392838/chapter/part00.html',
'only_matching': True, 'only_matching': True,
}, {
# non-digits in course id
'url': 'https://www.safaribooksonline.com/library/view/create-a-nodejs/100000006A0210/part00.html',
'only_matching': True,
}] }]
def _real_extract(self, url): def _real_extract(self, url):
@ -122,7 +126,7 @@ class SafariCourseIE(SafariBaseIE):
IE_NAME = 'safari:course' IE_NAME = 'safari:course'
IE_DESC = 'safaribooksonline.com online courses' IE_DESC = 'safaribooksonline.com online courses'
_VALID_URL = r'https?://(?:www\.)?safaribooksonline\.com/(?:library/view/[^/]+|api/v1/book)/(?P<id>\d+)/?(?:[#?]|$)' _VALID_URL = r'https?://(?:www\.)?safaribooksonline\.com/(?:library/view/[^/]+|api/v1/book)/(?P<id>[^/]+)/?(?:[#?]|$)'
_TESTS = [{ _TESTS = [{
'url': 'https://www.safaribooksonline.com/library/view/hadoop-fundamentals-livelessons/9780133392838/', 'url': 'https://www.safaribooksonline.com/library/view/hadoop-fundamentals-livelessons/9780133392838/',

View File

@ -5,6 +5,7 @@ from .common import InfoExtractor
from ..compat import ( from ..compat import (
compat_urllib_parse, compat_urllib_parse,
compat_urllib_request, compat_urllib_request,
compat_urlparse,
) )
from ..utils import ( from ..utils import (
ExtractorError, ExtractorError,
@ -26,11 +27,21 @@ class Vbox7IE(InfoExtractor):
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) video_id = self._match_id(url)
redirect_page, urlh = self._download_webpage_handle(url, video_id) # need to get the page 3 times for the correct jsSecretToken cookie
new_location = self._search_regex(r'window\.location = \'(.*)\';', # which is necessary for the correct title
redirect_page, 'redirect location') def get_session_id():
redirect_url = urlh.geturl() + new_location redirect_page = self._download_webpage(url, video_id)
webpage = self._download_webpage(redirect_url, video_id, session_id_url = self._search_regex(
r'var\s*url\s*=\s*\'([^\']+)\';', redirect_page,
'session id url')
self._download_webpage(
compat_urlparse.urljoin(url, session_id_url), video_id,
'Getting session id')
get_session_id()
get_session_id()
webpage = self._download_webpage(url, video_id,
'Downloading redirect page') 'Downloading redirect page')
title = self._html_search_regex(r'<title>(.*)</title>', title = self._html_search_regex(r'<title>(.*)</title>',

View File

@ -13,6 +13,7 @@ from ..compat import (
from ..utils import ( from ..utils import (
ExtractorError, ExtractorError,
orderedSet, orderedSet,
str_to_int,
unescapeHTML, unescapeHTML,
unified_strdate, unified_strdate,
) )
@ -34,6 +35,7 @@ class VKIE(InfoExtractor):
'uploader': 're:(?:Noize MC|Alexander Ilyashenko).*', 'uploader': 're:(?:Noize MC|Alexander Ilyashenko).*',
'duration': 195, 'duration': 195,
'upload_date': '20120212', 'upload_date': '20120212',
'view_count': int,
}, },
}, },
{ {
@ -45,7 +47,8 @@ class VKIE(InfoExtractor):
'uploader': 'Tom Cruise', 'uploader': 'Tom Cruise',
'title': 'No name', 'title': 'No name',
'duration': 9, 'duration': 9,
'upload_date': '20130721' 'upload_date': '20130721',
'view_count': int,
} }
}, },
{ {
@ -59,6 +62,7 @@ class VKIE(InfoExtractor):
'title': 'Lin Dan', 'title': 'Lin Dan',
'duration': 101, 'duration': 101,
'upload_date': '20120730', 'upload_date': '20120730',
'view_count': int,
} }
}, },
{ {
@ -73,7 +77,8 @@ class VKIE(InfoExtractor):
'uploader': 'Триллеры', 'uploader': 'Триллеры',
'title': '► Бойцовский клуб / Fight Club 1999 [HD 720]', 'title': '► Бойцовский клуб / Fight Club 1999 [HD 720]',
'duration': 8352, 'duration': 8352,
'upload_date': '20121218' 'upload_date': '20121218',
'view_count': int,
}, },
'skip': 'Requires vk account credentials', 'skip': 'Requires vk account credentials',
}, },
@ -100,6 +105,7 @@ class VKIE(InfoExtractor):
'title': 'Книга Илая', 'title': 'Книга Илая',
'duration': 6771, 'duration': 6771,
'upload_date': '20140626', 'upload_date': '20140626',
'view_count': int,
}, },
'skip': 'Only works from Russia', 'skip': 'Only works from Russia',
}, },
@ -175,25 +181,29 @@ class VKIE(InfoExtractor):
m_rutube.group(1).replace('\\', '')) m_rutube.group(1).replace('\\', ''))
return self.url_result(rutube_url) return self.url_result(rutube_url)
m_opts = re.search(r'(?s)var\s+opts\s*=\s*({.*?});', info_page) m_opts = re.search(r'(?s)var\s+opts\s*=\s*({.+?});', info_page)
if m_opts: if m_opts:
m_opts_url = re.search(r"url\s*:\s*'([^']+)", m_opts.group(1)) m_opts_url = re.search(r"url\s*:\s*'((?!/\b)[^']+)", m_opts.group(1))
if m_opts_url: if m_opts_url:
opts_url = m_opts_url.group(1) opts_url = m_opts_url.group(1)
if opts_url.startswith('//'): if opts_url.startswith('//'):
opts_url = 'http:' + opts_url opts_url = 'http:' + opts_url
return self.url_result(opts_url) return self.url_result(opts_url)
data_json = self._search_regex(r'var vars = ({.*?});', info_page, 'vars') data_json = self._search_regex(r'var\s+vars\s*=\s*({.+?});', info_page, 'vars')
data = json.loads(data_json) data = json.loads(data_json)
# Extract upload date # Extract upload date
upload_date = None upload_date = None
mobj = re.search(r'id="mv_date_wrap".*?Added ([a-zA-Z]+ [0-9]+), ([0-9]+) at', info_page) mobj = re.search(r'id="mv_date(?:_views)?_wrap"[^>]*>([a-zA-Z]+ [0-9]+), ([0-9]+) at', info_page)
if mobj is not None: if mobj is not None:
mobj.group(1) + ' ' + mobj.group(2) mobj.group(1) + ' ' + mobj.group(2)
upload_date = unified_strdate(mobj.group(1) + ' ' + mobj.group(2)) upload_date = unified_strdate(mobj.group(1) + ' ' + mobj.group(2))
view_count = str_to_int(self._search_regex(
r'"mv_views_count_number"[^>]*>([\d,.]+) views<',
info_page, 'view count', fatal=False))
formats = [{ formats = [{
'format_id': k, 'format_id': k,
'url': v, 'url': v,
@ -210,6 +220,7 @@ class VKIE(InfoExtractor):
'uploader': data.get('md_author'), 'uploader': data.get('md_author'),
'duration': data.get('duration'), 'duration': data.get('duration'),
'upload_date': upload_date, 'upload_date': upload_date,
'view_count': view_count,
} }

View File

@ -1,123 +1,237 @@
# coding: utf-8 # coding: utf-8
from __future__ import unicode_literals from __future__ import unicode_literals
import math import base64
import random
import re
import time
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import ( from ..utils import ExtractorError
ExtractorError,
from ..compat import (
compat_urllib_parse,
compat_ord,
compat_urllib_request,
) )
class YoukuIE(InfoExtractor): class YoukuIE(InfoExtractor):
IE_NAME = 'youku'
_VALID_URL = r'''(?x) _VALID_URL = r'''(?x)
(?: (?:
http://(?:v|player)\.youku\.com/(?:v_show/id_|player\.php/sid/)| http://(?:v|player)\.youku\.com/(?:v_show/id_|player\.php/sid/)|
youku:) youku:)
(?P<id>[A-Za-z0-9]+)(?:\.html|/v\.swf|) (?P<id>[A-Za-z0-9]+)(?:\.html|/v\.swf|)
''' '''
_TEST = {
'url': 'http://v.youku.com/v_show/id_XNDgyMDQ2NTQw.html', _TESTS = [{
'md5': 'ffe3f2e435663dc2d1eea34faeff5b5b', 'url': 'http://v.youku.com/v_show/id_XMTc1ODE5Njcy.html',
'params': { 'md5': '5f3af4192eabacc4501508d54a8cabd7',
'test': False
},
'info_dict': { 'info_dict': {
'id': 'XNDgyMDQ2NTQw_part00', 'id': 'XMTc1ODE5Njcy_part1',
'ext': 'flv', 'title': '★Smile﹗♡ Git Fresh -Booty Music舞蹈.',
'title': 'youtube-dl test video "\'/\\ä↭𝕐' 'ext': 'flv'
} }
} }, {
'url': 'http://player.youku.com/player.php/sid/XNDgyMDQ2NTQw/v.swf',
'only_matching': True,
}, {
'url': 'http://v.youku.com/v_show/id_XODgxNjg1Mzk2_ev_1.html',
'info_dict': {
'id': 'XODgxNjg1Mzk2',
'title': '武媚娘传奇 85',
},
'playlist_count': 11,
}, {
'url': 'http://v.youku.com/v_show/id_XMTI1OTczNDM5Mg==.html',
'info_dict': {
'id': 'XMTI1OTczNDM5Mg',
'title': '花千骨 04',
},
'playlist_count': 13,
'skip': 'Available in China only',
}]
def _gen_sid(self): def construct_video_urls(self, data1, data2):
nowTime = int(time.time() * 1000) # get sid, token
random1 = random.randint(1000, 1998) def yk_t(s1, s2):
random2 = random.randint(1000, 9999) ls = list(range(256))
t = 0
for i in range(256):
t = (t + ls[i] + compat_ord(s1[i % len(s1)])) % 256
ls[i], ls[t] = ls[t], ls[i]
s = bytearray()
x, y = 0, 0
for i in range(len(s2)):
y = (y + 1) % 256
x = (x + ls[y]) % 256
ls[x], ls[y] = ls[y], ls[x]
s.append(compat_ord(s2[i]) ^ ls[(ls[x] + ls[y]) % 256])
return bytes(s)
return "%d%d%d" % (nowTime, random1, random2) sid, token = yk_t(
b'becaf9be', base64.b64decode(data2['ep'].encode('ascii'))
).decode('ascii').split('_')
def _get_file_ID_mix_string(self, seed): # get oip
mixed = [] oip = data2['ip']
source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
seed = float(seed)
for i in range(len(source)):
seed = (seed * 211 + 30031) % 65536
index = math.floor(seed / 65536 * len(source))
mixed.append(source[int(index)])
source.remove(source[int(index)])
# return ''.join(mixed)
return mixed
def _get_file_id(self, fileId, seed): # get fileid
mixed = self._get_file_ID_mix_string(seed) string_ls = list(
ids = fileId.split('*') 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890')
realId = [] shuffled_string_ls = []
for ch in ids: seed = data1['seed']
if ch: N = len(string_ls)
realId.append(mixed[int(ch)]) for ii in range(N):
return ''.join(realId) seed = (seed * 0xd3 + 0x754f) % 0x10000
idx = seed * len(string_ls) // 0x10000
shuffled_string_ls.append(string_ls[idx])
del string_ls[idx]
fileid_dict = {}
for format in data1['streamtypes']:
streamfileid = [
int(i) for i in data1['streamfileids'][format].strip('*').split('*')]
fileid = ''.join(
[shuffled_string_ls[i] for i in streamfileid])
fileid_dict[format] = fileid[:8] + '%s' + fileid[10:]
def get_fileid(format, n):
fileid = fileid_dict[format] % hex(int(n))[2:].upper().zfill(2)
return fileid
# get ep
def generate_ep(format, n):
fileid = get_fileid(format, n)
ep_t = yk_t(
b'bf7e5f01',
('%s_%s_%s' % (sid, fileid, token)).encode('ascii')
)
ep = base64.b64encode(ep_t).decode('ascii')
return ep
# generate video_urls
video_urls_dict = {}
for format in data1['streamtypes']:
video_urls = []
for dt in data1['segs'][format]:
n = str(int(dt['no']))
param = {
'K': dt['k'],
'hd': self.get_hd(format),
'myp': 0,
'ts': dt['seconds'],
'ypp': 0,
'ctype': 12,
'ev': 1,
'token': token,
'oip': oip,
'ep': generate_ep(format, n)
}
video_url = \
'http://k.youku.com/player/getFlvPath/' + \
'sid/' + sid + \
'_' + str(int(n) + 1).zfill(2) + \
'/st/' + self.parse_ext_l(format) + \
'/fileid/' + get_fileid(format, n) + '?' + \
compat_urllib_parse.urlencode(param)
video_urls.append(video_url)
video_urls_dict[format] = video_urls
return video_urls_dict
def get_hd(self, fm):
hd_id_dict = {
'flv': '0',
'mp4': '1',
'hd2': '2',
'hd3': '3',
'3gp': '0',
'3gphd': '1'
}
return hd_id_dict[fm]
def parse_ext_l(self, fm):
ext_dict = {
'flv': 'flv',
'mp4': 'mp4',
'hd2': 'flv',
'hd3': 'flv',
'3gp': 'flv',
'3gphd': 'mp4'
}
return ext_dict[fm]
def get_format_name(self, fm):
_dict = {
'3gp': 'h6',
'3gphd': 'h5',
'flv': 'h4',
'mp4': 'h3',
'hd2': 'h2',
'hd3': 'h1'
}
return _dict[fm]
def _real_extract(self, url): def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url) video_id = self._match_id(url)
video_id = mobj.group('id')
info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id def retrieve_data(req_url, note):
req = compat_urllib_request.Request(req_url)
config = self._download_json(info_url, video_id) cn_verification_proxy = self._downloader.params.get('cn_verification_proxy')
if cn_verification_proxy:
req.add_header('Ytdl-request-proxy', cn_verification_proxy)
error_code = config['data'][0].get('error_code') raw_data = self._download_json(req, video_id, note=note)
return raw_data['data'][0]
# request basic data
data1 = retrieve_data(
'http://v.youku.com/player/getPlayList/VideoIDS/%s' % video_id,
'Downloading JSON metadata 1')
data2 = retrieve_data(
'http://v.youku.com/player/getPlayList/VideoIDS/%s/Pf/4/ctype/12/ev/1' % video_id,
'Downloading JSON metadata 2')
error_code = data1.get('error_code')
if error_code: if error_code:
# -8 means blocked outside China. error = data1.get('error')
error = config['data'][0].get('error') # Chinese and English, separated by newline. if error is not None and '因版权原因无法观看此视频' in error:
raise ExtractorError(error or 'Server reported error %i' % error_code, raise ExtractorError(
expected=True) 'Youku said: Sorry, this video is available in China only', expected=True)
video_title = config['data'][0]['title']
seed = config['data'][0]['seed']
format = self._downloader.params.get('format', None)
supported_format = list(config['data'][0]['streamfileids'].keys())
# TODO proper format selection
if format is None or format == 'best':
if 'hd2' in supported_format:
format = 'hd2'
else: else:
format = 'flv' msg = 'Youku server reported error %i' % error_code
ext = 'flv' if error is not None:
elif format == 'worst': msg += ': ' + error
format = 'mp4' raise ExtractorError(msg)
ext = 'mp4'
else:
format = 'flv'
ext = 'flv'
fileid = config['data'][0]['streamfileids'][format] title = data1['title']
keys = [s['k'] for s in config['data'][0]['segs'][format]]
# segs is usually a dictionary, but an empty *list* if an error occured.
files_info = [] # generate video_urls_dict
sid = self._gen_sid() video_urls_dict = self.construct_video_urls(data1, data2)
fileid = self._get_file_id(fileid, seed)
# column 8,9 of fileid represent the segment number # construct info
# fileid[7:9] should be changed entries = []
for index, key in enumerate(keys): for fm in data1['streamtypes']:
temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:]) video_urls = video_urls_dict[fm]
download_url = 'http://k.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key) for i in range(len(video_urls)):
if len(entries) < i + 1:
entries.append({'formats': []})
entries[i]['formats'].append({
'url': video_urls[i],
'format_id': self.get_format_name(fm),
'ext': self.parse_ext_l(fm),
'filesize': int(data1['segs'][fm][i]['size'])
})
info = { for i in range(len(entries)):
'id': '%s_part%02d' % (video_id, index), entries[i].update({
'url': download_url, 'id': '%s_part%d' % (video_id, i + 1),
'uploader': None, 'title': title,
'upload_date': None, })
'title': video_title,
'ext': ext,
}
files_info.append(info)
return files_info return {
'_type': 'multi_video',
'id': video_id,
'title': title,
'entries': entries,
}

View File

@ -1504,7 +1504,7 @@ class YoutubeSearchIE(SearchInfoExtractor, YoutubePlaylistIE):
for pagenum in itertools.count(1): for pagenum in itertools.count(1):
url_query = { url_query = {
'search_query': query, 'search_query': query.encode('utf-8'),
'page': pagenum, 'page': pagenum,
'spf': 'navigate', 'spf': 'navigate',
} }