Merge pull request #1 from rg3/master

pull for local
This commit is contained in:
chouzhenixiao 2015-08-12 23:38:33 +08:00
commit cf663e548f
33 changed files with 728 additions and 205 deletions

View File

@ -108,7 +108,7 @@ which means you can modify it, redistribute it or use it however you like.
--playlist-reverse Download playlist videos in reverse order --playlist-reverse Download playlist videos in reverse order
--xattr-set-filesize Set file xattribute ytdl.filesize with expected filesize (experimental) --xattr-set-filesize Set file xattribute ytdl.filesize with expected filesize (experimental)
--hls-prefer-native Use the native HLS downloader instead of ffmpeg (experimental) --hls-prefer-native Use the native HLS downloader instead of ffmpeg (experimental)
--external-downloader COMMAND Use the specified external downloader. Currently supports aria2c,curl,httpie,wget --external-downloader COMMAND Use the specified external downloader. Currently supports aria2c,axel,curl,httpie,wget
--external-downloader-args ARGS Give these arguments to the external downloader --external-downloader-args ARGS Give these arguments to the external downloader
## Filesystem Options: ## Filesystem Options:

View File

@ -86,7 +86,7 @@
- **chirbit:profile** - **chirbit:profile**
- **Cinchcast** - **Cinchcast**
- **Cinemassacre** - **Cinemassacre**
- **clipfish** - **Clipfish**
- **cliphunter** - **cliphunter**
- **Clipsyndicate** - **Clipsyndicate**
- **Cloudy** - **Cloudy**
@ -116,6 +116,7 @@
- **DailymotionCloud** - **DailymotionCloud**
- **daum.net** - **daum.net**
- **DBTV** - **DBTV**
- **DCN**
- **DctpTv** - **DctpTv**
- **DeezerPlaylist** - **DeezerPlaylist**
- **defense.gouv.fr** - **defense.gouv.fr**
@ -351,7 +352,6 @@
- **NowTV** - **NowTV**
- **nowvideo**: NowVideo - **nowvideo**: NowVideo
- **npo**: npo.nl and ntr.nl - **npo**: npo.nl and ntr.nl
- **npo**: npo.nl and ntr.nl
- **npo.nl:live** - **npo.nl:live**
- **npo.nl:radio** - **npo.nl:radio**
- **npo.nl:radio:fragment** - **npo.nl:radio:fragment**
@ -377,6 +377,7 @@
- **parliamentlive.tv**: UK parliament videos - **parliamentlive.tv**: UK parliament videos
- **Patreon** - **Patreon**
- **PBS** - **PBS**
- **Periscope**: Periscope
- **PhilharmonieDeParis**: Philharmonie de Paris - **PhilharmonieDeParis**: Philharmonie de Paris
- **Phoenix** - **Phoenix**
- **Photobucket** - **Photobucket**
@ -406,6 +407,7 @@
- **qqmusic:playlist**: QQ音乐 - 歌单 - **qqmusic:playlist**: QQ音乐 - 歌单
- **qqmusic:singer**: QQ音乐 - 歌手 - **qqmusic:singer**: QQ音乐 - 歌手
- **qqmusic:toplist**: QQ音乐 - 排行榜 - **qqmusic:toplist**: QQ音乐 - 排行榜
- **Quickscope**: Quick Scope
- **QuickVid** - **QuickVid**
- **R7** - **R7**
- **radio.de** - **radio.de**
@ -518,6 +520,7 @@
- **ted** - **ted**
- **TeleBruxelles** - **TeleBruxelles**
- **telecinco.es** - **telecinco.es**
- **Telegraaf**
- **TeleMB** - **TeleMB**
- **TeleTask** - **TeleTask**
- **TenPlay** - **TenPlay**
@ -621,6 +624,7 @@
- **Vodlocker** - **Vodlocker**
- **VoiceRepublic** - **VoiceRepublic**
- **Vporn** - **Vporn**
- **vpro**: npo.nl and ntr.nl
- **VRT** - **VRT**
- **vube**: Vube.com - **vube**: Vube.com
- **VuClip** - **VuClip**

View File

@ -133,8 +133,8 @@ def expect_info_dict(self, got_dict, expected_dict):
elif isinstance(expected, compat_str) and expected.startswith('mincount:'): elif isinstance(expected, compat_str) and expected.startswith('mincount:'):
got = got_dict.get(info_field) got = got_dict.get(info_field)
self.assertTrue( self.assertTrue(
isinstance(got, list), isinstance(got, (list, dict)),
'Expected field %s to be a list, but it is of type %s' % ( 'Expected field %s to be a list or a dict, but it is of type %s' % (
info_field, type(got).__name__)) info_field, type(got).__name__))
expected_num = int(expected.partition(':')[2]) expected_num = int(expected.partition(':')[2])
assertGreaterEqual( assertGreaterEqual(
@ -160,7 +160,7 @@ def expect_info_dict(self, got_dict, expected_dict):
# Are checkable fields missing from the test case definition? # Are checkable fields missing from the test case definition?
test_info_dict = dict((key, value if not isinstance(value, compat_str) or len(value) < 250 else 'md5:' + md5(value)) test_info_dict = dict((key, value if not isinstance(value, compat_str) or len(value) < 250 else 'md5:' + md5(value))
for key, value in got_dict.items() for key, value in got_dict.items()
if value and key in ('id', 'title', 'description', 'uploader', 'upload_date', 'timestamp', 'uploader_id', 'location')) if value and key in ('id', 'title', 'description', 'uploader', 'upload_date', 'timestamp', 'uploader_id', 'location', 'age_limit'))
missing_keys = set(test_info_dict.keys()) - set(expected_dict.keys()) missing_keys = set(test_info_dict.keys()) - set(expected_dict.keys())
if missing_keys: if missing_keys:
def _repr(v): def _repr(v):

View File

@ -136,7 +136,9 @@ def generator(test_case):
# We're not using .download here sine that is just a shim # We're not using .download here sine that is just a shim
# for outside error handling, and returns the exit code # for outside error handling, and returns the exit code
# instead of the result dict. # instead of the result dict.
res_dict = ydl.extract_info(test_case['url']) res_dict = ydl.extract_info(
test_case['url'],
force_generic_extractor=params.get('force_generic_extractor', False))
except (DownloadError, ExtractorError) as err: except (DownloadError, ExtractorError) as err:
# Check if the exception is not a network related one # Check if the exception is not a network related one
if not err.exc_info[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError, compat_http_client.BadStatusLine) or (err.exc_info[0] == compat_HTTPError and err.exc_info[1].code == 503): if not err.exc_info[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError, compat_http_client.BadStatusLine) or (err.exc_info[0] == compat_HTTPError and err.exc_info[1].code == 503):

View File

@ -28,7 +28,6 @@ if os.name == 'nt':
import ctypes import ctypes
from .compat import ( from .compat import (
compat_basestring,
compat_cookiejar, compat_cookiejar,
compat_expanduser, compat_expanduser,
compat_get_terminal_size, compat_get_terminal_size,
@ -40,7 +39,6 @@ from .compat import (
compat_urllib_request, compat_urllib_request,
) )
from .utils import ( from .utils import (
escape_url,
ContentTooShortError, ContentTooShortError,
date_from_str, date_from_str,
DateRange, DateRange,
@ -51,7 +49,6 @@ from .utils import (
ExtractorError, ExtractorError,
format_bytes, format_bytes,
formatSeconds, formatSeconds,
HEADRequest,
locked_file, locked_file,
make_HTTPS_handler, make_HTTPS_handler,
MaxDownloadsReached, MaxDownloadsReached,
@ -1860,27 +1857,6 @@ class YoutubeDL(object):
def urlopen(self, req): def urlopen(self, req):
""" Start an HTTP download """ """ Start an HTTP download """
# According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
# always respected by websites, some tend to give out URLs with non percent-encoded
# non-ASCII characters (see telemb.py, ard.py [#3412])
# urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
# To work around aforementioned issue we will replace request's original URL with
# percent-encoded one
req_is_string = isinstance(req, compat_basestring)
url = req if req_is_string else req.get_full_url()
url_escaped = escape_url(url)
# Substitute URL if any change after escaping
if url != url_escaped:
if req_is_string:
req = url_escaped
else:
req_type = HEADRequest if req.get_method() == 'HEAD' else compat_urllib_request.Request
req = req_type(
url_escaped, data=req.data, headers=req.headers,
origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
return self._opener.open(req, timeout=self._socket_timeout) return self._opener.open(req, timeout=self._socket_timeout)
def print_debug_header(self): def print_debug_header(self):

View File

@ -45,11 +45,13 @@ class ExternalFD(FileDownloader):
def supports(cls, info_dict): def supports(cls, info_dict):
return info_dict['protocol'] in ('http', 'https', 'ftp', 'ftps') return info_dict['protocol'] in ('http', 'https', 'ftp', 'ftps')
def _source_address(self, command_option): def _option(self, command_option, param):
source_address = self.params.get('source_address') param = self.params.get(param)
if source_address is None: if param is None:
return [] return []
return [command_option, source_address] if isinstance(param, bool):
return [command_option]
return [command_option, param]
def _configuration_args(self, default=[]): def _configuration_args(self, default=[]):
ex_args = self.params.get('external_downloader_args') ex_args = self.params.get('external_downloader_args')
@ -77,7 +79,17 @@ class CurlFD(ExternalFD):
cmd = [self.exe, '--location', '-o', tmpfilename] cmd = [self.exe, '--location', '-o', tmpfilename]
for key, val in info_dict['http_headers'].items(): for key, val in info_dict['http_headers'].items():
cmd += ['--header', '%s: %s' % (key, val)] cmd += ['--header', '%s: %s' % (key, val)]
cmd += self._source_address('--interface') cmd += self._option('--interface', 'source_address')
cmd += self._configuration_args()
cmd += ['--', info_dict['url']]
return cmd
class AxelFD(ExternalFD):
def _make_cmd(self, tmpfilename, info_dict):
cmd = [self.exe, '-o', tmpfilename]
for key, val in info_dict['http_headers'].items():
cmd += ['-H', '%s: %s' % (key, val)]
cmd += self._configuration_args() cmd += self._configuration_args()
cmd += ['--', info_dict['url']] cmd += ['--', info_dict['url']]
return cmd return cmd
@ -88,7 +100,9 @@ class WgetFD(ExternalFD):
cmd = [self.exe, '-O', tmpfilename, '-nv', '--no-cookies'] cmd = [self.exe, '-O', tmpfilename, '-nv', '--no-cookies']
for key, val in info_dict['http_headers'].items(): for key, val in info_dict['http_headers'].items():
cmd += ['--header', '%s: %s' % (key, val)] cmd += ['--header', '%s: %s' % (key, val)]
cmd += self._source_address('--bind-address') cmd += self._option('--bind-address', 'source_address')
cmd += self._option('--proxy', 'proxy')
cmd += self._option('--no-check-certificate', 'nocheckcertificate')
cmd += self._configuration_args() cmd += self._configuration_args()
cmd += ['--', info_dict['url']] cmd += ['--', info_dict['url']]
return cmd return cmd
@ -105,7 +119,8 @@ class Aria2cFD(ExternalFD):
cmd += ['--out', os.path.basename(tmpfilename)] cmd += ['--out', os.path.basename(tmpfilename)]
for key, val in info_dict['http_headers'].items(): for key, val in info_dict['http_headers'].items():
cmd += ['--header', '%s: %s' % (key, val)] cmd += ['--header', '%s: %s' % (key, val)]
cmd += self._source_address('--interface') cmd += self._option('--interface', 'source_address')
cmd += self._option('--all-proxy', 'proxy')
cmd += ['--', info_dict['url']] cmd += ['--', info_dict['url']]
return cmd return cmd

View File

@ -118,6 +118,7 @@ from .dailymotion import (
) )
from .daum import DaumIE from .daum import DaumIE
from .dbtv import DBTVIE from .dbtv import DBTVIE
from .dcn import DCNIE
from .dctp import DctpTvIE from .dctp import DctpTvIE
from .deezer import DeezerPlaylistIE from .deezer import DeezerPlaylistIE
from .dfb import DFBIE from .dfb import DFBIE
@ -431,6 +432,10 @@ from .orf import (
from .parliamentliveuk import ParliamentLiveUKIE from .parliamentliveuk import ParliamentLiveUKIE
from .patreon import PatreonIE from .patreon import PatreonIE
from .pbs import PBSIE from .pbs import PBSIE
from .periscope import (
PeriscopeIE,
QuickscopeIE,
)
from .philharmoniedeparis import PhilharmonieDeParisIE from .philharmoniedeparis import PhilharmonieDeParisIE
from .phoenix import PhoenixIE from .phoenix import PhoenixIE
from .photobucket import PhotobucketIE from .photobucket import PhotobucketIE
@ -591,6 +596,7 @@ from .techtalks import TechTalksIE
from .ted import TEDIE from .ted import TEDIE
from .telebruxelles import TeleBruxellesIE from .telebruxelles import TeleBruxellesIE
from .telecinco import TelecincoIE from .telecinco import TelecincoIE
from .telegraaf import TelegraafIE
from .telemb import TeleMBIE from .telemb import TeleMBIE
from .teletask import TeleTaskIE from .teletask import TeleTaskIE
from .tenplay import TenPlayIE from .tenplay import TenPlayIE

View File

@ -18,6 +18,7 @@ class BreakIE(InfoExtractor):
'id': '2468056', 'id': '2468056',
'ext': 'mp4', 'ext': 'mp4',
'title': 'When Girls Act Like D-Bags', 'title': 'When Girls Act Like D-Bags',
'age_limit': 13,
} }
}, { }, {
'url': 'http://www.break.com/video/ugc/baby-flex-2773063', 'url': 'http://www.break.com/video/ugc/baby-flex-2773063',

View File

@ -1,53 +1,68 @@
from __future__ import unicode_literals from __future__ import unicode_literals
import re import re
import time
import xml.etree.ElementTree
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import ( from ..utils import (
ExtractorError, determine_ext,
parse_duration, int_or_none,
js_to_json,
parse_iso8601,
remove_end,
) )
class ClipfishIE(InfoExtractor): class ClipfishIE(InfoExtractor):
IE_NAME = 'clipfish' _VALID_URL = r'https?://(?:www\.)?clipfish\.de/(?:[^/]+/)+video/(?P<id>[0-9]+)'
_VALID_URL = r'^https?://(?:www\.)?clipfish\.de/.*?/video/(?P<id>[0-9]+)/'
_TEST = { _TEST = {
'url': 'http://www.clipfish.de/special/game-trailer/video/3966754/fifa-14-e3-2013-trailer/', 'url': 'http://www.clipfish.de/special/game-trailer/video/3966754/fifa-14-e3-2013-trailer/',
'md5': '2521cd644e862936cf2e698206e47385', 'md5': '79bc922f3e8a9097b3d68a93780fd475',
'info_dict': { 'info_dict': {
'id': '3966754', 'id': '3966754',
'ext': 'mp4', 'ext': 'mp4',
'title': 'FIFA 14 - E3 2013 Trailer', 'title': 'FIFA 14 - E3 2013 Trailer',
'timestamp': 1370938118,
'upload_date': '20130611',
'duration': 82, 'duration': 82,
}, }
'skip': 'Blocked in the US'
} }
def _real_extract(self, url): def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url) video_id = self._match_id(url)
video_id = mobj.group(1)
info_url = ('http://www.clipfish.de/devxml/videoinfo/%s?ts=%d' % webpage = self._download_webpage(url, video_id)
(video_id, int(time.time())))
doc = self._download_xml( video_info = self._parse_json(
info_url, video_id, note='Downloading info page') js_to_json(self._html_search_regex(
title = doc.find('title').text '(?s)videoObject\s*=\s*({.+?});', webpage, 'video object')),
video_url = doc.find('filename').text video_id)
if video_url is None:
xml_bytes = xml.etree.ElementTree.tostring(doc) formats = []
raise ExtractorError('Cannot find video URL in document %r' % for video_url in re.findall(r'var\s+videourl\s*=\s*"([^"]+)"', webpage):
xml_bytes) ext = determine_ext(video_url)
thumbnail = doc.find('imageurl').text if ext == 'm3u8':
duration = parse_duration(doc.find('duration').text) formats.append({
'url': video_url.replace('de.hls.fra.clipfish.de', 'hls.fra.clipfish.de'),
'ext': 'mp4',
'format_id': 'hls',
})
else:
formats.append({
'url': video_url,
'format_id': ext,
})
self._sort_formats(formats)
title = remove_end(self._og_search_title(webpage), ' - Video')
thumbnail = self._og_search_thumbnail(webpage)
duration = int_or_none(video_info.get('length'))
timestamp = parse_iso8601(self._html_search_meta('uploadDate', webpage, 'upload date'))
return { return {
'id': video_id, 'id': video_id,
'title': title, 'title': title,
'url': video_url, 'formats': formats,
'thumbnail': thumbnail, 'thumbnail': thumbnail,
'duration': duration, 'duration': duration,
'timestamp': timestamp,
} }

View File

@ -18,6 +18,7 @@ from ..compat import (
compat_HTTPError, compat_HTTPError,
compat_http_client, compat_http_client,
compat_urllib_error, compat_urllib_error,
compat_urllib_parse,
compat_urllib_parse_urlparse, compat_urllib_parse_urlparse,
compat_urllib_request, compat_urllib_request,
compat_urlparse, compat_urlparse,
@ -37,6 +38,9 @@ from ..utils import (
RegexNotFoundError, RegexNotFoundError,
sanitize_filename, sanitize_filename,
unescapeHTML, unescapeHTML,
url_basename,
xpath_text,
xpath_with_ns,
) )
@ -200,8 +204,8 @@ class InfoExtractor(object):
There must be a key "entries", which is a list, an iterable, or a PagedList There must be a key "entries", which is a list, an iterable, or a PagedList
object, each element of which is a valid dictionary by this specification. object, each element of which is a valid dictionary by this specification.
Additionally, playlists can have "title" and "id" attributes with the same Additionally, playlists can have "title", "description" and "id" attributes
semantics as videos (see above). with the same semantics as videos (see above).
_type "multi_video" indicates that there are multiple videos that _type "multi_video" indicates that there are multiple videos that
@ -636,7 +640,7 @@ class InfoExtractor(object):
@staticmethod @staticmethod
def _meta_regex(prop): def _meta_regex(prop):
return r'''(?isx)<meta return r'''(?isx)<meta
(?=[^>]+(?:itemprop|name|property)=(["\']?)%s\1) (?=[^>]+(?:itemprop|name|property|id)=(["\']?)%s\1)
[^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop) [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
def _og_search_property(self, prop, html, name=None, **kargs): def _og_search_property(self, prop, html, name=None, **kargs):
@ -978,69 +982,210 @@ class InfoExtractor(object):
self._sort_formats(formats) self._sort_formats(formats)
return formats return formats
# TODO: improve extraction @staticmethod
def _extract_smil_formats(self, smil_url, video_id, fatal=True): def _xpath_ns(path, namespace=None):
smil = self._download_xml( if not namespace:
smil_url, video_id, 'Downloading SMIL file', return path
'Unable to download SMIL file', fatal=fatal) out = []
for c in path.split('/'):
if not c or c == '.':
out.append(c)
else:
out.append('{%s}%s' % (namespace, c))
return '/'.join(out)
def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None):
smil = self._download_smil(smil_url, video_id, fatal=fatal)
if smil is False: if smil is False:
assert not fatal assert not fatal
return [] return []
base = smil.find('./head/meta').get('base') namespace = self._parse_smil_namespace(smil)
return self._parse_smil_formats(
smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
smil = self._download_smil(smil_url, video_id, fatal=fatal)
if smil is False:
return {}
return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
def _download_smil(self, smil_url, video_id, fatal=True):
return self._download_xml(
smil_url, video_id, 'Downloading SMIL file',
'Unable to download SMIL file', fatal=fatal)
def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
namespace = self._parse_smil_namespace(smil)
formats = self._parse_smil_formats(
smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
video_id = os.path.splitext(url_basename(smil_url))[0]
title = None
description = None
for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
name = meta.attrib.get('name')
content = meta.attrib.get('content')
if not name or not content:
continue
if not title and name == 'title':
title = content
elif not description and name in ('description', 'abstract'):
description = content
return {
'id': video_id,
'title': title or video_id,
'description': description,
'formats': formats,
'subtitles': subtitles,
}
def _parse_smil_namespace(self, smil):
return self._search_regex(
r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None):
base = smil_url
for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
b = meta.get('base') or meta.get('httpBase')
if b:
base = b
break
formats = [] formats = []
rtmp_count = 0 rtmp_count = 0
if smil.findall('./body/seq/video'): http_count = 0
video = smil.findall('./body/seq/video')[0]
fmts, rtmp_count = self._parse_smil_video(video, video_id, base, rtmp_count) videos = smil.findall(self._xpath_ns('.//video', namespace))
formats.extend(fmts) for video in videos:
else: src = video.get('src')
for video in smil.findall('./body/switch/video'): if not src:
fmts, rtmp_count = self._parse_smil_video(video, video_id, base, rtmp_count) continue
formats.extend(fmts)
bitrate = int_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000)
filesize = int_or_none(video.get('size') or video.get('fileSize'))
width = int_or_none(video.get('width'))
height = int_or_none(video.get('height'))
proto = video.get('proto')
ext = video.get('ext')
src_ext = determine_ext(src)
streamer = video.get('streamer') or base
if proto == 'rtmp' or streamer.startswith('rtmp'):
rtmp_count += 1
formats.append({
'url': streamer,
'play_path': src,
'ext': 'flv',
'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
'tbr': bitrate,
'filesize': filesize,
'width': width,
'height': height,
})
continue
src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
if proto == 'm3u8' or src_ext == 'm3u8':
formats.extend(self._extract_m3u8_formats(
src_url, video_id, ext or 'mp4', m3u8_id='hls'))
continue
if src_ext == 'f4m':
f4m_url = src_url
if not f4m_params:
f4m_params = {
'hdcore': '3.2.0',
'plugin': 'flowplayer-3.2.0.1',
}
f4m_url += '&' if '?' in f4m_url else '?'
f4m_url += compat_urllib_parse.urlencode(f4m_params)
formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds'))
continue
if src_url.startswith('http'):
http_count += 1
formats.append({
'url': src_url,
'ext': ext or src_ext or 'flv',
'format_id': 'http-%d' % (bitrate or http_count),
'tbr': bitrate,
'filesize': filesize,
'width': width,
'height': height,
})
continue
self._sort_formats(formats) self._sort_formats(formats)
return formats return formats
def _parse_smil_video(self, video, video_id, base, rtmp_count): def _parse_smil_subtitles(self, smil, namespace=None):
src = video.get('src') subtitles = {}
if not src: for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
return [], rtmp_count src = textstream.get('src')
bitrate = int_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000) if not src:
width = int_or_none(video.get('width')) continue
height = int_or_none(video.get('height')) ext = textstream.get('ext') or determine_ext(src)
proto = video.get('proto') if not ext:
if not proto: type_ = textstream.get('type')
if base: if type_ == 'text/srt':
if base.startswith('rtmp'): ext = 'srt'
proto = 'rtmp' lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName')
elif base.startswith('http'): subtitles.setdefault(lang, []).append({
proto = 'http' 'url': src,
ext = video.get('ext') 'ext': ext,
if proto == 'm3u8': })
return self._extract_m3u8_formats(src, video_id, ext), rtmp_count return subtitles
elif proto == 'rtmp':
rtmp_count += 1 def _extract_xspf_playlist(self, playlist_url, playlist_id, fatal=True):
streamer = video.get('streamer') or base xspf = self._download_xml(
return ([{ playlist_url, playlist_id, 'Downloading xpsf playlist',
'url': streamer, 'Unable to download xspf manifest', fatal=fatal)
'play_path': src, if xspf is False:
'ext': 'flv', return []
'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate), return self._parse_xspf(xspf, playlist_id)
'tbr': bitrate,
'width': width, def _parse_xspf(self, playlist, playlist_id):
'height': height, NS_MAP = {
}], rtmp_count) 'xspf': 'http://xspf.org/ns/0/',
elif proto.startswith('http'): 's1': 'http://static.streamone.nl/player/ns/0',
return ([{ }
'url': base + src,
'ext': ext or 'flv', entries = []
'tbr': bitrate, for track in playlist.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
'width': width, title = xpath_text(
'height': height, track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
}], rtmp_count) description = xpath_text(
track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
thumbnail = xpath_text(
track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
duration = float_or_none(
xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
formats = [{
'url': location.text,
'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
} for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP))]
self._sort_formats(formats)
entries.append({
'id': playlist_id,
'title': title,
'description': description,
'thumbnail': thumbnail,
'duration': duration,
'formats': formats,
})
return entries
def _live_title(self, name): def _live_title(self, name):
""" Generate the title for a live video """ """ Generate the title for a live video """

View File

@ -0,0 +1,84 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from ..compat import (
compat_urllib_parse,
compat_urllib_request,
)
from ..utils import (
int_or_none,
parse_iso8601,
)
class DCNIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?dcndigital\.ae/(?:#/)?(?:video/.+|show/\d+/.+?)/(?P<id>\d+)'
_TEST = {
'url': 'http://www.dcndigital.ae/#/show/199074/%D8%B1%D8%AD%D9%84%D8%A9-%D8%A7%D9%84%D8%B9%D9%85%D8%B1-%D8%A7%D9%84%D8%AD%D9%84%D9%82%D8%A9-1/17375/6887',
'info_dict':
{
'id': '17375',
'ext': 'mp4',
'title': 'رحلة العمر : الحلقة 1',
'description': 'md5:0156e935d870acb8ef0a66d24070c6d6',
'thumbnail': 're:^https?://.*\.jpg$',
'duration': 2041,
'timestamp': 1227504126,
'upload_date': '20081124',
},
'params': {
# m3u8 download
'skip_download': True,
},
}
def _real_extract(self, url):
video_id = self._match_id(url)
request = compat_urllib_request.Request(
'http://admin.mangomolo.com/analytics/index.php/plus/video?id=%s' % video_id,
headers={'Origin': 'http://www.dcndigital.ae'})
video = self._download_json(request, video_id)
title = video.get('title_en') or video['title_ar']
webpage = self._download_webpage(
'http://admin.mangomolo.com/analytics/index.php/customers/embed/video?'
+ compat_urllib_parse.urlencode({
'id': video['id'],
'user_id': video['user_id'],
'signature': video['signature'],
'countries': 'Q0M=',
'filter': 'DENY',
}), video_id)
m3u8_url = self._html_search_regex(r'file:\s*"([^"]+)', webpage, 'm3u8 url')
formats = self._extract_m3u8_formats(
m3u8_url, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls')
rtsp_url = self._search_regex(
r'<a[^>]+href="(rtsp://[^"]+)"', webpage, 'rtsp url', fatal=False)
if rtsp_url:
formats.append({
'url': rtsp_url,
'format_id': 'rtsp',
})
self._sort_formats(formats)
img = video.get('img')
thumbnail = 'http://admin.mangomolo.com/analytics/%s' % img if img else None
duration = int_or_none(video.get('duration'))
description = video.get('description_en') or video.get('description_ar')
timestamp = parse_iso8601(video.get('create_time') or video.get('update_time'), ' ')
return {
'id': video_id,
'title': title,
'description': description,
'thumbnail': thumbnail,
'duration': duration,
'timestamp': timestamp,
'formats': formats,
}

View File

@ -34,24 +34,14 @@ class DHMIE(InfoExtractor):
}] }]
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) playlist_id = self._match_id(url)
webpage = self._download_webpage(url, video_id) webpage = self._download_webpage(url, playlist_id)
playlist_url = self._search_regex( playlist_url = self._search_regex(
r"file\s*:\s*'([^']+)'", webpage, 'playlist url') r"file\s*:\s*'([^']+)'", webpage, 'playlist url')
playlist = self._download_xml(playlist_url, video_id) entries = self._extract_xspf_playlist(playlist_url, playlist_id)
track = playlist.find(
'./{http://xspf.org/ns/0/}trackList/{http://xspf.org/ns/0/}track')
video_url = xpath_text(
track, './{http://xspf.org/ns/0/}location',
'video url', fatal=True)
thumbnail = xpath_text(
track, './{http://xspf.org/ns/0/}image',
'thumbnail')
title = self._search_regex( title = self._search_regex(
[r'dc:title="([^"]+)"', r'<title> &raquo;([^<]+)</title>'], [r'dc:title="([^"]+)"', r'<title> &raquo;([^<]+)</title>'],
@ -63,11 +53,10 @@ class DHMIE(InfoExtractor):
r'<em>Length\s*</em>\s*:\s*</strong>([^<]+)', r'<em>Length\s*</em>\s*:\s*</strong>([^<]+)',
webpage, 'duration', default=None)) webpage, 'duration', default=None))
return { entries[0].update({
'id': video_id,
'url': video_url,
'title': title, 'title': title,
'description': description, 'description': description,
'duration': duration, 'duration': duration,
'thumbnail': thumbnail, })
}
return self.playlist_result(entries, playlist_id)

View File

@ -86,7 +86,7 @@ class FC2IE(InfoExtractor):
info_url = ( info_url = (
"http://video.fc2.com/ginfo.php?mimi={1:s}&href={2:s}&v={0:s}&fversion=WIN%2011%2C6%2C602%2C180&from=2&otag=0&upid={0:s}&tk=null&". "http://video.fc2.com/ginfo.php?mimi={1:s}&href={2:s}&v={0:s}&fversion=WIN%2011%2C6%2C602%2C180&from=2&otag=0&upid={0:s}&tk=null&".
format(video_id, mimi, compat_urllib_request.quote(refer, safe='').replace('.', '%2E'))) format(video_id, mimi, compat_urllib_request.quote(refer, safe=b'').replace('.', '%2E')))
info_webpage = self._download_webpage( info_webpage = self._download_webpage(
info_url, video_id, note='Downloading info page') info_url, video_id, note='Downloading info page')

View File

@ -32,6 +32,7 @@ class FourTubeIE(InfoExtractor):
'view_count': int, 'view_count': int,
'like_count': int, 'like_count': int,
'categories': list, 'categories': list,
'age_limit': 18,
} }
} }

View File

@ -130,6 +130,89 @@ class GenericIE(InfoExtractor):
'title': 'pdv_maddow_netcast_m4v-02-27-2015-201624', 'title': 'pdv_maddow_netcast_m4v-02-27-2015-201624',
} }
}, },
# SMIL from http://videolectures.net/promogram_igor_mekjavic_eng
{
'url': 'http://videolectures.net/promogram_igor_mekjavic_eng/video/1/smil.xml',
'info_dict': {
'id': 'smil',
'ext': 'mp4',
'title': 'Automatics, robotics and biocybernetics',
'description': 'md5:815fc1deb6b3a2bff99de2d5325be482',
'formats': 'mincount:16',
'subtitles': 'mincount:1',
},
'params': {
'force_generic_extractor': True,
'skip_download': True,
},
},
# SMIL from http://www1.wdr.de/mediathek/video/livestream/index.html
{
'url': 'http://metafilegenerator.de/WDR/WDR_FS/hds/hds.smil',
'info_dict': {
'id': 'hds',
'ext': 'flv',
'title': 'hds',
'formats': 'mincount:1',
},
'params': {
'skip_download': True,
},
},
# SMIL from https://www.restudy.dk/video/play/id/1637
{
'url': 'https://www.restudy.dk/awsmedia/SmilDirectory/video_1637.xml',
'info_dict': {
'id': 'video_1637',
'ext': 'flv',
'title': 'video_1637',
'formats': 'mincount:3',
},
'params': {
'skip_download': True,
},
},
# SMIL from http://adventure.howstuffworks.com/5266-cool-jobs-iditarod-musher-video.htm
{
'url': 'http://services.media.howstuffworks.com/videos/450221/smil-service.smil',
'info_dict': {
'id': 'smil-service',
'ext': 'flv',
'title': 'smil-service',
'formats': 'mincount:1',
},
'params': {
'skip_download': True,
},
},
# SMIL from http://new.livestream.com/CoheedandCambria/WebsterHall/videos/4719370
{
'url': 'http://api.new.livestream.com/accounts/1570303/events/1585861/videos/4719370.smil',
'info_dict': {
'id': '4719370',
'ext': 'mp4',
'title': '571de1fd-47bc-48db-abf9-238872a58d1f',
'formats': 'mincount:3',
},
'params': {
'skip_download': True,
},
},
# XSPF playlist from http://www.telegraaf.nl/tv/nieuws/binnenland/24353229/__Tikibad_ontruimd_wegens_brand__.html
{
'url': 'http://www.telegraaf.nl/xml/playlist/2015/8/7/mZlp2ctYIUEB.xspf',
'info_dict': {
'id': 'mZlp2ctYIUEB',
'ext': 'mp4',
'title': 'Tikibad ontruimd wegens brand',
'description': 'md5:05ca046ff47b931f9b04855015e163a4',
'thumbnail': 're:^https?://.*\.jpg$',
'duration': 33,
},
'params': {
'skip_download': True,
},
},
# google redirect # google redirect
{ {
'url': 'http://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&cad=rja&ved=0CCUQtwIwAA&url=http%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DcmQHVoWB5FY&ei=F-sNU-LLCaXk4QT52ICQBQ&usg=AFQjCNEw4hL29zgOohLXvpJ-Bdh2bils1Q&bvm=bv.61965928,d.bGE', 'url': 'http://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&cad=rja&ved=0CCUQtwIwAA&url=http%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DcmQHVoWB5FY&ei=F-sNU-LLCaXk4QT52ICQBQ&usg=AFQjCNEw4hL29zgOohLXvpJ-Bdh2bils1Q&bvm=bv.61965928,d.bGE',
@ -236,6 +319,19 @@ class GenericIE(InfoExtractor):
}, },
'add_ie': ['Ooyala'], 'add_ie': ['Ooyala'],
}, },
{
# ooyala video embedded with http://player.ooyala.com/iframe.js
'url': 'http://www.macrumors.com/2015/07/24/steve-jobs-the-man-in-the-machine-first-trailer/',
'info_dict': {
'id': 'p0MGJndjoG5SOKqO_hZJuZFPB-Tr5VgB',
'ext': 'mp4',
'title': '"Steve Jobs: Man in the Machine" trailer',
'description': 'The first trailer for the Alex Gibney documentary "Steve Jobs: Man in the Machine."',
},
'params': {
'skip_download': True,
},
},
# multiple ooyala embeds on SBN network websites # multiple ooyala embeds on SBN network websites
{ {
'url': 'http://www.sbnation.com/college-football-recruiting/2015/2/3/7970291/national-signing-day-rationalizations-itll-be-ok-itll-be-ok', 'url': 'http://www.sbnation.com/college-football-recruiting/2015/2/3/7970291/national-signing-day-rationalizations-itll-be-ok-itll-be-ok',
@ -1110,11 +1206,15 @@ class GenericIE(InfoExtractor):
self.report_extraction(video_id) self.report_extraction(video_id)
# Is it an RSS feed? # Is it an RSS feed, a SMIL file or a XSPF playlist?
try: try:
doc = parse_xml(webpage) doc = parse_xml(webpage)
if doc.tag == 'rss': if doc.tag == 'rss':
return self._extract_rss(url, video_id, doc) return self._extract_rss(url, video_id, doc)
elif re.match(r'^(?:{[^}]+})?smil$', doc.tag):
return self._parse_smil(doc, url, video_id)
elif doc.tag == '{http://xspf.org/ns/0/}playlist':
return self.playlist_result(self._parse_xspf(doc, video_id), video_id)
except compat_xml_parse_error: except compat_xml_parse_error:
pass pass
@ -1320,7 +1420,7 @@ class GenericIE(InfoExtractor):
return self.url_result(mobj.group('url')) return self.url_result(mobj.group('url'))
# Look for Ooyala videos # Look for Ooyala videos
mobj = (re.search(r'player\.ooyala\.com/[^"?]+\?[^"]*?(?:embedCode|ec)=(?P<ec>[^"&]+)', webpage) or mobj = (re.search(r'player\.ooyala\.com/[^"?]+[?#][^"]*?(?:embedCode|ec)=(?P<ec>[^"&]+)', webpage) or
re.search(r'OO\.Player\.create\([\'"].*?[\'"],\s*[\'"](?P<ec>.{32})[\'"]', webpage) or re.search(r'OO\.Player\.create\([\'"].*?[\'"],\s*[\'"](?P<ec>.{32})[\'"]', webpage) or
re.search(r'SBN\.VideoLinkset\.ooyala\([\'"](?P<ec>.{32})[\'"]\)', webpage) or re.search(r'SBN\.VideoLinkset\.ooyala\([\'"](?P<ec>.{32})[\'"]\)', webpage) or
re.search(r'data-ooyala-video-id\s*=\s*[\'"](?P<ec>.{32})[\'"]', webpage)) re.search(r'data-ooyala-video-id\s*=\s*[\'"](?P<ec>.{32})[\'"]', webpage))
@ -1716,7 +1816,8 @@ class GenericIE(InfoExtractor):
# here's a fun little line of code for you: # here's a fun little line of code for you:
video_id = os.path.splitext(video_id)[0] video_id = os.path.splitext(video_id)[0]
if determine_ext(video_url) == 'smil': ext = determine_ext(video_url)
if ext == 'smil':
entries.append({ entries.append({
'id': video_id, 'id': video_id,
'formats': self._extract_smil_formats(video_url, video_id), 'formats': self._extract_smil_formats(video_url, video_id),
@ -1724,6 +1825,8 @@ class GenericIE(InfoExtractor):
'title': video_title, 'title': video_title,
'age_limit': age_limit, 'age_limit': age_limit,
}) })
elif ext == 'xspf':
return self.playlist_result(self._extract_xspf_playlist(video_url, video_id), video_id)
else: else:
entries.append({ entries.append({
'id': video_id, 'id': video_id,

View File

@ -201,7 +201,7 @@ class IqiyiIE(InfoExtractor):
return raw_data return raw_data
def get_enc_key(self, swf_url, video_id): def get_enc_key(self, swf_url, video_id):
enc_key = '8e29ab5666d041c3a1ea76e06dabdffb' enc_key = '3601ba290e4f4662848c710e2122007e' # last update at 2015-08-10 for Zombie
return enc_key return enc_key
def _real_extract(self, url): def _real_extract(self, url):

View File

@ -14,7 +14,7 @@ from ..utils import (
class NowTVIE(InfoExtractor): class NowTVIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?nowtv\.de/(?:rtl|rtl2|rtlnitro|superrtl|ntv|vox)/(?P<id>.+?)/(?:player|preview)' _VALID_URL = r'https?://(?:www\.)?nowtv\.(?:de|at|ch)/(?:rtl|rtl2|rtlnitro|superrtl|ntv|vox)/(?P<id>.+?)/(?:player|preview)'
_TESTS = [{ _TESTS = [{
# rtl # rtl
@ -127,6 +127,9 @@ class NowTVIE(InfoExtractor):
}, { }, {
'url': 'http://www.nowtv.de/rtl/bauer-sucht-frau/die-neuen-bauern-und-eine-hochzeit/preview', 'url': 'http://www.nowtv.de/rtl/bauer-sucht-frau/die-neuen-bauern-und-eine-hochzeit/preview',
'only_matching': True, 'only_matching': True,
}, {
'url': 'http://www.nowtv.at/rtl/bauer-sucht-frau/die-neuen-bauern-und-eine-hochzeit/preview?return=/rtl/bauer-sucht-frau/die-neuen-bauern-und-eine-hochzeit',
'only_matching': True,
}] }]
def _real_extract(self, url): def _real_extract(self, url):

View File

@ -407,6 +407,7 @@ class NPORadioFragmentIE(InfoExtractor):
class VPROIE(NPOIE): class VPROIE(NPOIE):
IE_NAME = 'vpro'
_VALID_URL = r'https?://(?:www\.)?(?:tegenlicht\.)?vpro\.nl/(?:[^/]+/){2,}(?P<id>[^/]+)\.html' _VALID_URL = r'https?://(?:www\.)?(?:tegenlicht\.)?vpro\.nl/(?:[^/]+/){2,}(?P<id>[^/]+)\.html'
_TESTS = [ _TESTS = [

View File

@ -16,15 +16,17 @@ class OdnoklassnikiIE(InfoExtractor):
_TESTS = [{ _TESTS = [{
# metadata in JSON # metadata in JSON
'url': 'http://ok.ru/video/20079905452', 'url': 'http://ok.ru/video/20079905452',
'md5': '8e24ad2da6f387948e7a7d44eb8668fe', 'md5': '6ba728d85d60aa2e6dd37c9e70fdc6bc',
'info_dict': { 'info_dict': {
'id': '20079905452', 'id': '20079905452',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Культура меняет нас (прекрасный ролик!))', 'title': 'Культура меняет нас (прекрасный ролик!))',
'duration': 100, 'duration': 100,
'upload_date': '20141207',
'uploader_id': '330537914540', 'uploader_id': '330537914540',
'uploader': 'Виталий Добровольский', 'uploader': 'Виталий Добровольский',
'like_count': int, 'like_count': int,
'age_limit': 0,
}, },
}, { }, {
# metadataUrl # metadataUrl
@ -35,9 +37,11 @@ class OdnoklassnikiIE(InfoExtractor):
'ext': 'mp4', 'ext': 'mp4',
'title': 'Девушка без комплексов ...', 'title': 'Девушка без комплексов ...',
'duration': 191, 'duration': 191,
'upload_date': '20150518',
'uploader_id': '534380003155', 'uploader_id': '534380003155',
'uploader': 'Андрей Мещанинов', 'uploader': 'Андрей Мещанинов',
'like_count': int, 'like_count': int,
'age_limit': 0,
}, },
}, { }, {
'url': 'http://ok.ru/web-api/video/moviePlayer/20079905452', 'url': 'http://ok.ru/web-api/video/moviePlayer/20079905452',

View File

@ -92,6 +92,7 @@ class PBSIE(InfoExtractor):
'duration': 3172, 'duration': 3172,
'thumbnail': 're:^https?://.*\.jpg$', 'thumbnail': 're:^https?://.*\.jpg$',
'upload_date': '20140122', 'upload_date': '20140122',
'age_limit': 10,
}, },
'params': { 'params': {
'skip_download': True, # requires ffmpeg 'skip_download': True, # requires ffmpeg

View File

@ -0,0 +1,99 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from ..compat import (
compat_urllib_parse,
compat_urllib_request,
)
from ..utils import parse_iso8601
class PeriscopeIE(InfoExtractor):
IE_DESC = 'Periscope'
_VALID_URL = r'https?://(?:www\.)?periscope\.tv/w/(?P<id>[^/?#]+)'
_TEST = {
'url': 'https://www.periscope.tv/w/aJUQnjY3MjA3ODF8NTYxMDIyMDl2zCg2pECBgwTqRpQuQD352EMPTKQjT4uqlM3cgWFA-g==',
'md5': '65b57957972e503fcbbaeed8f4fa04ca',
'info_dict': {
'id': '56102209',
'ext': 'mp4',
'title': 'Bec Boop - 🚠✈️🇬🇧 Fly above #London in Emirates Air Line cable car at night 🇬🇧✈️🚠 #BoopScope 🎀💗',
'timestamp': 1438978559,
'upload_date': '20150807',
'uploader': 'Bec Boop',
'uploader_id': '1465763',
},
'skip': 'Expires in 24 hours',
}
def _call_api(self, method, token):
return self._download_json(
'https://api.periscope.tv/api/v2/%s?token=%s' % (method, token), token)
def _real_extract(self, url):
token = self._match_id(url)
broadcast_data = self._call_api('getBroadcastPublic', token)
broadcast = broadcast_data['broadcast']
status = broadcast['status']
uploader = broadcast.get('user_display_name') or broadcast_data.get('user', {}).get('display_name')
uploader_id = broadcast.get('user_id') or broadcast_data.get('user', {}).get('id')
title = '%s - %s' % (uploader, status) if uploader else status
state = broadcast.get('state').lower()
if state == 'running':
title = self._live_title(title)
timestamp = parse_iso8601(broadcast.get('created_at'))
thumbnails = [{
'url': broadcast[image],
} for image in ('image_url', 'image_url_small') if broadcast.get(image)]
stream = self._call_api('getAccessPublic', token)
formats = []
for format_id in ('replay', 'rtmp', 'hls', 'https_hls'):
video_url = stream.get(format_id + '_url')
if not video_url:
continue
f = {
'url': video_url,
'ext': 'flv' if format_id == 'rtmp' else 'mp4',
}
if format_id != 'rtmp':
f['protocol'] = 'm3u8_native' if state == 'ended' else 'm3u8'
formats.append(f)
self._sort_formats(formats)
return {
'id': broadcast.get('id') or token,
'title': title,
'timestamp': timestamp,
'uploader': uploader,
'uploader_id': uploader_id,
'thumbnails': thumbnails,
'formats': formats,
}
class QuickscopeIE(InfoExtractor):
IE_DESC = 'Quick Scope'
_VALID_URL = r'https?://watchonperiscope\.com/broadcast/(?P<id>\d+)'
_TEST = {
'url': 'https://watchonperiscope.com/broadcast/56180087',
'only_matching': True,
}
def _real_extract(self, url):
broadcast_id = self._match_id(url)
request = compat_urllib_request.Request(
'https://watchonperiscope.com/api/accessChannel', compat_urllib_parse.urlencode({
'broadcast_id': broadcast_id,
'entry_ticket': '',
'from_push': 'false',
'uses_sessions': 'true',
}).encode('utf-8'))
return self.url_result(
self._download_json(request, broadcast_id)['share_url'], 'Periscope')

View File

@ -22,6 +22,7 @@ class Porn91IE(InfoExtractor):
'title': '18岁大一漂亮学妹水嫩性感再爽一次', 'title': '18岁大一漂亮学妹水嫩性感再爽一次',
'ext': 'mp4', 'ext': 'mp4',
'duration': 431, 'duration': 431,
'age_limit': 18,
} }
} }
@ -68,4 +69,5 @@ class Porn91IE(InfoExtractor):
'url': video_url, 'url': video_url,
'duration': duration, 'duration': duration,
'comment_count': comment_count, 'comment_count': comment_count,
'age_limit': self._rta_search(webpage),
} }

View File

@ -30,6 +30,7 @@ class RutubeIE(InfoExtractor):
'uploader': 'NTDRussian', 'uploader': 'NTDRussian',
'uploader_id': '29790', 'uploader_id': '29790',
'upload_date': '20131016', 'upload_date': '20131016',
'age_limit': 0,
}, },
'params': { 'params': {
# It requires ffmpeg (m3u8 download) # It requires ffmpeg (m3u8 download)

View File

@ -29,6 +29,7 @@ class SexyKarmaIE(InfoExtractor):
'view_count': int, 'view_count': int,
'comment_count': int, 'comment_count': int,
'categories': list, 'categories': list,
'age_limit': 18,
} }
}, { }, {
'url': 'http://www.sexykarma.com/gonewild/video/pot-pixie-tribute-8Id6EZPbuHf.html', 'url': 'http://www.sexykarma.com/gonewild/video/pot-pixie-tribute-8Id6EZPbuHf.html',

View File

@ -45,6 +45,14 @@ class SouthParkDeIE(SouthParkIE):
'title': 'The Government Won\'t Respect My Privacy', 'title': 'The Government Won\'t Respect My Privacy',
'description': 'Cartman explains the benefits of "Shitter" to Stan, Kyle and Craig.', 'description': 'Cartman explains the benefits of "Shitter" to Stan, Kyle and Craig.',
}, },
}, {
# non-ASCII characters in initial URL
'url': 'http://www.southpark.de/alle-episoden/s18e09-hashtag-aufwärmen',
'playlist_count': 4,
}, {
# non-ASCII characters in redirect URL
'url': 'http://www.southpark.de/alle-episoden/s18e09',
'playlist_count': 4,
}] }]

View File

@ -0,0 +1,35 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import remove_end
class TelegraafIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?telegraaf\.nl/tv/(?:[^/]+/)+(?P<id>\d+)/[^/]+\.html'
_TEST = {
'url': 'http://www.telegraaf.nl/tv/nieuws/binnenland/24353229/__Tikibad_ontruimd_wegens_brand__.html',
'md5': '83245a9779bcc4a24454bfd53c65b6dc',
'info_dict': {
'id': '24353229',
'ext': 'mp4',
'title': 'Tikibad ontruimd wegens brand',
'description': 'md5:05ca046ff47b931f9b04855015e163a4',
'thumbnail': 're:^https?://.*\.jpg$',
'duration': 33,
},
}
def _real_extract(self, url):
playlist_id = self._match_id(url)
webpage = self._download_webpage(url, playlist_id)
playlist_url = self._search_regex(
r"iframe\.loadPlayer\('([^']+)'", webpage, 'player')
entries = self._extract_xspf_playlist(playlist_url, playlist_id)
title = remove_end(self._og_search_title(webpage), ' - VIDEO')
description = self._og_search_description(webpage)
return self.playlist_result(entries, playlist_id, title, description)

View File

@ -104,6 +104,7 @@ class TVPlayIE(InfoExtractor):
'duration': 1492, 'duration': 1492,
'timestamp': 1330522854, 'timestamp': 1330522854,
'upload_date': '20120229', 'upload_date': '20120229',
'age_limit': 18,
}, },
'params': { 'params': {
# rtmp download # rtmp download

View File

@ -13,7 +13,7 @@ class TweakersIE(InfoExtractor):
_VALID_URL = r'https?://tweakers\.net/video/(?P<id>\d+)' _VALID_URL = r'https?://tweakers\.net/video/(?P<id>\d+)'
_TEST = { _TEST = {
'url': 'https://tweakers.net/video/9926/new-nintendo-3ds-xl-op-alle-fronten-beter.html', 'url': 'https://tweakers.net/video/9926/new-nintendo-3ds-xl-op-alle-fronten-beter.html',
'md5': '1b5afa817403bb5baa08359dca31e6df', 'md5': '3147e4ddad366f97476a93863e4557c8',
'info_dict': { 'info_dict': {
'id': '9926', 'id': '9926',
'ext': 'mp4', 'ext': 'mp4',
@ -25,41 +25,7 @@ class TweakersIE(InfoExtractor):
} }
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) playlist_id = self._match_id(url)
entries = self._extract_xspf_playlist(
playlist = self._download_xml( 'https://tweakers.net/video/s1playlist/%s/playlist.xspf' % playlist_id, playlist_id)
'https://tweakers.net/video/s1playlist/%s/playlist.xspf' % video_id, return self.playlist_result(entries, playlist_id)
video_id)
NS_MAP = {
'xspf': 'http://xspf.org/ns/0/',
's1': 'http://static.streamone.nl/player/ns/0',
}
track = playlist.find(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP))
title = xpath_text(
track, xpath_with_ns('./xspf:title', NS_MAP), 'title')
description = xpath_text(
track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
thumbnail = xpath_text(
track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
duration = float_or_none(
xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'),
1000)
formats = [{
'url': location.text,
'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
} for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP))]
return {
'id': video_id,
'title': title,
'description': description,
'thumbnail': thumbnail,
'duration': duration,
'formats': formats,
}

View File

@ -12,7 +12,7 @@ from ..utils import (
class VideoLecturesNetIE(InfoExtractor): class VideoLecturesNetIE(InfoExtractor):
_VALID_URL = r'http://(?:www\.)?videolectures\.net/(?P<id>[^/#?]+)/' _VALID_URL = r'http://(?:www\.)?videolectures\.net/(?P<id>[^/#?]+)/*(?:[#?].*)?$'
IE_NAME = 'videolectures.net' IE_NAME = 'videolectures.net'
_TEST = { _TEST = {

View File

@ -29,6 +29,7 @@ from ..utils import (
class VimeoBaseInfoExtractor(InfoExtractor): class VimeoBaseInfoExtractor(InfoExtractor):
_NETRC_MACHINE = 'vimeo' _NETRC_MACHINE = 'vimeo'
_LOGIN_REQUIRED = False _LOGIN_REQUIRED = False
_LOGIN_URL = 'https://vimeo.com/log_in'
def _login(self): def _login(self):
(username, password) = self._get_login_info() (username, password) = self._get_login_info()
@ -37,21 +38,25 @@ class VimeoBaseInfoExtractor(InfoExtractor):
raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True) raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
return return
self.report_login() self.report_login()
login_url = 'https://vimeo.com/log_in' webpage = self._download_webpage(self._LOGIN_URL, None, False)
webpage = self._download_webpage(login_url, None, False) token = self._extract_xsrft(webpage)
token = self._search_regex(r'xsrft":"(.*?)"', webpage, 'login token')
data = urlencode_postdata({ data = urlencode_postdata({
'action': 'login',
'email': username, 'email': username,
'password': password, 'password': password,
'action': 'login',
'service': 'vimeo', 'service': 'vimeo',
'token': token, 'token': token,
}) })
login_request = compat_urllib_request.Request(login_url, data) login_request = compat_urllib_request.Request(self._LOGIN_URL, data)
login_request.add_header('Content-Type', 'application/x-www-form-urlencoded') login_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
login_request.add_header('Cookie', 'xsrft=%s' % token) login_request.add_header('Referer', self._LOGIN_URL)
self._download_webpage(login_request, None, False, 'Wrong login info') self._download_webpage(login_request, None, False, 'Wrong login info')
def _extract_xsrft(self, webpage):
return self._search_regex(
r'xsrft\s*[=:]\s*(?P<q>["\'])(?P<xsrft>.+?)(?P=q)',
webpage, 'login token', group='xsrft')
class VimeoIE(VimeoBaseInfoExtractor): class VimeoIE(VimeoBaseInfoExtractor):
"""Information extractor for vimeo.com.""" """Information extractor for vimeo.com."""
@ -193,7 +198,7 @@ class VimeoIE(VimeoBaseInfoExtractor):
password = self._downloader.params.get('videopassword', None) password = self._downloader.params.get('videopassword', None)
if password is None: if password is None:
raise ExtractorError('This video is protected by a password, use the --video-password option', expected=True) raise ExtractorError('This video is protected by a password, use the --video-password option', expected=True)
token = self._search_regex(r'xsrft[\s=:"\']+([^"\']+)', webpage, 'login token') token = self._extract_xsrft(webpage)
data = urlencode_postdata({ data = urlencode_postdata({
'password': password, 'password': password,
'token': token, 'token': token,
@ -203,7 +208,7 @@ class VimeoIE(VimeoBaseInfoExtractor):
url = url.replace('http://', 'https://') url = url.replace('http://', 'https://')
password_request = compat_urllib_request.Request(url + '/password', data) password_request = compat_urllib_request.Request(url + '/password', data)
password_request.add_header('Content-Type', 'application/x-www-form-urlencoded') password_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
password_request.add_header('Cookie', 'xsrft=%s' % token) password_request.add_header('Referer', url)
return self._download_webpage( return self._download_webpage(
password_request, video_id, password_request, video_id,
'Verifying the password', 'Wrong password') 'Verifying the password', 'Wrong password')
@ -422,10 +427,11 @@ class VimeoIE(VimeoBaseInfoExtractor):
} }
class VimeoChannelIE(InfoExtractor): class VimeoChannelIE(VimeoBaseInfoExtractor):
IE_NAME = 'vimeo:channel' IE_NAME = 'vimeo:channel'
_VALID_URL = r'https://vimeo\.com/channels/(?P<id>[^/?#]+)/?(?:$|[?#])' _VALID_URL = r'https://vimeo\.com/channels/(?P<id>[^/?#]+)/?(?:$|[?#])'
_MORE_PAGES_INDICATOR = r'<a.+?rel="next"' _MORE_PAGES_INDICATOR = r'<a.+?rel="next"'
_TITLE = None
_TITLE_RE = r'<link rel="alternate"[^>]+?title="(.*?)"' _TITLE_RE = r'<link rel="alternate"[^>]+?title="(.*?)"'
_TESTS = [{ _TESTS = [{
'url': 'https://vimeo.com/channels/tributes', 'url': 'https://vimeo.com/channels/tributes',
@ -440,7 +446,7 @@ class VimeoChannelIE(InfoExtractor):
return '%s/videos/page:%d/' % (base_url, pagenum) return '%s/videos/page:%d/' % (base_url, pagenum)
def _extract_list_title(self, webpage): def _extract_list_title(self, webpage):
return self._html_search_regex(self._TITLE_RE, webpage, 'list title') return self._TITLE or self._html_search_regex(self._TITLE_RE, webpage, 'list title')
def _login_list_password(self, page_url, list_id, webpage): def _login_list_password(self, page_url, list_id, webpage):
login_form = self._search_regex( login_form = self._search_regex(
@ -453,7 +459,7 @@ class VimeoChannelIE(InfoExtractor):
if password is None: if password is None:
raise ExtractorError('This album is protected by a password, use the --video-password option', expected=True) raise ExtractorError('This album is protected by a password, use the --video-password option', expected=True)
fields = self._hidden_inputs(login_form) fields = self._hidden_inputs(login_form)
token = self._search_regex(r'xsrft[\s=:"\']+([^"\']+)', webpage, 'login token') token = self._extract_xsrft(webpage)
fields['token'] = token fields['token'] = token
fields['password'] = password fields['password'] = password
post = urlencode_postdata(fields) post = urlencode_postdata(fields)
@ -499,7 +505,7 @@ class VimeoChannelIE(InfoExtractor):
class VimeoUserIE(VimeoChannelIE): class VimeoUserIE(VimeoChannelIE):
IE_NAME = 'vimeo:user' IE_NAME = 'vimeo:user'
_VALID_URL = r'https://vimeo\.com/(?![0-9]+(?:$|[?#/]))(?P<name>[^/]+)(?:/videos|[#?]|$)' _VALID_URL = r'https://vimeo\.com/(?!(?:[0-9]+|watchlater)(?:$|[?#/]))(?P<name>[^/]+)(?:/videos|[#?]|$)'
_TITLE_RE = r'<a[^>]+?class="user">([^<>]+?)</a>' _TITLE_RE = r'<a[^>]+?class="user">([^<>]+?)</a>'
_TESTS = [{ _TESTS = [{
'url': 'https://vimeo.com/nkistudio/videos', 'url': 'https://vimeo.com/nkistudio/videos',
@ -603,14 +609,14 @@ class VimeoReviewIE(InfoExtractor):
return self.url_result(player_url, 'Vimeo', video_id) return self.url_result(player_url, 'Vimeo', video_id)
class VimeoWatchLaterIE(VimeoBaseInfoExtractor, VimeoChannelIE): class VimeoWatchLaterIE(VimeoChannelIE):
IE_NAME = 'vimeo:watchlater' IE_NAME = 'vimeo:watchlater'
IE_DESC = 'Vimeo watch later list, "vimeowatchlater" keyword (requires authentication)' IE_DESC = 'Vimeo watch later list, "vimeowatchlater" keyword (requires authentication)'
_VALID_URL = r'https://vimeo\.com/home/watchlater|:vimeowatchlater' _VALID_URL = r'https://vimeo\.com/(?:home/)?watchlater|:vimeowatchlater'
_TITLE = 'Watch Later'
_LOGIN_REQUIRED = True _LOGIN_REQUIRED = True
_TITLE_RE = r'href="/home/watchlater".*?>(.*?)<'
_TESTS = [{ _TESTS = [{
'url': 'https://vimeo.com/home/watchlater', 'url': 'https://vimeo.com/watchlater',
'only_matching': True, 'only_matching': True,
}] }]
@ -626,7 +632,7 @@ class VimeoWatchLaterIE(VimeoBaseInfoExtractor, VimeoChannelIE):
return request return request
def _real_extract(self, url): def _real_extract(self, url):
return self._extract_videos('watchlater', 'https://vimeo.com/home/watchlater') return self._extract_videos('watchlater', 'https://vimeo.com/watchlater')
class VimeoLikesIE(InfoExtractor): class VimeoLikesIE(InfoExtractor):

View File

@ -213,7 +213,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
|(?: # or the v= param in all its forms |(?: # or the v= param in all its forms
(?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx) (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
(?:\?|\#!?) # the params delimiter ? or # or #! (?:\?|\#!?) # the params delimiter ? or # or #!
(?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx) (?:.*?&)?? # any other preceding param (like /?s=tuff&v=xxxx)
v= v=
) )
)) ))
@ -365,6 +365,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'description': 'md5:64249768eec3bc4276236606ea996373', 'description': 'md5:64249768eec3bc4276236606ea996373',
'uploader': 'justintimberlakeVEVO', 'uploader': 'justintimberlakeVEVO',
'uploader_id': 'justintimberlakeVEVO', 'uploader_id': 'justintimberlakeVEVO',
'age_limit': 18,
} }
}, },
{ {
@ -380,6 +381,26 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'uploader_id': 'setindia' 'uploader_id': 'setindia'
} }
}, },
{
'url': 'http://www.youtube.com/watch?v=BaW_jenozKcj&v=UxxajLWwzqY',
'note': 'Use the first video ID in the URL',
'info_dict': {
'id': 'BaW_jenozKc',
'ext': 'mp4',
'title': 'youtube-dl test video "\'/\\ä↭𝕐',
'uploader': 'Philipp Hagemeister',
'uploader_id': 'phihag',
'upload_date': '20121002',
'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
'categories': ['Science & Technology'],
'tags': ['youtube-dl'],
'like_count': int,
'dislike_count': int,
},
'params': {
'skip_download': True,
},
},
{ {
'url': 'http://www.youtube.com/watch?v=a9LDPn-MO4I', 'url': 'http://www.youtube.com/watch?v=a9LDPn-MO4I',
'note': '256k DASH audio (format 141) via DASH manifest', 'note': '256k DASH audio (format 141) via DASH manifest',
@ -421,7 +442,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'id': 'nfWlot6h_JM', 'id': 'nfWlot6h_JM',
'ext': 'm4a', 'ext': 'm4a',
'title': 'Taylor Swift - Shake It Off', 'title': 'Taylor Swift - Shake It Off',
'description': 'md5:2acfda1b285bdd478ccec22f9918199d', 'description': 'md5:95f66187cd7c8b2c13eb78e1223b63c3',
'uploader': 'TaylorSwiftVEVO', 'uploader': 'TaylorSwiftVEVO',
'uploader_id': 'TaylorSwiftVEVO', 'uploader_id': 'TaylorSwiftVEVO',
'upload_date': '20140818', 'upload_date': '20140818',
@ -455,6 +476,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'uploader': 'The Witcher', 'uploader': 'The Witcher',
'uploader_id': 'WitcherGame', 'uploader_id': 'WitcherGame',
'upload_date': '20140605', 'upload_date': '20140605',
'age_limit': 18,
}, },
}, },
# Age-gate video with encrypted signature # Age-gate video with encrypted signature
@ -468,6 +490,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'uploader': 'LloydVEVO', 'uploader': 'LloydVEVO',
'uploader_id': 'LloydVEVO', 'uploader_id': 'LloydVEVO',
'upload_date': '20110629', 'upload_date': '20110629',
'age_limit': 18,
}, },
}, },
# video_info is None (https://github.com/rg3/youtube-dl/issues/4421) # video_info is None (https://github.com/rg3/youtube-dl/issues/4421)
@ -492,7 +515,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'info_dict': { 'info_dict': {
'id': 'lqQg6PlCWgI', 'id': 'lqQg6PlCWgI',
'ext': 'mp4', 'ext': 'mp4',
'upload_date': '20120731', 'upload_date': '20120724',
'uploader_id': 'olympic', 'uploader_id': 'olympic',
'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games', 'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games',
'uploader': 'Olympics', 'uploader': 'Olympics',
@ -521,7 +544,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'url': 'qEJwOuvDf7I', 'url': 'qEJwOuvDf7I',
'info_dict': { 'info_dict': {
'id': 'qEJwOuvDf7I', 'id': 'qEJwOuvDf7I',
'ext': 'mp4', 'ext': 'webm',
'title': 'Обсуждение судебной практики по выборам 14 сентября 2014 года в Санкт-Петербурге', 'title': 'Обсуждение судебной практики по выборам 14 сентября 2014 года в Санкт-Петербурге',
'description': '', 'description': '',
'upload_date': '20150404', 'upload_date': '20150404',

View File

@ -651,6 +651,26 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
return ret return ret
def http_request(self, req): def http_request(self, req):
# According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
# always respected by websites, some tend to give out URLs with non percent-encoded
# non-ASCII characters (see telemb.py, ard.py [#3412])
# urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
# To work around aforementioned issue we will replace request's original URL with
# percent-encoded one
# Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
# the code of this workaround has been moved here from YoutubeDL.urlopen()
url = req.get_full_url()
url_escaped = escape_url(url)
# Substitute URL if any change after escaping
if url != url_escaped:
req_type = HEADRequest if req.get_method() == 'HEAD' else compat_urllib_request.Request
new_req = req_type(
url_escaped, data=req.data, headers=req.headers,
origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
new_req.timeout = req.timeout
req = new_req
for h, v in std_headers.items(): for h, v in std_headers.items():
# Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
# The dict keys are capitalized because of this bug by urllib # The dict keys are capitalized because of this bug by urllib
@ -695,6 +715,17 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
gz = io.BytesIO(self.deflate(resp.read())) gz = io.BytesIO(self.deflate(resp.read()))
resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code) resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
resp.msg = old_resp.msg resp.msg = old_resp.msg
# Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986
if 300 <= resp.code < 400:
location = resp.headers.get('Location')
if location:
# As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
if sys.version_info >= (3, 0):
location = location.encode('iso-8859-1').decode('utf-8')
location_escaped = escape_url(location)
if location != location_escaped:
del resp.headers['Location']
resp.headers['Location'] = location_escaped
return resp return resp
https_request = http_request https_request = http_request

View File

@ -1,3 +1,3 @@
from __future__ import unicode_literals from __future__ import unicode_literals
__version__ = '2015.07.28' __version__ = '2015.08.09'