Merge branch 'master' of https://github.com/rg3/youtube-dl into multipart_videos

This commit is contained in:
Mark Lee 2014-03-28 17:56:39 -07:00
commit 840e7cf4f3
19 changed files with 471 additions and 109 deletions

View File

@ -169,6 +169,7 @@ which means you can modify it, redistribute it or use it however you like.
## Verbosity / Simulation Options: ## Verbosity / Simulation Options:
-q, --quiet activates quiet mode -q, --quiet activates quiet mode
--no-warnings Ignore warnings
-s, --simulate do not download the video and do not write -s, --simulate do not download the video and do not write
anything to disk anything to disk
--skip-download do not download the video --skip-download do not download the video
@ -180,7 +181,9 @@ which means you can modify it, redistribute it or use it however you like.
--get-duration simulate, quiet but print video length --get-duration simulate, quiet but print video length
--get-filename simulate, quiet but print output filename --get-filename simulate, quiet but print output filename
--get-format simulate, quiet but print output format --get-format simulate, quiet but print output format
-j, --dump-json simulate, quiet but print JSON information -j, --dump-json simulate, quiet but print JSON information.
See --output for a description of available
keys.
--newline output progress bar as new lines --newline output progress bar as new lines
--no-progress do not print progress bar --no-progress do not print progress bar
--console-title display progress in console titlebar --console-title display progress in console titlebar

View File

@ -143,5 +143,8 @@ class TestAllURLsMatching(unittest.TestCase):
self.assertMatch('http://video.pbs.org/viralplayer/2365173446/', ['PBS']) self.assertMatch('http://video.pbs.org/viralplayer/2365173446/', ['PBS'])
self.assertMatch('http://video.pbs.org/widget/partnerplayer/980042464/', ['PBS']) self.assertMatch('http://video.pbs.org/widget/partnerplayer/980042464/', ['PBS'])
def test_ComedyCentralShows(self):
self.assertMatch('http://thedailyshow.cc.com/extended-interviews/xm3fnq/andrew-napolitano-extended-interview', ['ComedyCentralShows'])
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()

View File

@ -396,7 +396,7 @@ def parseOpts(overrideArguments=None):
help='simulate, quiet but print output format', default=False) help='simulate, quiet but print output format', default=False)
verbosity.add_option('-j', '--dump-json', verbosity.add_option('-j', '--dump-json',
action='store_true', dest='dumpjson', action='store_true', dest='dumpjson',
help='simulate, quiet but print JSON information', default=False) help='simulate, quiet but print JSON information. See --output for a description of available keys.', default=False)
verbosity.add_option('--newline', verbosity.add_option('--newline',
action='store_true', dest='progress_with_newline', help='output progress bar as new lines', default=False) action='store_true', dest='progress_with_newline', help='output progress bar as new lines', default=False)
verbosity.add_option('--no-progress', verbosity.add_option('--no-progress',

View File

@ -177,6 +177,8 @@ from .normalboots import NormalbootsIE
from .novamov import NovaMovIE from .novamov import NovaMovIE
from .nowness import NownessIE from .nowness import NownessIE
from .nowvideo import NowVideoIE from .nowvideo import NowVideoIE
from .ntv import NTVIE
from .oe1 import OE1IE
from .ooyala import OoyalaIE from .ooyala import OoyalaIE
from .orf import ORFIE from .orf import ORFIE
from .parliamentliveuk import ParliamentLiveUKIE from .parliamentliveuk import ParliamentLiveUKIE
@ -256,6 +258,7 @@ from .udemy import (
UdemyCourseIE UdemyCourseIE
) )
from .unistra import UnistraIE from .unistra import UnistraIE
from .urort import UrortIE
from .ustream import UstreamIE, UstreamChannelIE from .ustream import UstreamIE, UstreamChannelIE
from .vbox7 import Vbox7IE from .vbox7 import Vbox7IE
from .veehd import VeeHDIE from .veehd import VeeHDIE

View File

@ -16,9 +16,10 @@ class AppleTrailersIE(InfoExtractor):
"url": "http://trailers.apple.com/trailers/wb/manofsteel/", "url": "http://trailers.apple.com/trailers/wb/manofsteel/",
"playlist": [ "playlist": [
{ {
"file": "manofsteel-trailer4.mov",
"md5": "d97a8e575432dbcb81b7c3acb741f8a8", "md5": "d97a8e575432dbcb81b7c3acb741f8a8",
"info_dict": { "info_dict": {
"id": "manofsteel-trailer4",
"ext": "mov",
"duration": 111, "duration": 111,
"title": "Trailer 4", "title": "Trailer 4",
"upload_date": "20130523", "upload_date": "20130523",
@ -26,9 +27,10 @@ class AppleTrailersIE(InfoExtractor):
}, },
}, },
{ {
"file": "manofsteel-trailer3.mov",
"md5": "b8017b7131b721fb4e8d6f49e1df908c", "md5": "b8017b7131b721fb4e8d6f49e1df908c",
"info_dict": { "info_dict": {
"id": "manofsteel-trailer3",
"ext": "mov",
"duration": 182, "duration": 182,
"title": "Trailer 3", "title": "Trailer 3",
"upload_date": "20130417", "upload_date": "20130417",
@ -36,9 +38,10 @@ class AppleTrailersIE(InfoExtractor):
}, },
}, },
{ {
"file": "manofsteel-trailer.mov",
"md5": "d0f1e1150989b9924679b441f3404d48", "md5": "d0f1e1150989b9924679b441f3404d48",
"info_dict": { "info_dict": {
"id": "manofsteel-trailer",
"ext": "mov",
"duration": 148, "duration": 148,
"title": "Trailer", "title": "Trailer",
"upload_date": "20121212", "upload_date": "20121212",
@ -46,15 +49,16 @@ class AppleTrailersIE(InfoExtractor):
}, },
}, },
{ {
"file": "manofsteel-teaser.mov",
"md5": "5fe08795b943eb2e757fa95cb6def1cb", "md5": "5fe08795b943eb2e757fa95cb6def1cb",
"info_dict": { "info_dict": {
"id": "manofsteel-teaser",
"ext": "mov",
"duration": 93, "duration": 93,
"title": "Teaser", "title": "Teaser",
"upload_date": "20120721", "upload_date": "20120721",
"uploader_id": "wb", "uploader_id": "wb",
}, },
} },
] ]
} }
@ -65,16 +69,16 @@ class AppleTrailersIE(InfoExtractor):
movie = mobj.group('movie') movie = mobj.group('movie')
uploader_id = mobj.group('company') uploader_id = mobj.group('company')
playlist_url = compat_urlparse.urljoin(url, u'includes/playlists/itunes.inc') playlist_url = compat_urlparse.urljoin(url, 'includes/playlists/itunes.inc')
def fix_html(s): def fix_html(s):
s = re.sub(r'(?s)<script[^<]*?>.*?</script>', u'', s) s = re.sub(r'(?s)<script[^<]*?>.*?</script>', '', s)
s = re.sub(r'<img ([^<]*?)>', r'<img \1/>', s) s = re.sub(r'<img ([^<]*?)>', r'<img \1/>', s)
# The ' in the onClick attributes are not escaped, it couldn't be parsed # The ' in the onClick attributes are not escaped, it couldn't be parsed
# like: http://trailers.apple.com/trailers/wb/gravity/ # like: http://trailers.apple.com/trailers/wb/gravity/
def _clean_json(m): def _clean_json(m):
return u'iTunes.playURL(%s);' % m.group(1).replace('\'', '&#39;') return 'iTunes.playURL(%s);' % m.group(1).replace('\'', '&#39;')
s = re.sub(self._JSON_RE, _clean_json, s) s = re.sub(self._JSON_RE, _clean_json, s)
s = u'<html>' + s + u'</html>' s = '<html>' + s + u'</html>'
return s return s
doc = self._download_xml(playlist_url, movie, transform_source=fix_html) doc = self._download_xml(playlist_url, movie, transform_source=fix_html)
@ -82,7 +86,7 @@ class AppleTrailersIE(InfoExtractor):
for li in doc.findall('./div/ul/li'): for li in doc.findall('./div/ul/li'):
on_click = li.find('.//a').attrib['onClick'] on_click = li.find('.//a').attrib['onClick']
trailer_info_json = self._search_regex(self._JSON_RE, trailer_info_json = self._search_regex(self._JSON_RE,
on_click, u'trailer info') on_click, 'trailer info')
trailer_info = json.loads(trailer_info_json) trailer_info = json.loads(trailer_info_json)
title = trailer_info['title'] title = trailer_info['title']
video_id = movie + '-' + re.sub(r'[^a-zA-Z0-9]', '', title).lower() video_id = movie + '-' + re.sub(r'[^a-zA-Z0-9]', '', title).lower()
@ -98,8 +102,7 @@ class AppleTrailersIE(InfoExtractor):
first_url = trailer_info['url'] first_url = trailer_info['url']
trailer_id = first_url.split('/')[-1].rpartition('_')[0].lower() trailer_id = first_url.split('/')[-1].rpartition('_')[0].lower()
settings_json_url = compat_urlparse.urljoin(url, 'includes/settings/%s.json' % trailer_id) settings_json_url = compat_urlparse.urljoin(url, 'includes/settings/%s.json' % trailer_id)
settings_json = self._download_webpage(settings_json_url, trailer_id, u'Downloading settings json') settings = self._download_json(settings_json_url, trailer_id, 'Downloading settings json')
settings = json.loads(settings_json)
formats = [] formats = []
for format in settings['metadata']['sizes']: for format in settings['metadata']['sizes']:
@ -107,7 +110,6 @@ class AppleTrailersIE(InfoExtractor):
format_url = re.sub(r'_(\d*p.mov)', r'_h\1', format['src']) format_url = re.sub(r'_(\d*p.mov)', r'_h\1', format['src'])
formats.append({ formats.append({
'url': format_url, 'url': format_url,
'ext': determine_ext(format_url),
'format': format['type'], 'format': format['type'],
'width': format['width'], 'width': format['width'],
'height': int(format['height']), 'height': int(format['height']),

View File

@ -28,7 +28,7 @@ class CanalplusIE(InfoExtractor):
video_id = mobj.groupdict().get('id') video_id = mobj.groupdict().get('id')
if video_id is None: if video_id is None:
webpage = self._download_webpage(url, mobj.group('path')) webpage = self._download_webpage(url, mobj.group('path'))
video_id = self._search_regex(r'videoId = "(\d+)";', webpage, u'video id') video_id = self._search_regex(r'<canal:player videoId="(\d+)"', webpage, u'video id')
info_url = self._VIDEO_INFO_TEMPLATE % video_id info_url = self._VIDEO_INFO_TEMPLATE % video_id
doc = self._download_xml(info_url,video_id, doc = self._download_xml(info_url,video_id,
u'Downloading video info') u'Downloading video info')

View File

@ -1,3 +1,5 @@
from __future__ import unicode_literals
import re import re
from .common import InfoExtractor from .common import InfoExtractor
@ -11,13 +13,14 @@ class ClipsyndicateIE(InfoExtractor):
_VALID_URL = r'http://www\.clipsyndicate\.com/video/play(list/\d+)?/(?P<id>\d+)' _VALID_URL = r'http://www\.clipsyndicate\.com/video/play(list/\d+)?/(?P<id>\d+)'
_TEST = { _TEST = {
u'url': u'http://www.clipsyndicate.com/video/play/4629301/brick_briscoe', 'url': 'http://www.clipsyndicate.com/video/play/4629301/brick_briscoe',
u'md5': u'4d7d549451bad625e0ff3d7bd56d776c', 'md5': '4d7d549451bad625e0ff3d7bd56d776c',
u'info_dict': { 'info_dict': {
u'id': u'4629301', 'id': '4629301',
u'ext': u'mp4', 'ext': 'mp4',
u'title': u'Brick Briscoe', 'title': 'Brick Briscoe',
u'duration': 612, 'duration': 612,
'thumbnail': 're:^https?://.+\.jpg',
}, },
} }
@ -26,13 +29,13 @@ class ClipsyndicateIE(InfoExtractor):
video_id = mobj.group('id') video_id = mobj.group('id')
js_player = self._download_webpage( js_player = self._download_webpage(
'http://eplayer.clipsyndicate.com/embed/player.js?va_id=%s' % video_id, 'http://eplayer.clipsyndicate.com/embed/player.js?va_id=%s' % video_id,
video_id, u'Downlaoding player') video_id, 'Downlaoding player')
# it includes a required token # it includes a required token
flvars = self._search_regex(r'flvars: "(.*?)"', js_player, u'flvars') flvars = self._search_regex(r'flvars: "(.*?)"', js_player, 'flvars')
pdoc = self._download_xml( pdoc = self._download_xml(
'http://eplayer.clipsyndicate.com/osmf/playlist?%s' % flvars, 'http://eplayer.clipsyndicate.com/osmf/playlist?%s' % flvars,
video_id, u'Downloading video info', video_id, 'Downloading video info',
transform_source=fix_xml_ampersands) transform_source=fix_xml_ampersands)
track_doc = pdoc.find('trackList/track') track_doc = pdoc.find('trackList/track')

View File

@ -8,7 +8,7 @@ from ..utils import (
compat_str, compat_str,
compat_urllib_parse, compat_urllib_parse,
ExtractorError, ExtractorError,
int_or_none, float_or_none,
unified_strdate, unified_strdate,
) )
@ -46,7 +46,7 @@ class ComedyCentralShowsIE(InfoExtractor):
(the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?)) (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
|(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))| |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))|
(?P<interview> (?P<interview>
extended-interviews/(?P<interID>[0-9]+)/playlist_tds_extended_(?P<interview_title>.*?)/.*?))) extended-interviews/(?P<interID>[0-9a-z]+)/(?:playlist_tds_extended_)?(?P<interview_title>.*?)(/.*?)?)))
$''' $'''
_TEST = { _TEST = {
'url': 'http://thedailyshow.cc.com/watch/thu-december-13-2012/kristen-stewart', 'url': 'http://thedailyshow.cc.com/watch/thu-december-13-2012/kristen-stewart',
@ -134,7 +134,7 @@ class ComedyCentralShowsIE(InfoExtractor):
# a URL prefix; so extract the alternate reference # a URL prefix; so extract the alternate reference
# and then add the URL prefix manually. # and then add the URL prefix manually.
altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage) altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video|playlist).*?:.*?)"', webpage)
if len(altMovieParams) == 0: if len(altMovieParams) == 0:
raise ExtractorError('unable to find Flash URL in webpage ' + url) raise ExtractorError('unable to find Flash URL in webpage ' + url)
else: else:
@ -159,7 +159,7 @@ class ComedyCentralShowsIE(InfoExtractor):
thumbnail = itemEl.find('.//{http://search.yahoo.com/mrss/}thumbnail').attrib.get('url') thumbnail = itemEl.find('.//{http://search.yahoo.com/mrss/}thumbnail').attrib.get('url')
content = itemEl.find('.//{http://search.yahoo.com/mrss/}content') content = itemEl.find('.//{http://search.yahoo.com/mrss/}content')
duration = int_or_none(content.attrib.get('duration')) duration = float_or_none(content.attrib.get('duration'))
mediagen_url = content.attrib['url'] mediagen_url = content.attrib['url']
guid = itemEl.find('.//guid').text.rpartition(':')[-1] guid = itemEl.find('.//guid').text.rpartition(':')[-1]

View File

@ -1,23 +1,25 @@
from __future__ import unicode_literals
import re import re
from ..utils import ( from ..utils import (
compat_urllib_parse, compat_urllib_parse,
determine_ext
) )
from .common import InfoExtractor from .common import InfoExtractor
class EHowIE(InfoExtractor): class EHowIE(InfoExtractor):
IE_NAME = u'eHow' IE_NAME = 'eHow'
_VALID_URL = r'(?:https?://)?(?:www\.)?ehow\.com/[^/_?]*_(?P<id>[0-9]+)' _VALID_URL = r'https?://(?:www\.)?ehow\.com/[^/_?]*_(?P<id>[0-9]+)'
_TEST = { _TEST = {
u'url': u'http://www.ehow.com/video_12245069_hardwood-flooring-basics.html', 'url': 'http://www.ehow.com/video_12245069_hardwood-flooring-basics.html',
u'file': u'12245069.flv', 'md5': '9809b4e3f115ae2088440bcb4efbf371',
u'md5': u'9809b4e3f115ae2088440bcb4efbf371', 'info_dict': {
u'info_dict': { 'id': '12245069',
u"title": u"Hardwood Flooring Basics", 'ext': 'flv',
u"description": u"Hardwood flooring may be time consuming, but its ultimately a pretty straightforward concept. Learn about hardwood flooring basics with help from a hardware flooring business owner in this free video...", 'title': 'Hardwood Flooring Basics',
u"uploader": u"Erick Nathan" 'description': 'Hardwood flooring may be time consuming, but its ultimately a pretty straightforward concept. Learn about hardwood flooring basics with help from a hardware flooring business owner in this free video...',
'uploader': 'Erick Nathan',
} }
} }
@ -26,21 +28,16 @@ class EHowIE(InfoExtractor):
video_id = mobj.group('id') video_id = mobj.group('id')
webpage = self._download_webpage(url, video_id) webpage = self._download_webpage(url, video_id)
video_url = self._search_regex(r'(?:file|source)=(http[^\'"&]*)', video_url = self._search_regex(r'(?:file|source)=(http[^\'"&]*)',
webpage, u'video URL') webpage, 'video URL')
final_url = compat_urllib_parse.unquote(video_url) final_url = compat_urllib_parse.unquote(video_url)
uploader = self._search_regex(r'<meta name="uploader" content="(.+?)" />', uploader = self._html_search_meta('uploader', webpage)
webpage, u'uploader')
title = self._og_search_title(webpage).replace(' | eHow', '') title = self._og_search_title(webpage).replace(' | eHow', '')
ext = determine_ext(final_url)
return { return {
'_type': 'video',
'id': video_id, 'id': video_id,
'url': final_url, 'url': final_url,
'ext': ext,
'title': title, 'title': title,
'thumbnail': self._og_search_thumbnail(webpage), 'thumbnail': self._og_search_thumbnail(webpage),
'description': self._og_search_description(webpage), 'description': self._og_search_description(webpage),
'uploader': uploader, 'uploader': uploader,
} }

View File

@ -25,6 +25,7 @@ from ..utils import (
from .brightcove import BrightcoveIE from .brightcove import BrightcoveIE
from .ooyala import OoyalaIE from .ooyala import OoyalaIE
from .rutv import RUTVIE from .rutv import RUTVIE
from .smotri import SmotriIE
class GenericIE(InfoExtractor): class GenericIE(InfoExtractor):
@ -212,6 +213,21 @@ class GenericIE(InfoExtractor):
'skip_download': 'Requires rtmpdump' 'skip_download': 'Requires rtmpdump'
} }
}, },
# smotri embed
{
'url': 'http://rbctv.rbc.ru/archive/news/562949990879132.shtml',
'md5': 'ec40048448e9284c9a1de77bb188108b',
'info_dict': {
'id': 'v27008541fad',
'ext': 'mp4',
'title': 'Крым и Севастополь вошли в состав России',
'description': 'md5:fae01b61f68984c7bd2fa741e11c3175',
'duration': 900,
'upload_date': '20140318',
'uploader': 'rbctv_2012_4',
'uploader_id': 'rbctv_2012_4',
},
},
] ]
def report_download_webpage(self, video_id): def report_download_webpage(self, video_id):
@ -547,6 +563,11 @@ class GenericIE(InfoExtractor):
if mobj is not None: if mobj is not None:
return self.url_result(mobj.group('url'), 'ArteTVEmbed') return self.url_result(mobj.group('url'), 'ArteTVEmbed')
# Look for embedded smotri.com player
smotri_url = SmotriIE._extract_url(webpage)
if smotri_url:
return self.url_result(smotri_url, 'Smotri')
# Start with something easy: JW Player in SWFObject # Start with something easy: JW Player in SWFObject
mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage) mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
if mobj is None: if mobj is None:

View File

@ -14,7 +14,7 @@ from ..utils import (
class MooshareIE(InfoExtractor): class MooshareIE(InfoExtractor):
IE_NAME = 'mooshare' IE_NAME = 'mooshare'
IE_DESC = 'Mooshare.biz' IE_DESC = 'Mooshare.biz'
_VALID_URL = r'http://mooshare\.biz/(?P<id>[\da-z]{12})' _VALID_URL = r'http://(?:www\.)?mooshare\.biz/(?P<id>[\da-z]{12})'
_TESTS = [ _TESTS = [
{ {

View File

@ -1,12 +1,10 @@
# encoding: utf-8 # encoding: utf-8
from __future__ import unicode_literals
import re import re
import socket
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import ( from ..utils import (
compat_http_client,
compat_urllib_error,
compat_urllib_parse, compat_urllib_parse,
compat_urllib_request, compat_urllib_request,
compat_urlparse, compat_urlparse,
@ -18,57 +16,54 @@ from ..utils import (
class NiconicoIE(InfoExtractor): class NiconicoIE(InfoExtractor):
IE_NAME = u'niconico' IE_NAME = 'niconico'
IE_DESC = u'ニコニコ動画' IE_DESC = 'ニコニコ動画'
_TEST = { _TEST = {
u'url': u'http://www.nicovideo.jp/watch/sm22312215', 'url': 'http://www.nicovideo.jp/watch/sm22312215',
u'file': u'sm22312215.mp4', 'md5': 'd1a75c0823e2f629128c43e1212760f9',
u'md5': u'd1a75c0823e2f629128c43e1212760f9', 'info_dict': {
u'info_dict': { 'id': 'sm22312215',
u'title': u'Big Buck Bunny', 'ext': 'mp4',
u'uploader': u'takuya0301', 'title': 'Big Buck Bunny',
u'uploader_id': u'2698420', 'uploader': 'takuya0301',
u'upload_date': u'20131123', 'uploader_id': '2698420',
u'description': u'(c) copyright 2008, Blender Foundation / www.bigbuckbunny.org', 'upload_date': '20131123',
'description': '(c) copyright 2008, Blender Foundation / www.bigbuckbunny.org',
}, },
u'params': { 'params': {
u'username': u'ydl.niconico@gmail.com', 'username': 'ydl.niconico@gmail.com',
u'password': u'youtube-dl', 'password': 'youtube-dl',
}, },
} }
_VALID_URL = r'^https?://(?:www\.|secure\.)?nicovideo\.jp/watch/([a-z][a-z][0-9]+)(?:.*)$' _VALID_URL = r'^https?://(?:www\.|secure\.)?nicovideo\.jp/watch/([a-z][a-z][0-9]+)(?:.*)$'
_NETRC_MACHINE = 'niconico' _NETRC_MACHINE = 'niconico'
# If True it will raise an error if no login info is provided
_LOGIN_REQUIRED = True
def _real_initialize(self): def _real_initialize(self):
self._login() self._login()
def _login(self): def _login(self):
(username, password) = self._get_login_info() (username, password) = self._get_login_info()
# No authentication to be performed
if username is None: if username is None:
if self._LOGIN_REQUIRED: # Login is required
raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True) raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
return False
# Log in # Log in
login_form_strs = { login_form_strs = {
u'mail': username, 'mail': username,
u'password': password, 'password': password,
} }
# Convert to UTF-8 *before* urlencode because Python 2.x's urlencode # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
# chokes on unicode # chokes on unicode
login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items()) login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k, v in login_form_strs.items())
login_data = compat_urllib_parse.urlencode(login_form).encode('utf-8') login_data = compat_urllib_parse.urlencode(login_form).encode('utf-8')
request = compat_urllib_request.Request( request = compat_urllib_request.Request(
u'https://secure.nicovideo.jp/secure/login', login_data) 'https://secure.nicovideo.jp/secure/login', login_data)
login_results = self._download_webpage( login_results = self._download_webpage(
request, u'', note=u'Logging in', errnote=u'Unable to log in') request, None, note='Logging in', errnote='Unable to log in')
if re.search(r'(?i)<h1 class="mb8p4">Log in error</h1>', login_results) is not None: if re.search(r'(?i)<h1 class="mb8p4">Log in error</h1>', login_results) is not None:
self._downloader.report_warning(u'unable to log in: bad username or password') self._downloader.report_warning('unable to log in: bad username or password')
return False return False
return True return True
@ -82,12 +77,12 @@ class NiconicoIE(InfoExtractor):
video_info = self._download_xml( video_info = self._download_xml(
'http://ext.nicovideo.jp/api/getthumbinfo/' + video_id, video_id, 'http://ext.nicovideo.jp/api/getthumbinfo/' + video_id, video_id,
note=u'Downloading video info page') note='Downloading video info page')
# Get flv info # Get flv info
flv_info_webpage = self._download_webpage( flv_info_webpage = self._download_webpage(
u'http://flapi.nicovideo.jp/api/getflv?v=' + video_id, 'http://flapi.nicovideo.jp/api/getflv?v=' + video_id,
video_id, u'Downloading flv info') video_id, 'Downloading flv info')
video_real_url = compat_urlparse.parse_qs(flv_info_webpage)['url'][0] video_real_url = compat_urlparse.parse_qs(flv_info_webpage)['url'][0]
# Start extracting information # Start extracting information
@ -106,10 +101,10 @@ class NiconicoIE(InfoExtractor):
url = 'http://seiga.nicovideo.jp/api/user/info?id=' + video_uploader_id url = 'http://seiga.nicovideo.jp/api/user/info?id=' + video_uploader_id
try: try:
user_info = self._download_xml( user_info = self._download_xml(
url, video_id, note=u'Downloading user information') url, video_id, note='Downloading user information')
video_uploader = user_info.find('.//nickname').text video_uploader = user_info.find('.//nickname').text
except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: except ExtractorError as err:
self._downloader.report_warning(u'Unable to download user info webpage: %s' % compat_str(err)) self._downloader.report_warning('Unable to download user info webpage: %s' % compat_str(err))
return { return {
'id': video_id, 'id': video_id,

158
youtube_dl/extractor/ntv.py Normal file
View File

@ -0,0 +1,158 @@
# encoding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import (
RegexNotFoundError,
unescapeHTML
)
class NTVIE(InfoExtractor):
_VALID_URL = r'http://(?:www\.)?ntv\.ru/(?P<id>.+)'
_TESTS = [
{
'url': 'http://www.ntv.ru/novosti/863142/',
'info_dict': {
'id': '746000',
'ext': 'flv',
'title': 'Командующий Черноморским флотом провел переговоры в штабе ВМС Украины',
'description': 'Командующий Черноморским флотом провел переговоры в штабе ВМС Украины',
'duration': 136,
},
'params': {
# rtmp download
'skip_download': True,
},
},
{
'url': 'http://www.ntv.ru/video/novosti/750370/',
'info_dict': {
'id': '750370',
'ext': 'flv',
'title': 'Родные пассажиров пропавшего Boeing не верят в трагический исход',
'description': 'Родные пассажиров пропавшего Boeing не верят в трагический исход',
'duration': 172,
},
'params': {
# rtmp download
'skip_download': True,
},
},
{
'url': 'http://www.ntv.ru/peredacha/segodnya/m23700/o232416',
'info_dict': {
'id': '747480',
'ext': 'flv',
'title': '«Сегодня». 21 марта 2014 года. 16:00 ',
'description': '«Сегодня». 21 марта 2014 года. 16:00 ',
'duration': 1496,
},
'params': {
# rtmp download
'skip_download': True,
},
},
{
'url': 'http://www.ntv.ru/kino/Koma_film',
'info_dict': {
'id': '750783',
'ext': 'flv',
'title': 'Остросюжетный фильм «Кома» — 4 апреля вечером на НТВ',
'description': 'Остросюжетный фильм «Кома» — 4 апреля вечером на НТВ',
'duration': 28,
},
'params': {
# rtmp download
'skip_download': True,
},
},
{
'url': 'http://www.ntv.ru/serial/Delo_vrachey/m31760/o233916/',
'info_dict': {
'id': '751482',
'ext': 'flv',
'title': '«Дело врачей»: «Деревце жизни»',
'description': '«Дело врачей»: «Деревце жизни»',
'duration': 2590,
},
'params': {
# rtmp download
'skip_download': True,
},
},
]
_VIDEO_ID_REGEXES = [
r'<meta property="og:url" content="http://www\.ntv\.ru/video/(\d+)',
r'<video embed=[^>]+><id>(\d+)</id>',
r'<video restriction[^>]+><key>(\d+)</key>'
]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
page = self._download_webpage(url, video_id, 'Downloading page')
def extract(patterns, name, page, fatal=False):
for pattern in patterns:
mobj = re.search(pattern, page)
if mobj:
return mobj.group(1)
if fatal:
raise RegexNotFoundError(u'Unable to extract %s' % name)
return None
video_id = extract(self._VIDEO_ID_REGEXES, 'video id', page, fatal=True)
player = self._download_xml('http://www.ntv.ru/vi%s/' % video_id, video_id, 'Downloading video XML')
title = unescapeHTML(player.find('./data/title').text)
description = unescapeHTML(player.find('./data/description').text)
video = player.find('./data/video')
video_id = video.find('./id').text
thumbnail = video.find('./splash').text
duration = int(video.find('./totaltime').text)
view_count = int(video.find('./views').text)
puid22 = video.find('./puid22').text
apps = {
'4': 'video1',
'7': 'video2',
}
app = apps[puid22] if puid22 in apps else apps['4']
formats = []
for format_id in ['', 'hi', 'webm']:
file = video.find('./%sfile' % format_id)
if file is None:
continue
size = video.find('./%ssize' % format_id)
formats.append({
'url': 'rtmp://media.ntv.ru/%s' % app,
'app': app,
'play_path': file.text,
'rtmp_conn': 'B:1',
'player_url': 'http://www.ntv.ru/swf/vps1.swf?update=20131128',
'page_url': 'http://www.ntv.ru',
'flash_ver': 'LNX 11,2,202,341',
'rtmp_live': True,
'ext': 'flv',
'filesize': int(size.text),
})
self._sort_formats(formats)
return {
'id': video_id,
'title': title,
'description': description,
'thumbnail': thumbnail,
'duration': duration,
'view_count': view_count,
'formats': formats,
}

View File

@ -0,0 +1,40 @@
# coding: utf-8
from __future__ import unicode_literals
import calendar
import datetime
import re
from .common import InfoExtractor
# audios on oe1.orf.at are only available for 7 days, so we can't
# add tests.
class OE1IE(InfoExtractor):
IE_DESC = 'oe1.orf.at'
_VALID_URL = r'http://oe1\.orf\.at/programm/(?P<id>[0-9]+)'
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
show_id = mobj.group('id')
data = self._download_json(
'http://oe1.orf.at/programm/%s/konsole' % show_id,
show_id
)
timestamp = datetime.datetime.strptime('%s %s' % (
data['item']['day_label'],
data['item']['time']
), '%d.%m.%Y %H:%M')
unix_timestamp = calendar.timegm(timestamp.utctimetuple())
return {
'id': show_id,
'title': data['item']['title'],
'url': data['item']['url_stream'],
'ext': 'mp3',
'description': data['item'].get('info'),
'timestamp': unix_timestamp
}

View File

@ -13,22 +13,24 @@ from ..utils import (
compat_urllib_request, compat_urllib_request,
ExtractorError, ExtractorError,
url_basename, url_basename,
int_or_none,
) )
class SmotriIE(InfoExtractor): class SmotriIE(InfoExtractor):
IE_DESC = 'Smotri.com' IE_DESC = 'Smotri.com'
IE_NAME = 'smotri' IE_NAME = 'smotri'
_VALID_URL = r'^https?://(?:www\.)?(?P<url>smotri\.com/video/view/\?id=(?P<videoid>v(?P<realvideoid>[0-9]+)[a-z0-9]{4}))' _VALID_URL = r'^https?://(?:www\.)?(?:smotri\.com/video/view/\?id=|pics\.smotri\.com/(?:player|scrubber_custom8)\.swf\?file=)(?P<videoid>v(?P<realvideoid>[0-9]+)[a-z0-9]{4})'
_NETRC_MACHINE = 'smotri' _NETRC_MACHINE = 'smotri'
_TESTS = [ _TESTS = [
# real video id 2610366 # real video id 2610366
{ {
'url': 'http://smotri.com/video/view/?id=v261036632ab', 'url': 'http://smotri.com/video/view/?id=v261036632ab',
'file': 'v261036632ab.mp4',
'md5': '2a7b08249e6f5636557579c368040eb9', 'md5': '2a7b08249e6f5636557579c368040eb9',
'info_dict': { 'info_dict': {
'id': 'v261036632ab',
'ext': 'mp4',
'title': 'катастрофа с камер видеонаблюдения', 'title': 'катастрофа с камер видеонаблюдения',
'uploader': 'rbc2008', 'uploader': 'rbc2008',
'uploader_id': 'rbc08', 'uploader_id': 'rbc08',
@ -40,9 +42,10 @@ class SmotriIE(InfoExtractor):
# real video id 57591 # real video id 57591
{ {
'url': 'http://smotri.com/video/view/?id=v57591cb20', 'url': 'http://smotri.com/video/view/?id=v57591cb20',
'file': 'v57591cb20.flv',
'md5': '830266dfc21f077eac5afd1883091bcd', 'md5': '830266dfc21f077eac5afd1883091bcd',
'info_dict': { 'info_dict': {
'id': 'v57591cb20',
'ext': 'flv',
'title': 'test', 'title': 'test',
'uploader': 'Support Photofile@photofile', 'uploader': 'Support Photofile@photofile',
'uploader_id': 'support-photofile', 'uploader_id': 'support-photofile',
@ -54,9 +57,10 @@ class SmotriIE(InfoExtractor):
# video-password # video-password
{ {
'url': 'http://smotri.com/video/view/?id=v1390466a13c', 'url': 'http://smotri.com/video/view/?id=v1390466a13c',
'file': 'v1390466a13c.mp4',
'md5': 'f6331cef33cad65a0815ee482a54440b', 'md5': 'f6331cef33cad65a0815ee482a54440b',
'info_dict': { 'info_dict': {
'id': 'v1390466a13c',
'ext': 'mp4',
'title': 'TOCCA_A_NOI_-_LE_COSE_NON_VANNO_CAMBIAMOLE_ORA-1', 'title': 'TOCCA_A_NOI_-_LE_COSE_NON_VANNO_CAMBIAMOLE_ORA-1',
'uploader': 'timoxa40', 'uploader': 'timoxa40',
'uploader_id': 'timoxa40', 'uploader_id': 'timoxa40',
@ -71,9 +75,10 @@ class SmotriIE(InfoExtractor):
# age limit + video-password # age limit + video-password
{ {
'url': 'http://smotri.com/video/view/?id=v15408898bcf', 'url': 'http://smotri.com/video/view/?id=v15408898bcf',
'file': 'v15408898bcf.flv',
'md5': '91e909c9f0521adf5ee86fbe073aad70', 'md5': '91e909c9f0521adf5ee86fbe073aad70',
'info_dict': { 'info_dict': {
'id': 'v15408898bcf',
'ext': 'flv',
'title': 'этот ролик не покажут по ТВ', 'title': 'этот ролик не покажут по ТВ',
'uploader': 'zzxxx', 'uploader': 'zzxxx',
'uploader_id': 'ueggb', 'uploader_id': 'ueggb',
@ -85,7 +90,22 @@ class SmotriIE(InfoExtractor):
'params': { 'params': {
'videopassword': '333' 'videopassword': '333'
} }
} },
# swf player
{
'url': 'http://pics.smotri.com/scrubber_custom8.swf?file=v9188090500',
'md5': '4d47034979d9390d14acdf59c4935bc2',
'info_dict': {
'id': 'v9188090500',
'ext': 'mp4',
'title': 'Shakira - Don\'t Bother',
'uploader': 'HannahL',
'uploader_id': 'lisaha95',
'upload_date': '20090331',
'description': 'Shakira - Don\'t Bother, видео Shakira - Don\'t Bother',
'thumbnail': 'http://frame8.loadup.ru/44/0b/918809.7.3.jpg',
},
},
] ]
_SUCCESS = 0 _SUCCESS = 0
@ -93,6 +113,21 @@ class SmotriIE(InfoExtractor):
_PASSWORD_DETECTED = 2 _PASSWORD_DETECTED = 2
_VIDEO_NOT_FOUND = 3 _VIDEO_NOT_FOUND = 3
@classmethod
def _extract_url(cls, webpage):
mobj = re.search(
r'<embed[^>]src=(["\'])(?P<url>http://pics\.smotri\.com/(?:player|scrubber_custom8)\.swf\?file=v.+?\1)',
webpage)
if mobj is not None:
return mobj.group('url')
mobj = re.search(
r'''(?x)<div\s+class="video_file">http://smotri\.com/video/download/file/[^<]+</div>\s*
<div\s+class="video_image">[^<]+</div>\s*
<div\s+class="video_id">(?P<id>[^<]+)</div>''', webpage)
if mobj is not None:
return 'http://smotri.com/video/view/?id=%s' % mobj.group('id')
def _search_meta(self, name, html, display_name=None): def _search_meta(self, name, html, display_name=None):
if display_name is None: if display_name is None:
display_name = name display_name = name
@ -134,7 +169,7 @@ class SmotriIE(InfoExtractor):
# Video JSON does not provide enough meta data # Video JSON does not provide enough meta data
# We will extract some from the video web page instead # We will extract some from the video web page instead
video_page_url = 'http://' + mobj.group('url') video_page_url = 'http://smotri.com/video/view/?id=%s' % video_id
video_page = self._download_webpage(video_page_url, video_id, 'Downloading video page') video_page = self._download_webpage(video_page_url, video_id, 'Downloading video page')
# Warning if video is unavailable # Warning if video is unavailable
@ -222,7 +257,7 @@ class SmotriIE(InfoExtractor):
'upload_date': video_upload_date, 'upload_date': video_upload_date,
'uploader_id': video_uploader_id, 'uploader_id': video_uploader_id,
'duration': video_duration, 'duration': video_duration,
'view_count': video_view_count, 'view_count': int_or_none(video_view_count),
'age_limit': 18 if adult_content else 0, 'age_limit': 18 if adult_content else 0,
'video_page_url': video_page_url 'video_page_url': video_page_url
} }

View File

@ -18,12 +18,14 @@ class TEDIE(SubtitlesInfoExtractor):
(?P<type_playlist>playlists(?:/\d+)?) # We have a playlist (?P<type_playlist>playlists(?:/\d+)?) # We have a playlist
| |
((?P<type_talk>talks)) # We have a simple talk ((?P<type_talk>talks)) # We have a simple talk
|
(?P<type_watch>watch)/[^/]+/[^/]+
) )
(/lang/(.*?))? # The url may contain the language (/lang/(.*?))? # The url may contain the language
/(?P<name>\w+) # Here goes the name and then ".html" /(?P<name>[\w-]+) # Here goes the name and then ".html"
.*)$ .*)$
''' '''
_TEST = { _TESTS = [{
'url': 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html', 'url': 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html',
'md5': '4ea1dada91e4174b53dac2bb8ace429d', 'md5': '4ea1dada91e4174b53dac2bb8ace429d',
'info_dict': { 'info_dict': {
@ -36,7 +38,17 @@ class TEDIE(SubtitlesInfoExtractor):
'actively fooling us.'), 'actively fooling us.'),
'uploader': 'Dan Dennett', 'uploader': 'Dan Dennett',
} }
}, {
'url': 'http://www.ted.com/watch/ted-institute/ted-bcg/vishal-sikka-the-beauty-and-power-of-algorithms',
'md5': '226f4fb9c62380d11b7995efa4c87994',
'info_dict': {
'id': 'vishal-sikka-the-beauty-and-power-of-algorithms',
'ext': 'mp4',
'title': 'Vishal Sikka: The beauty and power of algorithms',
'thumbnail': 're:^https?://.+\.jpg',
'description': 'Adaptive, intelligent, and consistent, algorithms are emerging as the ultimate app for everything from matching consumers to products to assessing medical diagnoses. Vishal Sikka shares his appreciation for the algorithm, charting both its inherent beauty and its growing power.',
} }
}]
_FORMATS_PREFERENCE = { _FORMATS_PREFERENCE = {
'low': 1, 'low': 1,
@ -57,6 +69,8 @@ class TEDIE(SubtitlesInfoExtractor):
name = m.group('name') name = m.group('name')
if m.group('type_talk'): if m.group('type_talk'):
return self._talk_info(url, name) return self._talk_info(url, name)
elif m.group('type_watch'):
return self._watch_info(url, name)
else: else:
return self._playlist_videos_info(url, name) return self._playlist_videos_info(url, name)
@ -123,3 +137,26 @@ class TEDIE(SubtitlesInfoExtractor):
else: else:
self._downloader.report_warning(u'video doesn\'t have subtitles') self._downloader.report_warning(u'video doesn\'t have subtitles')
return {} return {}
def _watch_info(self, url, name):
webpage = self._download_webpage(url, name)
config_json = self._html_search_regex(
r"data-config='([^']+)", webpage, 'config')
config = json.loads(config_json)
video_url = config['video']['url']
thumbnail = config.get('image', {}).get('url')
title = self._html_search_regex(
r"(?s)<h1(?:\s+class='[^']+')?>(.+?)</h1>", webpage, 'title')
description = self._html_search_regex(
r'(?s)<h4 class="[^"]+" id="h3--about-this-talk">.*?</h4>(.*?)</div>',
webpage, 'description', fatal=False)
return {
'id': name,
'url': video_url,
'title': title,
'thumbnail': thumbnail,
'description': description,
}

View File

@ -0,0 +1,61 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import (
compat_urllib_parse,
unified_strdate,
)
class UrortIE(InfoExtractor):
IE_DESC = 'NRK P3 Urørt'
_VALID_URL = r'https?://(?:www\.)?urort\.p3\.no/#!/Band/(?P<id>[^/]+)$'
_TEST = {
'url': 'https://urort.p3.no/#!/Band/Gerilja',
'md5': '5ed31a924be8a05e47812678a86e127b',
'info_dict': {
'id': '33124-4',
'ext': 'mp3',
'title': 'The Bomb',
'thumbnail': 're:^https?://.+\.jpg',
'like_count': int,
'uploader': 'Gerilja',
'uploader_id': 'Gerilja',
'upload_date': '20100323',
},
'params': {
'matchtitle': '^The Bomb$', # To test, we want just one video
}
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
playlist_id = mobj.group('id')
fstr = compat_urllib_parse.quote("InternalBandUrl eq '%s'" % playlist_id)
json_url = 'http://urort.p3.no/breeze/urort/TrackDtos?$filter=' + fstr
songs = self._download_json(json_url, playlist_id)
print(songs[0])
entries = [{
'id': '%d-%s' % (s['BandId'], s['$id']),
'title': s['Title'],
'url': s['TrackUrl'],
'ext': 'mp3',
'uploader_id': playlist_id,
'uploader': s.get('BandName', playlist_id),
'like_count': s.get('LikeCount'),
'thumbnail': 'http://urort.p3.no/cloud/images/%s' % s['Image'],
'upload_date': unified_strdate(s.get('Released')),
} for s in songs]
return {
'_type': 'playlist',
'id': playlist_id,
'title': playlist_id,
'entries': entries,
}

View File

@ -1185,6 +1185,10 @@ def int_or_none(v, scale=1):
return v if v is None else (int(v) // scale) return v if v is None else (int(v) // scale)
def float_or_none(v, scale=1):
return v if v is None else (float(v) / scale)
def parse_duration(s): def parse_duration(s):
if s is None: if s is None:
return None return None

View File

@ -1,2 +1,2 @@
__version__ = '2014.03.25.1' __version__ = '2014.03.28'