Merge branch 'master' of https://github.com/rg3/youtube-dl into multipart_videos

This commit is contained in:
Mark Lee 2014-04-02 08:40:37 -07:00
commit 5d0495f7b4
30 changed files with 610 additions and 358 deletions

View File

@ -65,6 +65,7 @@ which means you can modify it, redistribute it or use it however you like.
configuration in ~/.config/youtube-dl.conf configuration in ~/.config/youtube-dl.conf
(%APPDATA%/youtube-dl/config.txt on (%APPDATA%/youtube-dl/config.txt on
Windows) Windows)
--encoding ENCODING Force the specified encoding (experimental)
## Video Selection: ## Video Selection:
--playlist-start NUMBER playlist video to start at (default is 1) --playlist-start NUMBER playlist video to start at (default is 1)

View File

@ -144,7 +144,15 @@ class TestAllURLsMatching(unittest.TestCase):
self.assertMatch('http://video.pbs.org/widget/partnerplayer/980042464/', ['PBS']) self.assertMatch('http://video.pbs.org/widget/partnerplayer/980042464/', ['PBS'])
def test_ComedyCentralShows(self): def test_ComedyCentralShows(self):
self.assertMatch('http://thedailyshow.cc.com/extended-interviews/xm3fnq/andrew-napolitano-extended-interview', ['ComedyCentralShows']) self.assertMatch(
'http://thedailyshow.cc.com/extended-interviews/xm3fnq/andrew-napolitano-extended-interview',
['ComedyCentralShows'])
self.assertMatch(
'http://thecolbertreport.cc.com/videos/29w6fx/-realhumanpraise-for-fox-news',
['ComedyCentralShows'])
self.assertMatch(
'http://thecolbertreport.cc.com/videos/gh6urb/neil-degrasse-tyson-pt--1?xrs=eml_col_031114',
['ComedyCentralShows'])
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()

View File

@ -8,6 +8,7 @@ import datetime
import errno import errno
import io import io
import json import json
import locale
import os import os
import platform import platform
import re import re
@ -160,6 +161,7 @@ class YoutubeDL(object):
include_ads: Download ads as well include_ads: Download ads as well
default_search: Prepend this string if an input url is not valid. default_search: Prepend this string if an input url is not valid.
'auto' for elaborate guessing 'auto' for elaborate guessing
encoding: Use this encoding instead of the system-specified.
The following parameters are not used by YoutubeDL itself, they are used by The following parameters are not used by YoutubeDL itself, they are used by
the FileDownloader: the FileDownloader:
@ -1219,6 +1221,9 @@ class YoutubeDL(object):
def print_debug_header(self): def print_debug_header(self):
if not self.params.get('verbose'): if not self.params.get('verbose'):
return return
write_string('[debug] Encodings: locale %s, fs %s, out %s, pref %s\n' %
(locale.getpreferredencoding(), sys.getfilesystemencoding(), sys.stdout.encoding, self.get_encoding()))
write_string('[debug] youtube-dl version ' + __version__ + '\n') write_string('[debug] youtube-dl version ' + __version__ + '\n')
try: try:
sp = subprocess.Popen( sp = subprocess.Popen(
@ -1283,3 +1288,19 @@ class YoutubeDL(object):
# (See https://github.com/rg3/youtube-dl/issues/1309 for details) # (See https://github.com/rg3/youtube-dl/issues/1309 for details)
opener.addheaders = [] opener.addheaders = []
self._opener = opener self._opener = opener
def encode(self, s):
if isinstance(s, bytes):
return s # Already encoded
try:
return s.encode(self.get_encoding())
except UnicodeEncodeError as err:
err.reason = err.reason + '. Check your system encoding configuration or use the --encoding option.'
raise
def get_encoding(self):
encoding = self.params.get('encoding')
if encoding is None:
encoding = preferredencoding()
return encoding

View File

@ -51,6 +51,7 @@ __authors__ = (
'David Wagner', 'David Wagner',
'Juan C. Olivares', 'Juan C. Olivares',
'Mattias Harrysson', 'Mattias Harrysson',
'phaer',
) )
__license__ = 'Public Domain' __license__ = 'Public Domain'
@ -256,13 +257,17 @@ def parseOpts(overrideArguments=None):
general.add_option( general.add_option(
'--bidi-workaround', dest='bidi_workaround', action='store_true', '--bidi-workaround', dest='bidi_workaround', action='store_true',
help=u'Work around terminals that lack bidirectional text support. Requires bidiv or fribidi executable in PATH') help=u'Work around terminals that lack bidirectional text support. Requires bidiv or fribidi executable in PATH')
general.add_option('--default-search', general.add_option(
dest='default_search', metavar='PREFIX', '--default-search',
help='Use this prefix for unqualified URLs. For example "gvsearch2:" downloads two videos from google videos for youtube-dl "large apple". By default (with value "auto") youtube-dl guesses.') dest='default_search', metavar='PREFIX',
help='Use this prefix for unqualified URLs. For example "gvsearch2:" downloads two videos from google videos for youtube-dl "large apple". By default (with value "auto") youtube-dl guesses.')
general.add_option( general.add_option(
'--ignore-config', '--ignore-config',
action='store_true', action='store_true',
help='Do not read configuration files. When given in the global configuration file /etc/youtube-dl.conf: do not read the user configuration in ~/.config/youtube-dl.conf (%APPDATA%/youtube-dl/config.txt on Windows)') help='Do not read configuration files. When given in the global configuration file /etc/youtube-dl.conf: do not read the user configuration in ~/.config/youtube-dl.conf (%APPDATA%/youtube-dl/config.txt on Windows)')
general.add_option(
'--encoding', dest='encoding', metavar='ENCODING',
help='Force the specified encoding (experimental)')
selection.add_option( selection.add_option(
'--playlist-start', '--playlist-start',
@ -542,8 +547,6 @@ def parseOpts(overrideArguments=None):
write_string(u'[debug] System config: ' + repr(_hide_login_info(systemConf)) + '\n') write_string(u'[debug] System config: ' + repr(_hide_login_info(systemConf)) + '\n')
write_string(u'[debug] User config: ' + repr(_hide_login_info(userConf)) + '\n') write_string(u'[debug] User config: ' + repr(_hide_login_info(userConf)) + '\n')
write_string(u'[debug] Command-line args: ' + repr(_hide_login_info(commandLineConf)) + '\n') write_string(u'[debug] Command-line args: ' + repr(_hide_login_info(commandLineConf)) + '\n')
write_string(u'[debug] Encodings: locale %r, fs %r, out %r, pref: %r\n' %
(locale.getpreferredencoding(), sys.getfilesystemencoding(), sys.stdout.encoding, preferredencoding()))
return parser, opts, args return parser, opts, args
@ -677,7 +680,7 @@ def _real_main(argv=None):
date = DateRange.day(opts.date) date = DateRange.day(opts.date)
else: else:
date = DateRange(opts.dateafter, opts.datebefore) date = DateRange(opts.dateafter, opts.datebefore)
if opts.default_search not in ('auto', None) and ':' not in opts.default_search: if opts.default_search not in ('auto', 'auto_warning', None) and ':' not in opts.default_search:
parser.error(u'--default-search invalid; did you forget a colon (:) at the end?') parser.error(u'--default-search invalid; did you forget a colon (:) at the end?')
# Do not download videos when there are audio-only formats # Do not download videos when there are audio-only formats
@ -789,6 +792,7 @@ def _real_main(argv=None):
'include_ads': opts.include_ads, 'include_ads': opts.include_ads,
'default_search': opts.default_search, 'default_search': opts.default_search,
'youtube_include_dash_manifest': opts.youtube_include_dash_manifest, 'youtube_include_dash_manifest': opts.youtube_include_dash_manifest,
'encoding': opts.encoding,
} }
with YoutubeDL(ydl_opts) as ydl: with YoutubeDL(ydl_opts) as ydl:

View File

@ -156,6 +156,7 @@ from .mtv import (
MTVIE, MTVIE,
MTVIggyIE, MTVIggyIE,
) )
from .musicplayon import MusicPlayOnIE
from .muzu import MuzuTVIE from .muzu import MuzuTVIE
from .myspace import MySpaceIE from .myspace import MySpaceIE
from .myspass import MySpassIE from .myspass import MySpassIE
@ -285,7 +286,10 @@ from .vk import VKIE
from .vube import VubeIE from .vube import VubeIE
from .washingtonpost import WashingtonPostIE from .washingtonpost import WashingtonPostIE
from .wat import WatIE from .wat import WatIE
from .wdr import WDRIE from .wdr import (
WDRIE,
WDRMausIE,
)
from .weibo import WeiboIE from .weibo import WeiboIE
from .wimp import WimpIE from .wimp import WimpIE
from .wistia import WistiaIE from .wistia import WistiaIE

View File

@ -6,7 +6,6 @@ import json
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import ( from ..utils import (
compat_urlparse, compat_urlparse,
determine_ext,
) )

View File

@ -1,22 +1,21 @@
from __future__ import unicode_literals
import re import re
from .common import InfoExtractor from .common import InfoExtractor
from .ooyala import OoyalaIE
class BloombergIE(InfoExtractor): class BloombergIE(InfoExtractor):
_VALID_URL = r'https?://www\.bloomberg\.com/video/(?P<name>.+?)\.html' _VALID_URL = r'https?://www\.bloomberg\.com/video/(?P<name>.+?)\.html'
_TEST = { _TEST = {
u'url': u'http://www.bloomberg.com/video/shah-s-presentation-on-foreign-exchange-strategies-qurhIVlJSB6hzkVi229d8g.html', 'url': 'http://www.bloomberg.com/video/shah-s-presentation-on-foreign-exchange-strategies-qurhIVlJSB6hzkVi229d8g.html',
u'file': u'12bzhqZTqQHmmlA8I-i0NpzJgcG5NNYX.mp4', 'md5': '7bf08858ff7c203c870e8a6190e221e5',
u'info_dict': { 'info_dict': {
u'title': u'Shah\'s Presentation on Foreign-Exchange Strategies', 'id': 'qurhIVlJSB6hzkVi229d8g',
u'description': u'md5:abc86e5236f9f0e4866c59ad36736686', 'ext': 'flv',
}, 'title': 'Shah\'s Presentation on Foreign-Exchange Strategies',
u'params': { 'description': 'md5:0681e0d30dcdfc6abf34594961d8ea88',
# Requires ffmpeg (m3u8 manifest)
u'skip_download': True,
}, },
} }
@ -24,7 +23,16 @@ class BloombergIE(InfoExtractor):
mobj = re.match(self._VALID_URL, url) mobj = re.match(self._VALID_URL, url)
name = mobj.group('name') name = mobj.group('name')
webpage = self._download_webpage(url, name) webpage = self._download_webpage(url, name)
embed_code = self._search_regex( f4m_url = self._search_regex(
r'<source src="https?://[^/]+/[^/]+/[^/]+/([^/]+)', webpage, r'<source src="(https?://[^"]+\.f4m.*?)"', webpage,
'embed code') 'f4m url')
return OoyalaIE._build_url_result(embed_code) title = re.sub(': Video$', '', self._og_search_title(webpage))
return {
'id': name.split('-')[-1],
'title': title,
'url': f4m_url,
'ext': 'flv',
'description': self._og_search_description(webpage),
'thumbnail': self._og_search_thumbnail(webpage),
}

View File

@ -87,7 +87,7 @@ class BrightcoveIE(InfoExtractor):
object_str = object_str.replace('<--', '<!--') object_str = object_str.replace('<--', '<!--')
object_str = fix_xml_ampersands(object_str) object_str = fix_xml_ampersands(object_str)
object_doc = xml.etree.ElementTree.fromstring(object_str) object_doc = xml.etree.ElementTree.fromstring(object_str.encode('utf-8'))
fv_el = find_xpath_attr(object_doc, './param', 'name', 'flashVars') fv_el = find_xpath_attr(object_doc, './param', 'name', 'flashVars')
if fv_el is not None: if fv_el is not None:

View File

@ -43,11 +43,13 @@ class ComedyCentralShowsIE(InfoExtractor):
(?P<showname>thedailyshow|thecolbertreport)\.(?:cc\.)?com/ (?P<showname>thedailyshow|thecolbertreport)\.(?:cc\.)?com/
(full-episodes/(?P<episode>.*)| (full-episodes/(?P<episode>.*)|
(?P<clip> (?P<clip>
(the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?)) (?:videos/[^/]+/(?P<videotitle>[^/?#]+))
|(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))| |(the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
|(watch/(?P<date>[^/]*)/(?P<tdstitle>.*))
)|
(?P<interview> (?P<interview>
extended-interviews/(?P<interID>[0-9a-z]+)/(?:playlist_tds_extended_)?(?P<interview_title>.*?)(/.*?)?))) extended-interviews/(?P<interID>[0-9a-z]+)/(?:playlist_tds_extended_)?(?P<interview_title>.*?)(/.*?)?)))
$''' (?:[?#].*|$)'''
_TEST = { _TEST = {
'url': 'http://thedailyshow.cc.com/watch/thu-december-13-2012/kristen-stewart', 'url': 'http://thedailyshow.cc.com/watch/thu-december-13-2012/kristen-stewart',
'md5': '4e2f5cb088a83cd8cdb7756132f9739d', 'md5': '4e2f5cb088a83cd8cdb7756132f9739d',
@ -57,7 +59,7 @@ class ComedyCentralShowsIE(InfoExtractor):
'upload_date': '20121213', 'upload_date': '20121213',
'description': 'Kristen Stewart learns to let loose in "On the Road."', 'description': 'Kristen Stewart learns to let loose in "On the Road."',
'uploader': 'thedailyshow', 'uploader': 'thedailyshow',
'title': 'thedailyshow-kristen-stewart part 1', 'title': 'thedailyshow kristen-stewart part 1',
} }
} }
@ -102,7 +104,9 @@ class ComedyCentralShowsIE(InfoExtractor):
assert mobj is not None assert mobj is not None
if mobj.group('clip'): if mobj.group('clip'):
if mobj.group('showname') == 'thedailyshow': if mobj.group('videotitle'):
epTitle = mobj.group('videotitle')
elif mobj.group('showname') == 'thedailyshow':
epTitle = mobj.group('tdstitle') epTitle = mobj.group('tdstitle')
else: else:
epTitle = mobj.group('cntitle') epTitle = mobj.group('cntitle')
@ -161,7 +165,7 @@ class ComedyCentralShowsIE(InfoExtractor):
content = itemEl.find('.//{http://search.yahoo.com/mrss/}content') content = itemEl.find('.//{http://search.yahoo.com/mrss/}content')
duration = float_or_none(content.attrib.get('duration')) duration = float_or_none(content.attrib.get('duration'))
mediagen_url = content.attrib['url'] mediagen_url = content.attrib['url']
guid = itemEl.find('.//guid').text.rpartition(':')[-1] guid = itemEl.find('./guid').text.rpartition(':')[-1]
cdoc = self._download_xml( cdoc = self._download_xml(
mediagen_url, epTitle, mediagen_url, epTitle,

View File

@ -10,9 +10,10 @@ class DiscoveryIE(InfoExtractor):
_VALID_URL = r'http://dsc\.discovery\.com\/[a-zA-Z0-9\-]*/[a-zA-Z0-9\-]*/videos/(?P<id>[a-zA-Z0-9\-]*)(.htm)?' _VALID_URL = r'http://dsc\.discovery\.com\/[a-zA-Z0-9\-]*/[a-zA-Z0-9\-]*/videos/(?P<id>[a-zA-Z0-9\-]*)(.htm)?'
_TEST = { _TEST = {
'url': 'http://dsc.discovery.com/tv-shows/mythbusters/videos/mission-impossible-outtakes.htm', 'url': 'http://dsc.discovery.com/tv-shows/mythbusters/videos/mission-impossible-outtakes.htm',
'file': '614784.mp4',
'md5': 'e12614f9ee303a6ccef415cb0793eba2', 'md5': 'e12614f9ee303a6ccef415cb0793eba2',
'info_dict': { 'info_dict': {
'id': '614784',
'ext': 'mp4',
'title': 'MythBusters: Mission Impossible Outtakes', 'title': 'MythBusters: Mission Impossible Outtakes',
'description': ('Watch Jamie Hyneman and Adam Savage practice being' 'description': ('Watch Jamie Hyneman and Adam Savage practice being'
' each other -- to the point of confusing Jamie\'s dog -- and ' ' each other -- to the point of confusing Jamie\'s dog -- and '
@ -34,7 +35,7 @@ class DiscoveryIE(InfoExtractor):
formats = [] formats = []
for f in info['mp4']: for f in info['mp4']:
formats.append( formats.append(
{'url': f['src'], r'ext': r'mp4', 'tbr': int(f['bitrate'][:-1])}) {'url': f['src'], 'ext': 'mp4', 'tbr': int(f['bitrate'][:-1])})
return { return {
'id': info['contentId'], 'id': info['contentId'],

View File

@ -82,6 +82,17 @@ class GenericIE(InfoExtractor):
}, },
'add_ie': ['Brightcove'], 'add_ie': ['Brightcove'],
}, },
{
'url': 'http://www.championat.com/video/football/v/87/87499.html',
'md5': 'fb973ecf6e4a78a67453647444222983',
'info_dict': {
'id': '3414141473001',
'ext': 'mp4',
'title': 'Видео. Удаление Дзагоева (ЦСКА)',
'description': 'Онлайн-трансляция матча ЦСКА - "Волга"',
'uploader': 'Championat',
},
},
# Direct link to a video # Direct link to a video
{ {
'url': 'http://media.w3.org/2010/05/sintel/trailer.mp4', 'url': 'http://media.w3.org/2010/05/sintel/trailer.mp4',
@ -316,13 +327,16 @@ class GenericIE(InfoExtractor):
if not parsed_url.scheme: if not parsed_url.scheme:
default_search = self._downloader.params.get('default_search') default_search = self._downloader.params.get('default_search')
if default_search is None: if default_search is None:
default_search = 'auto' default_search = 'auto_warning'
if default_search == 'auto': if default_search in ('auto', 'auto_warning'):
if '/' in url: if '/' in url:
self._downloader.report_warning('The url doesn\'t specify the protocol, trying with http') self._downloader.report_warning('The url doesn\'t specify the protocol, trying with http')
return self.url_result('http://' + url) return self.url_result('http://' + url)
else: else:
if default_search == 'auto_warning':
self._downloader.report_warning(
'Falling back to youtube search for %s . Set --default-search to "auto" to suppress this warning.' % url)
return self.url_result('ytsearch:' + url) return self.url_result('ytsearch:' + url)
else: else:
assert ':' in default_search assert ':' in default_search

View File

@ -21,9 +21,10 @@ class HuffPostIE(InfoExtractor):
_TEST = { _TEST = {
'url': 'http://live.huffingtonpost.com/r/segment/legalese-it/52dd3e4b02a7602131000677', 'url': 'http://live.huffingtonpost.com/r/segment/legalese-it/52dd3e4b02a7602131000677',
'file': '52dd3e4b02a7602131000677.mp4',
'md5': '55f5e8981c1c80a64706a44b74833de8', 'md5': '55f5e8981c1c80a64706a44b74833de8',
'info_dict': { 'info_dict': {
'id': '52dd3e4b02a7602131000677',
'ext': 'mp4',
'title': 'Legalese It! with @MikeSacksHP', 'title': 'Legalese It! with @MikeSacksHP',
'description': 'This week on Legalese It, Mike talks to David Bosco about his new book on the ICC, "Rough Justice," he also discusses the Virginia AG\'s historic stance on gay marriage, the execution of Edgar Tamayo, the ICC\'s delay of Kenya\'s President and more. ', 'description': 'This week on Legalese It, Mike talks to David Bosco about his new book on the ICC, "Rough Justice," he also discusses the Virginia AG\'s historic stance on gay marriage, the execution of Edgar Tamayo, the ICC\'s delay of Kenya\'s President and more. ',
'duration': 1549, 'duration': 1549,

View File

@ -1,10 +1,8 @@
from __future__ import unicode_literals
import re import re
import json
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import (
determine_ext,
)
class IGNIE(InfoExtractor): class IGNIE(InfoExtractor):
@ -14,52 +12,57 @@ class IGNIE(InfoExtractor):
""" """
_VALID_URL = r'https?://.+?\.ign\.com/(?P<type>videos|show_videos|articles|(?:[^/]*/feature))(/.+)?/(?P<name_or_id>.+)' _VALID_URL = r'https?://.+?\.ign\.com/(?P<type>videos|show_videos|articles|(?:[^/]*/feature))(/.+)?/(?P<name_or_id>.+)'
IE_NAME = u'ign.com' IE_NAME = 'ign.com'
_CONFIG_URL_TEMPLATE = 'http://www.ign.com/videos/configs/id/%s.config' _CONFIG_URL_TEMPLATE = 'http://www.ign.com/videos/configs/id/%s.config'
_DESCRIPTION_RE = [r'<span class="page-object-description">(.+?)</span>', _DESCRIPTION_RE = [
r'id="my_show_video">.*?<p>(.*?)</p>', r'<span class="page-object-description">(.+?)</span>',
] r'id="my_show_video">.*?<p>(.*?)</p>',
]
_TESTS = [ _TESTS = [
{ {
u'url': u'http://www.ign.com/videos/2013/06/05/the-last-of-us-review', 'url': 'http://www.ign.com/videos/2013/06/05/the-last-of-us-review',
u'file': u'8f862beef863986b2785559b9e1aa599.mp4', 'md5': 'eac8bdc1890980122c3b66f14bdd02e9',
u'md5': u'eac8bdc1890980122c3b66f14bdd02e9', 'info_dict': {
u'info_dict': { 'id': '8f862beef863986b2785559b9e1aa599',
u'title': u'The Last of Us Review', 'ext': 'mp4',
u'description': u'md5:c8946d4260a4d43a00d5ae8ed998870c', 'title': 'The Last of Us Review',
'description': 'md5:c8946d4260a4d43a00d5ae8ed998870c',
} }
}, },
{ {
u'url': u'http://me.ign.com/en/feature/15775/100-little-things-in-gta-5-that-will-blow-your-mind', 'url': 'http://me.ign.com/en/feature/15775/100-little-things-in-gta-5-that-will-blow-your-mind',
u'playlist': [ 'playlist': [
{ {
u'file': u'5ebbd138523268b93c9141af17bec937.mp4', 'info_dict': {
u'info_dict': { 'id': '5ebbd138523268b93c9141af17bec937',
u'title': u'GTA 5 Video Review', 'ext': 'mp4',
u'description': u'Rockstar drops the mic on this generation of games. Watch our review of the masterly Grand Theft Auto V.', 'title': 'GTA 5 Video Review',
'description': 'Rockstar drops the mic on this generation of games. Watch our review of the masterly Grand Theft Auto V.',
}, },
}, },
{ {
u'file': u'638672ee848ae4ff108df2a296418ee2.mp4', 'info_dict': {
u'info_dict': { 'id': '638672ee848ae4ff108df2a296418ee2',
u'title': u'26 Twisted Moments from GTA 5 in Slow Motion', 'ext': 'mp4',
u'description': u'The twisted beauty of GTA 5 in stunning slow motion.', 'title': '26 Twisted Moments from GTA 5 in Slow Motion',
'description': 'The twisted beauty of GTA 5 in stunning slow motion.',
}, },
}, },
], ],
u'params': { 'params': {
u'skip_download': True, 'skip_download': True,
}, },
}, },
] ]
def _find_video_id(self, webpage): def _find_video_id(self, webpage):
res_id = [r'data-video-id="(.+?)"', res_id = [
r'<object id="vid_(.+?)"', r'data-video-id="(.+?)"',
r'<meta name="og:image" content=".*/(.+?)-(.+?)/.+.jpg"', r'<object id="vid_(.+?)"',
] r'<meta name="og:image" content=".*/(.+?)-(.+?)/.+.jpg"',
]
return self._search_regex(res_id, webpage, 'video id') return self._search_regex(res_id, webpage, 'video id')
def _real_extract(self, url): def _real_extract(self, url):
@ -68,7 +71,7 @@ class IGNIE(InfoExtractor):
page_type = mobj.group('type') page_type = mobj.group('type')
webpage = self._download_webpage(url, name_or_id) webpage = self._download_webpage(url, name_or_id)
if page_type == 'articles': if page_type == 'articles':
video_url = self._search_regex(r'var videoUrl = "(.+?)"', webpage, u'video url') video_url = self._search_regex(r'var videoUrl = "(.+?)"', webpage, 'video url')
return self.url_result(video_url, ie='IGN') return self.url_result(video_url, ie='IGN')
elif page_type != 'video': elif page_type != 'video':
multiple_urls = re.findall( multiple_urls = re.findall(
@ -80,41 +83,37 @@ class IGNIE(InfoExtractor):
video_id = self._find_video_id(webpage) video_id = self._find_video_id(webpage)
result = self._get_video_info(video_id) result = self._get_video_info(video_id)
description = self._html_search_regex(self._DESCRIPTION_RE, description = self._html_search_regex(self._DESCRIPTION_RE,
webpage, 'video description', webpage, 'video description', flags=re.DOTALL)
flags=re.DOTALL)
result['description'] = description result['description'] = description
return result return result
def _get_video_info(self, video_id): def _get_video_info(self, video_id):
config_url = self._CONFIG_URL_TEMPLATE % video_id config_url = self._CONFIG_URL_TEMPLATE % video_id
config = json.loads(self._download_webpage(config_url, video_id, config = self._download_json(config_url, video_id)
u'Downloading video info'))
media = config['playlist']['media'] media = config['playlist']['media']
video_url = media['url']
return {'id': media['metadata']['videoId'], return {
'url': video_url, 'id': media['metadata']['videoId'],
'ext': determine_ext(video_url), 'url': media['url'],
'title': media['metadata']['title'], 'title': media['metadata']['title'],
'thumbnail': media['poster'][0]['url'].replace('{size}', 'grande'), 'thumbnail': media['poster'][0]['url'].replace('{size}', 'grande'),
} }
class OneUPIE(IGNIE): class OneUPIE(IGNIE):
"""Extractor for 1up.com, it uses the ign videos system."""
_VALID_URL = r'https?://gamevideos\.1up\.com/(?P<type>video)/id/(?P<name_or_id>.+)' _VALID_URL = r'https?://gamevideos\.1up\.com/(?P<type>video)/id/(?P<name_or_id>.+)'
IE_NAME = '1up.com' IE_NAME = '1up.com'
_DESCRIPTION_RE = r'<div id="vid_summary">(.+?)</div>' _DESCRIPTION_RE = r'<div id="vid_summary">(.+?)</div>'
_TEST = { _TEST = {
u'url': u'http://gamevideos.1up.com/video/id/34976', 'url': 'http://gamevideos.1up.com/video/id/34976',
u'file': u'34976.mp4', 'md5': '68a54ce4ebc772e4b71e3123d413163d',
u'md5': u'68a54ce4ebc772e4b71e3123d413163d', 'info_dict': {
u'info_dict': { 'id': '34976',
u'title': u'Sniper Elite V2 - Trailer', 'ext': 'mp4',
u'description': u'md5:5d289b722f5a6d940ca3136e9dae89cf', 'title': 'Sniper Elite V2 - Trailer',
'description': 'md5:5d289b722f5a6d940ca3136e9dae89cf',
} }
} }
@ -123,7 +122,6 @@ class OneUPIE(IGNIE):
def _real_extract(self, url): def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url) mobj = re.match(self._VALID_URL, url)
id = mobj.group('name_or_id')
result = super(OneUPIE, self)._real_extract(url) result = super(OneUPIE, self)._real_extract(url)
result['id'] = id result['id'] = mobj.group('name_or_id')
return result return result

View File

@ -1,37 +1,39 @@
# encoding: utf-8
from __future__ import unicode_literals
import re import re
from .common import InfoExtractor from .common import InfoExtractor
class KickStarterIE(InfoExtractor): class KickStarterIE(InfoExtractor):
_VALID_URL = r'https?://www\.kickstarter\.com/projects/(?P<id>\d*)/.*' _VALID_URL = r'https?://www\.kickstarter\.com/projects/(?P<id>[^/]*)/.*'
_TEST = { _TEST = {
u"url": u"https://www.kickstarter.com/projects/1404461844/intersection-the-story-of-josh-grant?ref=home_location", 'url': 'https://www.kickstarter.com/projects/1404461844/intersection-the-story-of-josh-grant?ref=home_location',
u"file": u"1404461844.mp4", 'md5': 'c81addca81327ffa66c642b5d8b08cab',
u"md5": u"c81addca81327ffa66c642b5d8b08cab", 'info_dict': {
u"info_dict": { 'id': '1404461844',
u"title": u"Intersection: The Story of Josh Grant by Kyle Cowling", 'ext': 'mp4',
'title': 'Intersection: The Story of Josh Grant by Kyle Cowling',
'description': 'A unique motocross documentary that examines the '
'life and mind of one of sports most elite athletes: Josh Grant.',
}, },
} }
def _real_extract(self, url): def _real_extract(self, url):
m = re.match(self._VALID_URL, url) m = re.match(self._VALID_URL, url)
video_id = m.group('id') video_id = m.group('id')
webpage_src = self._download_webpage(url, video_id) webpage = self._download_webpage(url, video_id)
video_url = self._search_regex(r'data-video="(.*?)">', video_url = self._search_regex(r'data-video-url="(.*?)"',
webpage_src, u'video URL') webpage, 'video URL')
if 'mp4' in video_url: video_title = self._html_search_regex(r'<title>(.*?)</title>',
ext = 'mp4' webpage, 'title').rpartition('— Kickstarter')[0].strip()
else:
ext = 'flv'
video_title = self._html_search_regex(r"<title>(.*?)</title>",
webpage_src, u'title').rpartition(u'\u2014 Kickstarter')[0].strip()
results = [{ return {
'id': video_id, 'id': video_id,
'url': video_url, 'url': video_url,
'title': video_title, 'title': video_title,
'ext': ext, 'description': self._og_search_description(webpage),
}] 'thumbnail': self._og_search_thumbnail(webpage),
return results }

View File

@ -13,8 +13,9 @@ class MetacriticIE(InfoExtractor):
_TEST = { _TEST = {
'url': 'http://www.metacritic.com/game/playstation-4/infamous-second-son/trailers/3698222', 'url': 'http://www.metacritic.com/game/playstation-4/infamous-second-son/trailers/3698222',
'file': '3698222.mp4',
'info_dict': { 'info_dict': {
'id': '3698222',
'ext': 'mp4',
'title': 'inFamous: Second Son - inSide Sucker Punch: Smoke & Mirrors', 'title': 'inFamous: Second Son - inSide Sucker Punch: Smoke & Mirrors',
'description': 'Take a peak behind-the-scenes to see how Sucker Punch brings smoke into the universe of inFAMOUS Second Son on the PS4.', 'description': 'Take a peak behind-the-scenes to see how Sucker Punch brings smoke into the universe of inFAMOUS Second Son on the PS4.',
'duration': 221, 'duration': 221,

View File

@ -0,0 +1,75 @@
# encoding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import int_or_none
class MusicPlayOnIE(InfoExtractor):
_VALID_URL = r'https?://(?:.+?\.)?musicplayon\.com/play(?:-touch)?\?(?:v|pl=100&play)=(?P<id>\d+)'
_TEST = {
'url': 'http://en.musicplayon.com/play?v=433377',
'info_dict': {
'id': '433377',
'ext': 'mp4',
'title': 'Rick Ross - Interview On Chelsea Lately (2014)',
'description': 'Rick Ross Interview On Chelsea Lately',
'duration': 342,
'uploader': 'ultrafish',
},
'params': {
# m3u8 download
'skip_download': True,
},
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
page = self._download_webpage(url, video_id)
title = self._og_search_title(page)
description = self._og_search_description(page)
thumbnail = self._og_search_thumbnail(page)
duration = self._html_search_meta('video:duration', page, 'duration', fatal=False)
view_count = self._og_search_property('count', page, fatal=False)
uploader = self._html_search_regex(
r'<div>by&nbsp;<a href="[^"]+" class="purple">([^<]+)</a></div>', page, 'uploader', fatal=False)
formats = [
{
'url': 'http://media0-eu-nl.musicplayon.com/stream-mobile?id=%s&type=.mp4' % video_id,
'ext': 'mp4',
}
]
manifest = self._download_webpage(
'http://en.musicplayon.com/manifest.m3u8?v=%s' % video_id, video_id, 'Downloading manifest')
for entry in manifest.split('#')[1:]:
if entry.startswith('EXT-X-STREAM-INF:'):
meta, url, _ = entry.split('\n')
params = dict(param.split('=') for param in meta.split(',')[1:])
formats.append({
'url': url,
'ext': 'mp4',
'tbr': int(params['BANDWIDTH']),
'width': int(params['RESOLUTION'].split('x')[1]),
'height': int(params['RESOLUTION'].split('x')[-1]),
'format_note': params['NAME'].replace('"', '').strip(),
})
return {
'id': video_id,
'title': title,
'description': description,
'thumbnail': thumbnail,
'uploader': uploader,
'duration': int_or_none(duration),
'view_count': int_or_none(view_count),
'formats': formats,
}

View File

@ -6,12 +6,13 @@ from .common import InfoExtractor
class NBAIE(InfoExtractor): class NBAIE(InfoExtractor):
_VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*?)(?:/index\.html)?(?:\?.*)?$' _VALID_URL = r'https?://(?:watch\.|www\.)?nba\.com/(?:nba/)?video(?P<id>/[^?]*?)(?:/index\.html)?(?:\?.*)?$'
_TEST = { _TEST = {
'url': 'http://www.nba.com/video/games/nets/2012/12/04/0021200253-okc-bkn-recap.nba/index.html', 'url': 'http://www.nba.com/video/games/nets/2012/12/04/0021200253-okc-bkn-recap.nba/index.html',
'file': u'0021200253-okc-bkn-recap.nba.mp4',
'md5': u'c0edcfc37607344e2ff8f13c378c88a4', 'md5': u'c0edcfc37607344e2ff8f13c378c88a4',
'info_dict': { 'info_dict': {
'id': '0021200253-okc-bkn-recap.nba',
'ext': 'mp4',
'description': 'Kevin Durant scores 32 points and dishes out six assists as the Thunder beat the Nets in Brooklyn.', 'description': 'Kevin Durant scores 32 points and dishes out six assists as the Thunder beat the Nets in Brooklyn.',
'title': 'Thunder vs. Nets', 'title': 'Thunder vs. Nets',
}, },
@ -19,7 +20,7 @@ class NBAIE(InfoExtractor):
def _real_extract(self, url): def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url) mobj = re.match(self._VALID_URL, url)
video_id = mobj.group(1) video_id = mobj.group('id')
webpage = self._download_webpage(url, video_id) webpage = self._download_webpage(url, video_id)
@ -33,7 +34,6 @@ class NBAIE(InfoExtractor):
return { return {
'id': shortened_video_id, 'id': shortened_video_id,
'url': video_url, 'url': video_url,
'ext': 'mp4',
'title': title, 'title': title,
'description': description, 'description': description,
} }

View File

@ -5,7 +5,7 @@ import re
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import ( from ..utils import (
RegexNotFoundError, ExtractorError,
unescapeHTML unescapeHTML
) )
@ -98,16 +98,15 @@ class NTVIE(InfoExtractor):
page = self._download_webpage(url, video_id, 'Downloading page') page = self._download_webpage(url, video_id, 'Downloading page')
def extract(patterns, name, page, fatal=False): for pattern in self._VIDEO_ID_REGEXES:
for pattern in patterns: mobj = re.search(pattern, page)
mobj = re.search(pattern, page) if mobj:
if mobj: break
return mobj.group(1)
if fatal:
raise RegexNotFoundError(u'Unable to extract %s' % name)
return None
video_id = extract(self._VIDEO_ID_REGEXES, 'video id', page, fatal=True) if not mobj:
raise ExtractorError('No media links available for %s' % video_id)
video_id = mobj.group(1)
player = self._download_xml('http://www.ntv.ru/vi%s/' % video_id, video_id, 'Downloading video XML') player = self._download_xml('http://www.ntv.ru/vi%s/' % video_id, video_id, 'Downloading video XML')
title = unescapeHTML(player.find('./data/title').text) title = unescapeHTML(player.find('./data/title').text)

View File

@ -1,3 +1,5 @@
from __future__ import unicode_literals
import re import re
import os import os
@ -5,45 +7,50 @@ from .common import InfoExtractor
class PyvideoIE(InfoExtractor): class PyvideoIE(InfoExtractor):
_VALID_URL = r'(?:http://)?(?:www\.)?pyvideo\.org/video/(?P<id>\d+)/(.*)' _VALID_URL = r'http://(?:www\.)?pyvideo\.org/video/(?P<id>\d+)/(.*)'
_TESTS = [{
u'url': u'http://pyvideo.org/video/1737/become-a-logging-expert-in-30-minutes', _TESTS = [
u'file': u'24_4WWkSmNo.mp4', {
u'md5': u'de317418c8bc76b1fd8633e4f32acbc6', 'url': 'http://pyvideo.org/video/1737/become-a-logging-expert-in-30-minutes',
u'info_dict': { 'md5': 'de317418c8bc76b1fd8633e4f32acbc6',
u"title": u"Become a logging expert in 30 minutes", 'info_dict': {
u"description": u"md5:9665350d466c67fb5b1598de379021f7", 'id': '24_4WWkSmNo',
u"upload_date": u"20130320", 'ext': 'mp4',
u"uploader": u"NextDayVideo", 'title': 'Become a logging expert in 30 minutes',
u"uploader_id": u"NextDayVideo", 'description': 'md5:9665350d466c67fb5b1598de379021f7',
'upload_date': '20130320',
'uploader': 'NextDayVideo',
'uploader_id': 'NextDayVideo',
},
'add_ie': ['Youtube'],
}, },
u'add_ie': ['Youtube'], {
}, 'url': 'http://pyvideo.org/video/2542/gloriajw-spotifywitherikbernhardsson182m4v',
{ 'md5': '5fe1c7e0a8aa5570330784c847ff6d12',
u'url': u'http://pyvideo.org/video/2542/gloriajw-spotifywitherikbernhardsson182m4v', 'info_dict': {
u'md5': u'5fe1c7e0a8aa5570330784c847ff6d12', 'id': '2542',
u'info_dict': { 'ext': 'm4v',
u'id': u'2542', 'title': 'Gloriajw-SpotifyWithErikBernhardsson182',
u'ext': u'm4v', },
u'title': u'Gloriajw-SpotifyWithErikBernhardsson182',
}, },
},
] ]
def _real_extract(self, url): def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url) mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id') video_id = mobj.group('id')
webpage = self._download_webpage(url, video_id)
m_youtube = re.search(r'(https?://www\.youtube\.com/watch\?v=.*)', webpage)
webpage = self._download_webpage(url, video_id)
m_youtube = re.search(r'(https?://www\.youtube\.com/watch\?v=.*)', webpage)
if m_youtube is not None: if m_youtube is not None:
return self.url_result(m_youtube.group(1), 'Youtube') return self.url_result(m_youtube.group(1), 'Youtube')
title = self._html_search_regex(r'<div class="section">.*?<h3>([^>]+?)</h3>', title = self._html_search_regex(
webpage, u'title', flags=re.DOTALL) r'<div class="section">.*?<h3>([^>]+?)</h3>', webpage, 'title', flags=re.DOTALL)
video_url = self._search_regex([r'<source src="(.*?)"', video_url = self._search_regex(
r'<dt>Download</dt>.*?<a href="(.+?)"'], [r'<source src="(.*?)"', r'<dt>Download</dt>.*?<a href="(.+?)"'],
webpage, u'video url', flags=re.DOTALL) webpage, 'video url', flags=re.DOTALL)
return { return {
'id': video_id, 'id': video_id,
'title': os.path.splitext(title)[0], 'title': os.path.splitext(title)[0],

View File

@ -1,5 +1,6 @@
from __future__ import unicode_literals
import re import re
import json
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import unified_strdate, determine_ext from ..utils import unified_strdate, determine_ext
@ -9,41 +10,44 @@ class RoxwelIE(InfoExtractor):
_VALID_URL = r'https?://www\.roxwel\.com/player/(?P<filename>.+?)(\.|\?|$)' _VALID_URL = r'https?://www\.roxwel\.com/player/(?P<filename>.+?)(\.|\?|$)'
_TEST = { _TEST = {
u'url': u'http://www.roxwel.com/player/passionpittakeawalklive.html', 'url': 'http://www.roxwel.com/player/passionpittakeawalklive.html',
u'file': u'passionpittakeawalklive.flv', 'info_dict': {
u'md5': u'd9dea8360a1e7d485d2206db7fe13035', 'id': 'passionpittakeawalklive',
u'info_dict': { 'ext': 'flv',
u'title': u'Take A Walk (live)', 'title': 'Take A Walk (live)',
u'uploader': u'Passion Pit', 'uploader': 'Passion Pit',
u'description': u'Passion Pit performs "Take A Walk\" live at The Backyard in Austin, Texas. ', 'uploader_id': 'passionpit',
'upload_date': '20120928',
'description': 'Passion Pit performs "Take A Walk\" live at The Backyard in Austin, Texas. ',
}, },
u'skip': u'Requires rtmpdump', 'params': {
# rtmp download
'skip_download': True,
}
} }
def _real_extract(self, url): def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url) mobj = re.match(self._VALID_URL, url)
filename = mobj.group('filename') filename = mobj.group('filename')
info_url = 'http://www.roxwel.com/api/videos/%s' % filename info_url = 'http://www.roxwel.com/api/videos/%s' % filename
info_page = self._download_webpage(info_url, filename, info = self._download_json(info_url, filename)
u'Downloading video info')
self.report_extraction(filename)
info = json.loads(info_page)
rtmp_rates = sorted([int(r.replace('flv_', '')) for r in info['media_rates'] if r.startswith('flv_')]) rtmp_rates = sorted([int(r.replace('flv_', '')) for r in info['media_rates'] if r.startswith('flv_')])
best_rate = rtmp_rates[-1] best_rate = rtmp_rates[-1]
url_page_url = 'http://roxwel.com/pl_one_time.php?filename=%s&quality=%s' % (filename, best_rate) url_page_url = 'http://roxwel.com/pl_one_time.php?filename=%s&quality=%s' % (filename, best_rate)
rtmp_url = self._download_webpage(url_page_url, filename, u'Downloading video url') rtmp_url = self._download_webpage(url_page_url, filename, 'Downloading video url')
ext = determine_ext(rtmp_url) ext = determine_ext(rtmp_url)
if ext == 'f4v': if ext == 'f4v':
rtmp_url = rtmp_url.replace(filename, 'mp4:%s' % filename) rtmp_url = rtmp_url.replace(filename, 'mp4:%s' % filename)
return {'id': filename, return {
'title': info['title'], 'id': filename,
'url': rtmp_url, 'title': info['title'],
'ext': 'flv', 'url': rtmp_url,
'description': info['description'], 'ext': 'flv',
'thumbnail': info.get('player_image_url') or info.get('image_url_large'), 'description': info['description'],
'uploader': info['artist'], 'thumbnail': info.get('player_image_url') or info.get('image_url_large'),
'uploader_id': info['artistname'], 'uploader': info['artist'],
'upload_date': unified_strdate(info['dbdate']), 'uploader_id': info['artistname'],
} 'upload_date': unified_strdate(info['dbdate']),
}

View File

@ -2,7 +2,6 @@
from __future__ import unicode_literals from __future__ import unicode_literals
import re import re
import json
import itertools import itertools
from .common import InfoExtractor from .common import InfoExtractor
@ -20,8 +19,9 @@ class RutubeIE(InfoExtractor):
_TEST = { _TEST = {
'url': 'http://rutube.ru/video/3eac3b4561676c17df9132a9a1e62e3e/', 'url': 'http://rutube.ru/video/3eac3b4561676c17df9132a9a1e62e3e/',
'file': '3eac3b4561676c17df9132a9a1e62e3e.mp4',
'info_dict': { 'info_dict': {
'id': '3eac3b4561676c17df9132a9a1e62e3e',
'ext': 'mp4',
'title': 'Раненный кенгуру забежал в аптеку', 'title': 'Раненный кенгуру забежал в аптеку',
'description': 'http://www.ntdtv.ru ', 'description': 'http://www.ntdtv.ru ',
'duration': 80, 'duration': 80,
@ -38,15 +38,15 @@ class RutubeIE(InfoExtractor):
def _real_extract(self, url): def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url) mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id') video_id = mobj.group('id')
api_response = self._download_webpage('http://rutube.ru/api/video/%s/?format=json' % video_id, video = self._download_json(
video_id, 'Downloading video JSON') 'http://rutube.ru/api/video/%s/?format=json' % video_id,
video = json.loads(api_response) video_id, 'Downloading video JSON')
api_response = self._download_webpage('http://rutube.ru/api/play/trackinfo/%s/?format=json' % video_id, trackinfo = self._download_json(
video_id, 'Downloading trackinfo JSON') 'http://rutube.ru/api/play/trackinfo/%s/?format=json' % video_id,
trackinfo = json.loads(api_response) video_id, 'Downloading trackinfo JSON')
# Some videos don't have the author field # Some videos don't have the author field
author = trackinfo.get('author') or {} author = trackinfo.get('author') or {}
m3u8_url = trackinfo['video_balancer'].get('m3u8') m3u8_url = trackinfo['video_balancer'].get('m3u8')
@ -79,10 +79,9 @@ class RutubeChannelIE(InfoExtractor):
def _extract_videos(self, channel_id, channel_title=None): def _extract_videos(self, channel_id, channel_title=None):
entries = [] entries = []
for pagenum in itertools.count(1): for pagenum in itertools.count(1):
api_response = self._download_webpage( page = self._download_json(
self._PAGE_TEMPLATE % (channel_id, pagenum), self._PAGE_TEMPLATE % (channel_id, pagenum),
channel_id, 'Downloading page %s' % pagenum) channel_id, 'Downloading page %s' % pagenum)
page = json.loads(api_response)
results = page['results'] results = page['results']
if not results: if not results:
break break
@ -108,10 +107,9 @@ class RutubeMovieIE(RutubeChannelIE):
def _real_extract(self, url): def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url) mobj = re.match(self._VALID_URL, url)
movie_id = mobj.group('id') movie_id = mobj.group('id')
api_response = self._download_webpage( movie = self._download_json(
self._MOVIE_TEMPLATE % movie_id, movie_id, self._MOVIE_TEMPLATE % movie_id, movie_id,
'Downloading movie JSON') 'Downloading movie JSON')
movie = json.loads(api_response)
movie_name = movie['name'] movie_name = movie['name']
return self._extract_videos(movie_id, movie_name) return self._extract_videos(movie_id, movie_name)

View File

@ -1,33 +1,37 @@
# coding: utf-8 # coding: utf-8
from __future__ import unicode_literals
import json
import re import re
from .common import InfoExtractor from .common import InfoExtractor
class TF1IE(InfoExtractor): class TF1IE(InfoExtractor):
"""TF1 uses the wat.tv player.""" """TF1 uses the wat.tv player."""
_VALID_URL = r'http://videos\.tf1\.fr/.*-(.*?)\.html' _VALID_URL = r'http://videos\.tf1\.fr/.*-(?P<id>.*?)\.html'
_TEST = { _TEST = {
u'url': u'http://videos.tf1.fr/auto-moto/citroen-grand-c4-picasso-2013-presentation-officielle-8062060.html', 'url': 'http://videos.tf1.fr/auto-moto/citroen-grand-c4-picasso-2013-presentation-officielle-8062060.html',
u'file': u'10635995.mp4', 'info_dict': {
u'md5': u'2e378cc28b9957607d5e88f274e637d8', 'id': '10635995',
u'info_dict': { 'ext': 'mp4',
u'title': u'Citroën Grand C4 Picasso 2013 : présentation officielle', 'title': 'Citroën Grand C4 Picasso 2013 : présentation officielle',
u'description': u'Vidéo officielle du nouveau Citroën Grand C4 Picasso, lancé à l\'automne 2013.', 'description': 'Vidéo officielle du nouveau Citroën Grand C4 Picasso, lancé à l\'automne 2013.',
},
'params': {
# Sometimes wat serves the whole file with the --test option
'skip_download': True,
}, },
u'skip': u'Sometimes wat serves the whole file with the --test option',
} }
def _real_extract(self, url): def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url) mobj = re.match(self._VALID_URL, url)
id = mobj.group(1) video_id = mobj.group('id')
webpage = self._download_webpage(url, id) webpage = self._download_webpage(url, video_id)
embed_url = self._html_search_regex(r'"(https://www.wat.tv/embedframe/.*?)"', embed_url = self._html_search_regex(
webpage, 'embed url') r'"(https://www.wat.tv/embedframe/.*?)"', webpage, 'embed url')
embed_page = self._download_webpage(embed_url, id, u'Downloading embed player page') embed_page = self._download_webpage(embed_url, video_id,
'Downloading embed player page')
wat_id = self._search_regex(r'UVID=(.*?)&', embed_page, 'wat id') wat_id = self._search_regex(r'UVID=(.*?)&', embed_page, 'wat id')
wat_info = self._download_webpage('http://www.wat.tv/interface/contentv3/%s' % wat_id, id, u'Downloading Wat info') wat_info = self._download_json(
wat_info = json.loads(wat_info)['media'] 'http://www.wat.tv/interface/contentv3/%s' % wat_id, video_id)
wat_url = wat_info['url'] return self.url_result(wat_info['media']['url'], 'Wat')
return self.url_result(wat_url, 'Wat')

View File

@ -16,7 +16,7 @@ from ..utils import (
class VKIE(InfoExtractor): class VKIE(InfoExtractor):
IE_NAME = 'vk.com' IE_NAME = 'vk.com'
_VALID_URL = r'https?://vk\.com/(?:video_ext\.php\?.*?\boid=(?P<oid>\d+).*?\bid=(?P<id>\d+)|(?:videos.*?\?.*?z=)?video(?P<videoid>.*?)(?:\?|%2F|$))' _VALID_URL = r'https?://vk\.com/(?:video_ext\.php\?.*?\boid=(?P<oid>-?\d+).*?\bid=(?P<id>\d+)|(?:videos.*?\?.*?z=)?video(?P<videoid>.*?)(?:\?|%2F|$))'
_NETRC_MACHINE = 'vk' _NETRC_MACHINE = 'vk'
_TESTS = [ _TESTS = [

View File

@ -1,37 +1,37 @@
# coding: utf-8 # coding: utf-8
from __future__ import unicode_literals
import json
import re import re
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import ( from ..utils import (
unified_strdate, unified_strdate,
) )
class WatIE(InfoExtractor): class WatIE(InfoExtractor):
_VALID_URL=r'http://www\.wat\.tv/.*-(?P<shortID>.*?)_.*?\.html' _VALID_URL = r'http://www\.wat\.tv/.*-(?P<shortID>.*?)_.*?\.html'
IE_NAME = 'wat.tv' IE_NAME = 'wat.tv'
_TEST = { _TEST = {
u'url': u'http://www.wat.tv/video/world-war-philadelphia-vost-6bv55_2fjr7_.html', 'url': 'http://www.wat.tv/video/world-war-philadelphia-vost-6bv55_2fjr7_.html',
u'file': u'10631273.mp4', 'info_dict': {
u'md5': u'd8b2231e1e333acd12aad94b80937e19', 'id': '10631273',
u'info_dict': { 'ext': 'mp4',
u'title': u'World War Z - Philadelphia VOST', 'title': 'World War Z - Philadelphia VOST',
u'description': u'La menace est partout. Que se passe-t-il à Philadelphia ?\r\nWORLD WAR Z, avec Brad Pitt, au cinéma le 3 juillet.\r\nhttp://www.worldwarz.fr', 'description': 'La menace est partout. Que se passe-t-il à Philadelphia ?\r\nWORLD WAR Z, avec Brad Pitt, au cinéma le 3 juillet.\r\nhttp://www.worldwarz.fr',
},
'params': {
# Sometimes wat serves the whole file with the --test option
'skip_download': True,
}, },
u'skip': u'Sometimes wat serves the whole file with the --test option',
} }
def download_video_info(self, real_id): def download_video_info(self, real_id):
# 'contentv4' is used in the website, but it also returns the related # 'contentv4' is used in the website, but it also returns the related
# videos, we don't need them # videos, we don't need them
info = self._download_webpage('http://www.wat.tv/interface/contentv3/' + real_id, real_id, 'Downloading video info') info = self._download_json('http://www.wat.tv/interface/contentv3/' + real_id, real_id)
info = json.loads(info)
return info['media'] return info['media']
def _real_extract(self, url): def _real_extract(self, url):
def real_id_for_chapter(chapter): def real_id_for_chapter(chapter):
return chapter['tc_start'].split('-')[0] return chapter['tc_start'].split('-')[0]
@ -56,17 +56,17 @@ class WatIE(InfoExtractor):
entries = [self.url_result(chapter_url) for chapter_url in chapter_urls] entries = [self.url_result(chapter_url) for chapter_url in chapter_urls]
return self.playlist_result(entries, real_id, video_info['title']) return self.playlist_result(entries, real_id, video_info['title'])
upload_date = None
if 'date_diffusion' in first_chapter:
upload_date = unified_strdate(first_chapter['date_diffusion'])
# Otherwise we can continue and extract just one part, we have to use # Otherwise we can continue and extract just one part, we have to use
# the short id for getting the video url # the short id for getting the video url
info = {'id': real_id, return {
'url': 'http://wat.tv/get/android5/%s.mp4' % real_id, 'id': real_id,
'ext': 'mp4', 'url': 'http://wat.tv/get/android5/%s.mp4' % real_id,
'title': first_chapter['title'], 'title': first_chapter['title'],
'thumbnail': first_chapter['preview'], 'thumbnail': first_chapter['preview'],
'description': first_chapter['description'], 'description': first_chapter['description'],
'view_count': video_info['views'], 'view_count': video_info['views'],
} 'upload_date': upload_date,
if 'date_diffusion' in first_chapter: }
info['upload_date'] = unified_strdate(first_chapter['date_diffusion'])
return info

View File

@ -4,9 +4,10 @@ import re
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import ( from ..utils import (
unified_strdate, compat_parse_qs,
compat_urlparse, compat_urlparse,
determine_ext, determine_ext,
unified_strdate,
) )
@ -111,4 +112,85 @@ class WDRIE(InfoExtractor):
'description': description, 'description': description,
'thumbnail': thumbnail, 'thumbnail': thumbnail,
'upload_date': upload_date, 'upload_date': upload_date,
} }
class WDRMausIE(InfoExtractor):
_VALID_URL = 'http://(?:www\.)?wdrmaus\.de/(?:[^/]+/){,2}(?P<id>[^/?#]+)(?:/index\.php5|(?<!index)\.php5|/(?:$|[?#]))'
IE_DESC = 'Sendung mit der Maus'
_TESTS = [{
'url': 'http://www.wdrmaus.de/aktuelle-sendung/index.php5',
'info_dict': {
'id': 'aktuelle-sendung',
'ext': 'mp4',
'thumbnail': 're:^http://.+\.jpg',
'upload_date': 're:^[0-9]{8}$',
'title': 're:^[0-9.]{10} - Aktuelle Sendung$',
}
}, {
'url': 'http://www.wdrmaus.de/sachgeschichten/sachgeschichten/40_jahre_maus.php5',
'md5': '3b1227ca3ed28d73ec5737c65743b2a3',
'info_dict': {
'id': '40_jahre_maus',
'ext': 'mp4',
'thumbnail': 're:^http://.+\.jpg',
'upload_date': '20131007',
'title': '12.03.2011 - 40 Jahre Maus',
}
}]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
webpage = self._download_webpage(url, video_id)
param_code = self._html_search_regex(
r'<a href="\?startVideo=1&amp;([^"]+)"', webpage, 'parameters')
title_date = self._search_regex(
r'<div class="sendedatum"><p>Sendedatum:\s*([0-9\.]+)</p>',
webpage, 'air date')
title_str = self._html_search_regex(
r'<h1>(.*?)</h1>', webpage, 'title')
title = '%s - %s' % (title_date, title_str)
upload_date = unified_strdate(
self._html_search_meta('dc.date', webpage))
fields = compat_parse_qs(param_code)
video_url = fields['firstVideo'][0]
thumbnail = compat_urlparse.urljoin(url, fields['startPicture'][0])
formats = [{
'format_id': 'rtmp',
'url': video_url,
}]
jscode = self._download_webpage(
'http://www.wdrmaus.de/codebase/js/extended-medien.min.js',
video_id, fatal=False,
note='Downloading URL translation table',
errnote='Could not download URL translation table')
if jscode:
for m in re.finditer(
r"stream:\s*'dslSrc=(?P<stream>[^']+)',\s*download:\s*'(?P<dl>[^']+)'\s*\}",
jscode):
if video_url.startswith(m.group('stream')):
http_url = video_url.replace(
m.group('stream'), m.group('dl'))
formats.append({
'format_id': 'http',
'url': http_url,
})
break
self._sort_formats(formats)
return {
'id': video_id,
'title': title,
'formats': formats,
'thumbnail': thumbnail,
'upload_date': upload_date,
}
# TODO test _1

View File

@ -7,13 +7,13 @@ import itertools
import json import json
import os.path import os.path
import re import re
import string
import struct import struct
import traceback import traceback
import zlib import zlib
from .common import InfoExtractor, SearchInfoExtractor from .common import InfoExtractor, SearchInfoExtractor
from .subtitles import SubtitlesInfoExtractor from .subtitles import SubtitlesInfoExtractor
from ..jsinterp import JSInterpreter
from ..utils import ( from ..utils import (
compat_chr, compat_chr,
compat_parse_qs, compat_parse_qs,
@ -438,113 +438,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
def _parse_sig_js(self, jscode): def _parse_sig_js(self, jscode):
funcname = self._search_regex( funcname = self._search_regex(
r'signature=([a-zA-Z]+)', jscode, r'signature=([a-zA-Z]+)', jscode,
u'Initial JS player signature function name') u'Initial JS player signature function name')
functions = {} jsi = JSInterpreter(jscode)
initial_function = jsi.extract_function(funcname)
def argidx(varname):
return string.lowercase.index(varname)
def interpret_statement(stmt, local_vars, allow_recursion=20):
if allow_recursion < 0:
raise ExtractorError(u'Recursion limit reached')
if stmt.startswith(u'var '):
stmt = stmt[len(u'var '):]
ass_m = re.match(r'^(?P<out>[a-z]+)(?:\[(?P<index>[^\]]+)\])?' +
r'=(?P<expr>.*)$', stmt)
if ass_m:
if ass_m.groupdict().get('index'):
def assign(val):
lvar = local_vars[ass_m.group('out')]
idx = interpret_expression(ass_m.group('index'),
local_vars, allow_recursion)
assert isinstance(idx, int)
lvar[idx] = val
return val
expr = ass_m.group('expr')
else:
def assign(val):
local_vars[ass_m.group('out')] = val
return val
expr = ass_m.group('expr')
elif stmt.startswith(u'return '):
assign = lambda v: v
expr = stmt[len(u'return '):]
else:
raise ExtractorError(
u'Cannot determine left side of statement in %r' % stmt)
v = interpret_expression(expr, local_vars, allow_recursion)
return assign(v)
def interpret_expression(expr, local_vars, allow_recursion):
if expr.isdigit():
return int(expr)
if expr.isalpha():
return local_vars[expr]
m = re.match(r'^(?P<in>[a-z]+)\.(?P<member>.*)$', expr)
if m:
member = m.group('member')
val = local_vars[m.group('in')]
if member == 'split("")':
return list(val)
if member == 'join("")':
return u''.join(val)
if member == 'length':
return len(val)
if member == 'reverse()':
return val[::-1]
slice_m = re.match(r'slice\((?P<idx>.*)\)', member)
if slice_m:
idx = interpret_expression(
slice_m.group('idx'), local_vars, allow_recursion-1)
return val[idx:]
m = re.match(
r'^(?P<in>[a-z]+)\[(?P<idx>.+)\]$', expr)
if m:
val = local_vars[m.group('in')]
idx = interpret_expression(m.group('idx'), local_vars,
allow_recursion-1)
return val[idx]
m = re.match(r'^(?P<a>.+?)(?P<op>[%])(?P<b>.+?)$', expr)
if m:
a = interpret_expression(m.group('a'),
local_vars, allow_recursion)
b = interpret_expression(m.group('b'),
local_vars, allow_recursion)
return a % b
m = re.match(
r'^(?P<func>[a-zA-Z$]+)\((?P<args>[a-z0-9,]+)\)$', expr)
if m:
fname = m.group('func')
if fname not in functions:
functions[fname] = extract_function(fname)
argvals = [int(v) if v.isdigit() else local_vars[v]
for v in m.group('args').split(',')]
return functions[fname](argvals)
raise ExtractorError(u'Unsupported JS expression %r' % expr)
def extract_function(funcname):
func_m = re.search(
r'function ' + re.escape(funcname) +
r'\((?P<args>[a-z,]+)\){(?P<code>[^}]+)}',
jscode)
argnames = func_m.group('args').split(',')
def resf(args):
local_vars = dict(zip(argnames, args))
for stmt in func_m.group('code').split(';'):
res = interpret_statement(stmt, local_vars)
return res
return resf
initial_function = extract_function(funcname)
return lambda s: initial_function([s]) return lambda s: initial_function([s])
def _parse_sig_swf(self, file_contents): def _parse_sig_swf(self, file_contents):
@ -1549,7 +1446,9 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
break break
more = self._download_json( more = self._download_json(
'https://youtube.com/%s' % mobj.group('more'), playlist_id, 'Downloading page #%s' % page_num) 'https://youtube.com/%s' % mobj.group('more'), playlist_id,
'Downloading page #%s' % page_num,
transform_source=uppercase_escape)
content_html = more['content_html'] content_html = more['content_html']
more_widget_html = more['load_more_widget_html'] more_widget_html = more['load_more_widget_html']
@ -1712,7 +1611,7 @@ class YoutubeUserIE(InfoExtractor):
class YoutubeSearchIE(SearchInfoExtractor): class YoutubeSearchIE(SearchInfoExtractor):
IE_DESC = u'YouTube.com searches' IE_DESC = u'YouTube.com searches'
_API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc' _API_URL = u'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
_MAX_RESULTS = 1000 _MAX_RESULTS = 1000
IE_NAME = u'youtube:search' IE_NAME = u'youtube:search'
_SEARCH_KEY = 'ytsearch' _SEARCH_KEY = 'ytsearch'
@ -1723,9 +1622,12 @@ class YoutubeSearchIE(SearchInfoExtractor):
video_ids = [] video_ids = []
pagenum = 0 pagenum = 0
limit = n limit = n
PAGE_SIZE = 50
while (50 * pagenum) < limit: while (PAGE_SIZE * pagenum) < limit:
result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1) result_url = self._API_URL % (
compat_urllib_parse.quote_plus(query.encode('utf-8')),
(PAGE_SIZE * pagenum) + 1)
data_json = self._download_webpage( data_json = self._download_webpage(
result_url, video_id=u'query "%s"' % query, result_url, video_id=u'query "%s"' % query,
note=u'Downloading page %s' % (pagenum + 1), note=u'Downloading page %s' % (pagenum + 1),
@ -1836,11 +1738,10 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
feed_entries = [] feed_entries = []
paging = 0 paging = 0
for i in itertools.count(1): for i in itertools.count(1):
info = self._download_webpage(self._FEED_TEMPLATE % paging, info = self._download_json(self._FEED_TEMPLATE % paging,
u'%s feed' % self._FEED_NAME, u'%s feed' % self._FEED_NAME,
u'Downloading page %s' % i) u'Downloading page %s' % i)
info = json.loads(info) feed_html = info.get('feed_html') or info.get('content_html')
feed_html = info['feed_html']
m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html) m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
ids = orderedSet(m.group(1) for m in m_ids) ids = orderedSet(m.group(1) for m in m_ids)
feed_entries.extend( feed_entries.extend(

116
youtube_dl/jsinterp.py Normal file
View File

@ -0,0 +1,116 @@
from __future__ import unicode_literals
import re
from .utils import (
ExtractorError,
)
class JSInterpreter(object):
def __init__(self, code):
self.code = code
self._functions = {}
def interpret_statement(self, stmt, local_vars, allow_recursion=20):
if allow_recursion < 0:
raise ExtractorError('Recursion limit reached')
if stmt.startswith('var '):
stmt = stmt[len('var '):]
ass_m = re.match(r'^(?P<out>[a-z]+)(?:\[(?P<index>[^\]]+)\])?' +
r'=(?P<expr>.*)$', stmt)
if ass_m:
if ass_m.groupdict().get('index'):
def assign(val):
lvar = local_vars[ass_m.group('out')]
idx = self.interpret_expression(
ass_m.group('index'), local_vars, allow_recursion)
assert isinstance(idx, int)
lvar[idx] = val
return val
expr = ass_m.group('expr')
else:
def assign(val):
local_vars[ass_m.group('out')] = val
return val
expr = ass_m.group('expr')
elif stmt.startswith('return '):
assign = lambda v: v
expr = stmt[len('return '):]
else:
raise ExtractorError(
'Cannot determine left side of statement in %r' % stmt)
v = self.interpret_expression(expr, local_vars, allow_recursion)
return assign(v)
def interpret_expression(self, expr, local_vars, allow_recursion):
if expr.isdigit():
return int(expr)
if expr.isalpha():
return local_vars[expr]
m = re.match(r'^(?P<in>[a-z]+)\.(?P<member>.*)$', expr)
if m:
member = m.group('member')
val = local_vars[m.group('in')]
if member == 'split("")':
return list(val)
if member == 'join("")':
return u''.join(val)
if member == 'length':
return len(val)
if member == 'reverse()':
return val[::-1]
slice_m = re.match(r'slice\((?P<idx>.*)\)', member)
if slice_m:
idx = self.interpret_expression(
slice_m.group('idx'), local_vars, allow_recursion - 1)
return val[idx:]
m = re.match(
r'^(?P<in>[a-z]+)\[(?P<idx>.+)\]$', expr)
if m:
val = local_vars[m.group('in')]
idx = self.interpret_expression(
m.group('idx'), local_vars, allow_recursion - 1)
return val[idx]
m = re.match(r'^(?P<a>.+?)(?P<op>[%])(?P<b>.+?)$', expr)
if m:
a = self.interpret_expression(
m.group('a'), local_vars, allow_recursion)
b = self.interpret_expression(
m.group('b'), local_vars, allow_recursion)
return a % b
m = re.match(
r'^(?P<func>[a-zA-Z$]+)\((?P<args>[a-z0-9,]+)\)$', expr)
if m:
fname = m.group('func')
if fname not in self._functions:
self._functions[fname] = self.extract_function(fname)
argvals = [int(v) if v.isdigit() else local_vars[v]
for v in m.group('args').split(',')]
return self._functions[fname](argvals)
raise ExtractorError('Unsupported JS expression %r' % expr)
def extract_function(self, funcname):
func_m = re.search(
(r'(?:function %s|%s\s*=\s*function)' % (
re.escape(funcname), re.escape(funcname))) +
r'\((?P<args>[a-z,]+)\){(?P<code>[^}]+)}',
self.code)
if func_m is None:
raise ExtractorError('Could not find JS function %r' % funcname)
argnames = func_m.group('args').split(',')
def resf(args):
local_vars = dict(zip(argnames, args))
for stmt in func_m.group('code').split(';'):
res = self.interpret_statement(stmt, local_vars)
return res
return resf

View File

@ -55,8 +55,9 @@ class FFmpegPostProcessor(PostProcessor):
if self._downloader.params.get('verbose', False): if self._downloader.params.get('verbose', False):
self._downloader.to_screen(u'[debug] ffmpeg command line: %s' % shell_quote(cmd)) self._downloader.to_screen(u'[debug] ffmpeg command line: %s' % shell_quote(cmd))
p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) bcmd = [self._downloader.encode(c) for c in cmd]
stdout,stderr = p.communicate() p = subprocess.Popen(bcmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
stdout, stderr = p.communicate()
if p.returncode != 0: if p.returncode != 0:
stderr = stderr.decode('utf-8', 'replace') stderr = stderr.decode('utf-8', 'replace')
msg = stderr.strip().split('\n')[-1] msg = stderr.strip().split('\n')[-1]

View File

@ -539,7 +539,6 @@ def encodeFilename(s, for_subprocess=False):
encoding = 'utf-8' encoding = 'utf-8'
return s.encode(encoding, 'ignore') return s.encode(encoding, 'ignore')
def decodeOption(optval): def decodeOption(optval):
if optval is None: if optval is None:
return optval return optval
@ -1269,8 +1268,8 @@ class PagedList(object):
def uppercase_escape(s): def uppercase_escape(s):
return re.sub( return re.sub(
r'\\U([0-9a-fA-F]{8})', r'\\U[0-9a-fA-F]{8}',
lambda m: compat_chr(int(m.group(1), base=16)), s) lambda m: m.group(0).decode('unicode-escape'), s)
try: try:
struct.pack(u'!I', 0) struct.pack(u'!I', 0)

View File

@ -1,2 +1,2 @@
__version__ = '2014.03.28' __version__ = '2014.04.02'