Merge remote-tracking branch 'origin/master' into metadata

Conflicts:
	youtube_dl/utils.py
This commit is contained in:
epitron 2013-12-07 04:15:44 -05:00
commit 3fdf6b5b15
107 changed files with 1878 additions and 661 deletions

View File

@ -30,7 +30,8 @@ which means you can modify it, redistribute it or use it however you like.
--list-extractors List all supported extractors and the URLs they
would handle
--extractor-descriptions Output descriptions of all supported extractors
--proxy URL Use the specified HTTP/HTTPS proxy
--proxy URL Use the specified HTTP/HTTPS proxy. Pass in an
empty string (--proxy "") for direct connection
--no-check-certificate Suppress HTTPS certificate validation.
--cache-dir DIR Location in the filesystem where youtube-dl can
store downloaded information permanently. By
@ -55,8 +56,9 @@ which means you can modify it, redistribute it or use it however you like.
--dateafter DATE download only videos uploaded after this date
--no-playlist download only the currently playing video
--age-limit YEARS download only videos suitable for the given age
--download-archive FILE Download only videos not present in the archive
file. Record all downloaded videos in it.
--download-archive FILE Download only videos not listed in the archive
file. Record the IDs of all downloaded videos in
it.
## Download Options:
-r, --rate-limit LIMIT maximum download rate in bytes per second (e.g.
@ -130,11 +132,11 @@ which means you can modify it, redistribute it or use it however you like.
-v, --verbose print various debugging information
--dump-intermediate-pages print downloaded pages to debug problems(very
verbose)
--write-pages Write downloaded pages to files in the current
directory
--write-pages Write downloaded intermediary pages to files in
the current directory to debug problems
## Video Format Options:
-f, --format FORMAT video format code, specifiy the order of
-f, --format FORMAT video format code, specify the order of
preference using slashes: "-f 22/17/18". "-f mp4"
and "-f flv" are also supported
--all-formats download all available video formats
@ -182,7 +184,7 @@ which means you can modify it, redistribute it or use it however you like.
# CONFIGURATION
You can configure youtube-dl by placing default arguments (such as `--extract-audio --no-mtime` to always extract the audio and not copy the mtime) into `/etc/youtube-dl.conf` and/or `~/.config/youtube-dl.conf`.
You can configure youtube-dl by placing default arguments (such as `--extract-audio --no-mtime` to always extract the audio and not copy the mtime) into `/etc/youtube-dl.conf` and/or `~/.config/youtube-dl.conf`. On Windows, the configuration file locations are `%APPDATA%\youtube-dl\config.txt` and `C:\Users\<Yourname>\youtube-dl.conf`.
# OUTPUT TEMPLATE

View File

@ -1,10 +1,21 @@
__youtube_dl()
{
local cur prev opts
local cur prev opts fileopts diropts keywords
COMPREPLY=()
cur="${COMP_WORDS[COMP_CWORD]}"
prev="${COMP_WORDS[COMP_CWORD-1]}"
opts="{{flags}}"
keywords=":ytfavorites :ytrecommended :ytsubscriptions :ytwatchlater"
keywords=":ytfavorites :ytrecommended :ytsubscriptions :ytwatchlater :ythistory"
fileopts="-a|--batch-file|--download-archive|--cookies"
diropts="--cache-dir"
if [[ ${prev} =~ ${fileopts} ]]; then
COMPREPLY=( $(compgen -f -- ${cur}) )
return 0
elif [[ ${prev} =~ ${diropts} ]]; then
COMPREPLY=( $(compgen -d -- ${cur}) )
return 0
fi
if [[ ${cur} =~ : ]]; then
COMPREPLY=( $(compgen -W "${keywords}" -- ${cur}) )

View File

@ -12,10 +12,6 @@ from youtube_dl import YoutubeDL
from youtube_dl.utils import preferredencoding
def global_setup():
youtube_dl._setup_opener(timeout=10)
def get_params(override=None):
PARAMETERS_FILE = os.path.join(os.path.dirname(os.path.abspath(__file__)),
"parameters.json")

View File

@ -39,5 +39,6 @@
"writeinfojson": true,
"writesubtitles": false,
"allsubtitles": false,
"listssubtitles": false
"listssubtitles": false,
"socket_timeout": 20
}

View File

@ -6,8 +6,7 @@ import sys
import unittest
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from test.helper import global_setup, try_rm
global_setup()
from test.helper import try_rm
from youtube_dl import YoutubeDL

View File

@ -100,10 +100,15 @@ class TestAllURLsMatching(unittest.TestCase):
def test_keywords(self):
self.assertMatch(':ytsubs', ['youtube:subscriptions'])
self.assertMatch(':ytsubscriptions', ['youtube:subscriptions'])
self.assertMatch(':thedailyshow', ['ComedyCentral'])
self.assertMatch(':tds', ['ComedyCentral'])
self.assertMatch(':colbertreport', ['ComedyCentral'])
self.assertMatch(':cr', ['ComedyCentral'])
self.assertMatch(':ythistory', ['youtube:history'])
self.assertMatch(':thedailyshow', ['ComedyCentralShows'])
self.assertMatch(':tds', ['ComedyCentralShows'])
self.assertMatch(':colbertreport', ['ComedyCentralShows'])
self.assertMatch(':cr', ['ComedyCentralShows'])
def test_vimeo_matching(self):
self.assertMatch('http://vimeo.com/channels/tributes', ['vimeo:channel'])
self.assertMatch('http://vimeo.com/user7108434', ['vimeo:user'])
if __name__ == '__main__':

View File

@ -9,12 +9,10 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from test.helper import (
get_params,
get_testcases,
global_setup,
try_rm,
md5,
report_warning
)
global_setup()
import hashlib

View File

@ -8,21 +8,25 @@ import sys
import unittest
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from test.helper import FakeYDL, global_setup
global_setup()
from test.helper import FakeYDL
from youtube_dl.extractor import (
DailymotionPlaylistIE,
DailymotionUserIE,
VimeoChannelIE,
VimeoUserIE,
VimeoAlbumIE,
VimeoGroupsIE,
UstreamChannelIE,
SoundcloudSetIE,
SoundcloudUserIE,
LivestreamIE,
NHLVideocenterIE,
BambuserChannelIE,
BandcampAlbumIE
BandcampAlbumIE,
SmotriCommunityIE,
SmotriUserIE
)
@ -55,6 +59,30 @@ class TestPlaylists(unittest.TestCase):
self.assertEqual(result['title'], u'Vimeo Tributes')
self.assertTrue(len(result['entries']) > 24)
def test_vimeo_user(self):
dl = FakeYDL()
ie = VimeoUserIE(dl)
result = ie.extract('http://vimeo.com/nkistudio/videos')
self.assertIsPlaylist(result)
self.assertEqual(result['title'], u'Nki')
self.assertTrue(len(result['entries']) > 65)
def test_vimeo_album(self):
dl = FakeYDL()
ie = VimeoAlbumIE(dl)
result = ie.extract('http://vimeo.com/album/2632481')
self.assertIsPlaylist(result)
self.assertEqual(result['title'], u'Staff Favorites: November 2013')
self.assertTrue(len(result['entries']) > 12)
def test_vimeo_groups(self):
dl = FakeYDL()
ie = VimeoGroupsIE(dl)
result = ie.extract('http://vimeo.com/groups/rolexawards')
self.assertIsPlaylist(result)
self.assertEqual(result['title'], u'Rolex Awards for Enterprise')
self.assertTrue(len(result['entries']) > 72)
def test_ustream_channel(self):
dl = FakeYDL()
ie = UstreamChannelIE(dl)
@ -111,6 +139,24 @@ class TestPlaylists(unittest.TestCase):
self.assertIsPlaylist(result)
self.assertEqual(result['title'], u'Nightmare Night EP')
self.assertTrue(len(result['entries']) >= 4)
def test_smotri_community(self):
dl = FakeYDL()
ie = SmotriCommunityIE(dl)
result = ie.extract('http://smotri.com/community/video/kommuna')
self.assertIsPlaylist(result)
self.assertEqual(result['id'], u'kommuna')
self.assertEqual(result['title'], u'КПРФ')
self.assertTrue(len(result['entries']) >= 4)
def test_smotri_user(self):
dl = FakeYDL()
ie = SmotriUserIE(dl)
result = ie.extract('http://smotri.com/user/inspector')
self.assertIsPlaylist(result)
self.assertEqual(result['id'], u'inspector')
self.assertEqual(result['title'], u'Inspector')
self.assertTrue(len(result['entries']) >= 9)
if __name__ == '__main__':
unittest.main()

View File

@ -6,8 +6,7 @@ import sys
import unittest
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from test.helper import FakeYDL, global_setup, md5
global_setup()
from test.helper import FakeYDL, md5
from youtube_dl.extractor import (
@ -73,7 +72,7 @@ class TestYoutubeSubtitles(BaseTestSubtitles):
self.DL.params['writesubtitles'] = True
self.DL.params['subtitlesformat'] = 'vtt'
subtitles = self.getSubtitles()
self.assertEqual(md5(subtitles['en']), '356cdc577fde0c6783b9b822e7206ff7')
self.assertEqual(md5(subtitles['en']), '3cb210999d3e021bd6c7f0ea751eab06')
def test_youtube_list_subtitles(self):
self.DL.expect_warning(u'Video doesn\'t have automatic captions')

View File

@ -26,6 +26,7 @@ from youtube_dl.utils import (
unsmuggle_url,
shell_quote,
encodeFilename,
str_to_int,
)
if sys.version_info < (3, 0):
@ -176,6 +177,10 @@ class TestUtil(unittest.TestCase):
args = ['ffmpeg', '-i', encodeFilename(u'ñ€ß\'.mp4')]
self.assertEqual(shell_quote(args), u"""ffmpeg -i 'ñ€ß'"'"'.mp4'""")
def test_str_to_int(self):
self.assertEqual(str_to_int('123,456'), 123456)
self.assertEqual(str_to_int('123.456'), 123456)
if __name__ == '__main__':
unittest.main()

View File

@ -7,8 +7,7 @@ import sys
import unittest
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from test.helper import get_params, global_setup, try_rm
global_setup()
from test.helper import get_params, try_rm
import io

View File

@ -7,8 +7,7 @@ import sys
import unittest
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from test.helper import get_params, global_setup
global_setup()
from test.helper import get_params
import io
@ -34,6 +33,7 @@ TEST_ID = 'BaW_jenozKc'
INFO_JSON_FILE = TEST_ID + '.info.json'
DESCRIPTION_FILE = TEST_ID + '.mp4.description'
EXPECTED_DESCRIPTION = u'''test chars: "'/\ä↭𝕐
test URL: https://github.com/rg3/youtube-dl/issues/1892
This is a test video for youtube-dl.

View File

@ -6,8 +6,7 @@ import sys
import unittest
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from test.helper import FakeYDL, global_setup
global_setup()
from test.helper import FakeYDL
from youtube_dl.extractor import (
@ -108,5 +107,14 @@ class TestYoutubeLists(unittest.TestCase):
result = ie.extract('http://www.youtube.com/show/airdisasters')
self.assertTrue(len(result) >= 3)
def test_youtube_mix(self):
dl = FakeYDL()
ie = YoutubePlaylistIE(dl)
result = ie.extract('http://www.youtube.com/watch?v=lLJf9qJHR3E&list=RDrjFaenf1T-Y')
entries = result['entries']
self.assertTrue(len(entries) >= 20)
original_video = entries[0]
self.assertEqual(original_video['id'], 'rjFaenf1T-Y')
if __name__ == '__main__':
unittest.main()

View File

@ -6,9 +6,6 @@ import sys
import unittest
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from test.helper import global_setup
global_setup()
import io
import re

View File

@ -1,4 +1,3 @@
import math
import os
import re
import subprocess
@ -11,6 +10,7 @@ from .utils import (
ContentTooShortError,
determine_ext,
encodeFilename,
format_bytes,
sanitize_open,
timeconvert,
)
@ -53,20 +53,6 @@ class FileDownloader(object):
self._progress_hooks = []
self.params = params
@staticmethod
def format_bytes(bytes):
if bytes is None:
return 'N/A'
if type(bytes) is str:
bytes = float(bytes)
if bytes == 0.0:
exponent = 0
else:
exponent = int(math.log(bytes, 1024.0))
suffix = ['B','KiB','MiB','GiB','TiB','PiB','EiB','ZiB','YiB'][exponent]
converted = float(bytes) / float(1024 ** exponent)
return '%.2f%s' % (converted, suffix)
@staticmethod
def format_seconds(seconds):
(mins, secs) = divmod(seconds, 60)
@ -117,7 +103,7 @@ class FileDownloader(object):
def format_speed(speed):
if speed is None:
return '%10s' % '---b/s'
return '%10s' % ('%s/s' % FileDownloader.format_bytes(speed))
return '%10s' % ('%s/s' % format_bytes(speed))
@staticmethod
def best_block_size(elapsed_time, bytes):
@ -270,6 +256,61 @@ class FileDownloader(object):
(clear_line, data_len_str, self.format_seconds(tot_time)))
def _download_with_rtmpdump(self, filename, url, player_url, page_url, play_path, tc_url, live):
def run_rtmpdump(args):
start = time.time()
resume_percent = None
resume_downloaded_data_len = None
proc = subprocess.Popen(args, stderr=subprocess.PIPE)
cursor_in_new_line = True
proc_stderr_closed = False
while not proc_stderr_closed:
# read line from stderr
line = u''
while True:
char = proc.stderr.read(1)
if not char:
proc_stderr_closed = True
break
if char in [b'\r', b'\n']:
break
line += char.decode('ascii', 'replace')
if not line:
# proc_stderr_closed is True
continue
mobj = re.search(r'([0-9]+\.[0-9]{3}) kB / [0-9]+\.[0-9]{2} sec \(([0-9]{1,2}\.[0-9])%\)', line)
if mobj:
downloaded_data_len = int(float(mobj.group(1))*1024)
percent = float(mobj.group(2))
if not resume_percent:
resume_percent = percent
resume_downloaded_data_len = downloaded_data_len
eta = self.calc_eta(start, time.time(), 100-resume_percent, percent-resume_percent)
speed = self.calc_speed(start, time.time(), downloaded_data_len-resume_downloaded_data_len)
data_len = None
if percent > 0:
data_len = int(downloaded_data_len * 100 / percent)
data_len_str = u'~' + format_bytes(data_len)
self.report_progress(percent, data_len_str, speed, eta)
cursor_in_new_line = False
self._hook_progress({
'downloaded_bytes': downloaded_data_len,
'total_bytes': data_len,
'tmpfilename': tmpfilename,
'filename': filename,
'status': 'downloading',
'eta': eta,
'speed': speed,
})
elif self.params.get('verbose', False):
if not cursor_in_new_line:
self.to_screen(u'')
cursor_in_new_line = True
self.to_screen(u'[rtmpdump] '+line)
proc.wait()
if not cursor_in_new_line:
self.to_screen(u'')
return proc.returncode
self.report_destination(filename)
tmpfilename = self.temp_name(filename)
test = self.params.get('test', False)
@ -280,12 +321,11 @@ class FileDownloader(object):
except (OSError, IOError):
self.report_error(u'RTMP download detected but "rtmpdump" could not be run')
return False
verbosity_option = '--verbose' if self.params.get('verbose', False) else '--quiet'
# Download using rtmpdump. rtmpdump returns exit code 2 when
# the connection was interrumpted and resuming appears to be
# possible. This is part of rtmpdump's normal usage, AFAIK.
basic_args = ['rtmpdump', verbosity_option, '-r', url, '-o', tmpfilename]
basic_args = ['rtmpdump', '--verbose', '-r', url, '-o', tmpfilename]
if player_url is not None:
basic_args += ['--swfVfy', player_url]
if page_url is not None:
@ -299,30 +339,48 @@ class FileDownloader(object):
if live:
basic_args += ['--live']
args = basic_args + [[], ['--resume', '--skip', '1']][self.params.get('continuedl', False)]
if sys.platform == 'win32' and sys.version_info < (3, 0):
# Windows subprocess module does not actually support Unicode
# on Python 2.x
# See http://stackoverflow.com/a/9951851/35070
subprocess_encoding = sys.getfilesystemencoding()
args = [a.encode(subprocess_encoding, 'ignore') for a in args]
else:
subprocess_encoding = None
if self.params.get('verbose', False):
if subprocess_encoding:
str_args = [
a.decode(subprocess_encoding) if isinstance(a, bytes) else a
for a in args]
else:
str_args = args
try:
import pipes
shell_quote = lambda args: ' '.join(map(pipes.quote, args))
shell_quote = lambda args: ' '.join(map(pipes.quote, str_args))
except ImportError:
shell_quote = repr
self.to_screen(u'[debug] rtmpdump command line: ' + shell_quote(args))
retval = subprocess.call(args)
self.to_screen(u'[debug] rtmpdump command line: ' + shell_quote(str_args))
retval = run_rtmpdump(args)
while (retval == 2 or retval == 1) and not test:
prevsize = os.path.getsize(encodeFilename(tmpfilename))
self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
self.to_screen(u'[rtmpdump] %s bytes' % prevsize)
time.sleep(5.0) # This seems to be needed
retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
retval = run_rtmpdump(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
cursize = os.path.getsize(encodeFilename(tmpfilename))
if prevsize == cursize and retval == 1:
break
# Some rtmp streams seem abort after ~ 99.8%. Don't complain for those
if prevsize == cursize and retval == 2 and cursize > 1024:
self.to_screen(u'\r[rtmpdump] Could not download the whole video. This can happen for some advertisements.')
self.to_screen(u'[rtmpdump] Could not download the whole video. This can happen for some advertisements.')
retval = 0
break
if retval == 0 or (test and retval == 2):
fsize = os.path.getsize(encodeFilename(tmpfilename))
self.to_screen(u'\r[rtmpdump] %s bytes' % fsize)
self.to_screen(u'[rtmpdump] %s bytes' % fsize)
self.try_rename(tmpfilename, filename)
self._hook_progress({
'downloaded_bytes': fsize,
@ -525,7 +583,7 @@ class FileDownloader(object):
self.to_screen(u'\r[download] File is larger than max-filesize (%s bytes > %s bytes). Aborting.' % (data_len, max_data_len))
return False
data_len_str = self.format_bytes(data_len)
data_len_str = format_bytes(data_len)
byte_counter = 0 + resume_len
block_size = self.params.get('buffersize', 1024)
start = time.time()

View File

@ -7,8 +7,10 @@ import errno
import io
import json
import os
import platform
import re
import shutil
import subprocess
import socket
import sys
import time
@ -18,6 +20,7 @@ if os.name == 'nt':
import ctypes
from .utils import (
compat_cookiejar,
compat_http_client,
compat_print,
compat_str,
@ -30,9 +33,12 @@ from .utils import (
DownloadError,
encodeFilename,
ExtractorError,
format_bytes,
locked_file,
make_HTTPS_handler,
MaxDownloadsReached,
PostProcessingError,
platform_name,
preferredencoding,
SameFileError,
sanitize_filename,
@ -41,9 +47,11 @@ from .utils import (
UnavailableVideoError,
write_json_file,
write_string,
YoutubeDLHandler,
)
from .extractor import get_info_extractor, gen_extractors
from .FileDownloader import FileDownloader
from .version import __version__
class YoutubeDL(object):
@ -118,9 +126,13 @@ class YoutubeDL(object):
noplaylist: Download single video instead of a playlist if in doubt.
age_limit: An integer representing the user's age in years.
Unsuitable videos for the given age are skipped.
downloadarchive: File name of a file where all downloads are recorded.
download_archive: File name of a file where all downloads are recorded.
Videos already present in the file are not downloaded
again.
cookiefile: File name where cookies should be read from and dumped to.
nocheckcertificate:Do not verify SSL certificates
proxy: URL of the proxy server to use
socket_timeout: Time to wait for unresponsive hosts, in seconds
The following parameters are not used by YoutubeDL itself, they are used by
the FileDownloader:
@ -135,7 +147,7 @@ class YoutubeDL(object):
_num_downloads = None
_screen_file = None
def __init__(self, params):
def __init__(self, params=None):
"""Create a FileDownloader object with the given options."""
self._ies = []
self._ies_instances = {}
@ -144,6 +156,7 @@ class YoutubeDL(object):
self._download_retcode = 0
self._num_downloads = 0
self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
self.params = {} if params is None else params
if (sys.version_info >= (3,) and sys.platform != 'win32' and
sys.getfilesystemencoding() in ['ascii', 'ANSI_X3.4-1968']
@ -153,14 +166,15 @@ class YoutubeDL(object):
u'Assuming --restrict-filenames since file system encoding '
u'cannot encode all charactes. '
u'Set the LC_ALL environment variable to fix this.')
params['restrictfilenames'] = True
self.params['restrictfilenames'] = True
self.params = params
self.fd = FileDownloader(self, self.params)
if '%(stitle)s' in self.params['outtmpl']:
if '%(stitle)s' in self.params.get('outtmpl', ''):
self.report_warning(u'%(stitle)s is deprecated. Use the %(title)s and the --restrict-filenames flag(which also secures %(uploader)s et al) instead.')
self._setup_opener()
def add_info_extractor(self, ie):
"""Add an InfoExtractor object to the end of the list."""
self._ies.append(ie)
@ -241,10 +255,9 @@ class YoutubeDL(object):
def __exit__(self, *args):
self.restore_console_title()
def fixed_template(self):
"""Checks if the output template is fixed."""
return (re.search(u'(?u)%\\(.+?\\)s', self.params['outtmpl']) is None)
if self.params.get('cookiefile') is not None:
self.cookiejar.save()
def trouble(self, message=None, tb=None):
"""Determine action to take when a download problem appears.
@ -392,7 +405,8 @@ class YoutubeDL(object):
for key, value in extra_info.items():
info_dict.setdefault(key, value)
def extract_info(self, url, download=True, ie_key=None, extra_info={}):
def extract_info(self, url, download=True, ie_key=None, extra_info={},
process=True):
'''
Returns a list with a dictionary for each video we find.
If 'download', also downloads the videos.
@ -428,7 +442,10 @@ class YoutubeDL(object):
'webpage_url': url,
'extractor_key': ie.ie_key(),
})
return self.process_ie_result(ie_result, download, extra_info)
if process:
return self.process_ie_result(ie_result, download, extra_info)
else:
return ie_result
except ExtractorError as de: # An error we somewhat expected
self.report_error(compat_str(de), de.format_traceback())
break
@ -461,8 +478,33 @@ class YoutubeDL(object):
download,
ie_key=ie_result.get('ie_key'),
extra_info=extra_info)
elif result_type == 'playlist':
elif result_type == 'url_transparent':
# Use the information from the embedding page
info = self.extract_info(
ie_result['url'], ie_key=ie_result.get('ie_key'),
extra_info=extra_info, download=False, process=False)
def make_result(embedded_info):
new_result = ie_result.copy()
for f in ('_type', 'url', 'ext', 'player_url', 'formats',
'entries', 'urlhandle', 'ie_key', 'duration',
'subtitles', 'annotations', 'format',
'thumbnail', 'thumbnails'):
if f in new_result:
del new_result[f]
if f in embedded_info:
new_result[f] = embedded_info[f]
return new_result
new_result = make_result(info)
assert new_result.get('_type') != 'url_transparent'
if new_result.get('_type') == 'compat_list':
new_result['entries'] = [
make_result(e) for e in new_result['entries']]
return self.process_ie_result(
new_result, download=download, extra_info=extra_info)
elif result_type == 'playlist':
# We process each entry in the playlist
playlist = ie_result.get('title', None) or ie_result.get('id', None)
self.to_screen(u'[download] Downloading playlist: %s' % playlist)
@ -783,13 +825,15 @@ class YoutubeDL(object):
def download(self, url_list):
"""Download a given list of URLs."""
if len(url_list) > 1 and self.fixed_template():
if (len(url_list) > 1 and
'%' not in self.params['outtmpl']
and self.params.get('max_downloads') != 1):
raise SameFileError(self.params['outtmpl'])
for url in url_list:
try:
#It also downloads the videos
videos = self.extract_info(url)
self.extract_info(url)
except UnavailableVideoError:
self.report_error(u'unable to download video')
except MaxDownloadsReached:
@ -867,20 +911,26 @@ class YoutubeDL(object):
except (IOError, OSError):
self.report_warning(u'Unable to remove downloaded video file')
def in_download_archive(self, info_dict):
fn = self.params.get('download_archive')
if fn is None:
return False
extractor = info_dict.get('extractor_id')
def _make_archive_id(self, info_dict):
# Future-proof against any change in case
# and backwards compatibility with prior versions
extractor = info_dict.get('extractor_key')
if extractor is None:
if 'id' in info_dict:
extractor = info_dict.get('ie_key') # key in a playlist
if extractor is None:
return None # Incomplete video information
return extractor.lower() + u' ' + info_dict['id']
def in_download_archive(self, info_dict):
fn = self.params.get('download_archive')
if fn is None:
return False
vid_id = self._make_archive_id(info_dict)
if vid_id is None:
return False # Incomplete video information
# Future-proof against any change in case
# and backwards compatibility with prior versions
extractor = extractor.lower()
vid_id = extractor + u' ' + info_dict['id']
try:
with locked_file(fn, 'r', encoding='utf-8') as archive_file:
for line in archive_file:
@ -895,12 +945,15 @@ class YoutubeDL(object):
fn = self.params.get('download_archive')
if fn is None:
return
vid_id = info_dict['extractor'] + u' ' + info_dict['id']
vid_id = self._make_archive_id(info_dict)
assert vid_id
with locked_file(fn, 'a', encoding='utf-8') as archive_file:
archive_file.write(vid_id + u'\n')
@staticmethod
def format_resolution(format, default='unknown'):
if format.get('vcodec') == 'none':
return 'audio only'
if format.get('_resolution') is not None:
return format['_resolution']
if format.get('height') is not None:
@ -914,10 +967,11 @@ class YoutubeDL(object):
def list_formats(self, info_dict):
def format_note(fdict):
if fdict.get('format_note') is not None:
return fdict['format_note']
res = u''
if fdict.get('vcodec') is not None:
if fdict.get('format_note') is not None:
res += fdict['format_note'] + u' '
if (fdict.get('vcodec') is not None and
fdict.get('vcodec') != 'none'):
res += u'%-5s' % fdict['vcodec']
elif fdict.get('vbr') is not None:
res += u'video'
@ -933,25 +987,103 @@ class YoutubeDL(object):
res += 'audio'
if fdict.get('abr') is not None:
res += u'@%3dk' % fdict['abr']
if fdict.get('filesize') is not None:
if res:
res += u', '
res += format_bytes(fdict['filesize'])
return res
def line(format):
return (u'%-20s%-10s%-12s%s' % (
def line(format, idlen=20):
return ((u'%-' + compat_str(idlen + 1) + u's%-10s%-12s%s') % (
format['format_id'],
format['ext'],
self.format_resolution(format),
format_note(format),
)
)
))
formats = info_dict.get('formats', [info_dict])
formats_s = list(map(line, formats))
idlen = max(len(u'format code'),
max(len(f['format_id']) for f in formats))
formats_s = [line(f, idlen) for f in formats]
if len(formats) > 1:
formats_s[0] += (' ' if format_note(formats[0]) else '') + '(worst)'
formats_s[-1] += (' ' if format_note(formats[-1]) else '') + '(best)'
header_line = line({
'format_id': u'format code', 'ext': u'extension',
'_resolution': u'resolution', 'format_note': u'note'})
'_resolution': u'resolution', 'format_note': u'note'}, idlen=idlen)
self.to_screen(u'[info] Available formats for %s:\n%s\n%s' %
(info_dict['id'], header_line, u"\n".join(formats_s)))
def urlopen(self, req):
""" Start an HTTP download """
return self._opener.open(req)
def print_debug_header(self):
if not self.params.get('verbose'):
return
write_string(u'[debug] youtube-dl version ' + __version__ + u'\n')
try:
sp = subprocess.Popen(
['git', 'rev-parse', '--short', 'HEAD'],
stdout=subprocess.PIPE, stderr=subprocess.PIPE,
cwd=os.path.dirname(os.path.abspath(__file__)))
out, err = sp.communicate()
out = out.decode().strip()
if re.match('[0-9a-f]+', out):
write_string(u'[debug] Git HEAD: ' + out + u'\n')
except:
try:
sys.exc_clear()
except:
pass
write_string(u'[debug] Python version %s - %s' %
(platform.python_version(), platform_name()) + u'\n')
proxy_map = {}
for handler in self._opener.handlers:
if hasattr(handler, 'proxies'):
proxy_map.update(handler.proxies)
write_string(u'[debug] Proxy map: ' + compat_str(proxy_map) + u'\n')
def _setup_opener(self):
timeout_val = self.params.get('socket_timeout')
timeout = 600 if timeout_val is None else float(timeout_val)
opts_cookiefile = self.params.get('cookiefile')
opts_proxy = self.params.get('proxy')
if opts_cookiefile is None:
self.cookiejar = compat_cookiejar.CookieJar()
else:
self.cookiejar = compat_cookiejar.MozillaCookieJar(
opts_cookiefile)
if os.access(opts_cookiefile, os.R_OK):
self.cookiejar.load()
cookie_processor = compat_urllib_request.HTTPCookieProcessor(
self.cookiejar)
if opts_proxy is not None:
if opts_proxy == '':
proxies = {}
else:
proxies = {'http': opts_proxy, 'https': opts_proxy}
else:
proxies = compat_urllib_request.getproxies()
# Set HTTPS proxy to HTTP one if given (https://github.com/rg3/youtube-dl/issues/805)
if 'http' in proxies and 'https' not in proxies:
proxies['https'] = proxies['http']
proxy_handler = compat_urllib_request.ProxyHandler(proxies)
https_handler = make_HTTPS_handler(
self.params.get('nocheckcertificate', False))
opener = compat_urllib_request.build_opener(
https_handler, proxy_handler, cookie_processor, YoutubeDLHandler())
# Delete the default user-agent header, which would otherwise apply in
# cases where our custom HTTP handler doesn't come into play
# (See https://github.com/rg3/youtube-dl/issues/1309 for details)
opener.addheaders = []
self._opener = opener
# TODO remove this global modification
compat_urllib_request.install_opener(opener)
socket.setdefaulttimeout(timeout)

View File

@ -36,50 +36,41 @@ __authors__ = (
'Marcin Cieślak',
'Anton Larionov',
'Takuya Tsuchida',
'Sergey M.',
)
__license__ = 'Public Domain'
import codecs
import collections
import getpass
import optparse
import os
import random
import re
import shlex
import socket
import subprocess
import sys
import traceback
import platform
from .utils import (
compat_cookiejar,
compat_print,
compat_str,
compat_urllib_request,
DateRange,
decodeOption,
determine_ext,
DownloadError,
get_cachedir,
make_HTTPS_handler,
MaxDownloadsReached,
platform_name,
preferredencoding,
SameFileError,
std_headers,
write_string,
YoutubeDLHandler,
)
from .update import update_self
from .version import __version__
from .FileDownloader import (
FileDownloader,
)
from .extractor import gen_extractors
from .version import __version__
from .YoutubeDL import YoutubeDL
from .PostProcessor import (
FFmpegMetadataPP,
@ -90,11 +81,11 @@ from .PostProcessor import (
def parseOpts(overrideArguments=None):
def _readOptions(filename_bytes):
def _readOptions(filename_bytes, default=[]):
try:
optionf = open(filename_bytes)
except IOError:
return [] # silently skip if file is not present
return default # silently skip if file is not present
try:
res = []
for l in optionf:
@ -200,7 +191,9 @@ def parseOpts(overrideArguments=None):
general.add_option('--extractor-descriptions',
action='store_true', dest='list_extractor_descriptions',
help='Output descriptions of all supported extractors', default=False)
general.add_option('--proxy', dest='proxy', default=None, help='Use the specified HTTP/HTTPS proxy', metavar='URL')
general.add_option(
'--proxy', dest='proxy', default=None, metavar='URL',
help='Use the specified HTTP/HTTPS proxy. Pass in an empty string (--proxy "") for direct connection')
general.add_option('--no-check-certificate', action='store_true', dest='no_check_certificate', default=False, help='Suppress HTTPS certificate validation.')
general.add_option(
'--cache-dir', dest='cachedir', default=get_cachedir(), metavar='DIR',
@ -208,6 +201,9 @@ def parseOpts(overrideArguments=None):
general.add_option(
'--no-cache-dir', action='store_const', const=None, dest='cachedir',
help='Disable filesystem caching')
general.add_option(
'--socket-timeout', dest='socket_timeout',
type=float, default=None, help=optparse.SUPPRESS_HELP)
selection.add_option('--playlist-start',
@ -216,7 +212,9 @@ def parseOpts(overrideArguments=None):
dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
selection.add_option('--match-title', dest='matchtitle', metavar='REGEX',help='download only matching titles (regex or caseless sub-string)')
selection.add_option('--reject-title', dest='rejecttitle', metavar='REGEX',help='skip download for matching titles (regex or caseless sub-string)')
selection.add_option('--max-downloads', metavar='NUMBER', dest='max_downloads', help='Abort after downloading NUMBER files', default=None)
selection.add_option('--max-downloads', metavar='NUMBER',
dest='max_downloads', type=int, default=None,
help='Abort after downloading NUMBER files')
selection.add_option('--min-filesize', metavar='SIZE', dest='min_filesize', help="Do not download any videos smaller than SIZE (e.g. 50k or 44.6m)", default=None)
selection.add_option('--max-filesize', metavar='SIZE', dest='max_filesize', help="Do not download any videos larger than SIZE (e.g. 50k or 44.6m)", default=None)
selection.add_option('--date', metavar='DATE', dest='date', help='download only videos uploaded in this date', default=None)
@ -228,7 +226,7 @@ def parseOpts(overrideArguments=None):
default=None, type=int)
selection.add_option('--download-archive', metavar='FILE',
dest='download_archive',
help='Download only videos not present in the archive file. Record all downloaded videos in it.')
help='Download only videos not listed in the archive file. Record the IDs of all downloaded videos in it.')
authentication.add_option('-u', '--username',
@ -243,7 +241,7 @@ def parseOpts(overrideArguments=None):
video_format.add_option('-f', '--format',
action='store', dest='format', metavar='FORMAT', default='best',
help='video format code, specifiy the order of preference using slashes: "-f 22/17/18". "-f mp4" and "-f flv" are also supported')
help='video format code, specify the order of preference using slashes: "-f 22/17/18". "-f mp4" and "-f flv" are also supported')
video_format.add_option('--all-formats',
action='store_const', dest='format', help='download all available video formats', const='all')
video_format.add_option('--prefer-free-formats',
@ -325,7 +323,7 @@ def parseOpts(overrideArguments=None):
help='print downloaded pages to debug problems(very verbose)')
verbosity.add_option('--write-pages',
action='store_true', dest='write_pages', default=False,
help='Write downloaded pages to files in the current directory')
help='Write downloaded intermediary pages to files in the current directory to debug problems')
verbosity.add_option('--youtube-print-sig-code',
action='store_true', dest='youtube_print_sig_code', default=False,
help=optparse.SUPPRESS_HELP)
@ -423,6 +421,8 @@ def parseOpts(overrideArguments=None):
if opts.verbose:
write_string(u'[debug] Override config: ' + repr(overrideArguments) + '\n')
else:
systemConf = _readOptions('/etc/youtube-dl.conf')
xdg_config_home = os.environ.get('XDG_CONFIG_HOME')
if xdg_config_home:
userConfFile = os.path.join(xdg_config_home, 'youtube-dl', 'config')
@ -432,8 +432,31 @@ def parseOpts(overrideArguments=None):
userConfFile = os.path.join(os.path.expanduser('~'), '.config', 'youtube-dl', 'config')
if not os.path.isfile(userConfFile):
userConfFile = os.path.join(os.path.expanduser('~'), '.config', 'youtube-dl.conf')
systemConf = _readOptions('/etc/youtube-dl.conf')
userConf = _readOptions(userConfFile)
userConf = _readOptions(userConfFile, None)
if userConf is None:
appdata_dir = os.environ.get('appdata')
if appdata_dir:
userConf = _readOptions(
os.path.join(appdata_dir, 'youtube-dl', 'config'),
default=None)
if userConf is None:
userConf = _readOptions(
os.path.join(appdata_dir, 'youtube-dl', 'config.txt'),
default=None)
if userConf is None:
userConf = _readOptions(
os.path.join(os.path.expanduser('~'), 'youtube-dl.conf'),
default=None)
if userConf is None:
userConf = _readOptions(
os.path.join(os.path.expanduser('~'), 'youtube-dl.conf.txt'),
default=None)
if userConf is None:
userConf = []
commandLineConf = sys.argv[1:]
argv = systemConf + userConf + commandLineConf
opts, args = parser.parse_args(argv)
@ -452,19 +475,6 @@ def _real_main(argv=None):
parser, opts, args = parseOpts(argv)
# Open appropriate CookieJar
if opts.cookiefile is None:
jar = compat_cookiejar.CookieJar()
else:
try:
jar = compat_cookiejar.MozillaCookieJar(opts.cookiefile)
if os.access(opts.cookiefile, os.R_OK):
jar.load()
except (IOError, OSError) as err:
if opts.verbose:
traceback.print_exc()
write_string(u'ERROR: unable to open cookie file\n')
sys.exit(101)
# Set user agent
if opts.user_agent is not None:
std_headers['User-Agent'] = opts.user_agent
@ -496,8 +506,6 @@ def _real_main(argv=None):
all_urls = batchurls + args
all_urls = [url.strip() for url in all_urls]
opener = _setup_opener(jar=jar, opts=opts)
extractors = gen_extractors()
if opts.list_extractors:
@ -552,7 +560,7 @@ def _real_main(argv=None):
if opts.retries is not None:
try:
opts.retries = int(opts.retries)
except (TypeError, ValueError) as err:
except (TypeError, ValueError):
parser.error(u'invalid retry count specified')
if opts.buffersize is not None:
numeric_buffersize = FileDownloader.parse_bytes(opts.buffersize)
@ -563,13 +571,13 @@ def _real_main(argv=None):
opts.playliststart = int(opts.playliststart)
if opts.playliststart <= 0:
raise ValueError(u'Playlist start must be positive')
except (TypeError, ValueError) as err:
except (TypeError, ValueError):
parser.error(u'invalid playlist start number specified')
try:
opts.playlistend = int(opts.playlistend)
if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
raise ValueError(u'Playlist end must be greater than playlist start')
except (TypeError, ValueError) as err:
except (TypeError, ValueError):
parser.error(u'invalid playlist end number specified')
if opts.extractaudio:
if opts.audioformat not in ['best', 'aac', 'mp3', 'm4a', 'opus', 'vorbis', 'wav']:
@ -672,34 +680,14 @@ def _real_main(argv=None):
'youtube_print_sig_code': opts.youtube_print_sig_code,
'age_limit': opts.age_limit,
'download_archive': opts.download_archive,
'cookiefile': opts.cookiefile,
'nocheckcertificate': opts.no_check_certificate,
'proxy': opts.proxy,
'socket_timeout': opts.socket_timeout,
}
with YoutubeDL(ydl_opts) as ydl:
if opts.verbose:
write_string(u'[debug] youtube-dl version ' + __version__ + u'\n')
try:
sp = subprocess.Popen(
['git', 'rev-parse', '--short', 'HEAD'],
stdout=subprocess.PIPE, stderr=subprocess.PIPE,
cwd=os.path.dirname(os.path.abspath(__file__)))
out, err = sp.communicate()
out = out.decode().strip()
if re.match('[0-9a-f]+', out):
write_string(u'[debug] Git HEAD: ' + out + u'\n')
except:
try:
sys.exc_clear()
except:
pass
write_string(u'[debug] Python version %s - %s' %
(platform.python_version(), platform_name()) + u'\n')
proxy_map = {}
for handler in opener.handlers:
if hasattr(handler, 'proxies'):
proxy_map.update(handler.proxies)
write_string(u'[debug] Proxy map: ' + compat_str(proxy_map) + u'\n')
ydl.print_debug_header()
ydl.add_default_info_extractors()
# PostProcessors
@ -730,46 +718,9 @@ def _real_main(argv=None):
ydl.to_screen(u'--max-download limit reached, aborting.')
retcode = 101
# Dump cookie jar if requested
if opts.cookiefile is not None:
try:
jar.save()
except (IOError, OSError):
sys.exit(u'ERROR: unable to save cookie jar')
sys.exit(retcode)
def _setup_opener(jar=None, opts=None, timeout=300):
if opts is None:
FakeOptions = collections.namedtuple(
'FakeOptions', ['proxy', 'no_check_certificate'])
opts = FakeOptions(proxy=None, no_check_certificate=False)
cookie_processor = compat_urllib_request.HTTPCookieProcessor(jar)
if opts.proxy is not None:
if opts.proxy == '':
proxies = {}
else:
proxies = {'http': opts.proxy, 'https': opts.proxy}
else:
proxies = compat_urllib_request.getproxies()
# Set HTTPS proxy to HTTP one if given (https://github.com/rg3/youtube-dl/issues/805)
if 'http' in proxies and 'https' not in proxies:
proxies['https'] = proxies['http']
proxy_handler = compat_urllib_request.ProxyHandler(proxies)
https_handler = make_HTTPS_handler(opts)
opener = compat_urllib_request.build_opener(
https_handler, proxy_handler, cookie_processor, YoutubeDLHandler())
# Delete the default user-agent header, which would otherwise apply in
# cases where our custom HTTP handler doesn't come into play
# (See https://github.com/rg3/youtube-dl/issues/1309 for details)
opener.addheaders = []
compat_urllib_request.install_opener(opener)
socket.setdefaulttimeout(timeout)
return opener
def main(argv=None):
try:
_real_main(argv)

View File

@ -20,9 +20,11 @@ from .c56 import C56IE
from .canalplus import CanalplusIE
from .canalc2 import Canalc2IE
from .cinemassacre import CinemassacreIE
from .clipfish import ClipfishIE
from .clipsyndicate import ClipsyndicateIE
from .cnn import CNNIE
from .collegehumor import CollegeHumorIE
from .comedycentral import ComedyCentralIE
from .comedycentral import ComedyCentralIE, ComedyCentralShowsIE
from .condenast import CondeNastIE
from .criterion import CriterionIE
from .cspan import CSpanIE
@ -54,7 +56,7 @@ from .flickr import FlickrIE
from .francetv import (
PluzzIE,
FranceTvInfoIE,
France2IE,
FranceTVIE,
GenerationQuoiIE
)
from .freesound import FreesoundIE
@ -70,6 +72,7 @@ from .hotnewhiphop import HotNewHipHopIE
from .howcast import HowcastIE
from .hypem import HypemIE
from .ign import IGNIE, OneUPIE
from .imdb import ImdbIE
from .ina import InaIE
from .infoq import InfoQIE
from .instagram import InstagramIE
@ -99,11 +102,13 @@ from .nbc import NBCNewsIE
from .newgrounds import NewgroundsIE
from .nhl import NHLIE, NHLVideocenterIE
from .niconico import NiconicoIE
from .ninegag import NineGagIE
from .nowvideo import NowVideoIE
from .ooyala import OoyalaIE
from .orf import ORFIE
from .pbs import PBSIE
from .photobucket import PhotobucketIE
from .podomatic import PodomaticIE
from .pornhub import PornHubIE
from .pornotube import PornotubeIE
from .rbmaradio import RBMARadioIE
@ -117,6 +122,11 @@ from .rutube import RutubeIE
from .sina import SinaIE
from .slashdot import SlashdotIE
from .slideshare import SlideshareIE
from .smotri import (
SmotriIE,
SmotriCommunityIE,
SmotriUserIE,
)
from .sohu import SohuIE
from .soundcloud import SoundcloudIE, SoundcloudSetIE, SoundcloudUserIE
from .southparkstudios import (
@ -135,6 +145,7 @@ from .teamcoco import TeamcocoIE
from .techtalks import TechTalksIE
from .ted import TEDIE
from .tf1 import TF1IE
from .theplatform import ThePlatformIE
from .thisav import ThisAVIE
from .toutv import TouTvIE
from .traileraddict import TrailerAddictIE
@ -155,19 +166,31 @@ from .viddler import ViddlerIE
from .videodetective import VideoDetectiveIE
from .videofyme import VideofyMeIE
from .videopremium import VideoPremiumIE
from .vimeo import VimeoIE, VimeoChannelIE
from .vimeo import (
VimeoIE,
VimeoChannelIE,
VimeoUserIE,
VimeoAlbumIE,
VimeoGroupsIE,
)
from .vine import VineIE
from .viki import VikiIE
from .vk import VKIE
from .wat import WatIE
from .websurg import WeBSurgIE
from .weibo import WeiboIE
from .wimp import WimpIE
from .wistia import WistiaIE
from .worldstarhiphop import WorldStarHipHopIE
from .xhamster import XHamsterIE
from .xnxx import XNXXIE
from .xvideos import XVideosIE
from .xtube import XTubeIE
from .yahoo import YahooIE, YahooSearchIE
from .yahoo import (
YahooIE,
YahooNewsIE,
YahooSearchIE,
)
from .youjizz import YouJizzIE
from .youku import YoukuIE
from .youporn import YouPornIE
@ -184,6 +207,7 @@ from .youtube import (
YoutubeTruncatedURLIE,
YoutubeWatchLaterIE,
YoutubeFavouritesIE,
YoutubeHistoryIE,
)
from .zdf import ZDFIE

View File

@ -13,7 +13,7 @@ from ..utils import (
class AddAnimeIE(InfoExtractor):
_VALID_URL = r'^http://(?:\w+\.)?add-anime\.net/watch_video.php\?(?:.*?)v=(?P<video_id>[\w_]+)(?:.*)'
_VALID_URL = r'^http://(?:\w+\.)?add-anime\.net/watch_video\.php\?(?:.*?)v=(?P<video_id>[\w_]+)(?:.*)'
IE_NAME = u'AddAnime'
_TEST = {
u'url': u'http://www.add-anime.net/watch_video.php?v=24MR3YO5SAS9',

View File

@ -1,5 +1,4 @@
import re
import xml.etree.ElementTree
from .common import InfoExtractor
@ -28,9 +27,8 @@ class AnitubeIE(InfoExtractor):
key = self._html_search_regex(r'http://www\.anitube\.se/embed/([A-Za-z0-9_-]*)',
webpage, u'key')
webpage_config = self._download_webpage('http://www.anitube.se/nuevo/econfig.php?key=%s' % key,
config_xml = self._download_xml('http://www.anitube.se/nuevo/econfig.php?key=%s' % key,
key)
config_xml = xml.etree.ElementTree.fromstring(webpage_config.encode('utf-8'))
video_title = config_xml.find('title').text

View File

@ -10,7 +10,7 @@ from ..utils import (
class AppleTrailersIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?trailers.apple.com/trailers/(?P<company>[^/]+)/(?P<movie>[^/]+)'
_VALID_URL = r'https?://(?:www\.)?trailers\.apple\.com/trailers/(?P<company>[^/]+)/(?P<movie>[^/]+)'
_TEST = {
u"url": u"http://trailers.apple.com/trailers/wb/manofsteel/",
u"playlist": [
@ -113,7 +113,7 @@ class AppleTrailersIE(InfoExtractor):
})
formats = sorted(formats, key=lambda f: (f['height'], f['width']))
info = {
playlist.append({
'_type': 'video',
'id': video_id,
'title': title,
@ -124,12 +124,7 @@ class AppleTrailersIE(InfoExtractor):
'upload_date': upload_date,
'uploader_id': uploader_id,
'user_agent': 'QuickTime compatible (youtube-dl)',
}
# TODO: Remove when #980 has been merged
info['url'] = formats[-1]['url']
info['ext'] = formats[-1]['ext']
playlist.append(info)
})
return {
'_type': 'playlist',

View File

@ -11,7 +11,7 @@ from ..utils import (
class ArchiveOrgIE(InfoExtractor):
IE_NAME = 'archive.org'
IE_DESC = 'archive.org videos'
_VALID_URL = r'(?:https?://)?(?:www\.)?archive.org/details/(?P<id>[^?/]+)(?:[?].*)?$'
_VALID_URL = r'(?:https?://)?(?:www\.)?archive\.org/details/(?P<id>[^?/]+)(?:[?].*)?$'
_TEST = {
u"url": u"http://archive.org/details/XD300-23_68HighlightsAResearchCntAugHumanIntellect",
u'file': u'XD300-23_68HighlightsAResearchCntAugHumanIntellect.ogv',
@ -49,7 +49,7 @@ class ArchiveOrgIE(InfoExtractor):
for f in formats:
f['ext'] = determine_ext(f['url'])
info = {
return {
'_type': 'video',
'id': video_id,
'title': title,
@ -57,12 +57,5 @@ class ArchiveOrgIE(InfoExtractor):
'description': description,
'uploader': uploader,
'upload_date': upload_date,
'thumbnail': data.get('misc', {}).get('image'),
}
thumbnail = data.get('misc', {}).get('image')
if thumbnail:
info['thumbnail'] = thumbnail
# TODO: Remove when #980 has been merged
info.update(formats[-1])
return info

View File

@ -1,7 +1,6 @@
# encoding: utf-8
import re
import json
import xml.etree.ElementTree
from .common import InfoExtractor
from ..utils import (
@ -18,8 +17,8 @@ from ..utils import (
# add tests.
class ArteTvIE(InfoExtractor):
_VIDEOS_URL = r'(?:http://)?videos.arte.tv/(?P<lang>fr|de)/.*-(?P<id>.*?).html'
_LIVEWEB_URL = r'(?:http://)?liveweb.arte.tv/(?P<lang>fr|de)/(?P<subpage>.+?)/(?P<name>.+)'
_VIDEOS_URL = r'(?:http://)?videos\.arte\.tv/(?P<lang>fr|de)/.*-(?P<id>.*?)\.html'
_LIVEWEB_URL = r'(?:http://)?liveweb\.arte\.tv/(?P<lang>fr|de)/(?P<subpage>.+?)/(?P<name>.+)'
_LIVE_URL = r'index-[0-9]+\.html$'
IE_NAME = u'arte.tv'
@ -78,8 +77,7 @@ class ArteTvIE(InfoExtractor):
"""Extract from videos.arte.tv"""
ref_xml_url = url.replace('/videos/', '/do_delegate/videos/')
ref_xml_url = ref_xml_url.replace('.html', ',view,asPlayerXml.xml')
ref_xml = self._download_webpage(ref_xml_url, video_id, note=u'Downloading metadata')
ref_xml_doc = xml.etree.ElementTree.fromstring(ref_xml)
ref_xml_doc = self._download_xml(ref_xml_url, video_id, note=u'Downloading metadata')
config_node = find_xpath_attr(ref_xml_doc, './/video', 'lang', lang)
config_xml_url = config_node.attrib['ref']
config_xml = self._download_webpage(config_xml_url, video_id, note=u'Downloading configuration')
@ -109,9 +107,8 @@ class ArteTvIE(InfoExtractor):
"""Extract form http://liveweb.arte.tv/"""
webpage = self._download_webpage(url, name)
video_id = self._search_regex(r'eventId=(\d+?)("|&)', webpage, u'event id')
config_xml = self._download_webpage('http://download.liveweb.arte.tv/o21/liveweb/events/event-%s.xml' % video_id,
config_doc = self._download_xml('http://download.liveweb.arte.tv/o21/liveweb/events/event-%s.xml' % video_id,
video_id, u'Downloading information')
config_doc = xml.etree.ElementTree.fromstring(config_xml.encode('utf-8'))
event_doc = config_doc.find('event')
url_node = event_doc.find('video').find('urlHd')
if url_node is None:

View File

@ -16,7 +16,7 @@ class AUEngineIE(InfoExtractor):
u"title": u"[Commie]The Legend of the Legendary Heroes - 03 - Replication Eye (Alpha Stigma)[F9410F5A]"
}
}
_VALID_URL = r'(?:http://)?(?:www\.)?auengine\.com/embed.php\?.*?file=([^&]+).*?'
_VALID_URL = r'(?:http://)?(?:www\.)?auengine\.com/embed\.php\?.*?file=([^&]+).*?'
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)

View File

@ -25,6 +25,11 @@ class BambuserIE(InfoExtractor):
u'uploader': u'pixelversity',
u'uploader_id': u'344706',
},
u'params': {
# It doesn't respect the 'Range' header, it would download the whole video
# caused the travis builds to fail: https://travis-ci.org/rg3/youtube-dl/jobs/14493845#L59
u'skip_download': True,
},
}
def _real_extract(self, url):
@ -49,7 +54,7 @@ class BambuserIE(InfoExtractor):
class BambuserChannelIE(InfoExtractor):
IE_NAME = u'bambuser:channel'
_VALID_URL = r'http://bambuser.com/channel/(?P<user>.*?)(?:/|#|\?|$)'
_VALID_URL = r'https?://bambuser\.com/channel/(?P<user>.*?)(?:/|#|\?|$)'
# The maximum number we can get with each request
_STEP = 50

View File

@ -34,7 +34,6 @@ class BandcampIE(InfoExtractor):
json_code = m_trackinfo.group(1)
data = json.loads(json_code)
entries = []
for d in data:
formats = [{
'format_id': 'format_id',

View File

@ -4,7 +4,7 @@ from .common import InfoExtractor
class BloombergIE(InfoExtractor):
_VALID_URL = r'https?://www\.bloomberg\.com/video/(?P<name>.+?).html'
_VALID_URL = r'https?://www\.bloomberg\.com/video/(?P<name>.+?)\.html'
_TEST = {
u'url': u'http://www.bloomberg.com/video/shah-s-presentation-on-foreign-exchange-strategies-qurhIVlJSB6hzkVi229d8g.html',

View File

@ -76,18 +76,21 @@ class BrightcoveIE(InfoExtractor):
'playerID': find_xpath_attr(object_doc, './param', 'name', 'playerID').attrib['value'],
}
def find_param(name):
return find_xpath_attr(object_doc, './param', 'name', name)
node = find_xpath_attr(object_doc, './param', 'name', name)
if node is not None:
return node.attrib['value']
return None
playerKey = find_param('playerKey')
# Not all pages define this value
if playerKey is not None:
params['playerKey'] = playerKey.attrib['value']
params['playerKey'] = playerKey
# The three fields hold the id of the video
videoPlayer = find_param('@videoPlayer') or find_param('videoId') or find_param('videoID')
if videoPlayer is not None:
params['@videoPlayer'] = videoPlayer.attrib['value']
params['@videoPlayer'] = videoPlayer
linkBase = find_param('linkBaseURL')
if linkBase is not None:
params['linkBaseURL'] = linkBase.attrib['value']
params['linkBaseURL'] = linkBase
data = compat_urllib_parse.urlencode(params)
return cls._FEDERATED_URL_TEMPLATE % data

View File

@ -1,6 +1,5 @@
# encoding: utf-8
import re
import xml.etree.ElementTree
from .common import InfoExtractor
from ..utils import unified_strdate
@ -31,11 +30,10 @@ class CanalplusIE(InfoExtractor):
webpage = self._download_webpage(url, mobj.group('path'))
video_id = self._search_regex(r'videoId = "(\d+)";', webpage, u'video id')
info_url = self._VIDEO_INFO_TEMPLATE % video_id
info_page = self._download_webpage(info_url,video_id,
doc = self._download_xml(info_url,video_id,
u'Downloading video info')
self.report_extraction(video_id)
doc = xml.etree.ElementTree.fromstring(info_page.encode('utf-8'))
video_info = [video for video in doc if video.find('ID').text == video_id][0]
infos = video_info.find('INFOS')
media = video_info.find('MEDIA')

View File

@ -0,0 +1,58 @@
import re
import time
import xml.etree.ElementTree
from .common import InfoExtractor
from ..utils import ExtractorError
class ClipfishIE(InfoExtractor):
IE_NAME = u'clipfish'
_VALID_URL = r'^https?://(?:www\.)?clipfish\.de/.*?/video/(?P<id>[0-9]+)/'
_TEST = {
u'url': u'http://www.clipfish.de/special/game-trailer/video/3966754/fifa-14-e3-2013-trailer/',
u'file': u'3966754.mp4',
u'md5': u'2521cd644e862936cf2e698206e47385',
u'info_dict': {
u'title': u'FIFA 14 - E3 2013 Trailer',
u'duration': 82,
},
u'skip': 'Blocked in the US'
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group(1)
info_url = ('http://www.clipfish.de/devxml/videoinfo/%s?ts=%d' %
(video_id, int(time.time())))
doc = self._download_xml(
info_url, video_id, note=u'Downloading info page')
title = doc.find('title').text
video_url = doc.find('filename').text
if video_url is None:
xml_bytes = xml.etree.ElementTree.tostring(doc)
raise ExtractorError(u'Cannot find video URL in document %r' %
xml_bytes)
thumbnail = doc.find('imageurl').text
duration_str = doc.find('duration').text
m = re.match(
r'^(?P<hours>[0-9]+):(?P<minutes>[0-9]{2}):(?P<seconds>[0-9]{2}):(?P<ms>[0-9]*)$',
duration_str)
if m:
duration = (
(int(m.group('hours')) * 60 * 60) +
(int(m.group('minutes')) * 60) +
(int(m.group('seconds')))
)
else:
duration = None
return {
'id': video_id,
'title': title,
'url': video_url,
'thumbnail': thumbnail,
'duration': duration,
}

View File

@ -0,0 +1,52 @@
import re
import xml.etree.ElementTree
from .common import InfoExtractor
from ..utils import (
find_xpath_attr,
)
class ClipsyndicateIE(InfoExtractor):
_VALID_URL = r'http://www\.clipsyndicate\.com/video/play(list/\d+)?/(?P<id>\d+)'
_TEST = {
u'url': u'http://www.clipsyndicate.com/video/play/4629301/brick_briscoe',
u'md5': u'4d7d549451bad625e0ff3d7bd56d776c',
u'info_dict': {
u'id': u'4629301',
u'ext': u'mp4',
u'title': u'Brick Briscoe',
u'duration': 612,
},
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
js_player = self._download_webpage(
'http://eplayer.clipsyndicate.com/embed/player.js?va_id=%s' % video_id,
video_id, u'Downlaoding player')
# it includes a required token
flvars = self._search_regex(r'flvars: "(.*?)"', js_player, u'flvars')
playlist_page = self._download_webpage(
'http://eplayer.clipsyndicate.com/osmf/playlist?%s' % flvars,
video_id, u'Downloading video info')
# Fix broken xml
playlist_page = re.sub('&', '&amp;', playlist_page)
pdoc = xml.etree.ElementTree.fromstring(playlist_page.encode('utf-8'))
track_doc = pdoc.find('trackList/track')
def find_param(name):
node = find_xpath_attr(track_doc, './/param', 'name', name)
if node is not None:
return node.attrib['value']
return {
'id': video_id,
'title': find_param('title'),
'url': track_doc.find('location').text,
'thumbnail': find_param('thumbnail'),
'duration': int(find_param('duration')),
}

View File

@ -1,5 +1,4 @@
import re
import xml.etree.ElementTree
from .common import InfoExtractor
from ..utils import determine_ext
@ -33,8 +32,7 @@ class CNNIE(InfoExtractor):
path = mobj.group('path')
page_title = mobj.group('title')
info_url = u'http://cnn.com/video/data/3.0/%s/index.xml' % path
info_xml = self._download_webpage(info_url, page_title)
info = xml.etree.ElementTree.fromstring(info_xml.encode('utf-8'))
info = self._download_xml(info_url, page_title)
formats = []
for f in info.findall('files/file'):

View File

@ -1,5 +1,4 @@
import re
import xml.etree.ElementTree
from .common import InfoExtractor
from ..utils import (
@ -46,11 +45,10 @@ class CollegeHumorIE(InfoExtractor):
self.report_extraction(video_id)
xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
metaXml = self._download_webpage(xmlUrl, video_id,
mdoc = self._download_xml(xmlUrl, video_id,
u'Downloading info XML',
u'Unable to download video info XML')
mdoc = xml.etree.ElementTree.fromstring(metaXml)
try:
videoNode = mdoc.findall('./video')[0]
youtubeIdNode = videoNode.find('./youtubeID')
@ -65,11 +63,10 @@ class CollegeHumorIE(InfoExtractor):
if next_url.endswith(u'manifest.f4m'):
manifest_url = next_url + '?hdcore=2.10.3'
manifestXml = self._download_webpage(manifest_url, video_id,
adoc = self._download_xml(manifest_url, video_id,
u'Downloading XML manifest',
u'Unable to download video info XML')
adoc = xml.etree.ElementTree.fromstring(manifestXml)
try:
video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
except IndexError:

View File

@ -1,7 +1,7 @@
import re
import xml.etree.ElementTree
from .common import InfoExtractor
from .mtv import MTVServicesInfoExtractor
from ..utils import (
compat_str,
compat_urllib_parse,
@ -11,7 +11,31 @@ from ..utils import (
)
class ComedyCentralIE(InfoExtractor):
class ComedyCentralIE(MTVServicesInfoExtractor):
_VALID_URL = r'https?://(?:www.)?comedycentral.com/(video-clips|episodes|cc-studios)/(?P<title>.*)'
_FEED_URL = u'http://comedycentral.com/feeds/mrss/'
_TEST = {
u'url': u'http://www.comedycentral.com/video-clips/kllhuv/stand-up-greg-fitzsimmons--uncensored---too-good-of-a-mother',
u'md5': u'4167875aae411f903b751a21f357f1ee',
u'info_dict': {
u'id': u'cef0cbb3-e776-4bc9-b62e-8016deccb354',
u'ext': u'mp4',
u'title': u'Uncensored - Greg Fitzsimmons - Too Good of a Mother',
u'description': u'After a certain point, breastfeeding becomes c**kblocking.',
},
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
title = mobj.group('title')
webpage = self._download_webpage(url, title)
mgid = self._search_regex(r'data-mgid="(?P<mgid>mgid:.*?)"',
webpage, u'mgid')
return self._get_videos_info(mgid)
class ComedyCentralShowsIE(InfoExtractor):
IE_DESC = u'The Daily Show / Colbert Report'
# urls can be abbreviations like :thedailyshow or :colbert
# urls for episodes like:
@ -127,13 +151,12 @@ class ComedyCentralIE(InfoExtractor):
uri = mMovieParams[0][1]
indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
indexXml = self._download_webpage(indexUrl, epTitle,
idoc = self._download_xml(indexUrl, epTitle,
u'Downloading show index',
u'unable to download episode index')
results = []
idoc = xml.etree.ElementTree.fromstring(indexXml)
itemEls = idoc.findall('.//item')
for partNum,itemEl in enumerate(itemEls):
mediaId = itemEl.findall('./guid')[0].text
@ -144,10 +167,9 @@ class ComedyCentralIE(InfoExtractor):
configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
compat_urllib_parse.urlencode({'uri': mediaId}))
configXml = self._download_webpage(configUrl, epTitle,
cdoc = self._download_xml(configUrl, epTitle,
u'Downloading configuration for %s' % shortMediaId)
cdoc = xml.etree.ElementTree.fromstring(configXml)
turls = []
for rendition in cdoc.findall('.//rendition'):
finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
@ -169,7 +191,7 @@ class ComedyCentralIE(InfoExtractor):
})
effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
info = {
results.append({
'id': shortMediaId,
'formats': formats,
'uploader': showId,
@ -177,11 +199,6 @@ class ComedyCentralIE(InfoExtractor):
'title': effTitle,
'thumbnail': None,
'description': compat_str(officialTitle),
}
# TODO: Remove when #980 has been merged
info.update(info['formats'][-1])
results.append(info)
})
return results

View File

@ -4,11 +4,11 @@ import re
import socket
import sys
import netrc
import xml.etree.ElementTree
from ..utils import (
compat_http_client,
compat_urllib_error,
compat_urllib_request,
compat_str,
clean_html,
@ -19,6 +19,7 @@ from ..utils import (
unescapeHTML,
)
class InfoExtractor(object):
"""Information Extractor class.
@ -54,6 +55,9 @@ class InfoExtractor(object):
subtitles: The subtitle file contents as a dictionary in the format
{language: subtitles}.
view_count: How many users have watched the video on the platform.
like_count: Number of positive ratings of the video
dislike_count: Number of negative ratings of the video
comment_count: Number of comments on the video
urlhandle: [internal] The urlHandle to be used to download the file,
like returned by urllib.request.urlopen
age_limit: Age restriction for the video, as an integer (years)
@ -75,6 +79,7 @@ class InfoExtractor(object):
* acodec Name of the audio codec in use
* vbr Average video bitrate in KBit/s
* vcodec Name of the video codec in use
* filesize The number of bytes, if known in advance
webpage_url: The url to the video webpage, if given to youtube-dl it
should allow to get the same result again. (It will be set
by YoutubeDL if it's missing)
@ -156,7 +161,7 @@ class InfoExtractor(object):
elif note is not False:
self.to_screen(u'%s: %s' % (video_id, note))
try:
return compat_urllib_request.urlopen(url_or_request)
return self._downloader.urlopen(url_or_request)
except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
if errnote is None:
errnote = u'Unable to download webpage'
@ -208,6 +213,12 @@ class InfoExtractor(object):
""" Returns the data of the page as a string """
return self._download_webpage_handle(url_or_request, video_id, note, errnote)[0]
def _download_xml(self, url_or_request, video_id,
note=u'Downloading XML', errnote=u'Unable to download XML'):
"""Return the xml as an xml.etree.ElementTree.Element"""
xml_string = self._download_webpage(url_or_request, video_id, note, errnote)
return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
def to_screen(self, msg):
"""Print msg to screen, prefixing it with '[ie_name]'"""
self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
@ -356,7 +367,8 @@ class InfoExtractor(object):
if display_name is None:
display_name = name
return self._html_search_regex(
r'''(?ix)<meta(?=[^>]+(?:name|property)=["\']%s["\'])
r'''(?ix)<meta
(?=[^>]+(?:itemprop|name|property)=["\']%s["\'])
[^>]+content=["\']([^"\']+)["\']''' % re.escape(name),
html, display_name, fatal=False)

View File

@ -6,7 +6,7 @@ from ..utils import (
)
class CSpanIE(InfoExtractor):
_VALID_URL = r'http://www.c-spanvideo.org/program/(.*)'
_VALID_URL = r'http://www\.c-spanvideo\.org/program/(.*)'
_TEST = {
u'url': u'http://www.c-spanvideo.org/program/HolderonV',
u'file': u'315139.flv',

View File

@ -11,6 +11,7 @@ from ..utils import (
get_element_by_attribute,
get_element_by_id,
orderedSet,
str_to_int,
ExtractorError,
)
@ -146,6 +147,9 @@ class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor):
self._list_available_subtitles(video_id, webpage)
return
view_count = str_to_int(self._search_regex(
r'video_views_value[^>]+>([\d\.,]+)<', webpage, u'view count'))
return {
'id': video_id,
'formats': formats,
@ -155,6 +159,7 @@ class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor):
'subtitles': video_subtitles,
'thumbnail': info['thumbnail_url'],
'age_limit': age_limit,
'view_count': view_count,
}
def _get_available_subtitles(self, video_id, webpage):

View File

@ -1,6 +1,5 @@
# encoding: utf-8
import re
import xml.etree.ElementTree
from .common import InfoExtractor
from ..utils import (
@ -29,17 +28,16 @@ class DaumIE(InfoExtractor):
video_id = mobj.group(1)
canonical_url = 'http://tvpot.daum.net/v/%s' % video_id
webpage = self._download_webpage(canonical_url, video_id)
full_id = self._search_regex(r'<link rel="video_src" href=".+?vid=(.+?)"',
full_id = self._search_regex(
r'<iframe src="http://videofarm.daum.net/controller/video/viewer/Video.html\?.*?vid=(.+?)[&"]',
webpage, u'full id')
query = compat_urllib_parse.urlencode({'vid': full_id})
info_xml = self._download_webpage(
info = self._download_xml(
'http://tvpot.daum.net/clip/ClipInfoXml.do?' + query, video_id,
u'Downloading video info')
urls_xml = self._download_webpage(
urls = self._download_xml(
'http://videofarm.daum.net/controller/api/open/v1_2/MovieData.apixml?' + query,
video_id, u'Downloading video formats info')
info = xml.etree.ElementTree.fromstring(info_xml.encode('utf-8'))
urls = xml.etree.ElementTree.fromstring(urls_xml.encode('utf-8'))
self.to_screen(u'%s: Getting video urls' % video_id)
formats = []
@ -49,10 +47,9 @@ class DaumIE(InfoExtractor):
'vid': full_id,
'profile': profile,
})
url_xml = self._download_webpage(
url_doc = self._download_xml(
'http://videofarm.daum.net/controller/api/open/v1_2/MovieLocation.apixml?' + format_query,
video_id, note=False)
url_doc = xml.etree.ElementTree.fromstring(url_xml.encode('utf-8'))
format_url = url_doc.find('result/url').text
formats.append({
'url': format_url,
@ -60,7 +57,7 @@ class DaumIE(InfoExtractor):
'format_id': profile,
})
info = {
return {
'id': video_id,
'title': info.find('TITLE').text,
'formats': formats,
@ -69,6 +66,3 @@ class DaumIE(InfoExtractor):
'duration': int(info.find('DURATION').text),
'upload_date': info.find('REGDTTM').text[:8],
}
# TODO: Remove when #980 has been merged
info.update(formats[-1])
return info

View File

@ -1,7 +1,6 @@
# coding: utf-8
import re
import xml.etree.ElementTree
from .common import InfoExtractor
from ..utils import (
@ -12,7 +11,7 @@ from ..utils import (
class DreiSatIE(InfoExtractor):
IE_NAME = '3sat'
_VALID_URL = r'(?:http://)?(?:www\.)?3sat.de/mediathek/index.php\?(?:(?:mode|display)=[^&]+&)*obj=(?P<id>[0-9]+)$'
_VALID_URL = r'(?:http://)?(?:www\.)?3sat\.de/mediathek/index\.php\?(?:(?:mode|display)=[^&]+&)*obj=(?P<id>[0-9]+)$'
_TEST = {
u"url": u"http://www.3sat.de/mediathek/index.php?obj=36983",
u'file': u'36983.webm',
@ -30,8 +29,7 @@ class DreiSatIE(InfoExtractor):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
details_url = 'http://www.3sat.de/mediathek/xmlservice/web/beitragsDetails?ak=web&id=%s' % video_id
details_xml = self._download_webpage(details_url, video_id, note=u'Downloading video details')
details_doc = xml.etree.ElementTree.fromstring(details_xml.encode('utf-8'))
details_doc = self._download_xml(details_url, video_id, note=u'Downloading video details')
thumbnail_els = details_doc.findall('.//teaserimage')
thumbnails = [{
@ -67,7 +65,7 @@ class DreiSatIE(InfoExtractor):
return (qidx, prefer_http, format['video_bitrate'])
formats.sort(key=_sortkey)
info = {
return {
'_type': 'video',
'id': video_id,
'title': video_title,
@ -78,8 +76,3 @@ class DreiSatIE(InfoExtractor):
'uploader': video_uploader,
'upload_date': upload_date,
}
# TODO: Remove when #980 has been merged
info.update(formats[-1])
return info

View File

@ -1,5 +1,4 @@
import re
import xml.etree.ElementTree
from .common import InfoExtractor
from ..utils import determine_ext
@ -21,9 +20,8 @@ class EbaumsWorldIE(InfoExtractor):
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
config_xml = self._download_webpage(
config = self._download_xml(
'http://www.ebaumsworld.com/video/player/%s' % video_id, video_id)
config = xml.etree.ElementTree.fromstring(config_xml.encode('utf-8'))
video_url = config.find('file').text
return {

View File

@ -10,7 +10,7 @@ from ..utils import (
class EightTracksIE(InfoExtractor):
IE_NAME = '8tracks'
_VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
_VALID_URL = r'https?://8tracks\.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
_TEST = {
u"name": u"EightTracks",
u"url": u"http://8tracks.com/ytdl/youtube-dl-test-tracks-a",

View File

@ -8,7 +8,7 @@ class ExfmIE(InfoExtractor):
IE_NAME = u'exfm'
IE_DESC = u'ex.fm'
_VALID_URL = r'(?:http://)?(?:www\.)?ex\.fm/song/([^/]+)'
_SOUNDCLOUD_URL = r'(?:http://)?(?:www\.)?api\.soundcloud.com/tracks/([^/]+)/stream'
_SOUNDCLOUD_URL = r'(?:http://)?(?:www\.)?api\.soundcloud\.com/tracks/([^/]+)/stream'
_TESTS = [
{
u'url': u'http://ex.fm/song/eh359',

View File

@ -1,6 +1,5 @@
# encoding: utf-8
import re
import xml.etree.ElementTree
from .common import InfoExtractor
from ..utils import (
@ -10,7 +9,7 @@ from ..utils import (
class FazIE(InfoExtractor):
IE_NAME = u'faz.net'
_VALID_URL = r'https?://www\.faz\.net/multimedia/videos/.*?-(?P<id>\d+).html'
_VALID_URL = r'https?://www\.faz\.net/multimedia/videos/.*?-(?P<id>\d+)\.html'
_TEST = {
u'url': u'http://www.faz.net/multimedia/videos/stockholm-chemie-nobelpreis-fuer-drei-amerikanische-forscher-12610585.html',
@ -28,9 +27,8 @@ class FazIE(InfoExtractor):
webpage = self._download_webpage(url, video_id)
config_xml_url = self._search_regex(r'writeFLV\(\'(.+?)\',', webpage,
u'config xml url')
config_xml = self._download_webpage(config_xml_url, video_id,
config = self._download_xml(config_xml_url, video_id,
u'Downloading config xml')
config = xml.etree.ElementTree.fromstring(config_xml.encode('utf-8'))
encodings = config.find('ENCODINGS')
formats = []
@ -46,13 +44,10 @@ class FazIE(InfoExtractor):
})
descr = self._html_search_regex(r'<p class="Content Copy">(.*?)</p>', webpage, u'description')
info = {
return {
'id': video_id,
'title': self._og_search_title(webpage),
'formats': formats,
'description': descr,
'thumbnail': config.find('STILL/STILL_BIG').text,
}
# TODO: Remove when #980 has been merged
info.update(formats[-1])
return info

View File

@ -12,7 +12,7 @@ from ..utils import (
class FKTVIE(InfoExtractor):
IE_NAME = u'fernsehkritik.tv'
_VALID_URL = r'(?:http://)?(?:www\.)?fernsehkritik.tv/folge-(?P<ep>[0-9]+)(?:/.*)?'
_VALID_URL = r'(?:http://)?(?:www\.)?fernsehkritik\.tv/folge-(?P<ep>[0-9]+)(?:/.*)?'
_TEST = {
u'url': u'http://fernsehkritik.tv/folge-1',
@ -52,7 +52,7 @@ class FKTVIE(InfoExtractor):
class FKTVPosteckeIE(InfoExtractor):
IE_NAME = u'fernsehkritik.tv:postecke'
_VALID_URL = r'(?:http://)?(?:www\.)?fernsehkritik.tv/inline-video/postecke.php\?(.*&)?ep=(?P<ep>[0-9]+)(&|$)'
_VALID_URL = r'(?:http://)?(?:www\.)?fernsehkritik\.tv/inline-video/postecke\.php\?(.*&)?ep=(?P<ep>[0-9]+)(&|$)'
_TEST = {
u'url': u'http://fernsehkritik.tv/inline-video/postecke.php?iframe=true&width=625&height=440&ep=120',
u'file': u'0120.flv',

View File

@ -1,6 +1,5 @@
# encoding: utf-8
import re
import xml.etree.ElementTree
import json
from .common import InfoExtractor
@ -11,11 +10,10 @@ from ..utils import (
class FranceTVBaseInfoExtractor(InfoExtractor):
def _extract_video(self, video_id):
xml_desc = self._download_webpage(
info = self._download_xml(
'http://www.francetvinfo.fr/appftv/webservices/video/'
'getInfosOeuvre.php?id-diffusion='
+ video_id, video_id, 'Downloading XML config')
info = xml.etree.ElementTree.fromstring(xml_desc.encode('utf-8'))
manifest_url = info.find('videos/video/url').text
video_url = manifest_url.replace('manifest.f4m', 'index_2_av.m3u8')
@ -23,7 +21,7 @@ class FranceTVBaseInfoExtractor(InfoExtractor):
thumbnail_path = info.find('image').text
return {'id': video_id,
'ext': 'mp4',
'ext': 'flv' if video_url.startswith('rtmp') else 'mp4',
'url': video_url,
'title': info.find('titre').text,
'thumbnail': compat_urlparse.urljoin('http://pluzz.francetv.fr', thumbnail_path),
@ -47,7 +45,7 @@ class PluzzIE(FranceTVBaseInfoExtractor):
class FranceTvInfoIE(FranceTVBaseInfoExtractor):
IE_NAME = u'francetvinfo.fr'
_VALID_URL = r'https?://www\.francetvinfo\.fr/replay.*/(?P<title>.+).html'
_VALID_URL = r'https?://www\.francetvinfo\.fr/replay.*/(?P<title>.+)\.html'
_TEST = {
u'url': u'http://www.francetvinfo.fr/replay-jt/france-3/soir-3/jt-grand-soir-3-lundi-26-aout-2013_393427.html',
@ -68,35 +66,101 @@ class FranceTvInfoIE(FranceTVBaseInfoExtractor):
return self._extract_video(video_id)
class France2IE(FranceTVBaseInfoExtractor):
IE_NAME = u'france2.fr'
_VALID_URL = r'''(?x)https?://www\.france2\.fr/
class FranceTVIE(FranceTVBaseInfoExtractor):
IE_NAME = u'francetv'
IE_DESC = u'France 2, 3, 4, 5 and Ô'
_VALID_URL = r'''(?x)https?://www\.france[2345o]\.fr/
(?:
emissions/.*?/videos/(?P<id>\d+)
| emission/(?P<key>[^/?]+)
emissions/.*?/(videos|emissions)/(?P<id>[^/?]+)
| (emissions?|jt)/(?P<key>[^/?]+)
)'''
_TEST = {
u'url': u'http://www.france2.fr/emissions/13h15-le-samedi-le-dimanche/videos/75540104',
u'file': u'75540104.mp4',
u'info_dict': {
u'title': u'13h15, le samedi...',
u'description': u'md5:2e5b58ba7a2d3692b35c792be081a03d',
_TESTS = [
# france2
{
u'url': u'http://www.france2.fr/emissions/13h15-le-samedi-le-dimanche/videos/75540104',
u'file': u'75540104.mp4',
u'info_dict': {
u'title': u'13h15, le samedi...',
u'description': u'md5:2e5b58ba7a2d3692b35c792be081a03d',
},
u'params': {
# m3u8 download
u'skip_download': True,
},
},
u'params': {
u'skip_download': True,
# france3
{
u'url': u'http://www.france3.fr/emissions/pieces-a-conviction/diffusions/13-11-2013_145575',
u'info_dict': {
u'id': u'000702326_CAPP_PicesconvictionExtrait313022013_120220131722_Au',
u'ext': u'flv',
u'title': u'Le scandale du prix des médicaments',
u'description': u'md5:1384089fbee2f04fc6c9de025ee2e9ce',
},
u'params': {
# rtmp download
u'skip_download': True,
},
},
}
# france4
{
u'url': u'http://www.france4.fr/emissions/hero-corp/videos/rhozet_herocorp_bonus_1_20131106_1923_06112013172108_F4',
u'info_dict': {
u'id': u'rhozet_herocorp_bonus_1_20131106_1923_06112013172108_F4',
u'ext': u'flv',
u'title': u'Hero Corp Making of - Extrait 1',
u'description': u'md5:c87d54871b1790679aec1197e73d650a',
},
u'params': {
# rtmp download
u'skip_download': True,
},
},
# france5
{
u'url': u'http://www.france5.fr/emissions/c-a-dire/videos/92837968',
u'info_dict': {
u'id': u'92837968',
u'ext': u'mp4',
u'title': u'C à dire ?!',
u'description': u'md5:fb1db1cbad784dcce7c7a7bd177c8e2f',
},
u'params': {
# m3u8 download
u'skip_download': True,
},
},
# franceo
{
u'url': u'http://www.franceo.fr/jt/info-afrique/04-12-2013',
u'info_dict': {
u'id': u'92327925',
u'ext': u'mp4',
u'title': u'Infô-Afrique',
u'description': u'md5:ebf346da789428841bee0fd2a935ea55',
},
u'params': {
# m3u8 download
u'skip_download': True,
},
u'skip': u'The id changes frequently',
},
]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
if mobj.group('key'):
webpage = self._download_webpage(url, mobj.group('key'))
video_id = self._html_search_regex(
r'''(?x)<div\s+class="video-player">\s*
id_res = [
(r'''(?x)<div\s+class="video-player">\s*
<a\s+href="http://videos.francetv.fr/video/([0-9]+)"\s+
class="francetv-video-player">''',
webpage, u'video ID')
class="francetv-video-player">'''),
(r'<a id="player_direct" href="http://info\.francetelevisions'
'\.fr/\?id-video=([^"/&]+)'),
(r'<a class="video" id="ftv_player_(.+?)"'),
]
video_id = self._html_search_regex(id_res, webpage, u'video ID')
else:
video_id = mobj.group('id')
return self._extract_video(video_id)

View File

@ -4,7 +4,7 @@ from .common import InfoExtractor
class GamekingsIE(InfoExtractor):
_VALID_URL = r'http?://www\.gamekings\.tv/videos/(?P<name>[0-9a-z\-]+)'
_VALID_URL = r'http://www\.gamekings\.tv/videos/(?P<name>[0-9a-z\-]+)'
_TEST = {
u"url": u"http://www.gamekings.tv/videos/phoenix-wright-ace-attorney-dual-destinies-review/",
u'file': u'20130811.mp4',

View File

@ -47,13 +47,10 @@ class GameSpotIE(InfoExtractor):
'format_id': q,
})
info = {
return {
'id': data_video['guid'],
'title': compat_urllib_parse.unquote(data_video['title']),
'formats': formats,
'description': get_meta_content('description', webpage),
'thumbnail': self._og_search_thumbnail(webpage),
}
# TODO: Remove when #980 has been merged
info.update(formats[-1])
return info

View File

@ -1,13 +1,10 @@
import re
from .mtv import MTVIE, _media_xml_tag
from .mtv import MTVServicesInfoExtractor
class GametrailersIE(MTVIE):
"""
Gametrailers use the same videos system as MTVIE, it just changes the feed
url, where the uri is and the method to get the thumbnails.
"""
_VALID_URL = r'http://www.gametrailers.com/(?P<type>videos|reviews|full-episodes)/(?P<id>.*?)/(?P<title>.*)'
class GametrailersIE(MTVServicesInfoExtractor):
_VALID_URL = r'http://www\.gametrailers\.com/(?P<type>videos|reviews|full-episodes)/(?P<id>.*?)/(?P<title>.*)'
_TEST = {
u'url': u'http://www.gametrailers.com/videos/zbvr8i/mirror-s-edge-2-e3-2013--debut-trailer',
u'file': u'70e9a5d7-cf25-4a10-9104-6f3e7342ae0d.mp4',
@ -17,15 +14,9 @@ class GametrailersIE(MTVIE):
u'description': u'Faith is back! Check out the World Premiere trailer for Mirror\'s Edge 2 straight from the EA Press Conference at E3 2013!',
},
}
# Overwrite MTVIE properties we don't want
_TESTS = []
_FEED_URL = 'http://www.gametrailers.com/feeds/mrss'
def _get_thumbnail_url(self, uri, itemdoc):
search_path = '%s/%s' % (_media_xml_tag('group'), _media_xml_tag('thumbnail'))
return itemdoc.find(search_path).attrib['url']
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')

View File

@ -169,8 +169,13 @@ class GenericIE(InfoExtractor):
# Site Name | Video Title
# Video Title - Tagline | Site Name
# and so on and so forth; it's just not practical
video_title = self._html_search_regex(r'<title>(.*)</title>',
webpage, u'video title', default=u'video', flags=re.DOTALL)
video_title = self._html_search_regex(
r'(?s)<title>(.*?)</title>', webpage, u'video title',
default=u'video')
# video uploader is domain name
video_uploader = self._search_regex(
r'^(?:https?://)?([^/]*)/.*', url, u'video uploader')
# Look for BrightCove:
bc_url = BrightcoveIE._extract_brightcove_url(webpage)
@ -188,13 +193,35 @@ class GenericIE(InfoExtractor):
# Look for embedded YouTube player
matches = re.findall(
r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?youtube.com/embed/.+?)\1', webpage)
r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?youtube\.com/embed/.+?)\1', webpage)
if matches:
urlrs = [self.url_result(unescapeHTML(tuppl[1]), 'Youtube')
for tuppl in matches]
return self.playlist_result(
urlrs, playlist_id=video_id, playlist_title=video_title)
# Look for embedded Dailymotion player
matches = re.findall(
r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.com/embed/video/.+?)\1', webpage)
if matches:
urlrs = [self.url_result(unescapeHTML(tuppl[1]), 'Dailymotion')
for tuppl in matches]
return self.playlist_result(
urlrs, playlist_id=video_id, playlist_title=video_title)
# Look for embedded Wistia player
match = re.search(
r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:fast\.)?wistia\.net/embed/iframe/.+?)\1', webpage)
if match:
return {
'_type': 'url_transparent',
'url': unescapeHTML(match.group('url')),
'ie_key': 'Wistia',
'uploader': video_uploader,
'title': video_title,
'id': video_id,
}
# Look for Bandcamp pages with custom domain
mobj = re.search(r'<meta property="og:url"[^>]*?content="(.*?bandcamp\.com.*?)"', webpage)
if mobj is not None:
@ -209,7 +236,7 @@ class GenericIE(InfoExtractor):
mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
if mobj is None:
# Broaden the search a little bit: JWPlayer JS loader
mobj = re.search(r'[^A-Za-z0-9]?file["\']?:\s*["\'](http[^\'"&]*)', webpage)
mobj = re.search(r'[^A-Za-z0-9]?file["\']?:\s*["\'](http[^\'"]*)', webpage)
if mobj is None:
# Try to find twitter cards info
mobj = re.search(r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage)
@ -236,18 +263,11 @@ class GenericIE(InfoExtractor):
video_id = compat_urllib_parse.unquote(os.path.basename(video_url))
# here's a fun little line of code for you:
video_extension = os.path.splitext(video_id)[1][1:]
video_id = os.path.splitext(video_id)[0]
# video uploader is domain name
video_uploader = self._search_regex(r'(?:https?://)?([^/]*)/.*',
url, u'video uploader')
return [{
return {
'id': video_id,
'url': video_url,
'uploader': video_uploader,
'upload_date': None,
'title': video_title,
'ext': video_extension,
}]
}

View File

@ -11,7 +11,7 @@ class HotNewHipHopIE(InfoExtractor):
u'file': u'1435540.mp3',
u'md5': u'2c2cd2f76ef11a9b3b581e8b232f3d96',
u'info_dict': {
u"title": u"Freddie Gibbs - Lay It Down"
u"title": u'Freddie Gibbs "Lay It Down"'
}
}

View File

@ -103,7 +103,7 @@ class IGNIE(InfoExtractor):
class OneUPIE(IGNIE):
"""Extractor for 1up.com, it uses the ign videos system."""
_VALID_URL = r'https?://gamevideos.1up.com/(?P<type>video)/id/(?P<name_or_id>.+)'
_VALID_URL = r'https?://gamevideos\.1up\.com/(?P<type>video)/id/(?P<name_or_id>.+)'
IE_NAME = '1up.com'
_DESCRIPTION_RE = r'<div id="vid_summary">(.+?)</div>'

View File

@ -0,0 +1,57 @@
import re
import json
from .common import InfoExtractor
from ..utils import (
compat_urlparse,
get_element_by_attribute,
)
class ImdbIE(InfoExtractor):
IE_NAME = u'imdb'
IE_DESC = u'Internet Movie Database trailers'
_VALID_URL = r'http://www\.imdb\.com/video/imdb/vi(?P<id>\d+)'
_TEST = {
u'url': u'http://www.imdb.com/video/imdb/vi2524815897',
u'md5': u'9f34fa777ade3a6e57a054fdbcb3a068',
u'info_dict': {
u'id': u'2524815897',
u'ext': u'mp4',
u'title': u'Ice Age: Continental Drift Trailer (No. 2) - IMDb',
u'description': u'md5:9061c2219254e5d14e03c25c98e96a81',
}
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
webpage = self._download_webpage(url,video_id)
descr = get_element_by_attribute('itemprop', 'description', webpage)
available_formats = re.findall(
r'case \'(?P<f_id>.*?)\' :$\s+url = \'(?P<path>.*?)\'', webpage,
flags=re.MULTILINE)
formats = []
for f_id, f_path in available_formats:
f_path = f_path.strip()
format_page = self._download_webpage(
compat_urlparse.urljoin(url, f_path),
u'Downloading info for %s format' % f_id)
json_data = self._search_regex(
r'<script[^>]+class="imdb-player-data"[^>]*?>(.*?)</script>',
format_page, u'json data', flags=re.DOTALL)
info = json.loads(json_data)
format_info = info['videoPlayerObject']['video']
formats.append({
'format_id': f_id,
'url': format_info['url'],
})
return {
'id': video_id,
'title': self._og_search_title(webpage),
'formats': formats,
'description': descr,
'thumbnail': format_info['slate'],
}

View File

@ -3,7 +3,7 @@ import re
from .common import InfoExtractor
class InstagramIE(InfoExtractor):
_VALID_URL = r'(?:http://)?instagram.com/p/(.*?)/'
_VALID_URL = r'(?:http://)?instagram\.com/p/(.*?)/'
_TEST = {
u'url': u'http://instagram.com/p/aye83DjauH/?foo=bar#abc',
u'file': u'aye83DjauH.mp4',

View File

@ -1,5 +1,4 @@
import re
import xml.etree.ElementTree
from .common import InfoExtractor
from ..utils import (
@ -43,9 +42,8 @@ class InternetVideoArchiveIE(InfoExtractor):
video_id = query_dic['publishedid'][0]
url = self._build_url(query)
flashconfiguration_xml = self._download_webpage(url, video_id,
flashconfiguration = self._download_xml(url, video_id,
u'Downloading flash configuration')
flashconfiguration = xml.etree.ElementTree.fromstring(flashconfiguration_xml.encode('utf-8'))
file_url = flashconfiguration.find('file').text
file_url = file_url.replace('/playlist.aspx', '/mrssplaylist.aspx')
# Replace some of the parameters in the query to get the best quality
@ -53,9 +51,8 @@ class InternetVideoArchiveIE(InfoExtractor):
file_url = re.sub(r'(?<=\?)(.+)$',
lambda m: self._clean_query(m.group()),
file_url)
info_xml = self._download_webpage(file_url, video_id,
info = self._download_xml(file_url, video_id,
u'Downloading video info')
info = xml.etree.ElementTree.fromstring(info_xml.encode('utf-8'))
item = info.find('channel/item')
def _bp(p):

View File

@ -2,7 +2,6 @@
import json
import re
import xml.etree.ElementTree
from .common import InfoExtractor
@ -32,12 +31,9 @@ class JeuxVideoIE(InfoExtractor):
r'http://www\.jeuxvideo\.com/config/\w+/\d+/(.*?)/\d+_player\.xml',
xml_link, u'video ID')
xml_config = self._download_webpage(
config = self._download_xml(
xml_link, title, u'Downloading XML config')
config = xml.etree.ElementTree.fromstring(xml_config.encode('utf-8'))
info_json = self._search_regex(
r'(?sm)<format\.json>(.*?)</format\.json>',
xml_config, u'JSON information')
info_json = config.find('format.json').text
info = json.loads(info_json)['versions'][0]
video_url = 'http://video720.jeuxvideo.com/' + info['file']

View File

@ -8,7 +8,7 @@ from ..utils import (
)
class JukeboxIE(InfoExtractor):
_VALID_URL = r'^http://www\.jukebox?\..+?\/.+[,](?P<video_id>[a-z0-9\-]+).html'
_VALID_URL = r'^http://www\.jukebox?\..+?\/.+[,](?P<video_id>[a-z0-9\-]+)\.html'
_IFRAME = r'<iframe .*src="(?P<iframe>[^"]*)".*>'
_VIDEO_URL = r'"config":{"file":"(?P<video_url>http:[^"]+[.](?P<video_ext>[^.?]+)[?]mdtk=[0-9]+)"'
_TITLE = r'<h1 class="inline">(?P<title>[^<]+)</h1>.*<span id="infos_article_artist">(?P<artist>[^<]+)</span>'

View File

@ -1,7 +1,6 @@
import json
import os
import re
import xml.etree.ElementTree
from .common import InfoExtractor
from ..utils import (
@ -94,10 +93,9 @@ class JustinTVIE(InfoExtractor):
archive_id = m.group(1)
api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
chapter_info_xml = self._download_webpage(api, chapter_id,
doc = self._download_xml(api, chapter_id,
note=u'Downloading chapter information',
errnote=u'Chapter information download failed')
doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
for a in doc.findall('.//archive'):
if archive_id == a.find('./id').text:
break

View File

@ -8,7 +8,7 @@ from ..utils import (
class LiveLeakIE(InfoExtractor):
_VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
_VALID_URL = r'^(?:http://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
IE_NAME = u'liveleak'
_TEST = {
u'url': u'http://www.liveleak.com/view?i=757_1364311680',

View File

@ -1,6 +1,5 @@
import re
import json
import xml.etree.ElementTree
from .common import InfoExtractor
from ..utils import (
@ -12,7 +11,7 @@ from ..utils import (
class LivestreamIE(InfoExtractor):
IE_NAME = u'livestream'
_VALID_URL = r'http://new.livestream.com/.*?/(?P<event_name>.*?)(/videos/(?P<id>\d+))?/?$'
_VALID_URL = r'http://new\.livestream\.com/.*?/(?P<event_name>.*?)(/videos/(?P<id>\d+))?/?$'
_TEST = {
u'url': u'http://new.livestream.com/CoheedandCambria/WebsterHall/videos/4719370',
u'file': u'4719370.mp4',
@ -80,8 +79,7 @@ class LivestreamOriginalIE(InfoExtractor):
user = mobj.group('user')
api_url = 'http://x{0}x.api.channel.livestream.com/2.0/clipdetails?extendedInfo=true&id={1}'.format(user, video_id)
api_response = self._download_webpage(api_url, video_id)
info = xml.etree.ElementTree.fromstring(api_response.encode('utf-8'))
info = self._download_xml(api_url, video_id)
item = info.find('channel').find('item')
ns = {'media': 'http://search.yahoo.com/mrss'}
thumbnail_url = item.find(xpath_with_ns('media:thumbnail', ns)).attrib['url']

View File

@ -69,6 +69,21 @@ class MetacafeIE(InfoExtractor):
u'age_limit': 18,
},
},
# cbs video
{
u'url': u'http://www.metacafe.com/watch/cb-0rOxMBabDXN6/samsung_galaxy_note_2_samsungs_next_generation_phablet/',
u'info_dict': {
u'id': u'0rOxMBabDXN6',
u'ext': u'flv',
u'title': u'Samsung Galaxy Note 2: Samsung\'s next-generation phablet',
u'description': u'md5:54d49fac53d26d5a0aaeccd061ada09d',
u'duration': 129,
},
u'params': {
# rtmp download
u'skip_download': True,
},
},
]
@ -106,10 +121,16 @@ class MetacafeIE(InfoExtractor):
video_id = mobj.group(1)
# Check if video comes from YouTube
mobj2 = re.match(r'^yt-(.*)$', video_id)
if mobj2 is not None:
return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1), 'Youtube')]
# the video may come from an external site
m_external = re.match('^(\w{2})-(.*)$', video_id)
if m_external is not None:
prefix, ext_id = m_external.groups()
# Check if video comes from YouTube
if prefix == 'yt':
return self.url_result('http://www.youtube.com/watch?v=%s' % ext_id, 'Youtube')
# CBS videos use theplatform.com
if prefix == 'cb':
return self.url_result('theplatform:%s' % ext_id, 'ThePlatform')
# Retrieve video webpage to extract further information
req = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)

View File

@ -43,13 +43,10 @@ class MetacriticIE(InfoExtractor):
description = self._html_search_regex(r'<b>Description:</b>(.*?)</p>',
webpage, u'description', flags=re.DOTALL)
info = {
return {
'id': video_id,
'title': clip.find('title').text,
'formats': formats,
'description': description,
'duration': int(clip.find('duration').text),
}
# TODO: Remove when #980 has been merged
info.update(formats[-1])
return info

View File

@ -60,7 +60,7 @@ class MixcloudIE(InfoExtractor):
'title': info['name'],
'url': final_song_url,
'ext': 'mp3',
'description': info['description'],
'description': info.get('description'),
'thumbnail': info['pictures'].get('extra_large'),
'uploader': info['user']['name'],
'uploader_id': info['user']['username'],

View File

@ -10,35 +10,8 @@ from ..utils import (
def _media_xml_tag(tag):
return '{http://search.yahoo.com/mrss/}%s' % tag
class MTVIE(InfoExtractor):
_VALID_URL = r'^https?://(?:www\.)?mtv\.com/videos/.+?/(?P<videoid>[0-9]+)/[^/]+$'
_FEED_URL = 'http://www.mtv.com/player/embed/AS3/rss/'
_TESTS = [
{
u'url': u'http://www.mtv.com/videos/misc/853555/ours-vh1-storytellers.jhtml',
u'file': u'853555.mp4',
u'md5': u'850f3f143316b1e71fa56a4edfd6e0f8',
u'info_dict': {
u'title': u'Taylor Swift - "Ours (VH1 Storytellers)"',
u'description': u'Album: Taylor Swift performs "Ours" for VH1 Storytellers at Harvey Mudd College.',
},
},
{
u'add_ie': ['Vevo'],
u'url': u'http://www.mtv.com/videos/taylor-swift/916187/everything-has-changed-ft-ed-sheeran.jhtml',
u'file': u'USCJY1331283.mp4',
u'md5': u'73b4e7fcadd88929292fe52c3ced8caf',
u'info_dict': {
u'title': u'Everything Has Changed',
u'upload_date': u'20130606',
u'uploader': u'Taylor Swift',
},
u'skip': u'VEVO is only available in some countries',
},
]
class MTVServicesInfoExtractor(InfoExtractor):
@staticmethod
def _id_from_uri(uri):
return uri.split(':')[-1]
@ -53,7 +26,12 @@ class MTVIE(InfoExtractor):
return base + m.group('finalid')
def _get_thumbnail_url(self, uri, itemdoc):
return 'http://mtv.mtvnimages.com/uri/' + uri
search_path = '%s/%s' % (_media_xml_tag('group'), _media_xml_tag('thumbnail'))
thumb_node = itemdoc.find(search_path)
if thumb_node is None:
return None
else:
return thumb_node.attrib['url']
def _extract_video_formats(self, metadataXml):
if '/error_country_block.swf' in metadataXml:
@ -93,7 +71,7 @@ class MTVIE(InfoExtractor):
else:
description = None
info = {
return {
'title': itemdoc.find('title').text,
'formats': self._extract_video_formats(mediagen_page),
'id': video_id,
@ -101,19 +79,46 @@ class MTVIE(InfoExtractor):
'description': description,
}
# TODO: Remove when #980 has been merged
info.update(info['formats'][-1])
return info
def _get_videos_info(self, uri):
video_id = self._id_from_uri(uri)
data = compat_urllib_parse.urlencode({'uri': uri})
infoXml = self._download_webpage(self._FEED_URL +'?' + data, video_id,
idoc = self._download_xml(self._FEED_URL +'?' + data, video_id,
u'Downloading info')
idoc = xml.etree.ElementTree.fromstring(infoXml.encode('utf-8'))
return [self._get_video_info(item) for item in idoc.findall('.//item')]
class MTVIE(MTVServicesInfoExtractor):
_VALID_URL = r'^https?://(?:www\.)?mtv\.com/videos/.+?/(?P<videoid>[0-9]+)/[^/]+$'
_FEED_URL = 'http://www.mtv.com/player/embed/AS3/rss/'
_TESTS = [
{
u'url': u'http://www.mtv.com/videos/misc/853555/ours-vh1-storytellers.jhtml',
u'file': u'853555.mp4',
u'md5': u'850f3f143316b1e71fa56a4edfd6e0f8',
u'info_dict': {
u'title': u'Taylor Swift - "Ours (VH1 Storytellers)"',
u'description': u'Album: Taylor Swift performs "Ours" for VH1 Storytellers at Harvey Mudd College.',
},
},
{
u'add_ie': ['Vevo'],
u'url': u'http://www.mtv.com/videos/taylor-swift/916187/everything-has-changed-ft-ed-sheeran.jhtml',
u'file': u'USCJY1331283.mp4',
u'md5': u'73b4e7fcadd88929292fe52c3ced8caf',
u'info_dict': {
u'title': u'Everything Has Changed',
u'upload_date': u'20130606',
u'uploader': u'Taylor Swift',
},
u'skip': u'VEVO is only available in some countries',
},
]
def _get_thumbnail_url(self, uri, itemdoc):
return 'http://mtv.mtvnimages.com/uri/' + uri
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('videoid')

View File

@ -9,7 +9,7 @@ from ..utils import (
class MuzuTVIE(InfoExtractor):
_VALID_URL = r'https?://www.muzu.tv/(.+?)/(.+?)/(?P<id>\d+)'
_VALID_URL = r'https?://www\.muzu\.tv/(.+?)/(.+?)/(?P<id>\d+)'
IE_NAME = u'muzu.tv'
_TEST = {

View File

@ -1,5 +1,4 @@
import os.path
import xml.etree.ElementTree
from .common import InfoExtractor
from ..utils import (
@ -10,7 +9,7 @@ from ..utils import (
class MySpassIE(InfoExtractor):
_VALID_URL = r'http://www.myspass.de/.*'
_VALID_URL = r'http://www\.myspass\.de/.*'
_TEST = {
u'url': u'http://www.myspass.de/myspass/shows/tvshows/absolute-mehrheit/Absolute-Mehrheit-vom-17022013-Die-Highlights-Teil-2--/11741/',
u'file': u'11741.mp4',
@ -33,8 +32,7 @@ class MySpassIE(InfoExtractor):
# get metadata
metadata_url = META_DATA_URL_TEMPLATE % video_id
metadata_text = self._download_webpage(metadata_url, video_id)
metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
metadata = self._download_xml(metadata_url, video_id)
# extract values from metadata
url_flv_el = metadata.find('url_flv')

View File

@ -1,6 +1,5 @@
# encoding: utf-8
import re
import xml.etree.ElementTree
from .common import InfoExtractor
from ..utils import (
@ -38,14 +37,12 @@ class NaverIE(InfoExtractor):
'protocol': 'p2p',
'inKey': key,
})
info_xml = self._download_webpage(
info = self._download_xml(
'http://serviceapi.rmcnmv.naver.com/flash/videoInfo.nhn?' + query,
video_id, u'Downloading video info')
urls_xml = self._download_webpage(
urls = self._download_xml(
'http://serviceapi.rmcnmv.naver.com/flash/playableEncodingOption.nhn?' + query_urls,
video_id, u'Downloading video formats info')
info = xml.etree.ElementTree.fromstring(info_xml.encode('utf-8'))
urls = xml.etree.ElementTree.fromstring(urls_xml.encode('utf-8'))
formats = []
for format_el in urls.findall('EncodingOptions/EncodingOption'):
@ -59,7 +56,7 @@ class NaverIE(InfoExtractor):
'height': int(format_el.find('height').text),
})
info = {
return {
'id': video_id,
'title': info.find('Subject').text,
'formats': formats,
@ -68,6 +65,3 @@ class NaverIE(InfoExtractor):
'upload_date': info.find('WriteDate').text.replace('.', ''),
'view_count': int(info.find('PlayCount').text),
}
# TODO: Remove when #980 has been merged
info.update(formats[-1])
return info

View File

@ -1,5 +1,4 @@
import re
import xml.etree.ElementTree
from .common import InfoExtractor
from ..utils import find_xpath_attr, compat_str
@ -21,8 +20,8 @@ class NBCNewsIE(InfoExtractor):
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
info_xml = self._download_webpage('http://www.nbcnews.com/id/%s/displaymode/1219' % video_id, video_id)
info = xml.etree.ElementTree.fromstring(info_xml.encode('utf-8')).find('video')
all_info = self._download_xml('http://www.nbcnews.com/id/%s/displaymode/1219' % video_id, video_id)
info = all_info.find('video')
return {'id': video_id,
'title': info.find('headline').text,

View File

@ -1,6 +1,5 @@
import re
import json
import xml.etree.ElementTree
from .common import InfoExtractor
from ..utils import (
@ -26,9 +25,8 @@ class NHLBaseInfoExtractor(InfoExtractor):
'path': initial_video_url.replace('.mp4', '_sd.mp4'),
})
path_url = 'http://video.nhl.com/videocenter/servlets/encryptvideopath?' + data
path_response = self._download_webpage(path_url, video_id,
path_doc = self._download_xml(path_url, video_id,
u'Downloading final video url')
path_doc = xml.etree.ElementTree.fromstring(path_response)
video_url = path_doc.find('path').text
join = compat_urlparse.urljoin

View File

@ -2,7 +2,6 @@
import re
import socket
import xml.etree.ElementTree
from .common import InfoExtractor
from ..utils import (
@ -77,11 +76,11 @@ class NiconicoIE(InfoExtractor):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group(1)
# Get video webpage
video_webpage = self._download_webpage(
'http://www.nicovideo.jp/watch/' + video_id, video_id)
# Get video webpage. We are not actually interested in it, but need
# the cookies in order to be able to download the info webpage
self._download_webpage('http://www.nicovideo.jp/watch/' + video_id, video_id)
video_info_webpage = self._download_webpage(
video_info = self._download_xml(
'http://ext.nicovideo.jp/api/getthumbinfo/' + video_id, video_id,
note=u'Downloading video info page')
@ -92,7 +91,6 @@ class NiconicoIE(InfoExtractor):
video_real_url = compat_urlparse.parse_qs(flv_info_webpage)['url'][0]
# Start extracting information
video_info = xml.etree.ElementTree.fromstring(video_info_webpage)
video_title = video_info.find('.//title').text
video_extension = video_info.find('.//movie_type').text
video_format = video_extension.upper()
@ -107,13 +105,11 @@ class NiconicoIE(InfoExtractor):
video_uploader = video_uploader_id
url = 'http://seiga.nicovideo.jp/api/user/info?id=' + video_uploader_id
try:
user_info_webpage = self._download_webpage(
user_info = self._download_xml(
url, video_id, note=u'Downloading user information')
video_uploader = user_info.find('.//nickname').text
except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
self._downloader.report_warning(u'Unable to download user info webpage: %s' % compat_str(err))
else:
user_info = xml.etree.ElementTree.fromstring(user_info_webpage)
video_uploader = user_info.find('.//nickname').text
return {
'id': video_id,

View File

@ -0,0 +1,43 @@
import json
import re
from .common import InfoExtractor
class NineGagIE(InfoExtractor):
IE_NAME = '9gag'
_VALID_URL = r'^https?://(?:www\.)?9gag\.tv/v/(?P<id>[0-9]+)'
_TEST = {
u"url": u"http://9gag.tv/v/1912",
u"file": u"1912.mp4",
u"info_dict": {
u"description": u"This 3-minute video will make you smile and then make you feel untalented and insignificant. Anyway, you should share this awesomeness. (Thanks, Dino!)",
u"title": u"\"People Are Awesome 2013\" Is Absolutely Awesome"
},
u'add_ie': [u'Youtube']
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
webpage = self._download_webpage(url, video_id)
data_json = self._html_search_regex(r'''(?x)
<div\s*id="tv-video"\s*data-video-source="youtube"\s*
data-video-meta="([^"]+)"''', webpage, u'video metadata')
data = json.loads(data_json)
return {
'_type': 'url_transparent',
'url': data['youtubeVideoId'],
'ie_key': 'Youtube',
'id': video_id,
'title': data['title'],
'description': data['description'],
'view_count': int(data['view_count']),
'like_count': int(data['statistic']['like']),
'dislike_count': int(data['statistic']['dislike']),
'thumbnail': data['thumbnail_url'],
}

View File

@ -12,7 +12,7 @@ from ..utils import (
)
class ORFIE(InfoExtractor):
_VALID_URL = r'https?://tvthek.orf.at/(programs/.+?/episodes|topics/.+?)/(?P<id>\d+)'
_VALID_URL = r'https?://tvthek\.orf\.at/(programs/.+?/episodes|topics/.+?)/(?P<id>\d+)'
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)

View File

@ -5,7 +5,7 @@ from .common import InfoExtractor
class PBSIE(InfoExtractor):
_VALID_URL = r'https?://video.pbs.org/video/(?P<id>\d+)/?'
_VALID_URL = r'https?://video\.pbs\.org/video/(?P<id>\d+)/?'
_TEST = {
u'url': u'http://video.pbs.org/video/2365006249/',

View File

@ -0,0 +1,49 @@
import json
import re
from .common import InfoExtractor
class PodomaticIE(InfoExtractor):
IE_NAME = 'podomatic'
_VALID_URL = r'^(?P<proto>https?)://(?P<channel>[^.]+)\.podomatic\.com/entry/(?P<id>[^?]+)'
_TEST = {
u"url": u"http://scienceteachingtips.podomatic.com/entry/2009-01-02T16_03_35-08_00",
u"file": u"2009-01-02T16_03_35-08_00.mp3",
u"md5": u"84bb855fcf3429e6bf72460e1eed782d",
u"info_dict": {
u"uploader": u"Science Teaching Tips",
u"uploader_id": u"scienceteachingtips",
u"title": u"64. When the Moon Hits Your Eye",
u"duration": 446,
}
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
channel = mobj.group('channel')
json_url = (('%s://%s.podomatic.com/entry/embed_params/%s' +
'?permalink=true&rtmp=0') %
(mobj.group('proto'), channel, video_id))
data_json = self._download_webpage(
json_url, video_id, note=u'Downloading video info')
data = json.loads(data_json)
video_url = data['downloadLink']
uploader = data['podcast']
title = data['title']
thumbnail = data['imageLocation']
duration = int(data['length'] / 1000.0)
return {
'id': video_id,
'url': video_url,
'title': title,
'uploader': uploader,
'uploader_id': channel,
'thumbnail': thumbnail,
'duration': duration,
}

View File

@ -30,7 +30,7 @@ class RedTubeIE(InfoExtractor):
r'<source src="(.+?)" type="video/mp4">', webpage, u'video URL')
video_title = self._html_search_regex(
r'<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
r'<h1 class="videoTitle[^"]*">(.+?)</h1>',
webpage, u'title')
# No self-labeling, but they describe themselves as

View File

@ -11,7 +11,7 @@ from ..utils import (
class RutubeIE(InfoExtractor):
_VALID_URL = r'https?://rutube.ru/video/(?P<long_id>\w+)'
_VALID_URL = r'https?://rutube\.ru/video/(?P<long_id>\w+)'
_TEST = {
u'url': u'http://rutube.ru/video/3eac3b4561676c17df9132a9a1e62e3e/',

View File

@ -1,7 +1,6 @@
# coding: utf-8
import re
import xml.etree.ElementTree
from .common import InfoExtractor
from ..utils import (
@ -35,12 +34,11 @@ class SinaIE(InfoExtractor):
def _extract_video(self, video_id):
data = compat_urllib_parse.urlencode({'vid': video_id})
url_page = self._download_webpage('http://v.iask.com/v_play.php?%s' % data,
url_doc = self._download_xml('http://v.iask.com/v_play.php?%s' % data,
video_id, u'Downloading video url')
image_page = self._download_webpage(
'http://interface.video.sina.com.cn/interface/common/getVideoImage.php?%s' % data,
video_id, u'Downloading thumbnail info')
url_doc = xml.etree.ElementTree.fromstring(url_page.encode('utf-8'))
return {'id': video_id,
'url': url_doc.find('./durl/url').text,

View File

@ -4,7 +4,7 @@ from .common import InfoExtractor
class SlashdotIE(InfoExtractor):
_VALID_URL = r'https?://tv.slashdot.org/video/\?embed=(?P<id>.*?)(&|$)'
_VALID_URL = r'https?://tv\.slashdot\.org/video/\?embed=(?P<id>.*?)(&|$)'
_TEST = {
u'add_ie': ['Ooyala'],

View File

@ -0,0 +1,251 @@
# encoding: utf-8
import re
import json
import hashlib
from .common import InfoExtractor
from ..utils import (
ExtractorError
)
class SmotriIE(InfoExtractor):
IE_DESC = u'Smotri.com'
IE_NAME = u'smotri'
_VALID_URL = r'^https?://(?:www\.)?(?P<url>smotri\.com/video/view/\?id=(?P<videoid>v(?P<realvideoid>[0-9]+)[a-z0-9]{4}))'
_TESTS = [
# real video id 2610366
{
u'url': u'http://smotri.com/video/view/?id=v261036632ab',
u'file': u'v261036632ab.mp4',
u'md5': u'2a7b08249e6f5636557579c368040eb9',
u'info_dict': {
u'title': u'катастрофа с камер видеонаблюдения',
u'uploader': u'rbc2008',
u'uploader_id': u'rbc08',
u'upload_date': u'20131118',
u'description': u'катастрофа с камер видеонаблюдения, видео катастрофа с камер видеонаблюдения',
u'thumbnail': u'http://frame6.loadup.ru/8b/a9/2610366.3.3.jpg',
},
},
# real video id 57591
{
u'url': u'http://smotri.com/video/view/?id=v57591cb20',
u'file': u'v57591cb20.flv',
u'md5': u'830266dfc21f077eac5afd1883091bcd',
u'info_dict': {
u'title': u'test',
u'uploader': u'Support Photofile@photofile',
u'uploader_id': u'support-photofile',
u'upload_date': u'20070704',
u'description': u'test, видео test',
u'thumbnail': u'http://frame4.loadup.ru/03/ed/57591.2.3.jpg',
},
},
# video-password
{
u'url': u'http://smotri.com/video/view/?id=v1390466a13c',
u'file': u'v1390466a13c.mp4',
u'md5': u'f6331cef33cad65a0815ee482a54440b',
u'info_dict': {
u'title': u'TOCCA_A_NOI_-_LE_COSE_NON_VANNO_CAMBIAMOLE_ORA-1',
u'uploader': u'timoxa40',
u'uploader_id': u'timoxa40',
u'upload_date': u'20100404',
u'thumbnail': u'http://frame7.loadup.ru/af/3f/1390466.3.3.jpg',
u'description': u'TOCCA_A_NOI_-_LE_COSE_NON_VANNO_CAMBIAMOLE_ORA-1, видео TOCCA_A_NOI_-_LE_COSE_NON_VANNO_CAMBIAMOLE_ORA-1',
},
u'params': {
u'videopassword': u'qwerty',
},
},
# age limit + video-password
{
u'url': u'http://smotri.com/video/view/?id=v15408898bcf',
u'file': u'v15408898bcf.flv',
u'md5': u'91e909c9f0521adf5ee86fbe073aad70',
u'info_dict': {
u'title': u'этот ролик не покажут по ТВ',
u'uploader': u'zzxxx',
u'uploader_id': u'ueggb',
u'upload_date': u'20101001',
u'thumbnail': u'http://frame3.loadup.ru/75/75/1540889.1.3.jpg',
u'age_limit': 18,
u'description': u'этот ролик не покажут по ТВ, видео этот ролик не покажут по ТВ',
},
u'params': {
u'videopassword': u'333'
}
}
]
_SUCCESS = 0
_PASSWORD_NOT_VERIFIED = 1
_PASSWORD_DETECTED = 2
_VIDEO_NOT_FOUND = 3
def _search_meta(self, name, html, display_name=None):
if display_name is None:
display_name = name
return self._html_search_regex(
r'<meta itemprop="%s" content="([^"]+)" />' % re.escape(name),
html, display_name, fatal=False)
return self._html_search_meta(name, html, display_name)
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('videoid')
real_video_id = mobj.group('realvideoid')
# Download video JSON data
video_json_url = 'http://smotri.com/vt.php?id=%s' % real_video_id
video_json_page = self._download_webpage(video_json_url, video_id, u'Downloading video JSON')
video_json = json.loads(video_json_page)
status = video_json['status']
if status == self._VIDEO_NOT_FOUND:
raise ExtractorError(u'Video %s does not exist' % video_id, expected=True)
elif status == self._PASSWORD_DETECTED: # The video is protected by a password, retry with
# video-password set
video_password = self._downloader.params.get('videopassword', None)
if not video_password:
raise ExtractorError(u'This video is protected by a password, use the --video-password option', expected=True)
video_json_url += '&md5pass=%s' % hashlib.md5(video_password.encode('utf-8')).hexdigest()
video_json_page = self._download_webpage(video_json_url, video_id, u'Downloading video JSON (video-password set)')
video_json = json.loads(video_json_page)
status = video_json['status']
if status == self._PASSWORD_NOT_VERIFIED:
raise ExtractorError(u'Video password is invalid', expected=True)
if status != self._SUCCESS:
raise ExtractorError(u'Unexpected status value %s' % status)
# Extract the URL of the video
video_url = video_json['file_data']
# Video JSON does not provide enough meta data
# We will extract some from the video web page instead
video_page_url = 'http://' + mobj.group('url')
video_page = self._download_webpage(video_page_url, video_id, u'Downloading video page')
# Adult content
if re.search(u'EroConfirmText">', video_page) is not None:
self.report_age_confirmation()
confirm_string = self._html_search_regex(
r'<a href="/video/view/\?id=%s&confirm=([^"]+)" title="[^"]+">' % video_id,
video_page, u'confirm string')
confirm_url = video_page_url + '&confirm=%s' % confirm_string
video_page = self._download_webpage(confirm_url, video_id, u'Downloading video page (age confirmed)')
adult_content = True
else:
adult_content = False
# Extract the rest of meta data
video_title = self._search_meta(u'name', video_page, u'title')
if not video_title:
video_title = video_url.rsplit('/', 1)[-1]
video_description = self._search_meta(u'description', video_page)
END_TEXT = u' на сайте Smotri.com'
if video_description.endswith(END_TEXT):
video_description = video_description[:-len(END_TEXT)]
START_TEXT = u'Смотреть онлайн ролик '
if video_description.startswith(START_TEXT):
video_description = video_description[len(START_TEXT):]
video_thumbnail = self._search_meta(u'thumbnail', video_page)
upload_date_str = self._search_meta(u'uploadDate', video_page, u'upload date')
upload_date_m = re.search(r'(?P<year>\d{4})\.(?P<month>\d{2})\.(?P<day>\d{2})T', upload_date_str)
video_upload_date = (
(
upload_date_m.group('year') +
upload_date_m.group('month') +
upload_date_m.group('day')
)
if upload_date_m else None
)
duration_str = self._search_meta(u'duration', video_page)
duration_m = re.search(r'T(?P<hours>[0-9]{2})H(?P<minutes>[0-9]{2})M(?P<seconds>[0-9]{2})S', duration_str)
video_duration = (
(
(int(duration_m.group('hours')) * 60 * 60) +
(int(duration_m.group('minutes')) * 60) +
int(duration_m.group('seconds'))
)
if duration_m else None
)
video_uploader = self._html_search_regex(
u'<div class="DescrUser"><div>Автор.*?onmouseover="popup_user_info[^"]+">(.*?)</a>',
video_page, u'uploader', fatal=False, flags=re.MULTILINE|re.DOTALL)
video_uploader_id = self._html_search_regex(
u'<div class="DescrUser"><div>Автор.*?onmouseover="popup_user_info\\(.*?\'([^\']+)\'\\);">',
video_page, u'uploader id', fatal=False, flags=re.MULTILINE|re.DOTALL)
video_view_count = self._html_search_regex(
u'Общее количество просмотров.*?<span class="Number">(\\d+)</span>',
video_page, u'view count', fatal=False, flags=re.MULTILINE|re.DOTALL)
return {
'id': video_id,
'url': video_url,
'title': video_title,
'thumbnail': video_thumbnail,
'description': video_description,
'uploader': video_uploader,
'upload_date': video_upload_date,
'uploader_id': video_uploader_id,
'video_duration': video_duration,
'view_count': video_view_count,
'age_limit': 18 if adult_content else 0,
'video_page_url': video_page_url
}
class SmotriCommunityIE(InfoExtractor):
IE_DESC = u'Smotri.com community videos'
IE_NAME = u'smotri:community'
_VALID_URL = r'^https?://(?:www\.)?smotri\.com/community/video/(?P<communityid>[0-9A-Za-z_\'-]+)'
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
community_id = mobj.group('communityid')
url = 'http://smotri.com/export/rss/video/by/community/-/%s/video.xml' % community_id
rss = self._download_xml(url, community_id, u'Downloading community RSS')
entries = [self.url_result(video_url.text, 'Smotri')
for video_url in rss.findall('./channel/item/link')]
description_text = rss.find('./channel/description').text
community_title = self._html_search_regex(
u'^Видео сообщества "([^"]+)"$', description_text, u'community title')
return self.playlist_result(entries, community_id, community_title)
class SmotriUserIE(InfoExtractor):
IE_DESC = u'Smotri.com user videos'
IE_NAME = u'smotri:user'
_VALID_URL = r'^https?://(?:www\.)?smotri\.com/user/(?P<userid>[0-9A-Za-z_\'-]+)'
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
user_id = mobj.group('userid')
url = 'http://smotri.com/export/rss/user/video/-/%s/video.xml' % user_id
rss = self._download_xml(url, user_id, u'Downloading user RSS')
entries = [self.url_result(video_url.text, 'Smotri')
for video_url in rss.findall('./channel/item/link')]
description_text = rss.find('./channel/description').text
user_nickname = self._html_search_regex(
u'^Видео режиссера (.*)$', description_text,
u'user nickname')
return self.playlist_result(entries, user_id, user_nickname)

View File

@ -25,7 +25,7 @@ class SoundcloudIE(InfoExtractor):
_VALID_URL = r'''^(?:https?://)?
(?:(?:(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)/?(?:[?].*)?$)
|(?:api\.soundcloud\.com/tracks/(?P<track_id>\d+))
|(?P<widget>w.soundcloud.com/player/?.*?url=.*)
|(?P<widget>w\.soundcloud\.com/player/?.*?url=.*)
)
'''
IE_NAME = u'soundcloud'
@ -76,44 +76,78 @@ class SoundcloudIE(InfoExtractor):
def _extract_info_dict(self, info, full_title=None, quiet=False):
track_id = compat_str(info['id'])
name = full_title or track_id
if quiet == False:
if quiet:
self.report_extraction(name)
thumbnail = info['artwork_url']
if thumbnail is not None:
thumbnail = thumbnail.replace('-large', '-t500x500')
ext = info.get('original_format', u'mp3')
result = {
'id': track_id,
'id': track_id,
'uploader': info['user']['username'],
'upload_date': unified_strdate(info['created_at']),
'title': info['title'],
'ext': info.get('original_format', u'mp3'),
'title': info['title'],
'description': info['description'],
'thumbnail': thumbnail,
}
if info.get('downloadable', False):
# We can build a direct link to the song
result['url'] = 'https://api.soundcloud.com/tracks/{0}/download?client_id={1}'.format(track_id, self._CLIENT_ID)
format_url = (
u'https://api.soundcloud.com/tracks/{0}/download?client_id={1}'.format(
track_id, self._CLIENT_ID))
result['formats'] = [{
'format_id': 'download',
'ext': ext,
'url': format_url,
'vcodec': 'none',
}]
else:
# We have to retrieve the url
stream_json = self._download_webpage(
'http://api.soundcloud.com/i1/tracks/{0}/streams?client_id={1}'.format(track_id, self._IPHONE_CLIENT_ID),
track_id, u'Downloading track url')
# There should be only one entry in the dictionary
key, stream_url = list(json.loads(stream_json).items())[0]
if key.startswith(u'http'):
result['url'] = stream_url
elif key.startswith(u'rtmp'):
# The url doesn't have an rtmp app, we have to extract the playpath
url, path = stream_url.split('mp3:', 1)
result.update({
'url': url,
'play_path': 'mp3:' + path,
})
else:
formats = []
format_dict = json.loads(stream_json)
for key, stream_url in format_dict.items():
if key.startswith(u'http'):
formats.append({
'format_id': key,
'ext': ext,
'url': stream_url,
'vcodec': 'none',
})
elif key.startswith(u'rtmp'):
# The url doesn't have an rtmp app, we have to extract the playpath
url, path = stream_url.split('mp3:', 1)
formats.append({
'format_id': key,
'url': url,
'play_path': 'mp3:' + path,
'ext': ext,
'vcodec': 'none',
})
if not formats:
# We fallback to the stream_url in the original info, this
# cannot be always used, sometimes it can give an HTTP 404 error
result['url'] = info['stream_url'] + '?client_id=' + self._CLIENT_ID,
formats.append({
'format_id': u'fallback',
'url': info['stream_url'] + '?client_id=' + self._CLIENT_ID,
'ext': ext,
'vcodec': 'none',
})
def format_pref(f):
if f['format_id'].startswith('http'):
return 2
if f['format_id'].startswith('rtmp'):
return 1
return 0
formats.sort(key=format_pref)
result['formats'] = formats
return result
@ -183,7 +217,7 @@ class SoundcloudSetIE(SoundcloudIE):
class SoundcloudUserIE(SoundcloudIE):
_VALID_URL = r'https?://(www\.)?soundcloud.com/(?P<user>[^/]+)(/?(tracks/)?)?(\?.*)?$'
_VALID_URL = r'https?://(www\.)?soundcloud\.com/(?P<user>[^/]+)(/?(tracks/)?)?(\?.*)?$'
IE_NAME = u'soundcloud:user'
# it's in tests/test_playlists.py

View File

@ -1,15 +1,14 @@
import re
from .mtv import MTVIE, _media_xml_tag
from .mtv import MTVServicesInfoExtractor
class SouthParkStudiosIE(MTVIE):
class SouthParkStudiosIE(MTVServicesInfoExtractor):
IE_NAME = u'southparkstudios.com'
_VALID_URL = r'(https?://)?(www\.)?(?P<url>southparkstudios\.com/(clips|full-episodes)/(?P<id>.+?)(\?|#|$))'
_FEED_URL = 'http://www.southparkstudios.com/feeds/video-player/mrss'
# Overwrite MTVIE properties we don't want
_TESTS = [{
u'url': u'http://www.southparkstudios.com/clips/104437/bat-daded#tab=featured',
u'file': u'a7bff6c2-ed00-11e0-aca6-0026b9414f30.mp4',
@ -19,14 +18,6 @@ class SouthParkStudiosIE(MTVIE):
},
}]
def _get_thumbnail_url(self, uri, itemdoc):
search_path = '%s/%s' % (_media_xml_tag('group'), _media_xml_tag('thumbnail'))
thumb_node = itemdoc.find(search_path)
if thumb_node is None:
return None
else:
return thumb_node.attrib['url']
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
url = u'http://www.' + mobj.group(u'url')

View File

@ -6,7 +6,7 @@ from ..utils import RegexNotFoundError, ExtractorError
class SpaceIE(InfoExtractor):
_VALID_URL = r'https?://www\.space\.com/\d+-(?P<title>[^/\.\?]*?)-video.html'
_VALID_URL = r'https?://www\.space\.com/\d+-(?P<title>[^/\.\?]*?)-video\.html'
_TEST = {
u'add_ie': ['Brightcove'],
u'url': u'http://www.space.com/23373-huge-martian-landforms-detail-revealed-by-european-probe-video.html',

View File

@ -1,5 +1,4 @@
import re
import xml.etree.ElementTree
from .common import InfoExtractor
@ -33,12 +32,10 @@ class SpiegelIE(InfoExtractor):
r'<div class="module-title">(.*?)</div>', webpage, u'title')
xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
xml_code = self._download_webpage(
idoc = self._download_xml(
xml_url, video_id,
note=u'Downloading XML', errnote=u'Failed to download XML')
idoc = xml.etree.ElementTree.fromstring(xml_code)
formats = [
{
'format_id': n.tag.rpartition('type')[2],

View File

@ -18,7 +18,7 @@ from ..utils import (
class StanfordOpenClassroomIE(InfoExtractor):
IE_NAME = u'stanfordoc'
IE_DESC = u'Stanford Open ClassRoom'
_VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
_VALID_URL = r'^(?:https?://)?openclassroom\.stanford\.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
_TEST = {
u'url': u'http://openclassroom.stanford.edu/MainFolder/VideoPage.php?course=PracticalUnix&video=intro-environment&speed=100',
u'file': u'PracticalUnix_intro-environment.mp4',

View File

@ -1,5 +1,4 @@
import re
import xml.etree.ElementTree
from .common import InfoExtractor
from ..utils import (
@ -32,8 +31,7 @@ class TeamcocoIE(InfoExtractor):
self.report_extraction(video_id)
data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
data_xml = self._download_webpage(data_url, video_id, 'Downloading data webpage')
data = xml.etree.ElementTree.fromstring(data_xml.encode('utf-8'))
data = self._download_xml(data_url, video_id, 'Downloading data webpage')
qualities = ['500k', '480p', '1000k', '720p', '1080p']

View File

@ -7,7 +7,7 @@ from .common import InfoExtractor
class TF1IE(InfoExtractor):
"""TF1 uses the wat.tv player."""
_VALID_URL = r'http://videos.tf1.fr/.*-(.*?).html'
_VALID_URL = r'http://videos\.tf1\.fr/.*-(.*?)\.html'
_TEST = {
u'url': u'http://videos.tf1.fr/auto-moto/citroen-grand-c4-picasso-2013-presentation-officielle-8062060.html',
u'file': u'10635995.mp4',

View File

@ -0,0 +1,68 @@
import re
import json
from .common import InfoExtractor
from ..utils import (
xpath_with_ns,
)
_x = lambda p: xpath_with_ns(p, {'smil': 'http://www.w3.org/2005/SMIL21/Language'})
class ThePlatformIE(InfoExtractor):
_VALID_URL = r'(?:https?://link\.theplatform\.com/s/[^/]+/|theplatform:)(?P<id>[^/\?]+)'
_TEST = {
# from http://www.metacafe.com/watch/cb-e9I_cZgTgIPd/blackberrys_big_bold_z30/
u'url': u'http://link.theplatform.com/s/dJ5BDC/e9I_cZgTgIPd/meta.smil?format=smil&Tracking=true&mbr=true',
u'info_dict': {
u'id': u'e9I_cZgTgIPd',
u'ext': u'flv',
u'title': u'Blackberry\'s big, bold Z30',
u'description': u'The Z30 is Blackberry\'s biggest, baddest mobile messaging device yet.',
u'duration': 247,
},
u'params': {
# rtmp download
u'skip_download': True,
},
}
def _get_info(self, video_id):
smil_url = ('http://link.theplatform.com/s/dJ5BDC/{0}/meta.smil?'
'format=smil&mbr=true'.format(video_id))
meta = self._download_xml(smil_url, video_id)
info_url = 'http://link.theplatform.com/s/dJ5BDC/{0}?format=preview'.format(video_id)
info_json = self._download_webpage(info_url, video_id)
info = json.loads(info_json)
head = meta.find(_x('smil:head'))
body = meta.find(_x('smil:body'))
base_url = head.find(_x('smil:meta')).attrib['base']
switch = body.find(_x('smil:switch'))
formats = []
for f in switch.findall(_x('smil:video')):
attr = f.attrib
formats.append({
'url': base_url,
'play_path': 'mp4:' + attr['src'],
'ext': 'flv',
'width': int(attr['width']),
'height': int(attr['height']),
'vbr': int(attr['system-bitrate']),
})
formats.sort(key=lambda f: (f['height'], f['width'], f['vbr']))
return {
'id': video_id,
'title': info['title'],
'formats': formats,
'description': info['description'],
'thumbnail': info['defaultThumbnailUrl'],
'duration': info['duration']//1000,
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
return self._get_info(video_id)

View File

@ -1,6 +1,5 @@
# coding: utf-8
import re
import xml.etree.ElementTree
from .common import InfoExtractor
from ..utils import (
@ -40,11 +39,9 @@ class TouTvIE(InfoExtractor):
r'"idMedia":\s*"([^"]+)"', webpage, u'media ID')
streams_url = u'http://release.theplatform.com/content.select?pid=' + mediaId
streams_webpage = self._download_webpage(
streams_doc = self._download_xml(
streams_url, video_id, note=u'Downloading stream list')
streams_doc = xml.etree.ElementTree.fromstring(
streams_webpage.encode('utf-8'))
video_url = next(n.text
for n in streams_doc.findall('.//choice/url')
if u'//ad.doubleclick' not in n.text)

View File

@ -1,6 +1,5 @@
import json
import re
import xml.etree.ElementTree
from .common import InfoExtractor
@ -36,12 +35,10 @@ class TriluliluIE(InfoExtractor):
format_url = (u'http://fs%(server)s.trilulilu.ro/%(hash)s/'
u'video-formats2' % log)
format_str = self._download_webpage(
format_doc = self._download_xml(
format_url, video_id,
note=u'Downloading formats',
errnote=u'Error while downloading formats')
format_doc = xml.etree.ElementTree.fromstring(format_str)
video_url_template = (
u'http://fs%(server)s.trilulilu.ro/stream.php?type=video'
@ -58,7 +55,7 @@ class TriluliluIE(InfoExtractor):
for fnode in format_doc.findall('./formats/format')
]
info = {
return {
'_type': 'video',
'id': video_id,
'formats': formats,
@ -67,7 +64,3 @@ class TriluliluIE(InfoExtractor):
'thumbnail': thumbnail,
}
# TODO: Remove when #980 has been merged
info.update(formats[-1])
return info

View File

@ -3,7 +3,7 @@ import re
from .common import InfoExtractor
class UnistraIE(InfoExtractor):
_VALID_URL = r'http://utv.unistra.fr/(?:index|video).php\?id_video\=(\d+)'
_VALID_URL = r'http://utv\.unistra\.fr/(?:index|video)\.php\?id_video\=(\d+)'
_TEST = {
u'url': u'http://utv.unistra.fr/video.php?id_video=154',

View File

@ -9,7 +9,7 @@ from ..utils import (
)
class VeeHDIE(InfoExtractor):
_VALID_URL = r'https?://veehd.com/video/(?P<id>\d+)'
_VALID_URL = r'https?://veehd\.com/video/(?P<id>\d+)'
_TEST = {
u'url': u'http://veehd.com/video/4686958',

View File

@ -15,7 +15,7 @@ class VevoIE(InfoExtractor):
Accepts urls from vevo.com or in the format 'vevo:{id}'
(currently used by MTVIE)
"""
_VALID_URL = r'((http://www.vevo.com/watch/.*?/.*?/)|(vevo:))(?P<id>.*?)(\?|$)'
_VALID_URL = r'((http://www\.vevo\.com/watch/(?:[^/]+/[^/]+/)?)|(vevo:))(?P<id>.*?)(\?|$)'
_TESTS = [{
u'url': u'http://www.vevo.com/watch/hurts/somebody-to-die-for/GB1101300280',
u'file': u'GB1101300280.mp4',
@ -24,7 +24,7 @@ class VevoIE(InfoExtractor):
u"upload_date": u"20130624",
u"uploader": u"Hurts",
u"title": u"Somebody to Die For",
u"duration": 230,
u"duration": 230.12,
u"width": 1920,
u"height": 1080,
}

View File

@ -6,7 +6,7 @@ from ..utils import ExtractorError
class ViceIE(InfoExtractor):
_VALID_URL = r'http://www.vice.com/.*?/(?P<name>.+)'
_VALID_URL = r'http://www\.vice\.com/.*?/(?P<name>.+)'
_TEST = {
u'url': u'http://www.vice.com/Fringes/cowboy-capitalists-part-1',

View File

@ -2,13 +2,10 @@ import json
import re
from .common import InfoExtractor
from ..utils import (
determine_ext,
)
class ViddlerIE(InfoExtractor):
_VALID_URL = r'(?P<domain>https?://(?:www\.)?viddler.com)/(?:v|embed|player)/(?P<id>[a-z0-9]+)'
_VALID_URL = r'(?P<domain>https?://(?:www\.)?viddler\.com)/(?:v|embed|player)/(?P<id>[a-z0-9]+)'
_TEST = {
u"url": u"http://www.viddler.com/v/43903784",
u'file': u'43903784.mp4',
@ -47,7 +44,7 @@ class ViddlerIE(InfoExtractor):
r"thumbnail\s*:\s*'([^']*)'",
webpage, u'thumbnail', fatal=False)
info = {
return {
'_type': 'video',
'id': video_id,
'title': title,
@ -56,9 +53,3 @@ class ViddlerIE(InfoExtractor):
'duration': duration,
'formats': formats,
}
# TODO: Remove when #980 has been merged
info['formats'][-1]['ext'] = determine_ext(info['formats'][-1]['url'])
info.update(info['formats'][-1])
return info

View File

@ -1,5 +1,4 @@
import re
import xml.etree.ElementTree
from .common import InfoExtractor
from ..utils import (
@ -8,7 +7,7 @@ from ..utils import (
)
class VideofyMeIE(InfoExtractor):
_VALID_URL = r'https?://(www.videofy.me/.+?|p.videofy.me/v)/(?P<id>\d+)(&|#|$)'
_VALID_URL = r'https?://(www\.videofy\.me/.+?|p\.videofy\.me/v)/(?P<id>\d+)(&|#|$)'
IE_NAME = u'videofy.me'
_TEST = {
@ -27,9 +26,8 @@ class VideofyMeIE(InfoExtractor):
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
config_xml = self._download_webpage('http://sunshine.videofy.me/?videoId=%s' % video_id,
config = self._download_xml('http://sunshine.videofy.me/?videoId=%s' % video_id,
video_id)
config = xml.etree.ElementTree.fromstring(config_xml.encode('utf-8'))
video = config.find('video')
sources = video.find('sources')
url_node = next(node for node in [find_xpath_attr(sources, 'source', 'id', 'HQ %s' % key)

View File

@ -5,7 +5,7 @@ from .common import InfoExtractor
class VideoPremiumIE(InfoExtractor):
_VALID_URL = r'(?:https?://)?(?:www\.)?videopremium\.tv/(?P<id>\w+)(?:/.*)?'
_VALID_URL = r'(?:https?://)?(?:www\.)?videopremium\.(?:tv|me)/(?P<id>\w+)(?:/.*)?'
_TEST = {
u'url': u'http://videopremium.tv/4w7oadjsf156',
u'file': u'4w7oadjsf156.f4v',
@ -41,4 +41,4 @@ class VideoPremiumIE(InfoExtractor):
'player_url': "http://videopremium.tv/uplayer/uppod.swf",
'ext': 'f4v',
'title': video_title,
}
}

View File

@ -0,0 +1,101 @@
import re
from ..utils import (
ExtractorError,
unescapeHTML,
unified_strdate,
)
from .subtitles import SubtitlesInfoExtractor
class VikiIE(SubtitlesInfoExtractor):
IE_NAME = u'viki'
_VALID_URL = r'^https?://(?:www\.)?viki\.com/videos/(?P<id>[0-9]+v)'
_TEST = {
u'url': u'http://www.viki.com/videos/1023585v-heirs-episode-14',
u'file': u'1023585v.mp4',
u'md5': u'a21454021c2646f5433514177e2caa5f',
u'info_dict': {
u'title': u'Heirs Episode 14',
u'uploader': u'SBS',
u'description': u'md5:c4b17b9626dd4b143dcc4d855ba3474e',
u'upload_date': u'20131121',
u'age_limit': 13,
},
u'skip': u'Blocked in the US',
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group(1)
webpage = self._download_webpage(url, video_id)
title = self._og_search_title(webpage)
description = self._og_search_description(webpage)
thumbnail = self._og_search_thumbnail(webpage)
uploader_m = re.search(
r'<strong>Broadcast Network: </strong>\s*([^<]*)<', webpage)
if uploader_m is None:
uploader = None
else:
uploader = uploader_m.group(1).strip()
rating_str = self._html_search_regex(
r'<strong>Rating: </strong>\s*([^<]*)<', webpage,
u'rating information', default='').strip()
RATINGS = {
'G': 0,
'PG': 10,
'PG-13': 13,
'R': 16,
'NC': 18,
}
age_limit = RATINGS.get(rating_str)
info_url = 'http://www.viki.com/player5_fragment/%s?action=show&controller=videos' % video_id
info_webpage = self._download_webpage(
info_url, video_id, note=u'Downloading info page')
if re.match(r'\s*<div\s+class="video-error', info_webpage):
raise ExtractorError(
u'Video %s is blocked from your location.' % video_id,
expected=True)
video_url = self._html_search_regex(
r'<source[^>]+src="([^"]+)"', info_webpage, u'video URL')
upload_date_str = self._html_search_regex(
r'"created_at":"([^"]+)"', info_webpage, u'upload date')
upload_date = (
unified_strdate(upload_date_str)
if upload_date_str is not None
else None
)
# subtitles
video_subtitles = self.extract_subtitles(video_id, info_webpage)
if self._downloader.params.get('listsubtitles', False):
self._list_available_subtitles(video_id, info_webpage)
return
return {
'id': video_id,
'title': title,
'url': video_url,
'description': description,
'thumbnail': thumbnail,
'age_limit': age_limit,
'uploader': uploader,
'subtitles': video_subtitles,
'upload_date': upload_date,
}
def _get_available_subtitles(self, video_id, info_webpage):
res = {}
for sturl_html in re.findall(r'<track src="([^"]+)"/>', info_webpage):
sturl = unescapeHTML(sturl_html)
m = re.search(r'/(?P<lang>[a-z]+)\.vtt', sturl)
if not m:
continue
res[m.group('lang')] = sturl
return res

View File

@ -20,7 +20,7 @@ class VimeoIE(InfoExtractor):
"""Information extractor for vimeo.com."""
# _VALID_URL matches Vimeo URLs
_VALID_URL = r'(?P<proto>https?://)?(?:(?:www|(?P<player>player))\.)?vimeo(?P<pro>pro)?\.com/(?:(?:(?:groups|album)/[^/]+)|(?:.*?)/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)/?(?:[?].*)?(?:#.*)?$'
_VALID_URL = r'(?P<proto>https?://)?(?:(?:www|(?P<player>player))\.)?vimeo(?P<pro>pro)?\.com/(?:.*?/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)/?(?:[?].*)?(?:#.*)?$'
_NETRC_MACHINE = 'vimeo'
IE_NAME = u'vimeo'
_TESTS = [
@ -196,6 +196,16 @@ class VimeoIE(InfoExtractor):
if mobj is not None:
video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
try:
view_count = int(self._search_regex(r'UserPlays:(\d+)', webpage, u'view count'))
like_count = int(self._search_regex(r'UserLikes:(\d+)', webpage, u'like count'))
comment_count = int(self._search_regex(r'UserComments:(\d+)', webpage, u'comment count'))
except RegexNotFoundError:
# This info is only available in vimeo.com/{id} urls
view_count = None
like_count = None
comment_count = None
# Vimeo specific: extract request signature and timestamp
sig = config['request']['signature']
timestamp = config['request']['timestamp']
@ -242,6 +252,9 @@ class VimeoIE(InfoExtractor):
'description': video_description,
'formats': formats,
'webpage_url': url,
'view_count': view_count,
'like_count': like_count,
'comment_count': comment_count,
}
@ -249,25 +262,77 @@ class VimeoChannelIE(InfoExtractor):
IE_NAME = u'vimeo:channel'
_VALID_URL = r'(?:https?://)?vimeo.\com/channels/(?P<id>[^/]+)'
_MORE_PAGES_INDICATOR = r'<a.+?rel="next"'
_TITLE_RE = r'<link rel="alternate"[^>]+?title="(.*?)"'
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
channel_id = mobj.group('id')
def _page_url(self, base_url, pagenum):
return '%s/videos/page:%d/' % (base_url, pagenum)
def _extract_list_title(self, webpage):
return self._html_search_regex(self._TITLE_RE, webpage, u'list title')
def _extract_videos(self, list_id, base_url):
video_ids = []
for pagenum in itertools.count(1):
webpage = self._download_webpage('http://vimeo.com/channels/%s/videos/page:%d' % (channel_id, pagenum),
channel_id, u'Downloading page %s' % pagenum)
webpage = self._download_webpage(
self._page_url(base_url, pagenum) ,list_id,
u'Downloading page %s' % pagenum)
video_ids.extend(re.findall(r'id="clip_(\d+?)"', webpage))
if re.search(self._MORE_PAGES_INDICATOR, webpage, re.DOTALL) is None:
break
entries = [self.url_result('http://vimeo.com/%s' % video_id, 'Vimeo')
for video_id in video_ids]
channel_title = self._html_search_regex(r'<a href="/channels/%s">(.*?)</a>' % channel_id,
webpage, u'channel title')
return {'_type': 'playlist',
'id': channel_id,
'title': channel_title,
'id': list_id,
'title': self._extract_list_title(webpage),
'entries': entries,
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
channel_id = mobj.group('id')
return self._extract_videos(channel_id, 'http://vimeo.com/channels/%s' % channel_id)
class VimeoUserIE(VimeoChannelIE):
IE_NAME = u'vimeo:user'
_VALID_URL = r'(?:https?://)?vimeo.\com/(?P<name>[^/]+)'
_TITLE_RE = r'<a[^>]+?class="user">([^<>]+?)</a>'
@classmethod
def suitable(cls, url):
if VimeoChannelIE.suitable(url) or VimeoIE.suitable(url) or VimeoAlbumIE.suitable(url) or VimeoGroupsIE.suitable(url):
return False
return super(VimeoUserIE, cls).suitable(url)
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
name = mobj.group('name')
return self._extract_videos(name, 'http://vimeo.com/%s' % name)
class VimeoAlbumIE(VimeoChannelIE):
IE_NAME = u'vimeo:album'
_VALID_URL = r'(?:https?://)?vimeo.\com/album/(?P<id>\d+)'
_TITLE_RE = r'<header id="page_header">\n\s*<h1>(.*?)</h1>'
def _page_url(self, base_url, pagenum):
return '%s/page:%d/' % (base_url, pagenum)
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
album_id = mobj.group('id')
return self._extract_videos(album_id, 'http://vimeo.com/album/%s' % album_id)
class VimeoGroupsIE(VimeoAlbumIE):
IE_NAME = u'vimeo:group'
_VALID_URL = r'(?:https?://)?vimeo.\com/groups/(?P<name>[^/]+)'
def _extract_list_title(self, webpage):
return self._og_search_title(webpage)
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
name = mobj.group('name')
return self._extract_videos(name, 'http://vimeo.com/groups/%s' % name)

View File

@ -11,7 +11,7 @@ from ..utils import (
class WatIE(InfoExtractor):
_VALID_URL=r'http://www.wat.tv/.*-(?P<shortID>.*?)_.*?.html'
_VALID_URL=r'http://www\.wat\.tv/.*-(?P<shortID>.*?)_.*?\.html'
IE_NAME = 'wat.tv'
_TEST = {
u'url': u'http://www.wat.tv/video/world-war-philadelphia-vost-6bv55_2fjr7_.html',

View File

@ -0,0 +1,55 @@
import json
import re
from .common import InfoExtractor
class WistiaIE(InfoExtractor):
_VALID_URL = r'^https?://(?:fast\.)?wistia\.net/embed/iframe/(?P<id>[a-z0-9]+)'
_TEST = {
u"url": u"http://fast.wistia.net/embed/iframe/sh7fpupwlt",
u"file": u"sh7fpupwlt.mov",
u"md5": u"cafeb56ec0c53c18c97405eecb3133df",
u"info_dict": {
u"title": u"cfh_resourceful_zdkh_final_1"
},
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
webpage = self._download_webpage(url, video_id)
data_json = self._html_search_regex(
r'Wistia.iframeInit\((.*?), {}\);', webpage, u'video data')
data = json.loads(data_json)
formats = []
thumbnails = []
for atype, a in data['assets'].items():
if atype == 'still':
thumbnails.append({
'url': a['url'],
'resolution': '%dx%d' % (a['width'], a['height']),
})
continue
if atype == 'preview':
continue
formats.append({
'format_id': atype,
'url': a['url'],
'width': a['width'],
'height': a['height'],
'filesize': a['size'],
'ext': a['ext'],
})
formats.sort(key=lambda a: a['filesize'])
return {
'id': video_id,
'title': data['name'],
'formats': formats,
'thumbnails': thumbnails,
}

View File

@ -26,7 +26,7 @@ class XHamsterIE(InfoExtractor):
{
u'url': u'http://xhamster.com/movies/2221348/britney_spears_sexy_booty.html?hd',
u'file': u'2221348.flv',
u'md5': u'e767b9475de189320f691f49c679c4c7',
u'md5': u'970a94178ca4118c5aa3aaea21211b81',
u'info_dict': {
u"upload_date": u"20130914",
u"uploader_id": u"jojo747400",

Some files were not shown because too many files have changed in this diff Show More