Merge branch 'master' into use-other-downloaders

This commit is contained in:
Rogério Brito 2013-10-17 13:59:09 -03:00
commit 7212863883
47 changed files with 1446 additions and 264 deletions

View File

@ -13,13 +13,13 @@ PYTHON=/usr/bin/env python
# set SYSCONFDIR to /etc if PREFIX=/usr or PREFIX=/usr/local
ifeq ($(PREFIX),/usr)
SYSCONFDIR=/etc
SYSCONFDIR=/etc
else
ifeq ($(PREFIX),/usr/local)
SYSCONFDIR=/etc
else
SYSCONFDIR=$(PREFIX)/etc
endif
ifeq ($(PREFIX),/usr/local)
SYSCONFDIR=/etc
else
SYSCONFDIR=$(PREFIX)/etc
endif
endif
install: youtube-dl youtube-dl.1 youtube-dl.bash-completion
@ -71,6 +71,7 @@ youtube-dl.tar.gz: youtube-dl README.md README.txt youtube-dl.1 youtube-dl.bash-
--exclude '*~' \
--exclude '__pycache' \
--exclude '.git' \
--exclude 'testdata' \
-- \
bin devscripts test youtube_dl \
CHANGELOG LICENSE README.md README.txt \

View File

@ -57,9 +57,10 @@ which means you can modify it, redistribute it or use it however you like.
file. Record all downloaded videos in it.
## Download Options:
-r, --rate-limit LIMIT maximum download rate (e.g. 50k or 44.6m)
-r, --rate-limit LIMIT maximum download rate in bytes per second (e.g.
50K or 4.2M)
-R, --retries RETRIES number of retries (default is 10)
--buffer-size SIZE size of download buffer (e.g. 1024 or 16k)
--buffer-size SIZE size of download buffer (e.g. 1024 or 16K)
(default is 1024)
--no-resize-buffer do not automatically adjust the buffer size. By
default, the buffer size is automatically resized
@ -100,6 +101,7 @@ which means you can modify it, redistribute it or use it however you like.
file modification time
--write-description write video description to a .description file
--write-info-json write video metadata to a .info.json file
--write-annotations write video annotations to a .annotation file
--write-thumbnail write thumbnail image to disk
## Verbosity / Simulation Options:
@ -166,6 +168,7 @@ which means you can modify it, redistribute it or use it however you like.
processed files are overwritten by default
--embed-subs embed subtitles in the video (only for mp4
videos)
--add-metadata add metadata to the files
# CONFIGURATION

View File

@ -16,10 +16,11 @@ def main():
ie_htmls = []
for ie in sorted(youtube_dl.gen_extractors(), key=lambda i: i.IE_NAME.lower()):
ie_html = '<b>{}</b>'.format(ie.IE_NAME)
try:
ie_desc = getattr(ie, 'IE_DESC', None)
if ie_desc is False:
continue
elif ie_desc is not None:
ie_html += ': {}'.format(ie.IE_DESC)
except AttributeError:
pass
if ie.working() == False:
ie_html += ' (Currently broken)'
ie_htmls.append('<li>{}</li>'.format(ie_html))

0
test/__init__.py Normal file
View File

View File

@ -1,22 +1,27 @@
import errno
import io
import hashlib
import json
import os.path
import re
import types
import youtube_dl.extractor
from youtube_dl import YoutubeDL, YoutubeDLHandler
from youtube_dl.utils import (
compat_cookiejar,
compat_urllib_request,
)
from youtube_dl import YoutubeDL
youtube_dl._setup_opener(timeout=10)
PARAMETERS_FILE = os.path.join(os.path.dirname(os.path.abspath(__file__)), "parameters.json")
with io.open(PARAMETERS_FILE, encoding='utf-8') as pf:
parameters = json.load(pf)
def global_setup():
youtube_dl._setup_opener(timeout=10)
def get_params(override=None):
PARAMETERS_FILE = os.path.join(os.path.dirname(os.path.abspath(__file__)),
"parameters.json")
with io.open(PARAMETERS_FILE, encoding='utf-8') as pf:
parameters = json.load(pf)
if override:
parameters.update(override)
return parameters
def try_rm(filename):
@ -32,7 +37,7 @@ class FakeYDL(YoutubeDL):
def __init__(self):
# Different instances of the downloader can't share the same dictionary
# some test set the "sublang" parameter, which would break the md5 checks.
params = dict(parameters)
params = get_params()
super(FakeYDL, self).__init__(params)
self.result = []
@ -62,3 +67,6 @@ def get_testcases():
for t in getattr(ie, '_TESTS', []):
t['name'] = type(ie).__name__[:-len('IE')]
yield t
md5 = lambda s: hashlib.md5(s.encode('utf-8')).hexdigest()

View File

@ -1,14 +1,16 @@
#!/usr/bin/env python
import sys
import unittest
# Allow direct execution
import os
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
import sys
import unittest
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from test.helper import global_setup, try_rm
global_setup()
from youtube_dl import YoutubeDL
from helper import try_rm
def _download_restricted(url, filename, age):

View File

@ -1,14 +1,20 @@
#!/usr/bin/env python
import sys
import unittest
# Allow direct execution
import os
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
import sys
import unittest
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from test.helper import get_testcases
from youtube_dl.extractor import (
gen_extractors,
JustinTVIE,
YoutubeIE,
)
from youtube_dl.extractor import YoutubeIE, YoutubePlaylistIE, YoutubeChannelIE, JustinTVIE, gen_extractors
from helper import get_testcases
class TestAllURLsMatching(unittest.TestCase):
def setUp(self):

View File

@ -1,18 +1,16 @@
#!/usr/bin/env python
import sys
import unittest
import hashlib
# Allow direct execution
import os
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
import sys
import unittest
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from test.helper import FakeYDL, global_setup, md5
global_setup()
from youtube_dl.extractor import DailymotionIE
from youtube_dl.utils import *
from helper import FakeYDL
md5 = lambda s: hashlib.md5(s.encode('utf-8')).hexdigest()
class TestDailymotionSubtitles(unittest.TestCase):
def setUp(self):

View File

@ -1,26 +1,31 @@
#!/usr/bin/env python
# Allow direct execution
import os
import sys
import unittest
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from test.helper import get_params, get_testcases, global_setup, try_rm, md5
global_setup()
import hashlib
import io
import os
import json
import unittest
import sys
import socket
import binascii
# Allow direct execution
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
import youtube_dl.YoutubeDL
from youtube_dl.utils import *
PARAMETERS_FILE = os.path.join(os.path.dirname(os.path.abspath(__file__)), "parameters.json")
from youtube_dl.utils import (
compat_str,
compat_urllib_error,
DownloadError,
ExtractorError,
UnavailableVideoError,
)
RETRIES = 3
md5 = lambda s: hashlib.md5(s.encode('utf-8')).hexdigest()
class YoutubeDL(youtube_dl.YoutubeDL):
def __init__(self, *args, **kwargs):
self.to_stderr = self.to_screen
@ -37,18 +42,12 @@ def _file_md5(fn):
with open(fn, 'rb') as f:
return hashlib.md5(f.read()).hexdigest()
import helper # Set up remaining global configuration
from helper import get_testcases, try_rm
defs = get_testcases()
with io.open(PARAMETERS_FILE, encoding='utf-8') as pf:
parameters = json.load(pf)
class TestDownload(unittest.TestCase):
maxDiff = None
def setUp(self):
self.parameters = parameters
self.defs = defs
### Dynamically generate tests
@ -68,8 +67,7 @@ def generator(test_case):
print_skipping(test_case['skip'])
return
params = self.parameters.copy()
params.update(test_case.get('params', {}))
params = get_params(test_case.get('params', {}))
ydl = YoutubeDL(params)
ydl.add_default_info_extractors()

View File

@ -1,13 +1,16 @@
#!/usr/bin/env python
# encoding: utf-8
import sys
import unittest
import json
# Allow direct execution
import os
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
import sys
import unittest
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from test.helper import FakeYDL, global_setup
global_setup()
from youtube_dl.extractor import (
DailymotionPlaylistIE,
@ -16,10 +19,9 @@ from youtube_dl.extractor import (
UstreamChannelIE,
SoundcloudUserIE,
LivestreamIE,
NHLVideocenterIE,
)
from youtube_dl.utils import *
from helper import FakeYDL
class TestPlaylists(unittest.TestCase):
def assertIsPlaylist(self, info):
@ -74,5 +76,14 @@ class TestPlaylists(unittest.TestCase):
self.assertEqual(result['title'], u'TEDCity2.0 (English)')
self.assertTrue(len(result['entries']) >= 4)
def test_nhl_videocenter(self):
dl = FakeYDL()
ie = NHLVideocenterIE(dl)
result = ie.extract('http://video.canucks.nhl.com/videocenter/console?catid=999')
self.assertIsPlaylist(result)
self.assertEqual(result['id'], u'999')
self.assertEqual(result['title'], u'Highlights')
self.assertEqual(len(result['entries']), 12)
if __name__ == '__main__':
unittest.main()

View File

@ -1,14 +1,15 @@
#!/usr/bin/env python
# Various small unit tests
import sys
import unittest
import xml.etree.ElementTree
# coding: utf-8
# Allow direct execution
import os
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
import sys
import unittest
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
# Various small unit tests
import xml.etree.ElementTree
#from youtube_dl.utils import htmlentity_transform
from youtube_dl.utils import (
@ -20,6 +21,9 @@ from youtube_dl.utils import (
unified_strdate,
find_xpath_attr,
get_meta_content,
xpath_with_ns,
smuggle_url,
unsmuggle_url,
)
if sys.version_info < (3, 0):
@ -141,5 +145,31 @@ class TestUtil(unittest.TestCase):
self.assertEqual(get_meta('description'), u'foo & bar')
self.assertEqual(get_meta('author'), 'Plato')
def test_xpath_with_ns(self):
testxml = u'''<root xmlns:media="http://example.com/">
<media:song>
<media:author>The Author</media:author>
<url>http://server.com/download.mp3</url>
</media:song>
</root>'''
doc = xml.etree.ElementTree.fromstring(testxml)
find = lambda p: doc.find(xpath_with_ns(p, {'media': 'http://example.com/'}))
self.assertTrue(find('media:song') is not None)
self.assertEqual(find('media:song/media:author').text, u'The Author')
self.assertEqual(find('media:song/url').text, u'http://server.com/download.mp3')
def test_smuggle_url(self):
data = {u"ö": u"ö", u"abc": [3]}
url = 'https://foo.bar/baz?x=y#a'
smug_url = smuggle_url(url, data)
unsmug_url, unsmug_data = unsmuggle_url(smug_url)
self.assertEqual(url, unsmug_url)
self.assertEqual(data, unsmug_data)
res_url, res_data = unsmuggle_url(url)
self.assertEqual(res_url, url)
self.assertEqual(res_data, None)
if __name__ == '__main__':
unittest.main()

View File

@ -0,0 +1,80 @@
#!/usr/bin/env python
# coding: utf-8
# Allow direct execution
import os
import sys
import unittest
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from test.helper import get_params, global_setup, try_rm
global_setup()
import io
import xml.etree.ElementTree
import youtube_dl.YoutubeDL
import youtube_dl.extractor
class YoutubeDL(youtube_dl.YoutubeDL):
def __init__(self, *args, **kwargs):
super(YoutubeDL, self).__init__(*args, **kwargs)
self.to_stderr = self.to_screen
params = get_params({
'writeannotations': True,
'skip_download': True,
'writeinfojson': False,
'format': 'flv',
})
TEST_ID = 'gr51aVj-mLg'
ANNOTATIONS_FILE = TEST_ID + '.flv.annotations.xml'
EXPECTED_ANNOTATIONS = ['Speech bubble', 'Note', 'Title', 'Spotlight', 'Label']
class TestAnnotations(unittest.TestCase):
def setUp(self):
# Clear old files
self.tearDown()
def test_info_json(self):
expected = list(EXPECTED_ANNOTATIONS) #Two annotations could have the same text.
ie = youtube_dl.extractor.YoutubeIE()
ydl = YoutubeDL(params)
ydl.add_info_extractor(ie)
ydl.download([TEST_ID])
self.assertTrue(os.path.exists(ANNOTATIONS_FILE))
annoxml = None
with io.open(ANNOTATIONS_FILE, 'r', encoding='utf-8') as annof:
annoxml = xml.etree.ElementTree.parse(annof)
self.assertTrue(annoxml is not None, 'Failed to parse annotations XML')
root = annoxml.getroot()
self.assertEqual(root.tag, 'document')
annotationsTag = root.find('annotations')
self.assertEqual(annotationsTag.tag, 'annotations')
annotations = annotationsTag.findall('annotation')
#Not all the annotations have TEXT children and the annotations are returned unsorted.
for a in annotations:
self.assertEqual(a.tag, 'annotation')
if a.get('type') == 'text':
textTag = a.find('TEXT')
text = textTag.text
self.assertTrue(text in expected) #assertIn only added in python 2.7
#remove the first occurance, there could be more than one annotation with the same text
expected.remove(text)
#We should have seen (and removed) all the expected annotation texts.
self.assertEqual(len(expected), 0, 'Not all expected annotations were found.')
def tearDown(self):
try_rm(ANNOTATIONS_FILE)
if __name__ == '__main__':
unittest.main()

View File

@ -1,37 +1,34 @@
#!/usr/bin/env python
# coding: utf-8
import json
# Allow direct execution
import os
import sys
import unittest
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
# Allow direct execution
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from test.helper import get_params, global_setup
global_setup()
import io
import json
import youtube_dl.YoutubeDL
import youtube_dl.extractor
from youtube_dl.utils import *
PARAMETERS_FILE = os.path.join(os.path.dirname(os.path.abspath(__file__)), "parameters.json")
# General configuration (from __init__, not very elegant...)
jar = compat_cookiejar.CookieJar()
cookie_processor = compat_urllib_request.HTTPCookieProcessor(jar)
proxy_handler = compat_urllib_request.ProxyHandler()
opener = compat_urllib_request.build_opener(proxy_handler, cookie_processor, YoutubeDLHandler())
compat_urllib_request.install_opener(opener)
class YoutubeDL(youtube_dl.YoutubeDL):
def __init__(self, *args, **kwargs):
super(YoutubeDL, self).__init__(*args, **kwargs)
self.to_stderr = self.to_screen
with io.open(PARAMETERS_FILE, encoding='utf-8') as pf:
params = json.load(pf)
params['writeinfojson'] = True
params['skip_download'] = True
params['writedescription'] = True
params = get_params({
'writeinfojson': True,
'skip_download': True,
'writedescription': True,
})
TEST_ID = 'BaW_jenozKc'
INFO_JSON_FILE = TEST_ID + '.mp4.info.json'
@ -42,6 +39,7 @@ This is a test video for youtube-dl.
For more information, contact phihag@phihag.de .'''
class TestInfoJSON(unittest.TestCase):
def setUp(self):
# Clear old files

View File

@ -1,20 +1,26 @@
#!/usr/bin/env python
import sys
import unittest
import json
# Allow direct execution
import os
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
import sys
import unittest
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from youtube_dl.extractor import YoutubeUserIE, YoutubePlaylistIE, YoutubeIE, YoutubeChannelIE, YoutubeShowIE
from youtube_dl.utils import *
from test.helper import FakeYDL, global_setup
global_setup()
from youtube_dl.extractor import (
YoutubeUserIE,
YoutubePlaylistIE,
YoutubeIE,
YoutubeChannelIE,
YoutubeShowIE,
)
from helper import FakeYDL
class TestYoutubeLists(unittest.TestCase):
def assertIsPlaylist(self,info):
def assertIsPlaylist(self, info):
"""Make sure the info has '_type' set to 'playlist'"""
self.assertEqual(info['_type'], 'playlist')
@ -100,7 +106,7 @@ class TestYoutubeLists(unittest.TestCase):
dl = FakeYDL()
ie = YoutubeShowIE(dl)
result = ie.extract('http://www.youtube.com/show/airdisasters')
self.assertTrue(len(result) >= 4)
self.assertTrue(len(result) >= 3)
if __name__ == '__main__':
unittest.main()

View File

@ -1,14 +1,18 @@
#!/usr/bin/env python
# Allow direct execution
import os
import sys
import unittest
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from test.helper import global_setup
global_setup()
import io
import re
import string
import sys
import unittest
# Allow direct execution
import os
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from youtube_dl.extractor import YoutubeIE
from youtube_dl.utils import compat_str, compat_urlretrieve

View File

@ -1,69 +1,79 @@
#!/usr/bin/env python
import sys
import unittest
import hashlib
# Allow direct execution
import os
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
import sys
import unittest
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from test.helper import FakeYDL, global_setup, md5
global_setup()
from youtube_dl.extractor import YoutubeIE
from youtube_dl.utils import *
from helper import FakeYDL
md5 = lambda s: hashlib.md5(s.encode('utf-8')).hexdigest()
class TestYoutubeSubtitles(unittest.TestCase):
def setUp(self):
self.DL = FakeYDL()
self.url = 'QRS8MkLhQmM'
def getInfoDict(self):
IE = YoutubeIE(self.DL)
info_dict = IE.extract(self.url)
return info_dict
def getSubtitles(self):
info_dict = self.getInfoDict()
return info_dict[0]['subtitles']
return info_dict[0]['subtitles']
def test_youtube_no_writesubtitles(self):
self.DL.params['writesubtitles'] = False
subtitles = self.getSubtitles()
self.assertEqual(subtitles, None)
def test_youtube_subtitles(self):
self.DL.params['writesubtitles'] = True
subtitles = self.getSubtitles()
self.assertEqual(md5(subtitles['en']), '4cd9278a35ba2305f47354ee13472260')
def test_youtube_subtitles_lang(self):
self.DL.params['writesubtitles'] = True
self.DL.params['subtitleslangs'] = ['it']
subtitles = self.getSubtitles()
self.assertEqual(md5(subtitles['it']), '164a51f16f260476a05b50fe4c2f161d')
def test_youtube_allsubtitles(self):
self.DL.params['writesubtitles'] = True
self.DL.params['allsubtitles'] = True
subtitles = self.getSubtitles()
self.assertEqual(len(subtitles.keys()), 13)
def test_youtube_subtitles_sbv_format(self):
self.DL.params['writesubtitles'] = True
self.DL.params['subtitlesformat'] = 'sbv'
subtitles = self.getSubtitles()
self.assertEqual(md5(subtitles['en']), '13aeaa0c245a8bed9a451cb643e3ad8b')
def test_youtube_subtitles_vtt_format(self):
self.DL.params['writesubtitles'] = True
self.DL.params['subtitlesformat'] = 'vtt'
subtitles = self.getSubtitles()
self.assertEqual(md5(subtitles['en']), '356cdc577fde0c6783b9b822e7206ff7')
def test_youtube_list_subtitles(self):
self.DL.expect_warning(u'Video doesn\'t have automatic captions')
self.DL.params['listsubtitles'] = True
info_dict = self.getInfoDict()
self.assertEqual(info_dict, None)
def test_youtube_automatic_captions(self):
self.url = '8YoUxe5ncPo'
self.DL.params['writeautomaticsub'] = True
self.DL.params['subtitleslangs'] = ['it']
subtitles = self.getSubtitles()
self.assertTrue(subtitles['it'] is not None)
def test_youtube_nosubtitles(self):
self.DL.expect_warning(u'video doesn\'t have subtitles')
self.url = 'sAjKT8FhjI8'
@ -71,6 +81,7 @@ class TestYoutubeSubtitles(unittest.TestCase):
self.DL.params['allsubtitles'] = True
subtitles = self.getSubtitles()
self.assertEqual(len(subtitles), 0)
def test_youtube_multiple_langs(self):
self.url = 'QRS8MkLhQmM'
self.DL.params['writesubtitles'] = True

View File

@ -1,5 +1,8 @@
[tox]
envlist = py26,py27,py33
[testenv]
deps = nose
commands = nosetests --with-coverage --cover-package=youtube_dl --cover-html --verbose test
deps =
nose
coverage
commands = nosetests --verbose {posargs:test} # --with-coverage --cover-package=youtube_dl --cover-html
# test.test_download:TestDownload.test_NowVideo

View File

@ -270,6 +270,7 @@ class FileDownloader(object):
def _download_with_rtmpdump(self, filename, url, player_url, page_url, play_path, tc_url):
self.report_destination(filename)
tmpfilename = self.temp_name(filename)
test = self.params.get('test', False)
# Check for rtmpdump first
try:
@ -291,6 +292,8 @@ class FileDownloader(object):
basic_args += ['--playpath', play_path]
if tc_url is not None:
basic_args += ['--tcUrl', url]
if test:
basic_args += ['--stop', '1']
args = basic_args + [[], ['--resume', '--skip', '1']][self.params.get('continuedl', False)]
if self.params.get('verbose', False):
try:
@ -300,7 +303,7 @@ class FileDownloader(object):
shell_quote = repr
self.to_screen(u'[debug] rtmpdump command line: ' + shell_quote(args))
retval = subprocess.call(args)
while retval == 2 or retval == 1:
while (retval == 2 or retval == 1) and not test:
prevsize = os.path.getsize(encodeFilename(tmpfilename))
self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
time.sleep(5.0) # This seems to be needed
@ -313,7 +316,7 @@ class FileDownloader(object):
self.to_screen(u'\r[rtmpdump] Could not download the whole video. This can happen for some advertisements.')
retval = 0
break
if retval == 0:
if retval == 0 or (test and retval == 2):
fsize = os.path.getsize(encodeFilename(tmpfilename))
self.to_screen(u'\r[rtmpdump] %s bytes' % fsize)
self.try_rename(tmpfilename, filename)

View File

@ -3,7 +3,14 @@ import subprocess
import sys
import time
from .utils import *
from .utils import (
compat_subprocess_get_DEVNULL,
encodeFilename,
PostProcessingError,
shell_quote,
subtitles_filename,
)
class PostProcessor(object):
@ -82,6 +89,8 @@ class FFmpegPostProcessor(PostProcessor):
+ opts +
[encodeFilename(self._ffmpeg_filename_argument(out_path))])
if self._downloader.params.get('verbose', False):
self._downloader.to_screen(u'[debug] ffmpeg command line: %s' % shell_quote(cmd))
p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
stdout,stderr = p.communicate()
if p.returncode != 0:
@ -177,7 +186,8 @@ class FFmpegExtractAudioPP(FFmpegPostProcessor):
extension = self._preferredcodec
more_opts = []
if self._preferredquality is not None:
if int(self._preferredquality) < 10:
# The opus codec doesn't support the -aq option
if int(self._preferredquality) < 10 and extension != 'opus':
more_opts += [self._exes['avconv'] and '-q:a' or '-aq', self._preferredquality]
else:
more_opts += [self._exes['avconv'] and '-b:a' or '-ab', self._preferredquality + 'k']
@ -467,3 +477,35 @@ class FFmpegEmbedSubtitlePP(FFmpegPostProcessor):
os.rename(encodeFilename(temp_filename), encodeFilename(filename))
return True, information
class FFmpegMetadataPP(FFmpegPostProcessor):
def run(self, info):
metadata = {}
if info.get('title') is not None:
metadata['title'] = info['title']
if info.get('upload_date') is not None:
metadata['date'] = info['upload_date']
if info.get('uploader') is not None:
metadata['artist'] = info['uploader']
elif info.get('uploader_id') is not None:
metadata['artist'] = info['uploader_id']
if not metadata:
self._downloader.to_screen(u'[ffmpeg] There isn\'t any metadata to add')
return True, info
filename = info['filepath']
ext = os.path.splitext(filename)[1][1:]
temp_filename = filename + u'.temp'
options = ['-c', 'copy']
for (name, value) in metadata.items():
options.extend(['-metadata', '%s="%s"' % (name, value)])
options.extend(['-f', ext])
self._downloader.to_screen(u'[ffmpeg] Adding metadata to \'%s\'' % filename)
self.run_ffmpeg(filename, temp_filename, options)
os.remove(encodeFilename(filename))
os.rename(encodeFilename(temp_filename), encodeFilename(filename))
return True, info

View File

@ -71,6 +71,7 @@ class YoutubeDL(object):
logtostderr: Log messages to stderr instead of stdout.
writedescription: Write the video description to a .description file
writeinfojson: Write the video description to a .info.json file
writeannotations: Write the video annotations to a .annotations.xml file
writethumbnail: Write the thumbnail image to a file
writesubtitles: Write the video subtitles to a file
writeautomaticsub: Write the automatic subtitles to a file
@ -119,7 +120,7 @@ class YoutubeDL(object):
and not params['restrictfilenames']):
# On Python 3, the Unicode filesystem API will throw errors (#1474)
self.report_warning(
u'Assuming --restrict-filenames isnce file system encoding '
u'Assuming --restrict-filenames since file system encoding '
u'cannot encode all charactes. '
u'Set the LC_ALL environment variable to fix this.')
params['restrictfilenames'] = True
@ -258,6 +259,10 @@ class YoutubeDL(object):
""" Report that the metadata file has been written """
self.to_screen(u'[info] Video description metadata as JSON to: ' + infofn)
def report_writeannotations(self, annofn):
""" Report that the annotations file has been written. """
self.to_screen(u'[info] Writing video annotations to: ' + annofn)
def report_file_already_downloaded(self, file_name):
"""Report file has already been fully downloaded."""
try:
@ -522,6 +527,18 @@ class YoutubeDL(object):
self.report_error(u'Cannot write description file ' + descfn)
return
if self.params.get('writeannotations', False):
try:
annofn = filename + u'.annotations.xml'
self.report_writeannotations(annofn)
with io.open(encodeFilename(annofn), 'w', encoding='utf-8') as annofile:
annofile.write(info_dict['annotations'])
except (KeyError, TypeError):
self.report_warning(u'There are no annotations to write.')
except (OSError, IOError):
self.report_error(u'Cannot write annotations file: ' + annofn)
return
subtitles_are_requested = any([self.params.get('writesubtitles', False),
self.params.get('writeautomaticsub')])

View File

@ -31,6 +31,7 @@ __authors__ = (
'Huarong Huo',
'Ismael Mejía',
'Steffan \'Ruirize\' James',
'Andras Elso',
)
__license__ = 'Public Domain'
@ -46,17 +47,43 @@ import shlex
import socket
import subprocess
import sys
import warnings
import traceback
import platform
from .utils import *
from .utils import (
compat_cookiejar,
compat_print,
compat_str,
compat_urllib_request,
DateRange,
decodeOption,
determine_ext,
DownloadError,
get_cachedir,
make_HTTPS_handler,
MaxDownloadsReached,
platform_name,
preferredencoding,
SameFileError,
std_headers,
write_string,
YoutubeDLHandler,
)
from .update import update_self
from .version import __version__
from .FileDownloader import *
from .FileDownloader import (
FileDownloader,
)
from .extractor import gen_extractors
from .YoutubeDL import YoutubeDL
from .PostProcessor import *
from .PostProcessor import (
FFmpegMetadataPP,
FFmpegVideoConvertor,
FFmpegExtractAudioPP,
FFmpegEmbedSubtitlePP,
)
def parseOpts(overrideArguments=None):
def _readOptions(filename_bytes):
@ -240,11 +267,11 @@ def parseOpts(overrideArguments=None):
help='languages of the subtitles to download (optional) separated by commas, use IETF language tags like \'en,pt\'')
downloader.add_option('-r', '--rate-limit',
dest='ratelimit', metavar='LIMIT', help='maximum download rate (e.g. 50k or 44.6m)')
dest='ratelimit', metavar='LIMIT', help='maximum download rate in bytes per second (e.g. 50K or 4.2M)')
downloader.add_option('-R', '--retries',
dest='retries', metavar='RETRIES', help='number of retries (default is %default)', default=10)
downloader.add_option('--buffer-size',
dest='buffersize', metavar='SIZE', help='size of download buffer (e.g. 1024 or 16k) (default is %default)', default="1024")
dest='buffersize', metavar='SIZE', help='size of download buffer (e.g. 1024 or 16K) (default is %default)', default="1024")
downloader.add_option('--no-resize-buffer',
action='store_true', dest='noresizebuffer',
help='do not automatically adjust the buffer size. By default, the buffer size is automatically resized from an initial value of SIZE.', default=False)
@ -339,6 +366,9 @@ def parseOpts(overrideArguments=None):
filesystem.add_option('--write-info-json',
action='store_true', dest='writeinfojson',
help='write video metadata to a .info.json file', default=False)
filesystem.add_option('--write-annotations',
action='store_true', dest='writeannotations',
help='write video annotations to a .annotation file', default=False)
filesystem.add_option('--write-thumbnail',
action='store_true', dest='writethumbnail',
help='write thumbnail image to disk', default=False)
@ -358,6 +388,8 @@ def parseOpts(overrideArguments=None):
help='do not overwrite post-processed files; the post-processed files are overwritten by default')
postproc.add_option('--embed-subs', action='store_true', dest='embedsubtitles', default=False,
help='embed subtitles in the video (only for mp4 videos)')
postproc.add_option('--add-metadata', action='store_true', dest='addmetadata', default=False,
help='add metadata to the files')
parser.add_option_group(general)
@ -603,6 +635,7 @@ def _real_main(argv=None):
'nopart': opts.nopart,
'updatetime': opts.updatetime,
'writedescription': opts.writedescription,
'writeannotations': opts.writeannotations,
'writeinfojson': opts.writeinfojson,
'writethumbnail': opts.writethumbnail,
'writesubtitles': opts.writesubtitles,
@ -655,6 +688,9 @@ def _real_main(argv=None):
ydl.add_default_info_extractors()
# PostProcessors
# Add the metadata pp first, the other pps will copy it
if opts.addmetadata:
ydl.add_post_processor(FFmpegMetadataPP())
if opts.extractaudio:
ydl.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat, preferredquality=opts.audioquality, nopostoverwrites=opts.nopostoverwrites))
if opts.recodevideo:
@ -683,7 +719,7 @@ def _real_main(argv=None):
if opts.cookiefile is not None:
try:
jar.save()
except (IOError, OSError) as err:
except (IOError, OSError):
sys.exit(u'ERROR: unable to save cookie jar')
sys.exit(retcode)

View File

@ -2,7 +2,12 @@ from .appletrailers import AppleTrailersIE
from .addanime import AddAnimeIE
from .archiveorg import ArchiveOrgIE
from .ard import ARDIE
from .arte import ArteTvIE
from .arte import (
ArteTvIE,
ArteTVPlus7IE,
ArteTVCreativeIE,
ArteTVFutureIE,
)
from .auengine import AUEngineIE
from .bandcamp import BandcampIE
from .bliptv import BlipTVIE, BlipTVUserIE
@ -12,6 +17,7 @@ from .brightcove import BrightcoveIE
from .c56 import C56IE
from .canalplus import CanalplusIE
from .canalc2 import Canalc2IE
from .cinemassacre import CinemassacreIE
from .cnn import CNNIE
from .collegehumor import CollegeHumorIE
from .comedycentral import ComedyCentralIE
@ -34,6 +40,7 @@ from .eighttracks import EightTracksIE
from .escapist import EscapistIE
from .exfm import ExfmIE
from .facebook import FacebookIE
from .faz import FazIE
from .fktv import (
FKTVIE,
FKTVPosteckeIE,
@ -60,6 +67,7 @@ from .ign import IGNIE, OneUPIE
from .ina import InaIE
from .infoq import InfoQIE
from .instagram import InstagramIE
from .internetvideoarchive import InternetVideoArchiveIE
from .jeuxvideo import JeuxVideoIE
from .jukebox import JukeboxIE
from .justintv import JustinTVIE
@ -80,6 +88,8 @@ from .naver import NaverIE
from .nba import NBAIE
from .nbc import NBCNewsIE
from .newgrounds import NewgroundsIE
from .nhl import NHLIE, NHLVideocenterIE
from .nowvideo import NowVideoIE
from .ooyala import OoyalaIE
from .orf import ORFIE
from .pbs import PBSIE
@ -89,8 +99,10 @@ from .rbmaradio import RBMARadioIE
from .redtube import RedTubeIE
from .ringtv import RingTVIE
from .ro220 import Ro220IE
from .rottentomatoes import RottenTomatoesIE
from .roxwel import RoxwelIE
from .rtlnow import RTLnowIE
from .rutube import RutubeIE
from .sina import SinaIE
from .slashdot import SlashdotIE
from .slideshare import SlideshareIE
@ -101,7 +113,9 @@ from .spiegel import SpiegelIE
from .stanfordoc import StanfordOpenClassroomIE
from .statigram import StatigramIE
from .steam import SteamIE
from .sztvhu import SztvHuIE
from .teamcoco import TeamcocoIE
from .techtalks import TechTalksIE
from .ted import TEDIE
from .tf1 import TF1IE
from .thisav import ThisAVIE
@ -118,10 +132,13 @@ from .veoh import VeohIE
from .vevo import VevoIE
from .vice import ViceIE
from .viddler import ViddlerIE
from .videodetective import VideoDetectiveIE
from .videofyme import VideofyMeIE
from .videopremium import VideoPremiumIE
from .vimeo import VimeoIE, VimeoChannelIE
from .vine import VineIE
from .wat import WatIE
from .websurg import WeBSurgIE
from .weibo import WeiboIE
from .wimp import WimpIE
from .worldstarhiphop import WorldStarHipHopIE

View File

@ -1,3 +1,4 @@
# encoding: utf-8
import re
import json
import xml.etree.ElementTree
@ -7,15 +8,15 @@ from ..utils import (
ExtractorError,
find_xpath_attr,
unified_strdate,
determine_ext,
get_element_by_id,
)
# There are different sources of video in arte.tv, the extraction process
# is different for each one. The videos usually expire in 7 days, so we can't
# add tests.
class ArteTvIE(InfoExtractor):
"""
There are two sources of video in arte.tv: videos.arte.tv and
www.arte.tv/guide, the extraction process is different for each one.
The videos expire in 7 days, so we can't add tests.
"""
_EMISSION_URL = r'(?:http://)?www\.arte.tv/guide/(?P<lang>fr|de)/(?:(?:sendungen|emissions)/)?(?P<id>.*?)/(?P<name>.*?)(\?.*)?'
_VIDEOS_URL = r'(?:http://)?videos.arte.tv/(?P<lang>fr|de)/.*-(?P<id>.*?).html'
_LIVEWEB_URL = r'(?:http://)?liveweb.arte.tv/(?P<lang>fr|de)/(?P<subpage>.+?)/(?P<name>.+)'
_LIVE_URL = r'index-[0-9]+\.html$'
@ -24,7 +25,7 @@ class ArteTvIE(InfoExtractor):
@classmethod
def suitable(cls, url):
return any(re.match(regex, url) for regex in (cls._EMISSION_URL, cls._VIDEOS_URL, cls._LIVEWEB_URL))
return any(re.match(regex, url) for regex in (cls._VIDEOS_URL, cls._LIVEWEB_URL))
# TODO implement Live Stream
# from ..utils import compat_urllib_parse
@ -55,14 +56,6 @@ class ArteTvIE(InfoExtractor):
# video_url = u'%s/%s' % (info.get('url'), info.get('path'))
def _real_extract(self, url):
mobj = re.match(self._EMISSION_URL, url)
if mobj is not None:
lang = mobj.group('lang')
# This is not a real id, it can be for example AJT for the news
# http://www.arte.tv/guide/fr/emissions/AJT/arte-journal
video_id = mobj.group('id')
return self._extract_emission(url, video_id, lang)
mobj = re.match(self._VIDEOS_URL, url)
if mobj is not None:
id = mobj.group('id')
@ -80,49 +73,6 @@ class ArteTvIE(InfoExtractor):
# self.extractLiveStream(url)
# return
def _extract_emission(self, url, video_id, lang):
"""Extract from www.arte.tv/guide"""
webpage = self._download_webpage(url, video_id)
json_url = self._html_search_regex(r'arte_vp_url="(.*?)"', webpage, 'json url')
json_info = self._download_webpage(json_url, video_id, 'Downloading info json')
self.report_extraction(video_id)
info = json.loads(json_info)
player_info = info['videoJsonPlayer']
info_dict = {'id': player_info['VID'],
'title': player_info['VTI'],
'description': player_info.get('VDE'),
'upload_date': unified_strdate(player_info['VDA'].split(' ')[0]),
'thumbnail': player_info['programImage'],
'ext': 'flv',
}
formats = player_info['VSR'].values()
def _match_lang(f):
# Return true if that format is in the language of the url
if lang == 'fr':
l = 'F'
elif lang == 'de':
l = 'A'
regexes = [r'VO?%s' % l, r'VO?.-ST%s' % l]
return any(re.match(r, f['versionCode']) for r in regexes)
# Some formats may not be in the same language as the url
formats = filter(_match_lang, formats)
# We order the formats by quality
formats = sorted(formats, key=lambda f: int(f['height']))
# Prefer videos without subtitles in the same language
formats = sorted(formats, key=lambda f: re.match(r'VO(F|A)-STM\1', f['versionCode']) is None)
# Pick the best quality
format_info = formats[-1]
if format_info['mediaType'] == u'rtmp':
info_dict['url'] = format_info['streamer']
info_dict['play_path'] = 'mp4:' + format_info['url']
else:
info_dict['url'] = format_info['url']
return info_dict
def _extract_video(self, url, video_id, lang):
"""Extract from videos.arte.tv"""
ref_xml_url = url.replace('/videos/', '/do_delegate/videos/')
@ -172,3 +122,110 @@ class ArteTvIE(InfoExtractor):
'ext': 'flv',
'thumbnail': self._og_search_thumbnail(webpage),
}
class ArteTVPlus7IE(InfoExtractor):
IE_NAME = u'arte.tv:+7'
_VALID_URL = r'https?://www\.arte.tv/guide/(?P<lang>fr|de)/(?:(?:sendungen|emissions)/)?(?P<id>.*?)/(?P<name>.*?)(\?.*)?'
@classmethod
def _extract_url_info(cls, url):
mobj = re.match(cls._VALID_URL, url)
lang = mobj.group('lang')
# This is not a real id, it can be for example AJT for the news
# http://www.arte.tv/guide/fr/emissions/AJT/arte-journal
video_id = mobj.group('id')
return video_id, lang
def _real_extract(self, url):
video_id, lang = self._extract_url_info(url)
webpage = self._download_webpage(url, video_id)
return self._extract_from_webpage(webpage, video_id, lang)
def _extract_from_webpage(self, webpage, video_id, lang):
json_url = self._html_search_regex(r'arte_vp_url="(.*?)"', webpage, 'json url')
json_info = self._download_webpage(json_url, video_id, 'Downloading info json')
self.report_extraction(video_id)
info = json.loads(json_info)
player_info = info['videoJsonPlayer']
info_dict = {
'id': player_info['VID'],
'title': player_info['VTI'],
'description': player_info.get('VDE'),
'upload_date': unified_strdate(player_info.get('VDA', '').split(' ')[0]),
'thumbnail': player_info.get('programImage') or player_info.get('VTU', {}).get('IUR'),
}
formats = player_info['VSR'].values()
def _match_lang(f):
if f.get('versionCode') is None:
return True
# Return true if that format is in the language of the url
if lang == 'fr':
l = 'F'
elif lang == 'de':
l = 'A'
regexes = [r'VO?%s' % l, r'VO?.-ST%s' % l]
return any(re.match(r, f['versionCode']) for r in regexes)
# Some formats may not be in the same language as the url
formats = filter(_match_lang, formats)
# Some formats use the m3u8 protocol
formats = filter(lambda f: f.get('videoFormat') != 'M3U8', formats)
# We order the formats by quality
formats = sorted(formats, key=lambda f: int(f.get('height',-1)))
# Prefer videos without subtitles in the same language
formats = sorted(formats, key=lambda f: re.match(r'VO(F|A)-STM\1', f.get('versionCode', '')) is None)
# Pick the best quality
def _format(format_info):
info = {
'width': format_info.get('width'),
'height': format_info.get('height'),
}
if format_info['mediaType'] == u'rtmp':
info['url'] = format_info['streamer']
info['play_path'] = 'mp4:' + format_info['url']
info['ext'] = 'flv'
else:
info['url'] = format_info['url']
info['ext'] = determine_ext(info['url'])
return info
info_dict['formats'] = [_format(f) for f in formats]
# TODO: Remove when #980 has been merged
info_dict.update(info_dict['formats'][-1])
return info_dict
# It also uses the arte_vp_url url from the webpage to extract the information
class ArteTVCreativeIE(ArteTVPlus7IE):
IE_NAME = u'arte.tv:creative'
_VALID_URL = r'https?://creative\.arte\.tv/(?P<lang>fr|de)/magazine?/(?P<id>.+)'
_TEST = {
u'url': u'http://creative.arte.tv/de/magazin/agentur-amateur-corporate-design',
u'file': u'050489-002.mp4',
u'info_dict': {
u'title': u'Agentur Amateur #2 - Corporate Design',
},
}
class ArteTVFutureIE(ArteTVPlus7IE):
IE_NAME = u'arte.tv:future'
_VALID_URL = r'https?://future\.arte\.tv/(?P<lang>fr|de)/(thema|sujet)/.*?#article-anchor-(?P<id>\d+)'
_TEST = {
u'url': u'http://future.arte.tv/fr/sujet/info-sciences#article-anchor-7081',
u'file': u'050940-003.mp4',
u'info_dict': {
u'title': u'Les champignons au secours de la planète',
},
}
def _real_extract(self, url):
anchor_id, lang = self._extract_url_info(url)
webpage = self._download_webpage(url, anchor_id)
row = get_element_by_id(anchor_id, webpage)
return self._extract_from_webpage(row, anchor_id, lang)

View File

@ -53,6 +53,8 @@ class BrightcoveIE(InfoExtractor):
# Fix up some stupid HTML, see https://github.com/rg3/youtube-dl/issues/1553
object_str = re.sub(r'(<param name="[^"]+" value="[^"]+")>',
lambda m: m.group(1) + '/>', object_str)
# Fix up some stupid XML, see https://github.com/rg3/youtube-dl/issues/1608
object_str = object_str.replace(u'<--', u'<!--')
object_doc = xml.etree.ElementTree.fromstring(object_str)
assert u'BrightcoveExperience' in object_doc.attrib['class']
@ -96,7 +98,10 @@ class BrightcoveIE(InfoExtractor):
playlist_info = self._download_webpage(self._PLAYLIST_URL_TEMPLATE % player_key,
player_key, u'Downloading playlist information')
playlist_info = json.loads(playlist_info)['videoList']
json_data = json.loads(playlist_info)
if 'videoList' not in json_data:
raise ExtractorError(u'Empty playlist')
playlist_info = json_data['videoList']
videos = [self._extract_video_info(video_info) for video_info in playlist_info['mediaCollectionDTO']['videoDTOs']]
return self.playlist_result(videos, playlist_id=playlist_info['id'],

View File

@ -0,0 +1,91 @@
# encoding: utf-8
import re
from .common import InfoExtractor
from ..utils import (
ExtractorError,
)
class CinemassacreIE(InfoExtractor):
_VALID_URL = r'(?:http://)?(?:www\.)?(?P<url>cinemassacre\.com/(?P<date_Y>[0-9]{4})/(?P<date_m>[0-9]{2})/(?P<date_d>[0-9]{2})/.+?)(?:[/?].*)?'
_TESTS = [{
u'url': u'http://cinemassacre.com/2012/11/10/avgn-the-movie-trailer/',
u'file': u'19911.flv',
u'info_dict': {
u'upload_date': u'20121110',
u'title': u'“Angry Video Game Nerd: The Movie” Trailer',
u'description': u'md5:fb87405fcb42a331742a0dce2708560b',
},
u'params': {
# rtmp download
u'skip_download': True,
},
},
{
u'url': u'http://cinemassacre.com/2013/10/02/the-mummys-hand-1940',
u'file': u'521be8ef82b16.flv',
u'info_dict': {
u'upload_date': u'20131002',
u'title': u'The Mummys Hand (1940)',
},
u'params': {
# rtmp download
u'skip_download': True,
},
}]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
webpage_url = u'http://' + mobj.group('url')
webpage = self._download_webpage(webpage_url, None) # Don't know video id yet
video_date = mobj.group('date_Y') + mobj.group('date_m') + mobj.group('date_d')
mobj = re.search(r'src="(?P<embed_url>http://player\.screenwavemedia\.com/play/(?:embed|player)\.php\?id=(?:Cinemassacre-)?(?P<video_id>.+?))"', webpage)
if not mobj:
raise ExtractorError(u'Can\'t extract embed url and video id')
playerdata_url = mobj.group(u'embed_url')
video_id = mobj.group(u'video_id')
video_title = self._html_search_regex(r'<title>(?P<title>.+?)\|',
webpage, u'title')
video_description = self._html_search_regex(r'<div class="entry-content">(?P<description>.+?)</div>',
webpage, u'description', flags=re.DOTALL, fatal=False)
if len(video_description) == 0:
video_description = None
playerdata = self._download_webpage(playerdata_url, video_id)
base_url = self._html_search_regex(r'\'streamer\': \'(?P<base_url>rtmp://.*?)/(?:vod|Cinemassacre)\'',
playerdata, u'base_url')
base_url += '/Cinemassacre/'
# Important: The file names in playerdata are not used by the player and even wrong for some videos
sd_file = 'Cinemassacre-%s_high.mp4' % video_id
hd_file = 'Cinemassacre-%s.mp4' % video_id
video_thumbnail = 'http://image.screenwavemedia.com/Cinemassacre/Cinemassacre-%s_thumb_640x360.jpg' % video_id
formats = [
{
'url': base_url + sd_file,
'ext': 'flv',
'format': 'sd',
'format_id': 'sd',
},
{
'url': base_url + hd_file,
'ext': 'flv',
'format': 'hd',
'format_id': 'hd',
},
]
info = {
'id': video_id,
'title': video_title,
'formats': formats,
'description': video_description,
'upload_date': video_date,
'thumbnail': video_thumbnail,
}
# TODO: Remove when #980 has been merged
info.update(formats[-1])
return info

View File

@ -0,0 +1,60 @@
# encoding: utf-8
import re
import xml.etree.ElementTree
from .common import InfoExtractor
from ..utils import (
determine_ext,
clean_html,
get_element_by_attribute,
)
class FazIE(InfoExtractor):
IE_NAME = u'faz.net'
_VALID_URL = r'https?://www\.faz\.net/multimedia/videos/.*?-(?P<id>\d+).html'
_TEST = {
u'url': u'http://www.faz.net/multimedia/videos/stockholm-chemie-nobelpreis-fuer-drei-amerikanische-forscher-12610585.html',
u'file': u'12610585.mp4',
u'info_dict': {
u'title': u'Stockholm: Chemie-Nobelpreis für drei amerikanische Forscher',
u'description': u'md5:1453fbf9a0d041d985a47306192ea253',
},
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
self.to_screen(video_id)
webpage = self._download_webpage(url, video_id)
config_xml_url = self._search_regex(r'writeFLV\(\'(.+?)\',', webpage,
u'config xml url')
config_xml = self._download_webpage(config_xml_url, video_id,
u'Downloading config xml')
config = xml.etree.ElementTree.fromstring(config_xml.encode('utf-8'))
encodings = config.find('ENCODINGS')
formats = []
for code in ['LOW', 'HIGH', 'HQ']:
encoding = encodings.find(code)
if encoding is None:
continue
encoding_url = encoding.find('FILENAME').text
formats.append({
'url': encoding_url,
'ext': determine_ext(encoding_url),
'format_id': code.lower(),
})
descr_html = get_element_by_attribute('class', 'Content Copy', webpage)
info = {
'id': video_id,
'title': self._og_search_title(webpage),
'formats': formats,
'description': clean_html(descr_html),
'thumbnail': config.find('STILL/STILL_BIG').text,
}
# TODO: Remove when #980 has been merged
info.update(formats[-1])
return info

View File

@ -1,55 +1,59 @@
import re
import xml.etree.ElementTree
import json
from .common import InfoExtractor
from ..utils import (
unified_strdate,
compat_urllib_parse,
compat_urlparse,
unescapeHTML,
get_meta_content,
)
class GameSpotIE(InfoExtractor):
_VALID_URL = r'(?:http://)?(?:www\.)?gamespot\.com/.*-(?P<page_id>\d+)/?'
_TEST = {
u"url": u"http://www.gamespot.com/arma-iii/videos/arma-iii-community-guide-sitrep-i-6410818/",
u"file": u"6410818.mp4",
u"file": u"gs-2300-6410818.mp4",
u"md5": u"b2a30deaa8654fcccd43713a6b6a4825",
u"info_dict": {
u"title": u"Arma 3 - Community Guide: SITREP I",
u"upload_date": u"20130627",
u'description': u'Check out this video where some of the basics of Arma 3 is explained.',
}
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
page_id = mobj.group('page_id')
page_id = video_id = mobj.group('page_id')
webpage = self._download_webpage(url, page_id)
video_id = self._html_search_regex([r'"og:video" content=".*?\?id=(\d+)"',
r'http://www\.gamespot\.com/videoembed/(\d+)'],
webpage, 'video id')
data = compat_urllib_parse.urlencode({'id': video_id, 'newplayer': '1'})
info_url = 'http://www.gamespot.com/pages/video_player/xml.php?' + data
info_xml = self._download_webpage(info_url, video_id)
doc = xml.etree.ElementTree.fromstring(info_xml)
clip_el = doc.find('./playList/clip')
data_video_json = self._search_regex(r'data-video=\'(.*?)\'', webpage, u'data video')
data_video = json.loads(unescapeHTML(data_video_json))
http_urls = [{'url': node.find('filePath').text,
'rate': int(node.find('rate').text)}
for node in clip_el.find('./httpURI')]
best_quality = sorted(http_urls, key=lambda f: f['rate'])[-1]
video_url = best_quality['url']
title = clip_el.find('./title').text
ext = video_url.rpartition('.')[2]
thumbnail_url = clip_el.find('./screenGrabURI').text
view_count = int(clip_el.find('./views').text)
upload_date = unified_strdate(clip_el.find('./postDate').text)
# Transform the manifest url to a link to the mp4 files
# they are used in mobile devices.
f4m_url = data_video['videoStreams']['f4m_stream']
f4m_path = compat_urlparse.urlparse(f4m_url).path
QUALITIES_RE = r'((,\d+)+,?)'
qualities = self._search_regex(QUALITIES_RE, f4m_path, u'qualities').strip(',').split(',')
http_path = f4m_path[1:].split('/', 1)[1]
http_template = re.sub(QUALITIES_RE, r'%s', http_path)
http_template = http_template.replace('.csmil/manifest.f4m', '')
http_template = compat_urlparse.urljoin('http://video.gamespotcdn.com/', http_template)
formats = []
for q in qualities:
formats.append({
'url': http_template % q,
'ext': 'mp4',
'format_id': q,
})
return [{
'id' : video_id,
'url' : video_url,
'ext' : ext,
'title' : title,
'thumbnail' : thumbnail_url,
'upload_date' : upload_date,
'view_count' : view_count,
}]
info = {
'id': data_video['guid'],
'title': compat_urllib_parse.unquote(data_video['title']),
'formats': formats,
'description': get_meta_content('description', webpage),
'thumbnail': self._og_search_thumbnail(webpage),
}
# TODO: Remove when #980 has been merged
info.update(formats[-1])
return info

View File

@ -11,6 +11,8 @@ from ..utils import (
compat_urlparse,
ExtractorError,
smuggle_url,
unescapeHTML,
)
from .brightcove import BrightcoveIE
@ -29,6 +31,17 @@ class GenericIE(InfoExtractor):
u"title": u"R\u00e9gis plante sa Jeep"
}
},
# embedded vimeo video
{
u'url': u'http://skillsmatter.com/podcast/home/move-semanticsperfect-forwarding-and-rvalue-references',
u'file': u'22444065.mp4',
u'md5': u'2903896e23df39722c33f015af0666e2',
u'info_dict': {
u'title': u'ACCU 2011: Move Semantics,Perfect Forwarding, and Rvalue references- Scott Meyers- 13/04/2011',
u"uploader_id": u"skillsmatter",
u"uploader": u"Skills Matter",
}
}
]
def report_download_webpage(self, video_id):
@ -121,12 +134,20 @@ class GenericIE(InfoExtractor):
self.report_extraction(video_id)
# Look for BrightCove:
m_brightcove = re.search(r'<object.+?class=([\'"]).*?BrightcoveExperience.*?\1.+?</object>', webpage, re.DOTALL)
m_brightcove = re.search(r'<object[^>]+?class=([\'"])[^>]*?BrightcoveExperience.*?\1.+?</object>', webpage, re.DOTALL)
if m_brightcove is not None:
self.to_screen(u'Brightcove video detected.')
bc_url = BrightcoveIE._build_brighcove_url(m_brightcove.group())
return self.url_result(bc_url, 'Brightcove')
# Look for embedded Vimeo player
mobj = re.search(
r'<iframe\s+src="(https?://player.vimeo.com/video/.*?)"', webpage)
if mobj:
player_url = unescapeHTML(mobj.group(1))
surl = smuggle_url(player_url, {'Referer': url})
return self.url_result(surl, 'Vimeo')
# Start with something easy: JW Player in SWFObject
mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
if mobj is None:

View File

@ -0,0 +1,87 @@
import re
import xml.etree.ElementTree
from .common import InfoExtractor
from ..utils import (
compat_urlparse,
compat_urllib_parse,
xpath_with_ns,
determine_ext,
)
class InternetVideoArchiveIE(InfoExtractor):
_VALID_URL = r'https?://video\.internetvideoarchive\.net/flash/players/.*?\?.*?publishedid.*?'
_TEST = {
u'url': u'http://video.internetvideoarchive.net/flash/players/flashconfiguration.aspx?customerid=69249&publishedid=452693&playerid=247',
u'file': u'452693.mp4',
u'info_dict': {
u'title': u'SKYFALL',
u'description': u'In SKYFALL, Bond\'s loyalty to M is tested as her past comes back to haunt her. As MI6 comes under attack, 007 must track down and destroy the threat, no matter how personal the cost.',
u'duration': 156,
},
}
@staticmethod
def _build_url(query):
return 'http://video.internetvideoarchive.net/flash/players/flashconfiguration.aspx?' + query
@staticmethod
def _clean_query(query):
NEEDED_ARGS = ['publishedid', 'customerid']
query_dic = compat_urlparse.parse_qs(query)
cleaned_dic = dict((k,v[0]) for (k,v) in query_dic.items() if k in NEEDED_ARGS)
# Other player ids return m3u8 urls
cleaned_dic['playerid'] = '247'
cleaned_dic['videokbrate'] = '100000'
return compat_urllib_parse.urlencode(cleaned_dic)
def _real_extract(self, url):
query = compat_urlparse.urlparse(url).query
query_dic = compat_urlparse.parse_qs(query)
video_id = query_dic['publishedid'][0]
url = self._build_url(query)
flashconfiguration_xml = self._download_webpage(url, video_id,
u'Downloading flash configuration')
flashconfiguration = xml.etree.ElementTree.fromstring(flashconfiguration_xml.encode('utf-8'))
file_url = flashconfiguration.find('file').text
file_url = file_url.replace('/playlist.aspx', '/mrssplaylist.aspx')
# Replace some of the parameters in the query to get the best quality
# and http links (no m3u8 manifests)
file_url = re.sub(r'(?<=\?)(.+)$',
lambda m: self._clean_query(m.group()),
file_url)
info_xml = self._download_webpage(file_url, video_id,
u'Downloading video info')
info = xml.etree.ElementTree.fromstring(info_xml.encode('utf-8'))
item = info.find('channel/item')
def _bp(p):
return xpath_with_ns(p,
{'media': 'http://search.yahoo.com/mrss/',
'jwplayer': 'http://developer.longtailvideo.com/trac/wiki/FlashFormats'})
formats = []
for content in item.findall(_bp('media:group/media:content')):
attr = content.attrib
f_url = attr['url']
formats.append({
'url': f_url,
'ext': determine_ext(f_url),
'width': int(attr['width']),
'bitrate': int(attr['bitrate']),
})
formats = sorted(formats, key=lambda f: f['bitrate'])
info = {
'id': video_id,
'title': item.find('title').text,
'formats': formats,
'thumbnail': item.find(_bp('media:thumbnail')).attrib['url'],
'description': item.find('description').text,
'duration': int(attr['duration']),
}
# TODO: Remove when #980 has been merged
info.update(formats[-1])
return info

View File

@ -87,7 +87,7 @@ class MTVIE(InfoExtractor):
description_node = itemdoc.find('description')
if description_node is not None:
description = description_node.text
description = description_node.text.strip()
else:
description = None

120
youtube_dl/extractor/nhl.py Normal file
View File

@ -0,0 +1,120 @@
import re
import json
import xml.etree.ElementTree
from .common import InfoExtractor
from ..utils import (
compat_urlparse,
compat_urllib_parse,
determine_ext,
unified_strdate,
)
class NHLBaseInfoExtractor(InfoExtractor):
@staticmethod
def _fix_json(json_string):
return json_string.replace('\\\'', '\'')
def _extract_video(self, info):
video_id = info['id']
self.report_extraction(video_id)
initial_video_url = info['publishPoint']
data = compat_urllib_parse.urlencode({
'type': 'fvod',
'path': initial_video_url.replace('.mp4', '_sd.mp4'),
})
path_url = 'http://video.nhl.com/videocenter/servlets/encryptvideopath?' + data
path_response = self._download_webpage(path_url, video_id,
u'Downloading final video url')
path_doc = xml.etree.ElementTree.fromstring(path_response)
video_url = path_doc.find('path').text
join = compat_urlparse.urljoin
return {
'id': video_id,
'title': info['name'],
'url': video_url,
'ext': determine_ext(video_url),
'description': info['description'],
'duration': int(info['duration']),
'thumbnail': join(join(video_url, '/u/'), info['bigImage']),
'upload_date': unified_strdate(info['releaseDate'].split('.')[0]),
}
class NHLIE(NHLBaseInfoExtractor):
IE_NAME = u'nhl.com'
_VALID_URL = r'https?://video(?P<team>\.[^.]*)?\.nhl\.com/videocenter/console\?.*?(?<=[?&])id=(?P<id>\d+)'
_TEST = {
u'url': u'http://video.canucks.nhl.com/videocenter/console?catid=6?id=453614',
u'file': u'453614.mp4',
u'info_dict': {
u'title': u'Quick clip: Weise 4-3 goal vs Flames',
u'description': u'Dale Weise scores his first of the season to put the Canucks up 4-3.',
u'duration': 18,
u'upload_date': u'20131006',
},
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
json_url = 'http://video.nhl.com/videocenter/servlets/playlist?ids=%s&format=json' % video_id
info_json = self._download_webpage(json_url, video_id,
u'Downloading info json')
info_json = self._fix_json(info_json)
info = json.loads(info_json)[0]
return self._extract_video(info)
class NHLVideocenterIE(NHLBaseInfoExtractor):
IE_NAME = u'nhl.com:videocenter'
IE_DESC = u'Download the first 12 videos from a videocenter category'
_VALID_URL = r'https?://video\.(?P<team>[^.]*)\.nhl\.com/videocenter/(console\?.*?catid=(?P<catid>[^&]+))?'
@classmethod
def suitable(cls, url):
if NHLIE.suitable(url):
return False
return super(NHLVideocenterIE, cls).suitable(url)
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
team = mobj.group('team')
webpage = self._download_webpage(url, team)
cat_id = self._search_regex(
[r'var defaultCatId = "(.+?)";',
r'{statusIndex:0,index:0,.*?id:(.*?),'],
webpage, u'category id')
playlist_title = self._html_search_regex(
r'\?catid=%s">(.*?)</a>' % cat_id,
webpage, u'playlist title', flags=re.DOTALL)
data = compat_urllib_parse.urlencode({
'cid': cat_id,
# This is the default value
'count': 12,
'ptrs': 3,
'format': 'json',
})
path = '/videocenter/servlets/browse?' + data
request_url = compat_urlparse.urljoin(url, path)
response = self._download_webpage(request_url, playlist_title)
response = self._fix_json(response)
if not response.strip():
self._downloader.report_warning(u'Got an empty reponse, trying '
u'adding the "newvideos" parameter')
response = self._download_webpage(request_url + '&newvideos=true',
playlist_title)
response = self._fix_json(response)
videos = json.loads(response)
return {
'_type': 'playlist',
'title': playlist_title,
'id': cat_id,
'entries': [self._extract_video(i) for i in videos],
}

View File

@ -0,0 +1,43 @@
import re
from .common import InfoExtractor
from ..utils import compat_urlparse
class NowVideoIE(InfoExtractor):
_VALID_URL = r'(?:https?://)?(?:www\.)?nowvideo\.ch/video/(?P<id>\w+)'
_TEST = {
u'url': u'http://www.nowvideo.ch/video/0mw0yow7b6dxa',
u'file': u'0mw0yow7b6dxa.flv',
u'md5': u'f8fbbc8add72bd95b7850c6a02fc8817',
u'info_dict': {
u"title": u"youtubedl test video _BaW_jenozKc.mp4"
}
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
webpage_url = 'http://www.nowvideo.ch/video/' + video_id
webpage = self._download_webpage(webpage_url, video_id)
self.report_extraction(video_id)
video_title = self._html_search_regex(r'<h4>(.*)</h4>',
webpage, u'video title')
video_key = self._search_regex(r'var fkzd="(.*)";',
webpage, u'video key')
api_call = "http://www.nowvideo.ch/api/player.api.php?file={0}&numOfErrors=0&cid=1&key={1}".format(video_id, video_key)
api_response = self._download_webpage(api_call, video_id,
u'Downloading API page')
video_url = compat_urlparse.parse_qs(api_response)[u'url'][0]
return [{
'id': video_id,
'url': video_url,
'ext': 'flv',
'title': video_title,
}]

View File

@ -0,0 +1,16 @@
from .videodetective import VideoDetectiveIE
# It just uses the same method as videodetective.com,
# the internetvideoarchive.com is extracted from the og:video property
class RottenTomatoesIE(VideoDetectiveIE):
_VALID_URL = r'https?://www\.rottentomatoes\.com/m/[^/]+/trailers/(?P<id>\d+)'
_TEST = {
u'url': u'http://www.rottentomatoes.com/m/toy_story_3/trailers/11028566/',
u'file': '613340.mp4',
u'info_dict': {
u'title': u'TOY STORY 3',
u'description': u'From the creators of the beloved TOY STORY films, comes a story that will reunite the gang in a whole new way.',
},
}

View File

@ -0,0 +1,58 @@
# encoding: utf-8
import re
import json
from .common import InfoExtractor
from ..utils import (
compat_urlparse,
compat_str,
ExtractorError,
)
class RutubeIE(InfoExtractor):
_VALID_URL = r'https?://rutube.ru/video/(?P<long_id>\w+)'
_TEST = {
u'url': u'http://rutube.ru/video/3eac3b4561676c17df9132a9a1e62e3e/',
u'file': u'3eac3b4561676c17df9132a9a1e62e3e.mp4',
u'info_dict': {
u'title': u'Раненный кенгуру забежал в аптеку',
u'uploader': u'NTDRussian',
u'uploader_id': u'29790',
},
u'params': {
# It requires ffmpeg (m3u8 download)
u'skip_download': True,
},
}
def _get_api_response(self, short_id, subpath):
api_url = 'http://rutube.ru/api/play/%s/%s/?format=json' % (subpath, short_id)
response_json = self._download_webpage(api_url, short_id,
u'Downloading %s json' % subpath)
return json.loads(response_json)
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
long_id = mobj.group('long_id')
webpage = self._download_webpage(url, long_id)
og_video = self._og_search_video_url(webpage)
short_id = compat_urlparse.urlparse(og_video).path[1:]
options = self._get_api_response(short_id, 'options')
trackinfo = self._get_api_response(short_id, 'trackinfo')
# Some videos don't have the author field
author = trackinfo.get('author') or {}
m3u8_url = trackinfo['video_balancer'].get('m3u8')
if m3u8_url is None:
raise ExtractorError(u'Couldn\'t find m3u8 manifest url')
return {
'id': trackinfo['id'],
'title': trackinfo['title'],
'url': m3u8_url,
'ext': 'mp4',
'thumbnail': options['thumbnail_url'],
'uploader': author.get('name'),
'uploader_id': compat_str(author['id']) if author else None,
}

View File

@ -0,0 +1,44 @@
# -*- coding: utf-8 -*-
import re
from .common import InfoExtractor
from ..utils import determine_ext
class SztvHuIE(InfoExtractor):
_VALID_URL = r'(?:http://)?(?:(?:www\.)?sztv\.hu|www\.tvszombathely\.hu)/(?:[^/]+)/.+-(?P<id>[0-9]+)'
_TEST = {
u'url': u'http://sztv.hu/hirek/cserkeszek-nepszerusitettek-a-kornyezettudatos-eletmodot-a-savaria-teren-20130909',
u'file': u'20130909.mp4',
u'md5': u'a6df607b11fb07d0e9f2ad94613375cb',
u'info_dict': {
u"title": u"Cserkészek népszerűsítették a környezettudatos életmódot a Savaria téren",
u"description": u'A zöld nap játékos ismeretterjesztő programjait a Magyar Cserkész Szövetség szervezte, akik az ország nyolc városában adják át tudásukat az érdeklődőknek. A PET...',
}
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
webpage = self._download_webpage(url, video_id)
video_file = self._search_regex(
r'file: "...:(.*?)",', webpage, 'video file')
title = self._html_search_regex(
r'<meta name="title" content="([^"]*?) - [^-]*? - [^-]*?"',
webpage, 'video title')
description = self._html_search_regex(
r'<meta name="description" content="([^"]*)"/>',
webpage, 'video description', fatal=False)
thumbnail = self._og_search_thumbnail(webpage)
video_url = 'http://media.sztv.hu/vod/' + video_file
return {
'id': video_id,
'url': video_url,
'title': title,
'ext': determine_ext(video_url),
'description': description,
'thumbnail': thumbnail,
}

View File

@ -0,0 +1,65 @@
import re
from .common import InfoExtractor
from ..utils import (
get_element_by_attribute,
clean_html,
)
class TechTalksIE(InfoExtractor):
_VALID_URL = r'https?://techtalks\.tv/talks/[^/]*/(?P<id>\d+)/'
_TEST = {
u'url': u'http://techtalks.tv/talks/learning-topic-models-going-beyond-svd/57758/',
u'playlist': [
{
u'file': u'57758.flv',
u'info_dict': {
u'title': u'Learning Topic Models --- Going beyond SVD',
},
},
{
u'file': u'57758-slides.flv',
u'info_dict': {
u'title': u'Learning Topic Models --- Going beyond SVD',
},
},
],
u'params': {
# rtmp download
u'skip_download': True,
},
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
talk_id = mobj.group('id')
webpage = self._download_webpage(url, talk_id)
rtmp_url = self._search_regex(r'netConnectionUrl: \'(.*?)\'', webpage,
u'rtmp url')
play_path = self._search_regex(r'href=\'(.*?)\' [^>]*id="flowplayer_presenter"',
webpage, u'presenter play path')
title = clean_html(get_element_by_attribute('class', 'title', webpage))
video_info = {
'id': talk_id,
'title': title,
'url': rtmp_url,
'play_path': play_path,
'ext': 'flv',
}
m_slides = re.search(r'<a class="slides" href=\'(.*?)\'', webpage)
if m_slides is None:
return video_info
else:
return [
video_info,
# The slides video
{
'id': talk_id + '-slides',
'title': title,
'url': rtmp_url,
'play_path': m_slides.group(1),
'ext': 'flv',
},
]

View File

@ -7,15 +7,25 @@ from .common import InfoExtractor
class TudouIE(InfoExtractor):
_VALID_URL = r'(?:http://)?(?:www\.)?tudou\.com/(?:listplay|programs)/(?:view|(.+?))/(?:([^/]+)|([^/]+))(?:\.html)?'
_TEST = {
_VALID_URL = r'(?:http://)?(?:www\.)?tudou\.com/(?:listplay|programs|albumplay)/(?:view|(.+?))/(?:([^/]+)|([^/]+))(?:\.html)?'
_TESTS = [{
u'url': u'http://www.tudou.com/listplay/zzdE77v6Mmo/2xN2duXMxmw.html',
u'file': u'159448201.f4v',
u'md5': u'140a49ed444bd22f93330985d8475fcb',
u'info_dict': {
u"title": u"卡马乔国足开大脚长传冲吊集锦"
}
}
},
{
u'url': u'http://www.tudou.com/albumplay/TenTw_JgiPM/PzsAs5usU9A.html',
u'file': u'todo.mp4',
u'md5': u'todo.mp4',
u'info_dict': {
u'title': u'todo.mp4',
},
u'add_ie': [u'Youku'],
u'skip': u'Only works from China'
}]
def _url_for_id(self, id, quality = None):
info_url = "http://v2.tudou.com/f?id="+str(id)
@ -29,14 +39,18 @@ class TudouIE(InfoExtractor):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group(2)
webpage = self._download_webpage(url, video_id)
title = re.search(",kw:\"(.+)\"",webpage)
if title is None:
title = re.search(",kw: \'(.+)\'",webpage)
title = title.group(1)
thumbnail_url = re.search(",pic: \'(.+?)\'",webpage)
if thumbnail_url is None:
thumbnail_url = re.search(",pic:\"(.+?)\"",webpage)
thumbnail_url = thumbnail_url.group(1)
m = re.search(r'vcode:\s*[\'"](.+?)[\'"]', webpage)
if m and m.group(1):
return {
'_type': 'url',
'url': u'youku:' + m.group(1),
'ie_key': 'Youku'
}
title = self._search_regex(r",kw:['\"](.+?)[\"']", webpage, u'title')
thumbnail_url = self._search_regex(
r",pic:\s*[\"'](.+?)[\"']", webpage, u'thumbnail URL', fatal=False)
segs_json = self._search_regex(r'segs: \'(.*)\'', webpage, 'segments')
segments = json.loads(segs_json)

View File

@ -1,11 +1,15 @@
import re
import json
import xml.etree.ElementTree
import datetime
from .common import InfoExtractor
from ..utils import (
determine_ext,
ExtractorError,
)
class VevoIE(InfoExtractor):
"""
Accepts urls from vevo.com or in the format 'vevo:{id}'
@ -15,11 +19,11 @@ class VevoIE(InfoExtractor):
_TEST = {
u'url': u'http://www.vevo.com/watch/hurts/somebody-to-die-for/GB1101300280',
u'file': u'GB1101300280.mp4',
u'md5': u'06bea460acb744eab74a9d7dcb4bfd61',
u'info_dict': {
u"upload_date": u"20130624",
u"uploader": u"Hurts",
u"title": u"Somebody to Die For"
u"title": u"Somebody to Die For",
u'duration': 230,
}
}
@ -27,27 +31,47 @@ class VevoIE(InfoExtractor):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
json_url = 'http://www.vevo.com/data/video/%s' % video_id
base_url = 'http://smil.lvl3.vevo.com'
videos_url = '%s/Video/V2/VFILE/%s/%sr.smil' % (base_url, video_id, video_id.lower())
json_url = 'http://videoplayer.vevo.com/VideoService/AuthenticateVideo?isrc=%s' % video_id
info_json = self._download_webpage(json_url, video_id, u'Downloading json info')
links_webpage = self._download_webpage(videos_url, video_id, u'Downloading videos urls')
self.report_extraction(video_id)
video_info = json.loads(info_json)
m_urls = list(re.finditer(r'<video src="(?P<ext>.*?):/?(?P<url>.*?)"', links_webpage))
if m_urls is None or len(m_urls) == 0:
raise ExtractorError(u'Unable to extract video url')
# They are sorted from worst to best quality
m_url = m_urls[-1]
video_url = base_url + '/' + m_url.group('url')
ext = m_url.group('ext')
video_info = json.loads(info_json)['video']
last_version = {'version': -1}
for version in video_info['videoVersions']:
# These are the HTTP downloads, other types are for different manifests
if version['sourceType'] == 2:
if version['version'] > last_version['version']:
last_version = version
if last_version['version'] == -1:
raise ExtractorError(u'Unable to extract last version of the video')
return {'url': video_url,
'ext': ext,
'id': video_id,
'title': video_info['title'],
'thumbnail': video_info['img'],
'upload_date': video_info['launchDate'].replace('/',''),
'uploader': video_info['Artists'][0]['title'],
}
renditions = xml.etree.ElementTree.fromstring(last_version['data'])
formats = []
# Already sorted from worst to best quality
for rend in renditions.findall('rendition'):
attr = rend.attrib
f_url = attr['url']
formats.append({
'url': f_url,
'ext': determine_ext(f_url),
'height': int(attr['frameheight']),
'width': int(attr['frameWidth']),
})
date_epoch = int(self._search_regex(
r'/Date\((\d+)\)/', video_info['launchDate'], u'launch date'))/1000
upload_date = datetime.datetime.fromtimestamp(date_epoch)
info = {
'id': video_id,
'title': video_info['title'],
'formats': formats,
'thumbnail': video_info['imageUrl'],
'upload_date': upload_date.strftime('%Y%m%d'),
'uploader': video_info['mainArtists'][0]['artistName'],
'duration': video_info['duration'],
}
# TODO: Remove when #980 has been merged
info.update(formats[-1])
return info

View File

@ -0,0 +1,30 @@
import re
from .common import InfoExtractor
from .internetvideoarchive import InternetVideoArchiveIE
from ..utils import (
compat_urlparse,
)
class VideoDetectiveIE(InfoExtractor):
_VALID_URL = r'https?://www\.videodetective\.com/[^/]+/[^/]+/(?P<id>\d+)'
_TEST = {
u'url': u'http://www.videodetective.com/movies/kick-ass-2/194487',
u'file': u'194487.mp4',
u'info_dict': {
u'title': u'KICK-ASS 2',
u'description': u'md5:65ba37ad619165afac7d432eaded6013',
u'duration': 138,
},
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
webpage = self._download_webpage(url, video_id)
og_video = self._og_search_video_url(webpage)
query = compat_urlparse.urlparse(og_video).query
return self.url_result(InternetVideoArchiveIE._build_url(query),
ie=InternetVideoArchiveIE.ie_key())

View File

@ -0,0 +1,40 @@
import re
import random
from .common import InfoExtractor
class VideoPremiumIE(InfoExtractor):
_VALID_URL = r'(?:https?://)?(?:www\.)?videopremium\.tv/(?P<id>\w+)(?:/.*)?'
_TEST = {
u'url': u'http://videopremium.tv/4w7oadjsf156',
u'file': u'4w7oadjsf156.f4v',
u'info_dict': {
u"title": u"youtube-dl_test_video____a_________-BaW_jenozKc.mp4.mp4"
},
u'params': {
u'skip_download': True,
},
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
webpage_url = 'http://videopremium.tv/' + video_id
webpage = self._download_webpage(webpage_url, video_id)
self.report_extraction(video_id)
video_title = self._html_search_regex(r'<h2(?:.*?)>\s*(.+?)\s*<',
webpage, u'video title')
return [{
'id': video_id,
'url': "rtmp://e%d.md.iplay.md/play" % random.randint(1, 16),
'play_path': "mp4:%s.f4v" % video_id,
'page_url': "http://videopremium.tv/" + video_id,
'player_url': "http://videopremium.tv/uplayer/uppod.swf",
'ext': 'f4v',
'title': video_title,
}]

View File

@ -11,6 +11,7 @@ from ..utils import (
get_element_by_attribute,
ExtractorError,
std_headers,
unsmuggle_url,
)
class VimeoIE(InfoExtractor):
@ -53,7 +54,7 @@ class VimeoIE(InfoExtractor):
u'title': u'Kathy Sierra: Building the minimum Badass User, Business of Software',
u'uploader': u'The BLN & Business of Software',
},
},
}
]
def _login(self):
@ -98,6 +99,12 @@ class VimeoIE(InfoExtractor):
self._login()
def _real_extract(self, url, new_video=True):
url, data = unsmuggle_url(url)
headers = std_headers
if data is not None:
headers = headers.copy()
headers.update(data)
# Extract ID from URL
mobj = re.match(self._VALID_URL, url)
if mobj is None:
@ -112,7 +119,7 @@ class VimeoIE(InfoExtractor):
url = 'https://vimeo.com/' + video_id
# Retrieve video webpage to extract further information
request = compat_urllib_request.Request(url, None, std_headers)
request = compat_urllib_request.Request(url, None, headers)
webpage = self._download_webpage(request, video_id)
# Now we begin extracting as much information as we can from what we

View File

@ -0,0 +1,59 @@
# coding: utf-8
import re
from ..utils import (
compat_urllib_request,
compat_urllib_parse
)
from .common import InfoExtractor
class WeBSurgIE(InfoExtractor):
IE_NAME = u'websurg.com'
_VALID_URL = r'http://.*?\.websurg\.com/MEDIA/\?noheader=1&doi=(.*)'
_TEST = {
u'url': u'http://www.websurg.com/MEDIA/?noheader=1&doi=vd01en4012',
u'file': u'vd01en4012.mp4',
u'params': {
u'skip_download': True,
},
u'skip': u'Requires login information',
}
_LOGIN_URL = 'http://www.websurg.com/inc/login/login_div.ajax.php?login=1'
def _real_initialize(self):
login_form = {
'username': self._downloader.params['username'],
'password': self._downloader.params['password'],
'Submit': 1
}
request = compat_urllib_request.Request(
self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
request.add_header(
'Content-Type', 'application/x-www-form-urlencoded;charset=utf-8')
compat_urllib_request.urlopen(request).info()
webpage = self._download_webpage(self._LOGIN_URL, '', 'Logging in')
if webpage != 'OK':
self._downloader.report_error(
u'Unable to log in: bad username/password')
def _real_extract(self, url):
video_id = re.match(self._VALID_URL, url).group(1)
webpage = self._download_webpage(url, video_id)
url_info = re.search(r'streamer="(.*?)" src="(.*?)"', webpage)
return {'id': video_id,
'title': self._og_search_title(webpage),
'description': self._og_search_description(webpage),
'ext' : 'mp4',
'url' : url_info.group(1) + '/' + url_info.group(2),
'thumbnail': self._og_search_thumbnail(webpage)
}

View File

@ -50,6 +50,21 @@ class YahooIE(InfoExtractor):
webpage, u'items', flags=re.MULTILINE)
items = json.loads(items_json)
info = items['mediaItems']['query']['results']['mediaObj'][0]
# The 'meta' field is not always in the video webpage, we request it
# from another page
long_id = info['id']
query = ('SELECT * FROM yahoo.media.video.streams WHERE id="%s"'
' AND plrs="86Gj0vCaSzV_Iuf6hNylf2"' % long_id)
data = compat_urllib_parse.urlencode({
'q': query,
'env': 'prod',
'format': 'json',
})
query_result_json = self._download_webpage(
'http://video.query.yahoo.com/v1/public/yql?' + data,
video_id, u'Downloading video info')
query_result = json.loads(query_result_json)
info = query_result['query']['results']['mediaObj'][0]
meta = info['meta']
formats = []

View File

@ -13,7 +13,7 @@ from ..utils import (
class YoukuIE(InfoExtractor):
_VALID_URL = r'(?:http://)?(v|player)\.youku\.com/(v_show/id_|player\.php/sid/)(?P<ID>[A-Za-z0-9]+)(\.html|/v.swf)'
_VALID_URL = r'(?:(?:http://)?(?:v|player)\.youku\.com/(?:v_show/id_|player\.php/sid/)|youku:)(?P<ID>[A-Za-z0-9]+)(?:\.html|/v\.swf|)'
_TEST = {
u"url": u"http://v.youku.com/v_show/id_XNDgyMDQ2NTQw.html",
u"file": u"XNDgyMDQ2NTQw_part00.flv",

View File

@ -1116,6 +1116,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
'lang': lang,
'v': video_id,
'fmt': self._downloader.params.get('subtitlesformat'),
'name': l[0],
})
url = u'http://www.youtube.com/api/timedtext?' + params
sub_lang_list[lang] = url
@ -1149,7 +1150,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
list_page = self._download_webpage(list_url, video_id)
caption_list = xml.etree.ElementTree.fromstring(list_page.encode('utf-8'))
original_lang_node = caption_list.find('track')
if original_lang_node.attrib.get('kind') != 'asr' :
if not original_lang_node or original_lang_node.attrib.get('kind') != 'asr' :
self._downloader.report_warning(u'Video doesn\'t have automatic captions')
return {}
original_lang = original_lang_node.attrib['lang_code']
@ -1249,6 +1250,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
url_map[itag] = format_url
return url_map
def _extract_annotations(self, video_id):
url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
return self._download_webpage(url, video_id, note=u'Searching for annotations.', errnote=u'Unable to download video annotations.')
def _real_extract(self, url):
# Extract original video URL from URL with redirection, like age verification, using next_url parameter
mobj = re.search(self._NEXT_URL_RE, url)
@ -1381,6 +1386,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
else:
video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
# annotations
video_annotations = None
if self._downloader.params.get('writeannotations', False):
video_annotations = self._extract_annotations(video_id)
# Decide which formats to download
try:
@ -1494,6 +1504,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
'subtitles': video_subtitles,
'duration': video_duration,
'age_limit': 18 if age_gate else 0,
'annotations': video_annotations
})
return results
@ -1634,7 +1645,7 @@ class YoutubeChannelIE(InfoExtractor):
class YoutubeUserIE(InfoExtractor):
IE_DESC = u'YouTube.com user videos (URL or "ytuser" keyword)'
_VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!watch(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
_VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
_TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
_GDATA_PAGE_SIZE = 50
_GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'

View File

@ -9,6 +9,7 @@ import io
import json
import locale
import os
import pipes
import platform
import re
import socket
@ -229,6 +230,19 @@ else:
return f
return None
# On python2.6 the xml.etree.ElementTree.Element methods don't support
# the namespace parameter
def xpath_with_ns(path, ns_map):
components = [c.split(':') for c in path.split('/')]
replaced = []
for c in components:
if len(c) == 1:
replaced.append(c[0])
else:
ns, tag = c
replaced.append('{%s}%s' % (ns_map[ns], tag))
return '/'.join(replaced)
def htmlentity_transform(matchobj):
"""Transforms an HTML entity to a character.
@ -715,6 +729,7 @@ def unified_strdate(date_str):
'%Y/%m/%d %H:%M:%S',
'%d.%m.%Y %H:%M',
'%Y-%m-%dT%H:%M:%SZ',
'%Y-%m-%dT%H:%M:%S',
]
for expression in format_expressions:
try:
@ -926,3 +941,24 @@ class locked_file(object):
def read(self, *args):
return self.f.read(*args)
def shell_quote(args):
return ' '.join(map(pipes.quote, args))
def smuggle_url(url, data):
""" Pass additional data in a URL for internal use. """
sdata = compat_urllib_parse.urlencode(
{u'__youtubedl_smuggle': json.dumps(data)})
return url + u'#' + sdata
def unsmuggle_url(smug_url):
if not '#__youtubedl_smuggle' in smug_url:
return smug_url, None
url, _, sdata = smug_url.rpartition(u'#')
jsond = compat_parse_qs(sdata)[u'__youtubedl_smuggle'][0]
data = json.loads(jsond)
return url, data

View File

@ -1,2 +1,2 @@
__version__ = '2013.10.07'
__version__ = '2013.10.17'