diff --git a/README.md b/README.md index 580b16004..029c418d1 100644 --- a/README.md +++ b/README.md @@ -30,7 +30,8 @@ which means you can modify it, redistribute it or use it however you like. --list-extractors List all supported extractors and the URLs they would handle --extractor-descriptions Output descriptions of all supported extractors - --proxy URL Use the specified HTTP/HTTPS proxy + --proxy URL Use the specified HTTP/HTTPS proxy. Pass in an + empty string (--proxy "") for direct connection --no-check-certificate Suppress HTTPS certificate validation. --cache-dir DIR Location in the filesystem where youtube-dl can store downloaded information permanently. By @@ -55,8 +56,9 @@ which means you can modify it, redistribute it or use it however you like. --dateafter DATE download only videos uploaded after this date --no-playlist download only the currently playing video --age-limit YEARS download only videos suitable for the given age - --download-archive FILE Download only videos not present in the archive - file. Record all downloaded videos in it. + --download-archive FILE Download only videos not listed in the archive + file. Record the IDs of all downloaded videos in + it. ## Download Options: -r, --rate-limit LIMIT maximum download rate in bytes per second (e.g. @@ -130,11 +132,11 @@ which means you can modify it, redistribute it or use it however you like. -v, --verbose print various debugging information --dump-intermediate-pages print downloaded pages to debug problems(very verbose) - --write-pages Write downloaded pages to files in the current - directory + --write-pages Write downloaded intermediary pages to files in + the current directory to debug problems ## Video Format Options: - -f, --format FORMAT video format code, specifiy the order of + -f, --format FORMAT video format code, specify the order of preference using slashes: "-f 22/17/18". "-f mp4" and "-f flv" are also supported --all-formats download all available video formats @@ -182,7 +184,7 @@ which means you can modify it, redistribute it or use it however you like. # CONFIGURATION -You can configure youtube-dl by placing default arguments (such as `--extract-audio --no-mtime` to always extract the audio and not copy the mtime) into `/etc/youtube-dl.conf` and/or `~/.config/youtube-dl.conf`. +You can configure youtube-dl by placing default arguments (such as `--extract-audio --no-mtime` to always extract the audio and not copy the mtime) into `/etc/youtube-dl.conf` and/or `~/.config/youtube-dl.conf`. On Windows, the configuration file locations are `%APPDATA%\youtube-dl\config.txt` and `C:\Users\\youtube-dl.conf`. # OUTPUT TEMPLATE diff --git a/devscripts/bash-completion.in b/devscripts/bash-completion.in index ce893fcbe..3af87a378 100644 --- a/devscripts/bash-completion.in +++ b/devscripts/bash-completion.in @@ -1,10 +1,21 @@ __youtube_dl() { - local cur prev opts + local cur prev opts fileopts diropts keywords COMPREPLY=() cur="${COMP_WORDS[COMP_CWORD]}" + prev="${COMP_WORDS[COMP_CWORD-1]}" opts="{{flags}}" - keywords=":ytfavorites :ytrecommended :ytsubscriptions :ytwatchlater" + keywords=":ytfavorites :ytrecommended :ytsubscriptions :ytwatchlater :ythistory" + fileopts="-a|--batch-file|--download-archive|--cookies" + diropts="--cache-dir" + + if [[ ${prev} =~ ${fileopts} ]]; then + COMPREPLY=( $(compgen -f -- ${cur}) ) + return 0 + elif [[ ${prev} =~ ${diropts} ]]; then + COMPREPLY=( $(compgen -d -- ${cur}) ) + return 0 + fi if [[ ${cur} =~ : ]]; then COMPREPLY=( $(compgen -W "${keywords}" -- ${cur}) ) diff --git a/test/helper.py b/test/helper.py index d7bf7a828..b1f421ac5 100644 --- a/test/helper.py +++ b/test/helper.py @@ -12,10 +12,6 @@ from youtube_dl import YoutubeDL from youtube_dl.utils import preferredencoding -def global_setup(): - youtube_dl._setup_opener(timeout=10) - - def get_params(override=None): PARAMETERS_FILE = os.path.join(os.path.dirname(os.path.abspath(__file__)), "parameters.json") diff --git a/test/parameters.json b/test/parameters.json index f042880ed..487a46d56 100644 --- a/test/parameters.json +++ b/test/parameters.json @@ -39,5 +39,6 @@ "writeinfojson": true, "writesubtitles": false, "allsubtitles": false, - "listssubtitles": false + "listssubtitles": false, + "socket_timeout": 20 } diff --git a/test/test_age_restriction.py b/test/test_age_restriction.py index 506572e9e..c9cdb96cb 100644 --- a/test/test_age_restriction.py +++ b/test/test_age_restriction.py @@ -6,8 +6,7 @@ import sys import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from test.helper import global_setup, try_rm -global_setup() +from test.helper import try_rm from youtube_dl import YoutubeDL diff --git a/test/test_all_urls.py b/test/test_all_urls.py index 56e5f80e1..6b9764c67 100644 --- a/test/test_all_urls.py +++ b/test/test_all_urls.py @@ -100,10 +100,15 @@ class TestAllURLsMatching(unittest.TestCase): def test_keywords(self): self.assertMatch(':ytsubs', ['youtube:subscriptions']) self.assertMatch(':ytsubscriptions', ['youtube:subscriptions']) - self.assertMatch(':thedailyshow', ['ComedyCentral']) - self.assertMatch(':tds', ['ComedyCentral']) - self.assertMatch(':colbertreport', ['ComedyCentral']) - self.assertMatch(':cr', ['ComedyCentral']) + self.assertMatch(':ythistory', ['youtube:history']) + self.assertMatch(':thedailyshow', ['ComedyCentralShows']) + self.assertMatch(':tds', ['ComedyCentralShows']) + self.assertMatch(':colbertreport', ['ComedyCentralShows']) + self.assertMatch(':cr', ['ComedyCentralShows']) + + def test_vimeo_matching(self): + self.assertMatch('http://vimeo.com/channels/tributes', ['vimeo:channel']) + self.assertMatch('http://vimeo.com/user7108434', ['vimeo:user']) if __name__ == '__main__': diff --git a/test/test_download.py b/test/test_download.py index fe7f7b8cb..dd5818dba 100644 --- a/test/test_download.py +++ b/test/test_download.py @@ -9,12 +9,10 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from test.helper import ( get_params, get_testcases, - global_setup, try_rm, md5, report_warning ) -global_setup() import hashlib diff --git a/test/test_playlists.py b/test/test_playlists.py index 7c67239a4..87ca401e5 100644 --- a/test/test_playlists.py +++ b/test/test_playlists.py @@ -8,21 +8,25 @@ import sys import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from test.helper import FakeYDL, global_setup -global_setup() +from test.helper import FakeYDL from youtube_dl.extractor import ( DailymotionPlaylistIE, DailymotionUserIE, VimeoChannelIE, + VimeoUserIE, + VimeoAlbumIE, + VimeoGroupsIE, UstreamChannelIE, SoundcloudSetIE, SoundcloudUserIE, LivestreamIE, NHLVideocenterIE, BambuserChannelIE, - BandcampAlbumIE + BandcampAlbumIE, + SmotriCommunityIE, + SmotriUserIE ) @@ -55,6 +59,30 @@ class TestPlaylists(unittest.TestCase): self.assertEqual(result['title'], u'Vimeo Tributes') self.assertTrue(len(result['entries']) > 24) + def test_vimeo_user(self): + dl = FakeYDL() + ie = VimeoUserIE(dl) + result = ie.extract('http://vimeo.com/nkistudio/videos') + self.assertIsPlaylist(result) + self.assertEqual(result['title'], u'Nki') + self.assertTrue(len(result['entries']) > 65) + + def test_vimeo_album(self): + dl = FakeYDL() + ie = VimeoAlbumIE(dl) + result = ie.extract('http://vimeo.com/album/2632481') + self.assertIsPlaylist(result) + self.assertEqual(result['title'], u'Staff Favorites: November 2013') + self.assertTrue(len(result['entries']) > 12) + + def test_vimeo_groups(self): + dl = FakeYDL() + ie = VimeoGroupsIE(dl) + result = ie.extract('http://vimeo.com/groups/rolexawards') + self.assertIsPlaylist(result) + self.assertEqual(result['title'], u'Rolex Awards for Enterprise') + self.assertTrue(len(result['entries']) > 72) + def test_ustream_channel(self): dl = FakeYDL() ie = UstreamChannelIE(dl) @@ -111,6 +139,24 @@ class TestPlaylists(unittest.TestCase): self.assertIsPlaylist(result) self.assertEqual(result['title'], u'Nightmare Night EP') self.assertTrue(len(result['entries']) >= 4) + + def test_smotri_community(self): + dl = FakeYDL() + ie = SmotriCommunityIE(dl) + result = ie.extract('http://smotri.com/community/video/kommuna') + self.assertIsPlaylist(result) + self.assertEqual(result['id'], u'kommuna') + self.assertEqual(result['title'], u'КПРФ') + self.assertTrue(len(result['entries']) >= 4) + + def test_smotri_user(self): + dl = FakeYDL() + ie = SmotriUserIE(dl) + result = ie.extract('http://smotri.com/user/inspector') + self.assertIsPlaylist(result) + self.assertEqual(result['id'], u'inspector') + self.assertEqual(result['title'], u'Inspector') + self.assertTrue(len(result['entries']) >= 9) if __name__ == '__main__': unittest.main() diff --git a/test/test_subtitles.py b/test/test_subtitles.py index 06a304879..23a653124 100644 --- a/test/test_subtitles.py +++ b/test/test_subtitles.py @@ -6,8 +6,7 @@ import sys import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from test.helper import FakeYDL, global_setup, md5 -global_setup() +from test.helper import FakeYDL, md5 from youtube_dl.extractor import ( @@ -73,7 +72,7 @@ class TestYoutubeSubtitles(BaseTestSubtitles): self.DL.params['writesubtitles'] = True self.DL.params['subtitlesformat'] = 'vtt' subtitles = self.getSubtitles() - self.assertEqual(md5(subtitles['en']), '356cdc577fde0c6783b9b822e7206ff7') + self.assertEqual(md5(subtitles['en']), '3cb210999d3e021bd6c7f0ea751eab06') def test_youtube_list_subtitles(self): self.DL.expect_warning(u'Video doesn\'t have automatic captions') diff --git a/test/test_utils.py b/test/test_utils.py index e9e590e74..0fa66beec 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -26,6 +26,7 @@ from youtube_dl.utils import ( unsmuggle_url, shell_quote, encodeFilename, + str_to_int, ) if sys.version_info < (3, 0): @@ -176,6 +177,10 @@ class TestUtil(unittest.TestCase): args = ['ffmpeg', '-i', encodeFilename(u'ñ€ß\'.mp4')] self.assertEqual(shell_quote(args), u"""ffmpeg -i 'ñ€ß'"'"'.mp4'""") + def test_str_to_int(self): + self.assertEqual(str_to_int('123,456'), 123456) + self.assertEqual(str_to_int('123.456'), 123456) + if __name__ == '__main__': unittest.main() diff --git a/test/test_write_annotations.py b/test/test_write_annotations.py index 35defb895..eac53b285 100644 --- a/test/test_write_annotations.py +++ b/test/test_write_annotations.py @@ -7,8 +7,7 @@ import sys import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from test.helper import get_params, global_setup, try_rm -global_setup() +from test.helper import get_params, try_rm import io diff --git a/test/test_write_info_json.py b/test/test_write_info_json.py index 30c4859fd..90426a559 100644 --- a/test/test_write_info_json.py +++ b/test/test_write_info_json.py @@ -7,8 +7,7 @@ import sys import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from test.helper import get_params, global_setup -global_setup() +from test.helper import get_params import io @@ -34,6 +33,7 @@ TEST_ID = 'BaW_jenozKc' INFO_JSON_FILE = TEST_ID + '.info.json' DESCRIPTION_FILE = TEST_ID + '.mp4.description' EXPECTED_DESCRIPTION = u'''test chars: "'/\ä↭𝕐 +test URL: https://github.com/rg3/youtube-dl/issues/1892 This is a test video for youtube-dl. diff --git a/test/test_youtube_lists.py b/test/test_youtube_lists.py index 938517a2d..95f07d129 100644 --- a/test/test_youtube_lists.py +++ b/test/test_youtube_lists.py @@ -6,8 +6,7 @@ import sys import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from test.helper import FakeYDL, global_setup -global_setup() +from test.helper import FakeYDL from youtube_dl.extractor import ( @@ -108,5 +107,14 @@ class TestYoutubeLists(unittest.TestCase): result = ie.extract('http://www.youtube.com/show/airdisasters') self.assertTrue(len(result) >= 3) + def test_youtube_mix(self): + dl = FakeYDL() + ie = YoutubePlaylistIE(dl) + result = ie.extract('http://www.youtube.com/watch?v=lLJf9qJHR3E&list=RDrjFaenf1T-Y') + entries = result['entries'] + self.assertTrue(len(entries) >= 20) + original_video = entries[0] + self.assertEqual(original_video['id'], 'rjFaenf1T-Y') + if __name__ == '__main__': unittest.main() diff --git a/test/test_youtube_signature.py b/test/test_youtube_signature.py index 5e1ff5eb0..056700614 100644 --- a/test/test_youtube_signature.py +++ b/test/test_youtube_signature.py @@ -6,9 +6,6 @@ import sys import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from test.helper import global_setup -global_setup() - import io import re diff --git a/youtube_dl/FileDownloader.py b/youtube_dl/FileDownloader.py index e5a542ed5..3ff9716b3 100644 --- a/youtube_dl/FileDownloader.py +++ b/youtube_dl/FileDownloader.py @@ -1,4 +1,3 @@ -import math import os import re import subprocess @@ -11,6 +10,7 @@ from .utils import ( ContentTooShortError, determine_ext, encodeFilename, + format_bytes, sanitize_open, timeconvert, ) @@ -53,20 +53,6 @@ class FileDownloader(object): self._progress_hooks = [] self.params = params - @staticmethod - def format_bytes(bytes): - if bytes is None: - return 'N/A' - if type(bytes) is str: - bytes = float(bytes) - if bytes == 0.0: - exponent = 0 - else: - exponent = int(math.log(bytes, 1024.0)) - suffix = ['B','KiB','MiB','GiB','TiB','PiB','EiB','ZiB','YiB'][exponent] - converted = float(bytes) / float(1024 ** exponent) - return '%.2f%s' % (converted, suffix) - @staticmethod def format_seconds(seconds): (mins, secs) = divmod(seconds, 60) @@ -117,7 +103,7 @@ class FileDownloader(object): def format_speed(speed): if speed is None: return '%10s' % '---b/s' - return '%10s' % ('%s/s' % FileDownloader.format_bytes(speed)) + return '%10s' % ('%s/s' % format_bytes(speed)) @staticmethod def best_block_size(elapsed_time, bytes): @@ -270,6 +256,61 @@ class FileDownloader(object): (clear_line, data_len_str, self.format_seconds(tot_time))) def _download_with_rtmpdump(self, filename, url, player_url, page_url, play_path, tc_url, live): + def run_rtmpdump(args): + start = time.time() + resume_percent = None + resume_downloaded_data_len = None + proc = subprocess.Popen(args, stderr=subprocess.PIPE) + cursor_in_new_line = True + proc_stderr_closed = False + while not proc_stderr_closed: + # read line from stderr + line = u'' + while True: + char = proc.stderr.read(1) + if not char: + proc_stderr_closed = True + break + if char in [b'\r', b'\n']: + break + line += char.decode('ascii', 'replace') + if not line: + # proc_stderr_closed is True + continue + mobj = re.search(r'([0-9]+\.[0-9]{3}) kB / [0-9]+\.[0-9]{2} sec \(([0-9]{1,2}\.[0-9])%\)', line) + if mobj: + downloaded_data_len = int(float(mobj.group(1))*1024) + percent = float(mobj.group(2)) + if not resume_percent: + resume_percent = percent + resume_downloaded_data_len = downloaded_data_len + eta = self.calc_eta(start, time.time(), 100-resume_percent, percent-resume_percent) + speed = self.calc_speed(start, time.time(), downloaded_data_len-resume_downloaded_data_len) + data_len = None + if percent > 0: + data_len = int(downloaded_data_len * 100 / percent) + data_len_str = u'~' + format_bytes(data_len) + self.report_progress(percent, data_len_str, speed, eta) + cursor_in_new_line = False + self._hook_progress({ + 'downloaded_bytes': downloaded_data_len, + 'total_bytes': data_len, + 'tmpfilename': tmpfilename, + 'filename': filename, + 'status': 'downloading', + 'eta': eta, + 'speed': speed, + }) + elif self.params.get('verbose', False): + if not cursor_in_new_line: + self.to_screen(u'') + cursor_in_new_line = True + self.to_screen(u'[rtmpdump] '+line) + proc.wait() + if not cursor_in_new_line: + self.to_screen(u'') + return proc.returncode + self.report_destination(filename) tmpfilename = self.temp_name(filename) test = self.params.get('test', False) @@ -280,12 +321,11 @@ class FileDownloader(object): except (OSError, IOError): self.report_error(u'RTMP download detected but "rtmpdump" could not be run') return False - verbosity_option = '--verbose' if self.params.get('verbose', False) else '--quiet' # Download using rtmpdump. rtmpdump returns exit code 2 when # the connection was interrumpted and resuming appears to be # possible. This is part of rtmpdump's normal usage, AFAIK. - basic_args = ['rtmpdump', verbosity_option, '-r', url, '-o', tmpfilename] + basic_args = ['rtmpdump', '--verbose', '-r', url, '-o', tmpfilename] if player_url is not None: basic_args += ['--swfVfy', player_url] if page_url is not None: @@ -299,30 +339,48 @@ class FileDownloader(object): if live: basic_args += ['--live'] args = basic_args + [[], ['--resume', '--skip', '1']][self.params.get('continuedl', False)] + + if sys.platform == 'win32' and sys.version_info < (3, 0): + # Windows subprocess module does not actually support Unicode + # on Python 2.x + # See http://stackoverflow.com/a/9951851/35070 + subprocess_encoding = sys.getfilesystemencoding() + args = [a.encode(subprocess_encoding, 'ignore') for a in args] + else: + subprocess_encoding = None + if self.params.get('verbose', False): + if subprocess_encoding: + str_args = [ + a.decode(subprocess_encoding) if isinstance(a, bytes) else a + for a in args] + else: + str_args = args try: import pipes - shell_quote = lambda args: ' '.join(map(pipes.quote, args)) + shell_quote = lambda args: ' '.join(map(pipes.quote, str_args)) except ImportError: shell_quote = repr - self.to_screen(u'[debug] rtmpdump command line: ' + shell_quote(args)) - retval = subprocess.call(args) + self.to_screen(u'[debug] rtmpdump command line: ' + shell_quote(str_args)) + + retval = run_rtmpdump(args) + while (retval == 2 or retval == 1) and not test: prevsize = os.path.getsize(encodeFilename(tmpfilename)) - self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True) + self.to_screen(u'[rtmpdump] %s bytes' % prevsize) time.sleep(5.0) # This seems to be needed - retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1]) + retval = run_rtmpdump(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1]) cursize = os.path.getsize(encodeFilename(tmpfilename)) if prevsize == cursize and retval == 1: break # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those if prevsize == cursize and retval == 2 and cursize > 1024: - self.to_screen(u'\r[rtmpdump] Could not download the whole video. This can happen for some advertisements.') + self.to_screen(u'[rtmpdump] Could not download the whole video. This can happen for some advertisements.') retval = 0 break if retval == 0 or (test and retval == 2): fsize = os.path.getsize(encodeFilename(tmpfilename)) - self.to_screen(u'\r[rtmpdump] %s bytes' % fsize) + self.to_screen(u'[rtmpdump] %s bytes' % fsize) self.try_rename(tmpfilename, filename) self._hook_progress({ 'downloaded_bytes': fsize, @@ -525,7 +583,7 @@ class FileDownloader(object): self.to_screen(u'\r[download] File is larger than max-filesize (%s bytes > %s bytes). Aborting.' % (data_len, max_data_len)) return False - data_len_str = self.format_bytes(data_len) + data_len_str = format_bytes(data_len) byte_counter = 0 + resume_len block_size = self.params.get('buffersize', 1024) start = time.time() diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 579cca122..285ff5fe7 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -7,8 +7,10 @@ import errno import io import json import os +import platform import re import shutil +import subprocess import socket import sys import time @@ -18,6 +20,7 @@ if os.name == 'nt': import ctypes from .utils import ( + compat_cookiejar, compat_http_client, compat_print, compat_str, @@ -30,9 +33,12 @@ from .utils import ( DownloadError, encodeFilename, ExtractorError, + format_bytes, locked_file, + make_HTTPS_handler, MaxDownloadsReached, PostProcessingError, + platform_name, preferredencoding, SameFileError, sanitize_filename, @@ -41,9 +47,11 @@ from .utils import ( UnavailableVideoError, write_json_file, write_string, + YoutubeDLHandler, ) from .extractor import get_info_extractor, gen_extractors from .FileDownloader import FileDownloader +from .version import __version__ class YoutubeDL(object): @@ -118,9 +126,13 @@ class YoutubeDL(object): noplaylist: Download single video instead of a playlist if in doubt. age_limit: An integer representing the user's age in years. Unsuitable videos for the given age are skipped. - downloadarchive: File name of a file where all downloads are recorded. + download_archive: File name of a file where all downloads are recorded. Videos already present in the file are not downloaded again. + cookiefile: File name where cookies should be read from and dumped to. + nocheckcertificate:Do not verify SSL certificates + proxy: URL of the proxy server to use + socket_timeout: Time to wait for unresponsive hosts, in seconds The following parameters are not used by YoutubeDL itself, they are used by the FileDownloader: @@ -135,7 +147,7 @@ class YoutubeDL(object): _num_downloads = None _screen_file = None - def __init__(self, params): + def __init__(self, params=None): """Create a FileDownloader object with the given options.""" self._ies = [] self._ies_instances = {} @@ -144,6 +156,7 @@ class YoutubeDL(object): self._download_retcode = 0 self._num_downloads = 0 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)] + self.params = {} if params is None else params if (sys.version_info >= (3,) and sys.platform != 'win32' and sys.getfilesystemencoding() in ['ascii', 'ANSI_X3.4-1968'] @@ -153,14 +166,15 @@ class YoutubeDL(object): u'Assuming --restrict-filenames since file system encoding ' u'cannot encode all charactes. ' u'Set the LC_ALL environment variable to fix this.') - params['restrictfilenames'] = True + self.params['restrictfilenames'] = True - self.params = params self.fd = FileDownloader(self, self.params) - if '%(stitle)s' in self.params['outtmpl']: + if '%(stitle)s' in self.params.get('outtmpl', ''): self.report_warning(u'%(stitle)s is deprecated. Use the %(title)s and the --restrict-filenames flag(which also secures %(uploader)s et al) instead.') + self._setup_opener() + def add_info_extractor(self, ie): """Add an InfoExtractor object to the end of the list.""" self._ies.append(ie) @@ -241,10 +255,9 @@ class YoutubeDL(object): def __exit__(self, *args): self.restore_console_title() - - def fixed_template(self): - """Checks if the output template is fixed.""" - return (re.search(u'(?u)%\\(.+?\\)s', self.params['outtmpl']) is None) + + if self.params.get('cookiefile') is not None: + self.cookiejar.save() def trouble(self, message=None, tb=None): """Determine action to take when a download problem appears. @@ -392,7 +405,8 @@ class YoutubeDL(object): for key, value in extra_info.items(): info_dict.setdefault(key, value) - def extract_info(self, url, download=True, ie_key=None, extra_info={}): + def extract_info(self, url, download=True, ie_key=None, extra_info={}, + process=True): ''' Returns a list with a dictionary for each video we find. If 'download', also downloads the videos. @@ -428,7 +442,10 @@ class YoutubeDL(object): 'webpage_url': url, 'extractor_key': ie.ie_key(), }) - return self.process_ie_result(ie_result, download, extra_info) + if process: + return self.process_ie_result(ie_result, download, extra_info) + else: + return ie_result except ExtractorError as de: # An error we somewhat expected self.report_error(compat_str(de), de.format_traceback()) break @@ -461,8 +478,33 @@ class YoutubeDL(object): download, ie_key=ie_result.get('ie_key'), extra_info=extra_info) - elif result_type == 'playlist': + elif result_type == 'url_transparent': + # Use the information from the embedding page + info = self.extract_info( + ie_result['url'], ie_key=ie_result.get('ie_key'), + extra_info=extra_info, download=False, process=False) + def make_result(embedded_info): + new_result = ie_result.copy() + for f in ('_type', 'url', 'ext', 'player_url', 'formats', + 'entries', 'urlhandle', 'ie_key', 'duration', + 'subtitles', 'annotations', 'format', + 'thumbnail', 'thumbnails'): + if f in new_result: + del new_result[f] + if f in embedded_info: + new_result[f] = embedded_info[f] + return new_result + new_result = make_result(info) + + assert new_result.get('_type') != 'url_transparent' + if new_result.get('_type') == 'compat_list': + new_result['entries'] = [ + make_result(e) for e in new_result['entries']] + + return self.process_ie_result( + new_result, download=download, extra_info=extra_info) + elif result_type == 'playlist': # We process each entry in the playlist playlist = ie_result.get('title', None) or ie_result.get('id', None) self.to_screen(u'[download] Downloading playlist: %s' % playlist) @@ -783,13 +825,15 @@ class YoutubeDL(object): def download(self, url_list): """Download a given list of URLs.""" - if len(url_list) > 1 and self.fixed_template(): + if (len(url_list) > 1 and + '%' not in self.params['outtmpl'] + and self.params.get('max_downloads') != 1): raise SameFileError(self.params['outtmpl']) for url in url_list: try: #It also downloads the videos - videos = self.extract_info(url) + self.extract_info(url) except UnavailableVideoError: self.report_error(u'unable to download video') except MaxDownloadsReached: @@ -867,20 +911,26 @@ class YoutubeDL(object): except (IOError, OSError): self.report_warning(u'Unable to remove downloaded video file') - def in_download_archive(self, info_dict): - fn = self.params.get('download_archive') - if fn is None: - return False - extractor = info_dict.get('extractor_id') + def _make_archive_id(self, info_dict): + # Future-proof against any change in case + # and backwards compatibility with prior versions + extractor = info_dict.get('extractor_key') if extractor is None: if 'id' in info_dict: extractor = info_dict.get('ie_key') # key in a playlist if extractor is None: + return None # Incomplete video information + return extractor.lower() + u' ' + info_dict['id'] + + def in_download_archive(self, info_dict): + fn = self.params.get('download_archive') + if fn is None: + return False + + vid_id = self._make_archive_id(info_dict) + if vid_id is None: return False # Incomplete video information - # Future-proof against any change in case - # and backwards compatibility with prior versions - extractor = extractor.lower() - vid_id = extractor + u' ' + info_dict['id'] + try: with locked_file(fn, 'r', encoding='utf-8') as archive_file: for line in archive_file: @@ -895,12 +945,15 @@ class YoutubeDL(object): fn = self.params.get('download_archive') if fn is None: return - vid_id = info_dict['extractor'] + u' ' + info_dict['id'] + vid_id = self._make_archive_id(info_dict) + assert vid_id with locked_file(fn, 'a', encoding='utf-8') as archive_file: archive_file.write(vid_id + u'\n') @staticmethod def format_resolution(format, default='unknown'): + if format.get('vcodec') == 'none': + return 'audio only' if format.get('_resolution') is not None: return format['_resolution'] if format.get('height') is not None: @@ -914,10 +967,11 @@ class YoutubeDL(object): def list_formats(self, info_dict): def format_note(fdict): - if fdict.get('format_note') is not None: - return fdict['format_note'] res = u'' - if fdict.get('vcodec') is not None: + if fdict.get('format_note') is not None: + res += fdict['format_note'] + u' ' + if (fdict.get('vcodec') is not None and + fdict.get('vcodec') != 'none'): res += u'%-5s' % fdict['vcodec'] elif fdict.get('vbr') is not None: res += u'video' @@ -933,25 +987,103 @@ class YoutubeDL(object): res += 'audio' if fdict.get('abr') is not None: res += u'@%3dk' % fdict['abr'] + if fdict.get('filesize') is not None: + if res: + res += u', ' + res += format_bytes(fdict['filesize']) return res - def line(format): - return (u'%-20s%-10s%-12s%s' % ( + def line(format, idlen=20): + return ((u'%-' + compat_str(idlen + 1) + u's%-10s%-12s%s') % ( format['format_id'], format['ext'], self.format_resolution(format), format_note(format), - ) - ) + )) formats = info_dict.get('formats', [info_dict]) - formats_s = list(map(line, formats)) + idlen = max(len(u'format code'), + max(len(f['format_id']) for f in formats)) + formats_s = [line(f, idlen) for f in formats] if len(formats) > 1: formats_s[0] += (' ' if format_note(formats[0]) else '') + '(worst)' formats_s[-1] += (' ' if format_note(formats[-1]) else '') + '(best)' header_line = line({ 'format_id': u'format code', 'ext': u'extension', - '_resolution': u'resolution', 'format_note': u'note'}) + '_resolution': u'resolution', 'format_note': u'note'}, idlen=idlen) self.to_screen(u'[info] Available formats for %s:\n%s\n%s' % (info_dict['id'], header_line, u"\n".join(formats_s))) + + def urlopen(self, req): + """ Start an HTTP download """ + return self._opener.open(req) + + def print_debug_header(self): + if not self.params.get('verbose'): + return + write_string(u'[debug] youtube-dl version ' + __version__ + u'\n') + try: + sp = subprocess.Popen( + ['git', 'rev-parse', '--short', 'HEAD'], + stdout=subprocess.PIPE, stderr=subprocess.PIPE, + cwd=os.path.dirname(os.path.abspath(__file__))) + out, err = sp.communicate() + out = out.decode().strip() + if re.match('[0-9a-f]+', out): + write_string(u'[debug] Git HEAD: ' + out + u'\n') + except: + try: + sys.exc_clear() + except: + pass + write_string(u'[debug] Python version %s - %s' % + (platform.python_version(), platform_name()) + u'\n') + + proxy_map = {} + for handler in self._opener.handlers: + if hasattr(handler, 'proxies'): + proxy_map.update(handler.proxies) + write_string(u'[debug] Proxy map: ' + compat_str(proxy_map) + u'\n') + + def _setup_opener(self): + timeout_val = self.params.get('socket_timeout') + timeout = 600 if timeout_val is None else float(timeout_val) + + opts_cookiefile = self.params.get('cookiefile') + opts_proxy = self.params.get('proxy') + + if opts_cookiefile is None: + self.cookiejar = compat_cookiejar.CookieJar() + else: + self.cookiejar = compat_cookiejar.MozillaCookieJar( + opts_cookiefile) + if os.access(opts_cookiefile, os.R_OK): + self.cookiejar.load() + + cookie_processor = compat_urllib_request.HTTPCookieProcessor( + self.cookiejar) + if opts_proxy is not None: + if opts_proxy == '': + proxies = {} + else: + proxies = {'http': opts_proxy, 'https': opts_proxy} + else: + proxies = compat_urllib_request.getproxies() + # Set HTTPS proxy to HTTP one if given (https://github.com/rg3/youtube-dl/issues/805) + if 'http' in proxies and 'https' not in proxies: + proxies['https'] = proxies['http'] + proxy_handler = compat_urllib_request.ProxyHandler(proxies) + https_handler = make_HTTPS_handler( + self.params.get('nocheckcertificate', False)) + opener = compat_urllib_request.build_opener( + https_handler, proxy_handler, cookie_processor, YoutubeDLHandler()) + # Delete the default user-agent header, which would otherwise apply in + # cases where our custom HTTP handler doesn't come into play + # (See https://github.com/rg3/youtube-dl/issues/1309 for details) + opener.addheaders = [] + self._opener = opener + + # TODO remove this global modification + compat_urllib_request.install_opener(opener) + socket.setdefaulttimeout(timeout) diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 19904dbfd..d2446b670 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -36,50 +36,41 @@ __authors__ = ( 'Marcin Cieślak', 'Anton Larionov', 'Takuya Tsuchida', + 'Sergey M.', ) __license__ = 'Public Domain' import codecs -import collections import getpass import optparse import os import random import re import shlex -import socket import subprocess import sys -import traceback -import platform from .utils import ( - compat_cookiejar, compat_print, - compat_str, - compat_urllib_request, DateRange, decodeOption, determine_ext, DownloadError, get_cachedir, - make_HTTPS_handler, MaxDownloadsReached, - platform_name, preferredencoding, SameFileError, std_headers, write_string, - YoutubeDLHandler, ) from .update import update_self -from .version import __version__ from .FileDownloader import ( FileDownloader, ) from .extractor import gen_extractors +from .version import __version__ from .YoutubeDL import YoutubeDL from .PostProcessor import ( FFmpegMetadataPP, @@ -90,11 +81,11 @@ from .PostProcessor import ( def parseOpts(overrideArguments=None): - def _readOptions(filename_bytes): + def _readOptions(filename_bytes, default=[]): try: optionf = open(filename_bytes) except IOError: - return [] # silently skip if file is not present + return default # silently skip if file is not present try: res = [] for l in optionf: @@ -200,7 +191,9 @@ def parseOpts(overrideArguments=None): general.add_option('--extractor-descriptions', action='store_true', dest='list_extractor_descriptions', help='Output descriptions of all supported extractors', default=False) - general.add_option('--proxy', dest='proxy', default=None, help='Use the specified HTTP/HTTPS proxy', metavar='URL') + general.add_option( + '--proxy', dest='proxy', default=None, metavar='URL', + help='Use the specified HTTP/HTTPS proxy. Pass in an empty string (--proxy "") for direct connection') general.add_option('--no-check-certificate', action='store_true', dest='no_check_certificate', default=False, help='Suppress HTTPS certificate validation.') general.add_option( '--cache-dir', dest='cachedir', default=get_cachedir(), metavar='DIR', @@ -208,6 +201,9 @@ def parseOpts(overrideArguments=None): general.add_option( '--no-cache-dir', action='store_const', const=None, dest='cachedir', help='Disable filesystem caching') + general.add_option( + '--socket-timeout', dest='socket_timeout', + type=float, default=None, help=optparse.SUPPRESS_HELP) selection.add_option('--playlist-start', @@ -216,7 +212,9 @@ def parseOpts(overrideArguments=None): dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1) selection.add_option('--match-title', dest='matchtitle', metavar='REGEX',help='download only matching titles (regex or caseless sub-string)') selection.add_option('--reject-title', dest='rejecttitle', metavar='REGEX',help='skip download for matching titles (regex or caseless sub-string)') - selection.add_option('--max-downloads', metavar='NUMBER', dest='max_downloads', help='Abort after downloading NUMBER files', default=None) + selection.add_option('--max-downloads', metavar='NUMBER', + dest='max_downloads', type=int, default=None, + help='Abort after downloading NUMBER files') selection.add_option('--min-filesize', metavar='SIZE', dest='min_filesize', help="Do not download any videos smaller than SIZE (e.g. 50k or 44.6m)", default=None) selection.add_option('--max-filesize', metavar='SIZE', dest='max_filesize', help="Do not download any videos larger than SIZE (e.g. 50k or 44.6m)", default=None) selection.add_option('--date', metavar='DATE', dest='date', help='download only videos uploaded in this date', default=None) @@ -228,7 +226,7 @@ def parseOpts(overrideArguments=None): default=None, type=int) selection.add_option('--download-archive', metavar='FILE', dest='download_archive', - help='Download only videos not present in the archive file. Record all downloaded videos in it.') + help='Download only videos not listed in the archive file. Record the IDs of all downloaded videos in it.') authentication.add_option('-u', '--username', @@ -243,7 +241,7 @@ def parseOpts(overrideArguments=None): video_format.add_option('-f', '--format', action='store', dest='format', metavar='FORMAT', default='best', - help='video format code, specifiy the order of preference using slashes: "-f 22/17/18". "-f mp4" and "-f flv" are also supported') + help='video format code, specify the order of preference using slashes: "-f 22/17/18". "-f mp4" and "-f flv" are also supported') video_format.add_option('--all-formats', action='store_const', dest='format', help='download all available video formats', const='all') video_format.add_option('--prefer-free-formats', @@ -325,7 +323,7 @@ def parseOpts(overrideArguments=None): help='print downloaded pages to debug problems(very verbose)') verbosity.add_option('--write-pages', action='store_true', dest='write_pages', default=False, - help='Write downloaded pages to files in the current directory') + help='Write downloaded intermediary pages to files in the current directory to debug problems') verbosity.add_option('--youtube-print-sig-code', action='store_true', dest='youtube_print_sig_code', default=False, help=optparse.SUPPRESS_HELP) @@ -423,6 +421,8 @@ def parseOpts(overrideArguments=None): if opts.verbose: write_string(u'[debug] Override config: ' + repr(overrideArguments) + '\n') else: + systemConf = _readOptions('/etc/youtube-dl.conf') + xdg_config_home = os.environ.get('XDG_CONFIG_HOME') if xdg_config_home: userConfFile = os.path.join(xdg_config_home, 'youtube-dl', 'config') @@ -432,8 +432,31 @@ def parseOpts(overrideArguments=None): userConfFile = os.path.join(os.path.expanduser('~'), '.config', 'youtube-dl', 'config') if not os.path.isfile(userConfFile): userConfFile = os.path.join(os.path.expanduser('~'), '.config', 'youtube-dl.conf') - systemConf = _readOptions('/etc/youtube-dl.conf') - userConf = _readOptions(userConfFile) + userConf = _readOptions(userConfFile, None) + + if userConf is None: + appdata_dir = os.environ.get('appdata') + if appdata_dir: + userConf = _readOptions( + os.path.join(appdata_dir, 'youtube-dl', 'config'), + default=None) + if userConf is None: + userConf = _readOptions( + os.path.join(appdata_dir, 'youtube-dl', 'config.txt'), + default=None) + + if userConf is None: + userConf = _readOptions( + os.path.join(os.path.expanduser('~'), 'youtube-dl.conf'), + default=None) + if userConf is None: + userConf = _readOptions( + os.path.join(os.path.expanduser('~'), 'youtube-dl.conf.txt'), + default=None) + + if userConf is None: + userConf = [] + commandLineConf = sys.argv[1:] argv = systemConf + userConf + commandLineConf opts, args = parser.parse_args(argv) @@ -452,19 +475,6 @@ def _real_main(argv=None): parser, opts, args = parseOpts(argv) - # Open appropriate CookieJar - if opts.cookiefile is None: - jar = compat_cookiejar.CookieJar() - else: - try: - jar = compat_cookiejar.MozillaCookieJar(opts.cookiefile) - if os.access(opts.cookiefile, os.R_OK): - jar.load() - except (IOError, OSError) as err: - if opts.verbose: - traceback.print_exc() - write_string(u'ERROR: unable to open cookie file\n') - sys.exit(101) # Set user agent if opts.user_agent is not None: std_headers['User-Agent'] = opts.user_agent @@ -496,8 +506,6 @@ def _real_main(argv=None): all_urls = batchurls + args all_urls = [url.strip() for url in all_urls] - opener = _setup_opener(jar=jar, opts=opts) - extractors = gen_extractors() if opts.list_extractors: @@ -552,7 +560,7 @@ def _real_main(argv=None): if opts.retries is not None: try: opts.retries = int(opts.retries) - except (TypeError, ValueError) as err: + except (TypeError, ValueError): parser.error(u'invalid retry count specified') if opts.buffersize is not None: numeric_buffersize = FileDownloader.parse_bytes(opts.buffersize) @@ -563,13 +571,13 @@ def _real_main(argv=None): opts.playliststart = int(opts.playliststart) if opts.playliststart <= 0: raise ValueError(u'Playlist start must be positive') - except (TypeError, ValueError) as err: + except (TypeError, ValueError): parser.error(u'invalid playlist start number specified') try: opts.playlistend = int(opts.playlistend) if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart): raise ValueError(u'Playlist end must be greater than playlist start') - except (TypeError, ValueError) as err: + except (TypeError, ValueError): parser.error(u'invalid playlist end number specified') if opts.extractaudio: if opts.audioformat not in ['best', 'aac', 'mp3', 'm4a', 'opus', 'vorbis', 'wav']: @@ -672,34 +680,14 @@ def _real_main(argv=None): 'youtube_print_sig_code': opts.youtube_print_sig_code, 'age_limit': opts.age_limit, 'download_archive': opts.download_archive, + 'cookiefile': opts.cookiefile, + 'nocheckcertificate': opts.no_check_certificate, + 'proxy': opts.proxy, + 'socket_timeout': opts.socket_timeout, } with YoutubeDL(ydl_opts) as ydl: - if opts.verbose: - write_string(u'[debug] youtube-dl version ' + __version__ + u'\n') - try: - sp = subprocess.Popen( - ['git', 'rev-parse', '--short', 'HEAD'], - stdout=subprocess.PIPE, stderr=subprocess.PIPE, - cwd=os.path.dirname(os.path.abspath(__file__))) - out, err = sp.communicate() - out = out.decode().strip() - if re.match('[0-9a-f]+', out): - write_string(u'[debug] Git HEAD: ' + out + u'\n') - except: - try: - sys.exc_clear() - except: - pass - write_string(u'[debug] Python version %s - %s' % - (platform.python_version(), platform_name()) + u'\n') - - proxy_map = {} - for handler in opener.handlers: - if hasattr(handler, 'proxies'): - proxy_map.update(handler.proxies) - write_string(u'[debug] Proxy map: ' + compat_str(proxy_map) + u'\n') - + ydl.print_debug_header() ydl.add_default_info_extractors() # PostProcessors @@ -730,46 +718,9 @@ def _real_main(argv=None): ydl.to_screen(u'--max-download limit reached, aborting.') retcode = 101 - # Dump cookie jar if requested - if opts.cookiefile is not None: - try: - jar.save() - except (IOError, OSError): - sys.exit(u'ERROR: unable to save cookie jar') - sys.exit(retcode) -def _setup_opener(jar=None, opts=None, timeout=300): - if opts is None: - FakeOptions = collections.namedtuple( - 'FakeOptions', ['proxy', 'no_check_certificate']) - opts = FakeOptions(proxy=None, no_check_certificate=False) - - cookie_processor = compat_urllib_request.HTTPCookieProcessor(jar) - if opts.proxy is not None: - if opts.proxy == '': - proxies = {} - else: - proxies = {'http': opts.proxy, 'https': opts.proxy} - else: - proxies = compat_urllib_request.getproxies() - # Set HTTPS proxy to HTTP one if given (https://github.com/rg3/youtube-dl/issues/805) - if 'http' in proxies and 'https' not in proxies: - proxies['https'] = proxies['http'] - proxy_handler = compat_urllib_request.ProxyHandler(proxies) - https_handler = make_HTTPS_handler(opts) - opener = compat_urllib_request.build_opener( - https_handler, proxy_handler, cookie_processor, YoutubeDLHandler()) - # Delete the default user-agent header, which would otherwise apply in - # cases where our custom HTTP handler doesn't come into play - # (See https://github.com/rg3/youtube-dl/issues/1309 for details) - opener.addheaders = [] - compat_urllib_request.install_opener(opener) - socket.setdefaulttimeout(timeout) - return opener - - def main(argv=None): try: _real_main(argv) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index f443f11f6..f6a23f663 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -20,9 +20,11 @@ from .c56 import C56IE from .canalplus import CanalplusIE from .canalc2 import Canalc2IE from .cinemassacre import CinemassacreIE +from .clipfish import ClipfishIE +from .clipsyndicate import ClipsyndicateIE from .cnn import CNNIE from .collegehumor import CollegeHumorIE -from .comedycentral import ComedyCentralIE +from .comedycentral import ComedyCentralIE, ComedyCentralShowsIE from .condenast import CondeNastIE from .criterion import CriterionIE from .cspan import CSpanIE @@ -54,7 +56,7 @@ from .flickr import FlickrIE from .francetv import ( PluzzIE, FranceTvInfoIE, - France2IE, + FranceTVIE, GenerationQuoiIE ) from .freesound import FreesoundIE @@ -70,6 +72,7 @@ from .hotnewhiphop import HotNewHipHopIE from .howcast import HowcastIE from .hypem import HypemIE from .ign import IGNIE, OneUPIE +from .imdb import ImdbIE from .ina import InaIE from .infoq import InfoQIE from .instagram import InstagramIE @@ -99,11 +102,13 @@ from .nbc import NBCNewsIE from .newgrounds import NewgroundsIE from .nhl import NHLIE, NHLVideocenterIE from .niconico import NiconicoIE +from .ninegag import NineGagIE from .nowvideo import NowVideoIE from .ooyala import OoyalaIE from .orf import ORFIE from .pbs import PBSIE from .photobucket import PhotobucketIE +from .podomatic import PodomaticIE from .pornhub import PornHubIE from .pornotube import PornotubeIE from .rbmaradio import RBMARadioIE @@ -117,6 +122,11 @@ from .rutube import RutubeIE from .sina import SinaIE from .slashdot import SlashdotIE from .slideshare import SlideshareIE +from .smotri import ( + SmotriIE, + SmotriCommunityIE, + SmotriUserIE, +) from .sohu import SohuIE from .soundcloud import SoundcloudIE, SoundcloudSetIE, SoundcloudUserIE from .southparkstudios import ( @@ -135,6 +145,7 @@ from .teamcoco import TeamcocoIE from .techtalks import TechTalksIE from .ted import TEDIE from .tf1 import TF1IE +from .theplatform import ThePlatformIE from .thisav import ThisAVIE from .toutv import TouTvIE from .traileraddict import TrailerAddictIE @@ -155,19 +166,31 @@ from .viddler import ViddlerIE from .videodetective import VideoDetectiveIE from .videofyme import VideofyMeIE from .videopremium import VideoPremiumIE -from .vimeo import VimeoIE, VimeoChannelIE +from .vimeo import ( + VimeoIE, + VimeoChannelIE, + VimeoUserIE, + VimeoAlbumIE, + VimeoGroupsIE, +) from .vine import VineIE +from .viki import VikiIE from .vk import VKIE from .wat import WatIE from .websurg import WeBSurgIE from .weibo import WeiboIE from .wimp import WimpIE +from .wistia import WistiaIE from .worldstarhiphop import WorldStarHipHopIE from .xhamster import XHamsterIE from .xnxx import XNXXIE from .xvideos import XVideosIE from .xtube import XTubeIE -from .yahoo import YahooIE, YahooSearchIE +from .yahoo import ( + YahooIE, + YahooNewsIE, + YahooSearchIE, +) from .youjizz import YouJizzIE from .youku import YoukuIE from .youporn import YouPornIE @@ -184,6 +207,7 @@ from .youtube import ( YoutubeTruncatedURLIE, YoutubeWatchLaterIE, YoutubeFavouritesIE, + YoutubeHistoryIE, ) from .zdf import ZDFIE diff --git a/youtube_dl/extractor/addanime.py b/youtube_dl/extractor/addanime.py index b99d4b966..a3a1b999d 100644 --- a/youtube_dl/extractor/addanime.py +++ b/youtube_dl/extractor/addanime.py @@ -13,7 +13,7 @@ from ..utils import ( class AddAnimeIE(InfoExtractor): - _VALID_URL = r'^http://(?:\w+\.)?add-anime\.net/watch_video.php\?(?:.*?)v=(?P[\w_]+)(?:.*)' + _VALID_URL = r'^http://(?:\w+\.)?add-anime\.net/watch_video\.php\?(?:.*?)v=(?P[\w_]+)(?:.*)' IE_NAME = u'AddAnime' _TEST = { u'url': u'http://www.add-anime.net/watch_video.php?v=24MR3YO5SAS9', diff --git a/youtube_dl/extractor/anitube.py b/youtube_dl/extractor/anitube.py index 691d5a844..2b019daa9 100644 --- a/youtube_dl/extractor/anitube.py +++ b/youtube_dl/extractor/anitube.py @@ -1,5 +1,4 @@ import re -import xml.etree.ElementTree from .common import InfoExtractor @@ -28,9 +27,8 @@ class AnitubeIE(InfoExtractor): key = self._html_search_regex(r'http://www\.anitube\.se/embed/([A-Za-z0-9_-]*)', webpage, u'key') - webpage_config = self._download_webpage('http://www.anitube.se/nuevo/econfig.php?key=%s' % key, + config_xml = self._download_xml('http://www.anitube.se/nuevo/econfig.php?key=%s' % key, key) - config_xml = xml.etree.ElementTree.fromstring(webpage_config.encode('utf-8')) video_title = config_xml.find('title').text diff --git a/youtube_dl/extractor/appletrailers.py b/youtube_dl/extractor/appletrailers.py index 6d6237f8a..a527f10de 100644 --- a/youtube_dl/extractor/appletrailers.py +++ b/youtube_dl/extractor/appletrailers.py @@ -10,7 +10,7 @@ from ..utils import ( class AppleTrailersIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?trailers.apple.com/trailers/(?P[^/]+)/(?P[^/]+)' + _VALID_URL = r'https?://(?:www\.)?trailers\.apple\.com/trailers/(?P[^/]+)/(?P[^/]+)' _TEST = { u"url": u"http://trailers.apple.com/trailers/wb/manofsteel/", u"playlist": [ @@ -113,7 +113,7 @@ class AppleTrailersIE(InfoExtractor): }) formats = sorted(formats, key=lambda f: (f['height'], f['width'])) - info = { + playlist.append({ '_type': 'video', 'id': video_id, 'title': title, @@ -124,12 +124,7 @@ class AppleTrailersIE(InfoExtractor): 'upload_date': upload_date, 'uploader_id': uploader_id, 'user_agent': 'QuickTime compatible (youtube-dl)', - } - # TODO: Remove when #980 has been merged - info['url'] = formats[-1]['url'] - info['ext'] = formats[-1]['ext'] - - playlist.append(info) + }) return { '_type': 'playlist', diff --git a/youtube_dl/extractor/archiveorg.py b/youtube_dl/extractor/archiveorg.py index 61ce4469a..8bb546410 100644 --- a/youtube_dl/extractor/archiveorg.py +++ b/youtube_dl/extractor/archiveorg.py @@ -11,7 +11,7 @@ from ..utils import ( class ArchiveOrgIE(InfoExtractor): IE_NAME = 'archive.org' IE_DESC = 'archive.org videos' - _VALID_URL = r'(?:https?://)?(?:www\.)?archive.org/details/(?P[^?/]+)(?:[?].*)?$' + _VALID_URL = r'(?:https?://)?(?:www\.)?archive\.org/details/(?P[^?/]+)(?:[?].*)?$' _TEST = { u"url": u"http://archive.org/details/XD300-23_68HighlightsAResearchCntAugHumanIntellect", u'file': u'XD300-23_68HighlightsAResearchCntAugHumanIntellect.ogv', @@ -49,7 +49,7 @@ class ArchiveOrgIE(InfoExtractor): for f in formats: f['ext'] = determine_ext(f['url']) - info = { + return { '_type': 'video', 'id': video_id, 'title': title, @@ -57,12 +57,5 @@ class ArchiveOrgIE(InfoExtractor): 'description': description, 'uploader': uploader, 'upload_date': upload_date, + 'thumbnail': data.get('misc', {}).get('image'), } - thumbnail = data.get('misc', {}).get('image') - if thumbnail: - info['thumbnail'] = thumbnail - - # TODO: Remove when #980 has been merged - info.update(formats[-1]) - - return info diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py index 44d0b5d70..56a5d009f 100644 --- a/youtube_dl/extractor/arte.py +++ b/youtube_dl/extractor/arte.py @@ -1,7 +1,6 @@ # encoding: utf-8 import re import json -import xml.etree.ElementTree from .common import InfoExtractor from ..utils import ( @@ -18,8 +17,8 @@ from ..utils import ( # add tests. class ArteTvIE(InfoExtractor): - _VIDEOS_URL = r'(?:http://)?videos.arte.tv/(?Pfr|de)/.*-(?P.*?).html' - _LIVEWEB_URL = r'(?:http://)?liveweb.arte.tv/(?Pfr|de)/(?P.+?)/(?P.+)' + _VIDEOS_URL = r'(?:http://)?videos\.arte\.tv/(?Pfr|de)/.*-(?P.*?)\.html' + _LIVEWEB_URL = r'(?:http://)?liveweb\.arte\.tv/(?Pfr|de)/(?P.+?)/(?P.+)' _LIVE_URL = r'index-[0-9]+\.html$' IE_NAME = u'arte.tv' @@ -78,8 +77,7 @@ class ArteTvIE(InfoExtractor): """Extract from videos.arte.tv""" ref_xml_url = url.replace('/videos/', '/do_delegate/videos/') ref_xml_url = ref_xml_url.replace('.html', ',view,asPlayerXml.xml') - ref_xml = self._download_webpage(ref_xml_url, video_id, note=u'Downloading metadata') - ref_xml_doc = xml.etree.ElementTree.fromstring(ref_xml) + ref_xml_doc = self._download_xml(ref_xml_url, video_id, note=u'Downloading metadata') config_node = find_xpath_attr(ref_xml_doc, './/video', 'lang', lang) config_xml_url = config_node.attrib['ref'] config_xml = self._download_webpage(config_xml_url, video_id, note=u'Downloading configuration') @@ -109,9 +107,8 @@ class ArteTvIE(InfoExtractor): """Extract form http://liveweb.arte.tv/""" webpage = self._download_webpage(url, name) video_id = self._search_regex(r'eventId=(\d+?)("|&)', webpage, u'event id') - config_xml = self._download_webpage('http://download.liveweb.arte.tv/o21/liveweb/events/event-%s.xml' % video_id, + config_doc = self._download_xml('http://download.liveweb.arte.tv/o21/liveweb/events/event-%s.xml' % video_id, video_id, u'Downloading information') - config_doc = xml.etree.ElementTree.fromstring(config_xml.encode('utf-8')) event_doc = config_doc.find('event') url_node = event_doc.find('video').find('urlHd') if url_node is None: diff --git a/youtube_dl/extractor/auengine.py b/youtube_dl/extractor/auengine.py index 95c038003..bcccc0b7a 100644 --- a/youtube_dl/extractor/auengine.py +++ b/youtube_dl/extractor/auengine.py @@ -16,7 +16,7 @@ class AUEngineIE(InfoExtractor): u"title": u"[Commie]The Legend of the Legendary Heroes - 03 - Replication Eye (Alpha Stigma)[F9410F5A]" } } - _VALID_URL = r'(?:http://)?(?:www\.)?auengine\.com/embed.php\?.*?file=([^&]+).*?' + _VALID_URL = r'(?:http://)?(?:www\.)?auengine\.com/embed\.php\?.*?file=([^&]+).*?' def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) diff --git a/youtube_dl/extractor/bambuser.py b/youtube_dl/extractor/bambuser.py index 967568c4a..d48c0c38d 100644 --- a/youtube_dl/extractor/bambuser.py +++ b/youtube_dl/extractor/bambuser.py @@ -25,6 +25,11 @@ class BambuserIE(InfoExtractor): u'uploader': u'pixelversity', u'uploader_id': u'344706', }, + u'params': { + # It doesn't respect the 'Range' header, it would download the whole video + # caused the travis builds to fail: https://travis-ci.org/rg3/youtube-dl/jobs/14493845#L59 + u'skip_download': True, + }, } def _real_extract(self, url): @@ -49,7 +54,7 @@ class BambuserIE(InfoExtractor): class BambuserChannelIE(InfoExtractor): IE_NAME = u'bambuser:channel' - _VALID_URL = r'http://bambuser.com/channel/(?P.*?)(?:/|#|\?|$)' + _VALID_URL = r'https?://bambuser\.com/channel/(?P.*?)(?:/|#|\?|$)' # The maximum number we can get with each request _STEP = 50 diff --git a/youtube_dl/extractor/bandcamp.py b/youtube_dl/extractor/bandcamp.py index 1aa9dbefd..3a32c14c5 100644 --- a/youtube_dl/extractor/bandcamp.py +++ b/youtube_dl/extractor/bandcamp.py @@ -34,7 +34,6 @@ class BandcampIE(InfoExtractor): json_code = m_trackinfo.group(1) data = json.loads(json_code) - entries = [] for d in data: formats = [{ 'format_id': 'format_id', diff --git a/youtube_dl/extractor/bloomberg.py b/youtube_dl/extractor/bloomberg.py index 3666a780b..755d9c9ef 100644 --- a/youtube_dl/extractor/bloomberg.py +++ b/youtube_dl/extractor/bloomberg.py @@ -4,7 +4,7 @@ from .common import InfoExtractor class BloombergIE(InfoExtractor): - _VALID_URL = r'https?://www\.bloomberg\.com/video/(?P.+?).html' + _VALID_URL = r'https?://www\.bloomberg\.com/video/(?P.+?)\.html' _TEST = { u'url': u'http://www.bloomberg.com/video/shah-s-presentation-on-foreign-exchange-strategies-qurhIVlJSB6hzkVi229d8g.html', diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index 74a7d13e3..66fe0ac9a 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -76,18 +76,21 @@ class BrightcoveIE(InfoExtractor): 'playerID': find_xpath_attr(object_doc, './param', 'name', 'playerID').attrib['value'], } def find_param(name): - return find_xpath_attr(object_doc, './param', 'name', name) + node = find_xpath_attr(object_doc, './param', 'name', name) + if node is not None: + return node.attrib['value'] + return None playerKey = find_param('playerKey') # Not all pages define this value if playerKey is not None: - params['playerKey'] = playerKey.attrib['value'] + params['playerKey'] = playerKey # The three fields hold the id of the video videoPlayer = find_param('@videoPlayer') or find_param('videoId') or find_param('videoID') if videoPlayer is not None: - params['@videoPlayer'] = videoPlayer.attrib['value'] + params['@videoPlayer'] = videoPlayer linkBase = find_param('linkBaseURL') if linkBase is not None: - params['linkBaseURL'] = linkBase.attrib['value'] + params['linkBaseURL'] = linkBase data = compat_urllib_parse.urlencode(params) return cls._FEDERATED_URL_TEMPLATE % data diff --git a/youtube_dl/extractor/canalplus.py b/youtube_dl/extractor/canalplus.py index bfa2a8b40..7cdcd8399 100644 --- a/youtube_dl/extractor/canalplus.py +++ b/youtube_dl/extractor/canalplus.py @@ -1,6 +1,5 @@ # encoding: utf-8 import re -import xml.etree.ElementTree from .common import InfoExtractor from ..utils import unified_strdate @@ -31,11 +30,10 @@ class CanalplusIE(InfoExtractor): webpage = self._download_webpage(url, mobj.group('path')) video_id = self._search_regex(r'videoId = "(\d+)";', webpage, u'video id') info_url = self._VIDEO_INFO_TEMPLATE % video_id - info_page = self._download_webpage(info_url,video_id, + doc = self._download_xml(info_url,video_id, u'Downloading video info') self.report_extraction(video_id) - doc = xml.etree.ElementTree.fromstring(info_page.encode('utf-8')) video_info = [video for video in doc if video.find('ID').text == video_id][0] infos = video_info.find('INFOS') media = video_info.find('MEDIA') diff --git a/youtube_dl/extractor/clipfish.py b/youtube_dl/extractor/clipfish.py new file mode 100644 index 000000000..43efb08bf --- /dev/null +++ b/youtube_dl/extractor/clipfish.py @@ -0,0 +1,58 @@ +import re +import time +import xml.etree.ElementTree + +from .common import InfoExtractor +from ..utils import ExtractorError + + +class ClipfishIE(InfoExtractor): + IE_NAME = u'clipfish' + + _VALID_URL = r'^https?://(?:www\.)?clipfish\.de/.*?/video/(?P[0-9]+)/' + _TEST = { + u'url': u'http://www.clipfish.de/special/game-trailer/video/3966754/fifa-14-e3-2013-trailer/', + u'file': u'3966754.mp4', + u'md5': u'2521cd644e862936cf2e698206e47385', + u'info_dict': { + u'title': u'FIFA 14 - E3 2013 Trailer', + u'duration': 82, + }, + u'skip': 'Blocked in the US' + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group(1) + + info_url = ('http://www.clipfish.de/devxml/videoinfo/%s?ts=%d' % + (video_id, int(time.time()))) + doc = self._download_xml( + info_url, video_id, note=u'Downloading info page') + title = doc.find('title').text + video_url = doc.find('filename').text + if video_url is None: + xml_bytes = xml.etree.ElementTree.tostring(doc) + raise ExtractorError(u'Cannot find video URL in document %r' % + xml_bytes) + thumbnail = doc.find('imageurl').text + duration_str = doc.find('duration').text + m = re.match( + r'^(?P[0-9]+):(?P[0-9]{2}):(?P[0-9]{2}):(?P[0-9]*)$', + duration_str) + if m: + duration = ( + (int(m.group('hours')) * 60 * 60) + + (int(m.group('minutes')) * 60) + + (int(m.group('seconds'))) + ) + else: + duration = None + + return { + 'id': video_id, + 'title': title, + 'url': video_url, + 'thumbnail': thumbnail, + 'duration': duration, + } diff --git a/youtube_dl/extractor/clipsyndicate.py b/youtube_dl/extractor/clipsyndicate.py new file mode 100644 index 000000000..d4fc86973 --- /dev/null +++ b/youtube_dl/extractor/clipsyndicate.py @@ -0,0 +1,52 @@ +import re +import xml.etree.ElementTree + +from .common import InfoExtractor +from ..utils import ( + find_xpath_attr, +) + + +class ClipsyndicateIE(InfoExtractor): + _VALID_URL = r'http://www\.clipsyndicate\.com/video/play(list/\d+)?/(?P\d+)' + + _TEST = { + u'url': u'http://www.clipsyndicate.com/video/play/4629301/brick_briscoe', + u'md5': u'4d7d549451bad625e0ff3d7bd56d776c', + u'info_dict': { + u'id': u'4629301', + u'ext': u'mp4', + u'title': u'Brick Briscoe', + u'duration': 612, + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + js_player = self._download_webpage( + 'http://eplayer.clipsyndicate.com/embed/player.js?va_id=%s' % video_id, + video_id, u'Downlaoding player') + # it includes a required token + flvars = self._search_regex(r'flvars: "(.*?)"', js_player, u'flvars') + + playlist_page = self._download_webpage( + 'http://eplayer.clipsyndicate.com/osmf/playlist?%s' % flvars, + video_id, u'Downloading video info') + # Fix broken xml + playlist_page = re.sub('&', '&', playlist_page) + pdoc = xml.etree.ElementTree.fromstring(playlist_page.encode('utf-8')) + + track_doc = pdoc.find('trackList/track') + def find_param(name): + node = find_xpath_attr(track_doc, './/param', 'name', name) + if node is not None: + return node.attrib['value'] + + return { + 'id': video_id, + 'title': find_param('title'), + 'url': track_doc.find('location').text, + 'thumbnail': find_param('thumbnail'), + 'duration': int(find_param('duration')), + } diff --git a/youtube_dl/extractor/cnn.py b/youtube_dl/extractor/cnn.py index 34adf6dda..a034bb2fb 100644 --- a/youtube_dl/extractor/cnn.py +++ b/youtube_dl/extractor/cnn.py @@ -1,5 +1,4 @@ import re -import xml.etree.ElementTree from .common import InfoExtractor from ..utils import determine_ext @@ -33,8 +32,7 @@ class CNNIE(InfoExtractor): path = mobj.group('path') page_title = mobj.group('title') info_url = u'http://cnn.com/video/data/3.0/%s/index.xml' % path - info_xml = self._download_webpage(info_url, page_title) - info = xml.etree.ElementTree.fromstring(info_xml.encode('utf-8')) + info = self._download_xml(info_url, page_title) formats = [] for f in info.findall('files/file'): diff --git a/youtube_dl/extractor/collegehumor.py b/youtube_dl/extractor/collegehumor.py index 0c29acfb1..b27c1dfc5 100644 --- a/youtube_dl/extractor/collegehumor.py +++ b/youtube_dl/extractor/collegehumor.py @@ -1,5 +1,4 @@ import re -import xml.etree.ElementTree from .common import InfoExtractor from ..utils import ( @@ -46,11 +45,10 @@ class CollegeHumorIE(InfoExtractor): self.report_extraction(video_id) xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id - metaXml = self._download_webpage(xmlUrl, video_id, + mdoc = self._download_xml(xmlUrl, video_id, u'Downloading info XML', u'Unable to download video info XML') - mdoc = xml.etree.ElementTree.fromstring(metaXml) try: videoNode = mdoc.findall('./video')[0] youtubeIdNode = videoNode.find('./youtubeID') @@ -65,11 +63,10 @@ class CollegeHumorIE(InfoExtractor): if next_url.endswith(u'manifest.f4m'): manifest_url = next_url + '?hdcore=2.10.3' - manifestXml = self._download_webpage(manifest_url, video_id, + adoc = self._download_xml(manifest_url, video_id, u'Downloading XML manifest', u'Unable to download video info XML') - adoc = xml.etree.ElementTree.fromstring(manifestXml) try: video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text except IndexError: diff --git a/youtube_dl/extractor/comedycentral.py b/youtube_dl/extractor/comedycentral.py index 69b2beece..a54ce3ee7 100644 --- a/youtube_dl/extractor/comedycentral.py +++ b/youtube_dl/extractor/comedycentral.py @@ -1,7 +1,7 @@ import re -import xml.etree.ElementTree from .common import InfoExtractor +from .mtv import MTVServicesInfoExtractor from ..utils import ( compat_str, compat_urllib_parse, @@ -11,7 +11,31 @@ from ..utils import ( ) -class ComedyCentralIE(InfoExtractor): +class ComedyCentralIE(MTVServicesInfoExtractor): + _VALID_URL = r'https?://(?:www.)?comedycentral.com/(video-clips|episodes|cc-studios)/(?P.*)' + _FEED_URL = u'http://comedycentral.com/feeds/mrss/' + + _TEST = { + u'url': u'http://www.comedycentral.com/video-clips/kllhuv/stand-up-greg-fitzsimmons--uncensored---too-good-of-a-mother', + u'md5': u'4167875aae411f903b751a21f357f1ee', + u'info_dict': { + u'id': u'cef0cbb3-e776-4bc9-b62e-8016deccb354', + u'ext': u'mp4', + u'title': u'Uncensored - Greg Fitzsimmons - Too Good of a Mother', + u'description': u'After a certain point, breastfeeding becomes c**kblocking.', + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + title = mobj.group('title') + webpage = self._download_webpage(url, title) + mgid = self._search_regex(r'data-mgid="(?P<mgid>mgid:.*?)"', + webpage, u'mgid') + return self._get_videos_info(mgid) + + +class ComedyCentralShowsIE(InfoExtractor): IE_DESC = u'The Daily Show / Colbert Report' # urls can be abbreviations like :thedailyshow or :colbert # urls for episodes like: @@ -127,13 +151,12 @@ class ComedyCentralIE(InfoExtractor): uri = mMovieParams[0][1] indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri}) - indexXml = self._download_webpage(indexUrl, epTitle, + idoc = self._download_xml(indexUrl, epTitle, u'Downloading show index', u'unable to download episode index') results = [] - idoc = xml.etree.ElementTree.fromstring(indexXml) itemEls = idoc.findall('.//item') for partNum,itemEl in enumerate(itemEls): mediaId = itemEl.findall('./guid')[0].text @@ -144,10 +167,9 @@ class ComedyCentralIE(InfoExtractor): configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' + compat_urllib_parse.urlencode({'uri': mediaId})) - configXml = self._download_webpage(configUrl, epTitle, + cdoc = self._download_xml(configUrl, epTitle, u'Downloading configuration for %s' % shortMediaId) - cdoc = xml.etree.ElementTree.fromstring(configXml) turls = [] for rendition in cdoc.findall('.//rendition'): finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text) @@ -169,7 +191,7 @@ class ComedyCentralIE(InfoExtractor): }) effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1) - info = { + results.append({ 'id': shortMediaId, 'formats': formats, 'uploader': showId, @@ -177,11 +199,6 @@ class ComedyCentralIE(InfoExtractor): 'title': effTitle, 'thumbnail': None, 'description': compat_str(officialTitle), - } - - # TODO: Remove when #980 has been merged - info.update(info['formats'][-1]) - - results.append(info) + }) return results diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 3cebeaf29..92a0c5050 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -4,11 +4,11 @@ import re import socket import sys import netrc +import xml.etree.ElementTree from ..utils import ( compat_http_client, compat_urllib_error, - compat_urllib_request, compat_str, clean_html, @@ -19,6 +19,7 @@ from ..utils import ( unescapeHTML, ) + class InfoExtractor(object): """Information Extractor class. @@ -54,6 +55,9 @@ class InfoExtractor(object): subtitles: The subtitle file contents as a dictionary in the format {language: subtitles}. view_count: How many users have watched the video on the platform. + like_count: Number of positive ratings of the video + dislike_count: Number of negative ratings of the video + comment_count: Number of comments on the video urlhandle: [internal] The urlHandle to be used to download the file, like returned by urllib.request.urlopen age_limit: Age restriction for the video, as an integer (years) @@ -75,6 +79,7 @@ class InfoExtractor(object): * acodec Name of the audio codec in use * vbr Average video bitrate in KBit/s * vcodec Name of the video codec in use + * filesize The number of bytes, if known in advance webpage_url: The url to the video webpage, if given to youtube-dl it should allow to get the same result again. (It will be set by YoutubeDL if it's missing) @@ -156,7 +161,7 @@ class InfoExtractor(object): elif note is not False: self.to_screen(u'%s: %s' % (video_id, note)) try: - return compat_urllib_request.urlopen(url_or_request) + return self._downloader.urlopen(url_or_request) except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: if errnote is None: errnote = u'Unable to download webpage' @@ -208,6 +213,12 @@ class InfoExtractor(object): """ Returns the data of the page as a string """ return self._download_webpage_handle(url_or_request, video_id, note, errnote)[0] + def _download_xml(self, url_or_request, video_id, + note=u'Downloading XML', errnote=u'Unable to download XML'): + """Return the xml as an xml.etree.ElementTree.Element""" + xml_string = self._download_webpage(url_or_request, video_id, note, errnote) + return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8')) + def to_screen(self, msg): """Print msg to screen, prefixing it with '[ie_name]'""" self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg)) @@ -356,7 +367,8 @@ class InfoExtractor(object): if display_name is None: display_name = name return self._html_search_regex( - r'''(?ix)<meta(?=[^>]+(?:name|property)=["\']%s["\']) + r'''(?ix)<meta + (?=[^>]+(?:itemprop|name|property)=["\']%s["\']) [^>]+content=["\']([^"\']+)["\']''' % re.escape(name), html, display_name, fatal=False) diff --git a/youtube_dl/extractor/cspan.py b/youtube_dl/extractor/cspan.py index 7bf03c584..d5730684d 100644 --- a/youtube_dl/extractor/cspan.py +++ b/youtube_dl/extractor/cspan.py @@ -6,7 +6,7 @@ from ..utils import ( ) class CSpanIE(InfoExtractor): - _VALID_URL = r'http://www.c-spanvideo.org/program/(.*)' + _VALID_URL = r'http://www\.c-spanvideo\.org/program/(.*)' _TEST = { u'url': u'http://www.c-spanvideo.org/program/HolderonV', u'file': u'315139.flv', diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index 71f5e03ee..3bd0b862c 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -11,6 +11,7 @@ from ..utils import ( get_element_by_attribute, get_element_by_id, orderedSet, + str_to_int, ExtractorError, ) @@ -146,6 +147,9 @@ class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor): self._list_available_subtitles(video_id, webpage) return + view_count = str_to_int(self._search_regex( + r'video_views_value[^>]+>([\d\.,]+)<', webpage, u'view count')) + return { 'id': video_id, 'formats': formats, @@ -155,6 +159,7 @@ class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor): 'subtitles': video_subtitles, 'thumbnail': info['thumbnail_url'], 'age_limit': age_limit, + 'view_count': view_count, } def _get_available_subtitles(self, video_id, webpage): diff --git a/youtube_dl/extractor/daum.py b/youtube_dl/extractor/daum.py index a804e83bd..d418ce4a8 100644 --- a/youtube_dl/extractor/daum.py +++ b/youtube_dl/extractor/daum.py @@ -1,6 +1,5 @@ # encoding: utf-8 import re -import xml.etree.ElementTree from .common import InfoExtractor from ..utils import ( @@ -29,17 +28,16 @@ class DaumIE(InfoExtractor): video_id = mobj.group(1) canonical_url = 'http://tvpot.daum.net/v/%s' % video_id webpage = self._download_webpage(canonical_url, video_id) - full_id = self._search_regex(r'<link rel="video_src" href=".+?vid=(.+?)"', + full_id = self._search_regex( + r'<iframe src="http://videofarm.daum.net/controller/video/viewer/Video.html\?.*?vid=(.+?)[&"]', webpage, u'full id') query = compat_urllib_parse.urlencode({'vid': full_id}) - info_xml = self._download_webpage( + info = self._download_xml( 'http://tvpot.daum.net/clip/ClipInfoXml.do?' + query, video_id, u'Downloading video info') - urls_xml = self._download_webpage( + urls = self._download_xml( 'http://videofarm.daum.net/controller/api/open/v1_2/MovieData.apixml?' + query, video_id, u'Downloading video formats info') - info = xml.etree.ElementTree.fromstring(info_xml.encode('utf-8')) - urls = xml.etree.ElementTree.fromstring(urls_xml.encode('utf-8')) self.to_screen(u'%s: Getting video urls' % video_id) formats = [] @@ -49,10 +47,9 @@ class DaumIE(InfoExtractor): 'vid': full_id, 'profile': profile, }) - url_xml = self._download_webpage( + url_doc = self._download_xml( 'http://videofarm.daum.net/controller/api/open/v1_2/MovieLocation.apixml?' + format_query, video_id, note=False) - url_doc = xml.etree.ElementTree.fromstring(url_xml.encode('utf-8')) format_url = url_doc.find('result/url').text formats.append({ 'url': format_url, @@ -60,7 +57,7 @@ class DaumIE(InfoExtractor): 'format_id': profile, }) - info = { + return { 'id': video_id, 'title': info.find('TITLE').text, 'formats': formats, @@ -69,6 +66,3 @@ class DaumIE(InfoExtractor): 'duration': int(info.find('DURATION').text), 'upload_date': info.find('REGDTTM').text[:8], } - # TODO: Remove when #980 has been merged - info.update(formats[-1]) - return info diff --git a/youtube_dl/extractor/dreisat.py b/youtube_dl/extractor/dreisat.py index 765cb1f37..cb7226f82 100644 --- a/youtube_dl/extractor/dreisat.py +++ b/youtube_dl/extractor/dreisat.py @@ -1,7 +1,6 @@ # coding: utf-8 import re -import xml.etree.ElementTree from .common import InfoExtractor from ..utils import ( @@ -12,7 +11,7 @@ from ..utils import ( class DreiSatIE(InfoExtractor): IE_NAME = '3sat' - _VALID_URL = r'(?:http://)?(?:www\.)?3sat.de/mediathek/index.php\?(?:(?:mode|display)=[^&]+&)*obj=(?P<id>[0-9]+)$' + _VALID_URL = r'(?:http://)?(?:www\.)?3sat\.de/mediathek/index\.php\?(?:(?:mode|display)=[^&]+&)*obj=(?P<id>[0-9]+)$' _TEST = { u"url": u"http://www.3sat.de/mediathek/index.php?obj=36983", u'file': u'36983.webm', @@ -30,8 +29,7 @@ class DreiSatIE(InfoExtractor): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') details_url = 'http://www.3sat.de/mediathek/xmlservice/web/beitragsDetails?ak=web&id=%s' % video_id - details_xml = self._download_webpage(details_url, video_id, note=u'Downloading video details') - details_doc = xml.etree.ElementTree.fromstring(details_xml.encode('utf-8')) + details_doc = self._download_xml(details_url, video_id, note=u'Downloading video details') thumbnail_els = details_doc.findall('.//teaserimage') thumbnails = [{ @@ -67,7 +65,7 @@ class DreiSatIE(InfoExtractor): return (qidx, prefer_http, format['video_bitrate']) formats.sort(key=_sortkey) - info = { + return { '_type': 'video', 'id': video_id, 'title': video_title, @@ -78,8 +76,3 @@ class DreiSatIE(InfoExtractor): 'uploader': video_uploader, 'upload_date': upload_date, } - - # TODO: Remove when #980 has been merged - info.update(formats[-1]) - - return info diff --git a/youtube_dl/extractor/ebaumsworld.py b/youtube_dl/extractor/ebaumsworld.py index f02c6998b..877113d63 100644 --- a/youtube_dl/extractor/ebaumsworld.py +++ b/youtube_dl/extractor/ebaumsworld.py @@ -1,5 +1,4 @@ import re -import xml.etree.ElementTree from .common import InfoExtractor from ..utils import determine_ext @@ -21,9 +20,8 @@ class EbaumsWorldIE(InfoExtractor): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') - config_xml = self._download_webpage( + config = self._download_xml( 'http://www.ebaumsworld.com/video/player/%s' % video_id, video_id) - config = xml.etree.ElementTree.fromstring(config_xml.encode('utf-8')) video_url = config.find('file').text return { diff --git a/youtube_dl/extractor/eighttracks.py b/youtube_dl/extractor/eighttracks.py index f21ef8853..88f5526b8 100644 --- a/youtube_dl/extractor/eighttracks.py +++ b/youtube_dl/extractor/eighttracks.py @@ -10,7 +10,7 @@ from ..utils import ( class EightTracksIE(InfoExtractor): IE_NAME = '8tracks' - _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$' + _VALID_URL = r'https?://8tracks\.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$' _TEST = { u"name": u"EightTracks", u"url": u"http://8tracks.com/ytdl/youtube-dl-test-tracks-a", diff --git a/youtube_dl/extractor/exfm.py b/youtube_dl/extractor/exfm.py index a51d79b08..682901d16 100644 --- a/youtube_dl/extractor/exfm.py +++ b/youtube_dl/extractor/exfm.py @@ -8,7 +8,7 @@ class ExfmIE(InfoExtractor): IE_NAME = u'exfm' IE_DESC = u'ex.fm' _VALID_URL = r'(?:http://)?(?:www\.)?ex\.fm/song/([^/]+)' - _SOUNDCLOUD_URL = r'(?:http://)?(?:www\.)?api\.soundcloud.com/tracks/([^/]+)/stream' + _SOUNDCLOUD_URL = r'(?:http://)?(?:www\.)?api\.soundcloud\.com/tracks/([^/]+)/stream' _TESTS = [ { u'url': u'http://ex.fm/song/eh359', diff --git a/youtube_dl/extractor/faz.py b/youtube_dl/extractor/faz.py index 89ed08db4..c6ab6952e 100644 --- a/youtube_dl/extractor/faz.py +++ b/youtube_dl/extractor/faz.py @@ -1,6 +1,5 @@ # encoding: utf-8 import re -import xml.etree.ElementTree from .common import InfoExtractor from ..utils import ( @@ -10,7 +9,7 @@ from ..utils import ( class FazIE(InfoExtractor): IE_NAME = u'faz.net' - _VALID_URL = r'https?://www\.faz\.net/multimedia/videos/.*?-(?P<id>\d+).html' + _VALID_URL = r'https?://www\.faz\.net/multimedia/videos/.*?-(?P<id>\d+)\.html' _TEST = { u'url': u'http://www.faz.net/multimedia/videos/stockholm-chemie-nobelpreis-fuer-drei-amerikanische-forscher-12610585.html', @@ -28,9 +27,8 @@ class FazIE(InfoExtractor): webpage = self._download_webpage(url, video_id) config_xml_url = self._search_regex(r'writeFLV\(\'(.+?)\',', webpage, u'config xml url') - config_xml = self._download_webpage(config_xml_url, video_id, + config = self._download_xml(config_xml_url, video_id, u'Downloading config xml') - config = xml.etree.ElementTree.fromstring(config_xml.encode('utf-8')) encodings = config.find('ENCODINGS') formats = [] @@ -46,13 +44,10 @@ class FazIE(InfoExtractor): }) descr = self._html_search_regex(r'<p class="Content Copy">(.*?)</p>', webpage, u'description') - info = { + return { 'id': video_id, 'title': self._og_search_title(webpage), 'formats': formats, 'description': descr, 'thumbnail': config.find('STILL/STILL_BIG').text, } - # TODO: Remove when #980 has been merged - info.update(formats[-1]) - return info diff --git a/youtube_dl/extractor/fktv.py b/youtube_dl/extractor/fktv.py index dba1a8dc2..d7048c8c1 100644 --- a/youtube_dl/extractor/fktv.py +++ b/youtube_dl/extractor/fktv.py @@ -12,7 +12,7 @@ from ..utils import ( class FKTVIE(InfoExtractor): IE_NAME = u'fernsehkritik.tv' - _VALID_URL = r'(?:http://)?(?:www\.)?fernsehkritik.tv/folge-(?P<ep>[0-9]+)(?:/.*)?' + _VALID_URL = r'(?:http://)?(?:www\.)?fernsehkritik\.tv/folge-(?P<ep>[0-9]+)(?:/.*)?' _TEST = { u'url': u'http://fernsehkritik.tv/folge-1', @@ -52,7 +52,7 @@ class FKTVIE(InfoExtractor): class FKTVPosteckeIE(InfoExtractor): IE_NAME = u'fernsehkritik.tv:postecke' - _VALID_URL = r'(?:http://)?(?:www\.)?fernsehkritik.tv/inline-video/postecke.php\?(.*&)?ep=(?P<ep>[0-9]+)(&|$)' + _VALID_URL = r'(?:http://)?(?:www\.)?fernsehkritik\.tv/inline-video/postecke\.php\?(.*&)?ep=(?P<ep>[0-9]+)(&|$)' _TEST = { u'url': u'http://fernsehkritik.tv/inline-video/postecke.php?iframe=true&width=625&height=440&ep=120', u'file': u'0120.flv', diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py index 086cafca0..ad85bc16d 100644 --- a/youtube_dl/extractor/francetv.py +++ b/youtube_dl/extractor/francetv.py @@ -1,6 +1,5 @@ # encoding: utf-8 import re -import xml.etree.ElementTree import json from .common import InfoExtractor @@ -11,11 +10,10 @@ from ..utils import ( class FranceTVBaseInfoExtractor(InfoExtractor): def _extract_video(self, video_id): - xml_desc = self._download_webpage( + info = self._download_xml( 'http://www.francetvinfo.fr/appftv/webservices/video/' 'getInfosOeuvre.php?id-diffusion=' + video_id, video_id, 'Downloading XML config') - info = xml.etree.ElementTree.fromstring(xml_desc.encode('utf-8')) manifest_url = info.find('videos/video/url').text video_url = manifest_url.replace('manifest.f4m', 'index_2_av.m3u8') @@ -23,7 +21,7 @@ class FranceTVBaseInfoExtractor(InfoExtractor): thumbnail_path = info.find('image').text return {'id': video_id, - 'ext': 'mp4', + 'ext': 'flv' if video_url.startswith('rtmp') else 'mp4', 'url': video_url, 'title': info.find('titre').text, 'thumbnail': compat_urlparse.urljoin('http://pluzz.francetv.fr', thumbnail_path), @@ -47,7 +45,7 @@ class PluzzIE(FranceTVBaseInfoExtractor): class FranceTvInfoIE(FranceTVBaseInfoExtractor): IE_NAME = u'francetvinfo.fr' - _VALID_URL = r'https?://www\.francetvinfo\.fr/replay.*/(?P<title>.+).html' + _VALID_URL = r'https?://www\.francetvinfo\.fr/replay.*/(?P<title>.+)\.html' _TEST = { u'url': u'http://www.francetvinfo.fr/replay-jt/france-3/soir-3/jt-grand-soir-3-lundi-26-aout-2013_393427.html', @@ -68,35 +66,101 @@ class FranceTvInfoIE(FranceTVBaseInfoExtractor): return self._extract_video(video_id) -class France2IE(FranceTVBaseInfoExtractor): - IE_NAME = u'france2.fr' - _VALID_URL = r'''(?x)https?://www\.france2\.fr/ +class FranceTVIE(FranceTVBaseInfoExtractor): + IE_NAME = u'francetv' + IE_DESC = u'France 2, 3, 4, 5 and Ô' + _VALID_URL = r'''(?x)https?://www\.france[2345o]\.fr/ (?: - emissions/.*?/videos/(?P<id>\d+) - | emission/(?P<key>[^/?]+) + emissions/.*?/(videos|emissions)/(?P<id>[^/?]+) + | (emissions?|jt)/(?P<key>[^/?]+) )''' - _TEST = { - u'url': u'http://www.france2.fr/emissions/13h15-le-samedi-le-dimanche/videos/75540104', - u'file': u'75540104.mp4', - u'info_dict': { - u'title': u'13h15, le samedi...', - u'description': u'md5:2e5b58ba7a2d3692b35c792be081a03d', + _TESTS = [ + # france2 + { + u'url': u'http://www.france2.fr/emissions/13h15-le-samedi-le-dimanche/videos/75540104', + u'file': u'75540104.mp4', + u'info_dict': { + u'title': u'13h15, le samedi...', + u'description': u'md5:2e5b58ba7a2d3692b35c792be081a03d', + }, + u'params': { + # m3u8 download + u'skip_download': True, + }, }, - u'params': { - u'skip_download': True, + # france3 + { + u'url': u'http://www.france3.fr/emissions/pieces-a-conviction/diffusions/13-11-2013_145575', + u'info_dict': { + u'id': u'000702326_CAPP_PicesconvictionExtrait313022013_120220131722_Au', + u'ext': u'flv', + u'title': u'Le scandale du prix des médicaments', + u'description': u'md5:1384089fbee2f04fc6c9de025ee2e9ce', + }, + u'params': { + # rtmp download + u'skip_download': True, + }, }, - } + # france4 + { + u'url': u'http://www.france4.fr/emissions/hero-corp/videos/rhozet_herocorp_bonus_1_20131106_1923_06112013172108_F4', + u'info_dict': { + u'id': u'rhozet_herocorp_bonus_1_20131106_1923_06112013172108_F4', + u'ext': u'flv', + u'title': u'Hero Corp Making of - Extrait 1', + u'description': u'md5:c87d54871b1790679aec1197e73d650a', + }, + u'params': { + # rtmp download + u'skip_download': True, + }, + }, + # france5 + { + u'url': u'http://www.france5.fr/emissions/c-a-dire/videos/92837968', + u'info_dict': { + u'id': u'92837968', + u'ext': u'mp4', + u'title': u'C à dire ?!', + u'description': u'md5:fb1db1cbad784dcce7c7a7bd177c8e2f', + }, + u'params': { + # m3u8 download + u'skip_download': True, + }, + }, + # franceo + { + u'url': u'http://www.franceo.fr/jt/info-afrique/04-12-2013', + u'info_dict': { + u'id': u'92327925', + u'ext': u'mp4', + u'title': u'Infô-Afrique', + u'description': u'md5:ebf346da789428841bee0fd2a935ea55', + }, + u'params': { + # m3u8 download + u'skip_download': True, + }, + u'skip': u'The id changes frequently', + }, + ] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) if mobj.group('key'): webpage = self._download_webpage(url, mobj.group('key')) - video_id = self._html_search_regex( - r'''(?x)<div\s+class="video-player">\s* + id_res = [ + (r'''(?x)<div\s+class="video-player">\s* <a\s+href="http://videos.francetv.fr/video/([0-9]+)"\s+ - class="francetv-video-player">''', - webpage, u'video ID') + class="francetv-video-player">'''), + (r'<a id="player_direct" href="http://info\.francetelevisions' + '\.fr/\?id-video=([^"/&]+)'), + (r'<a class="video" id="ftv_player_(.+?)"'), + ] + video_id = self._html_search_regex(id_res, webpage, u'video ID') else: video_id = mobj.group('id') return self._extract_video(video_id) diff --git a/youtube_dl/extractor/gamekings.py b/youtube_dl/extractor/gamekings.py index c91669b0e..a3a5251fe 100644 --- a/youtube_dl/extractor/gamekings.py +++ b/youtube_dl/extractor/gamekings.py @@ -4,7 +4,7 @@ from .common import InfoExtractor class GamekingsIE(InfoExtractor): - _VALID_URL = r'http?://www\.gamekings\.tv/videos/(?P<name>[0-9a-z\-]+)' + _VALID_URL = r'http://www\.gamekings\.tv/videos/(?P<name>[0-9a-z\-]+)' _TEST = { u"url": u"http://www.gamekings.tv/videos/phoenix-wright-ace-attorney-dual-destinies-review/", u'file': u'20130811.mp4', diff --git a/youtube_dl/extractor/gamespot.py b/youtube_dl/extractor/gamespot.py index 9645b00c3..26b7d2ae5 100644 --- a/youtube_dl/extractor/gamespot.py +++ b/youtube_dl/extractor/gamespot.py @@ -47,13 +47,10 @@ class GameSpotIE(InfoExtractor): 'format_id': q, }) - info = { + return { 'id': data_video['guid'], 'title': compat_urllib_parse.unquote(data_video['title']), 'formats': formats, 'description': get_meta_content('description', webpage), 'thumbnail': self._og_search_thumbnail(webpage), } - # TODO: Remove when #980 has been merged - info.update(formats[-1]) - return info diff --git a/youtube_dl/extractor/gametrailers.py b/youtube_dl/extractor/gametrailers.py index 3cc02d97e..d82a5d4b2 100644 --- a/youtube_dl/extractor/gametrailers.py +++ b/youtube_dl/extractor/gametrailers.py @@ -1,13 +1,10 @@ import re -from .mtv import MTVIE, _media_xml_tag +from .mtv import MTVServicesInfoExtractor -class GametrailersIE(MTVIE): - """ - Gametrailers use the same videos system as MTVIE, it just changes the feed - url, where the uri is and the method to get the thumbnails. - """ - _VALID_URL = r'http://www.gametrailers.com/(?P<type>videos|reviews|full-episodes)/(?P<id>.*?)/(?P<title>.*)' + +class GametrailersIE(MTVServicesInfoExtractor): + _VALID_URL = r'http://www\.gametrailers\.com/(?P<type>videos|reviews|full-episodes)/(?P<id>.*?)/(?P<title>.*)' _TEST = { u'url': u'http://www.gametrailers.com/videos/zbvr8i/mirror-s-edge-2-e3-2013--debut-trailer', u'file': u'70e9a5d7-cf25-4a10-9104-6f3e7342ae0d.mp4', @@ -17,15 +14,9 @@ class GametrailersIE(MTVIE): u'description': u'Faith is back! Check out the World Premiere trailer for Mirror\'s Edge 2 straight from the EA Press Conference at E3 2013!', }, } - # Overwrite MTVIE properties we don't want - _TESTS = [] _FEED_URL = 'http://www.gametrailers.com/feeds/mrss' - def _get_thumbnail_url(self, uri, itemdoc): - search_path = '%s/%s' % (_media_xml_tag('group'), _media_xml_tag('thumbnail')) - return itemdoc.find(search_path).attrib['url'] - def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 0b5f2b2bb..216e03218 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -169,8 +169,13 @@ class GenericIE(InfoExtractor): # Site Name | Video Title # Video Title - Tagline | Site Name # and so on and so forth; it's just not practical - video_title = self._html_search_regex(r'<title>(.*)', - webpage, u'video title', default=u'video', flags=re.DOTALL) + video_title = self._html_search_regex( + r'(?s)(.*?)', webpage, u'video title', + default=u'video') + + # video uploader is domain name + video_uploader = self._search_regex( + r'^(?:https?://)?([^/]*)/.*', url, u'video uploader') # Look for BrightCove: bc_url = BrightcoveIE._extract_brightcove_url(webpage) @@ -188,13 +193,35 @@ class GenericIE(InfoExtractor): # Look for embedded YouTube player matches = re.findall( - r']+?src=(["\'])(?P(?:https?:)?//(?:www\.)?youtube.com/embed/.+?)\1', webpage) + r']+?src=(["\'])(?P(?:https?:)?//(?:www\.)?youtube\.com/embed/.+?)\1', webpage) if matches: urlrs = [self.url_result(unescapeHTML(tuppl[1]), 'Youtube') for tuppl in matches] return self.playlist_result( urlrs, playlist_id=video_id, playlist_title=video_title) + # Look for embedded Dailymotion player + matches = re.findall( + r']+?src=(["\'])(?P(?:https?:)?//(?:www\.)?dailymotion\.com/embed/video/.+?)\1', webpage) + if matches: + urlrs = [self.url_result(unescapeHTML(tuppl[1]), 'Dailymotion') + for tuppl in matches] + return self.playlist_result( + urlrs, playlist_id=video_id, playlist_title=video_title) + + # Look for embedded Wistia player + match = re.search( + r']+?src=(["\'])(?P(?:https?:)?//(?:fast\.)?wistia\.net/embed/iframe/.+?)\1', webpage) + if match: + return { + '_type': 'url_transparent', + 'url': unescapeHTML(match.group('url')), + 'ie_key': 'Wistia', + 'uploader': video_uploader, + 'title': video_title, + 'id': video_id, + } + # Look for Bandcamp pages with custom domain mobj = re.search(r']*?content="(.*?bandcamp\.com.*?)"', webpage) if mobj is not None: @@ -209,7 +236,7 @@ class GenericIE(InfoExtractor): mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage) if mobj is None: # Broaden the search a little bit: JWPlayer JS loader - mobj = re.search(r'[^A-Za-z0-9]?file["\']?:\s*["\'](http[^\'"&]*)', webpage) + mobj = re.search(r'[^A-Za-z0-9]?file["\']?:\s*["\'](http[^\'"]*)', webpage) if mobj is None: # Try to find twitter cards info mobj = re.search(r'video)/id/(?P.+)' + _VALID_URL = r'https?://gamevideos\.1up\.com/(?Pvideo)/id/(?P.+)' IE_NAME = '1up.com' _DESCRIPTION_RE = r'
(.+?)
' diff --git a/youtube_dl/extractor/imdb.py b/youtube_dl/extractor/imdb.py new file mode 100644 index 000000000..6fb373db2 --- /dev/null +++ b/youtube_dl/extractor/imdb.py @@ -0,0 +1,57 @@ +import re +import json + +from .common import InfoExtractor +from ..utils import ( + compat_urlparse, + get_element_by_attribute, +) + + +class ImdbIE(InfoExtractor): + IE_NAME = u'imdb' + IE_DESC = u'Internet Movie Database trailers' + _VALID_URL = r'http://www\.imdb\.com/video/imdb/vi(?P\d+)' + + _TEST = { + u'url': u'http://www.imdb.com/video/imdb/vi2524815897', + u'md5': u'9f34fa777ade3a6e57a054fdbcb3a068', + u'info_dict': { + u'id': u'2524815897', + u'ext': u'mp4', + u'title': u'Ice Age: Continental Drift Trailer (No. 2) - IMDb', + u'description': u'md5:9061c2219254e5d14e03c25c98e96a81', + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + webpage = self._download_webpage(url,video_id) + descr = get_element_by_attribute('itemprop', 'description', webpage) + available_formats = re.findall( + r'case \'(?P.*?)\' :$\s+url = \'(?P.*?)\'', webpage, + flags=re.MULTILINE) + formats = [] + for f_id, f_path in available_formats: + f_path = f_path.strip() + format_page = self._download_webpage( + compat_urlparse.urljoin(url, f_path), + u'Downloading info for %s format' % f_id) + json_data = self._search_regex( + r']+class="imdb-player-data"[^>]*?>(.*?)', + format_page, u'json data', flags=re.DOTALL) + info = json.loads(json_data) + format_info = info['videoPlayerObject']['video'] + formats.append({ + 'format_id': f_id, + 'url': format_info['url'], + }) + + return { + 'id': video_id, + 'title': self._og_search_title(webpage), + 'formats': formats, + 'description': descr, + 'thumbnail': format_info['slate'], + } diff --git a/youtube_dl/extractor/instagram.py b/youtube_dl/extractor/instagram.py index 213aac428..660573d02 100644 --- a/youtube_dl/extractor/instagram.py +++ b/youtube_dl/extractor/instagram.py @@ -3,7 +3,7 @@ import re from .common import InfoExtractor class InstagramIE(InfoExtractor): - _VALID_URL = r'(?:http://)?instagram.com/p/(.*?)/' + _VALID_URL = r'(?:http://)?instagram\.com/p/(.*?)/' _TEST = { u'url': u'http://instagram.com/p/aye83DjauH/?foo=bar#abc', u'file': u'aye83DjauH.mp4', diff --git a/youtube_dl/extractor/internetvideoarchive.py b/youtube_dl/extractor/internetvideoarchive.py index be8e05f53..16a6f73c8 100644 --- a/youtube_dl/extractor/internetvideoarchive.py +++ b/youtube_dl/extractor/internetvideoarchive.py @@ -1,5 +1,4 @@ import re -import xml.etree.ElementTree from .common import InfoExtractor from ..utils import ( @@ -43,9 +42,8 @@ class InternetVideoArchiveIE(InfoExtractor): video_id = query_dic['publishedid'][0] url = self._build_url(query) - flashconfiguration_xml = self._download_webpage(url, video_id, + flashconfiguration = self._download_xml(url, video_id, u'Downloading flash configuration') - flashconfiguration = xml.etree.ElementTree.fromstring(flashconfiguration_xml.encode('utf-8')) file_url = flashconfiguration.find('file').text file_url = file_url.replace('/playlist.aspx', '/mrssplaylist.aspx') # Replace some of the parameters in the query to get the best quality @@ -53,9 +51,8 @@ class InternetVideoArchiveIE(InfoExtractor): file_url = re.sub(r'(?<=\?)(.+)$', lambda m: self._clean_query(m.group()), file_url) - info_xml = self._download_webpage(file_url, video_id, + info = self._download_xml(file_url, video_id, u'Downloading video info') - info = xml.etree.ElementTree.fromstring(info_xml.encode('utf-8')) item = info.find('channel/item') def _bp(p): diff --git a/youtube_dl/extractor/jeuxvideo.py b/youtube_dl/extractor/jeuxvideo.py index 0020c47cf..caf9d8c85 100644 --- a/youtube_dl/extractor/jeuxvideo.py +++ b/youtube_dl/extractor/jeuxvideo.py @@ -2,7 +2,6 @@ import json import re -import xml.etree.ElementTree from .common import InfoExtractor @@ -32,12 +31,9 @@ class JeuxVideoIE(InfoExtractor): r'http://www\.jeuxvideo\.com/config/\w+/\d+/(.*?)/\d+_player\.xml', xml_link, u'video ID') - xml_config = self._download_webpage( + config = self._download_xml( xml_link, title, u'Downloading XML config') - config = xml.etree.ElementTree.fromstring(xml_config.encode('utf-8')) - info_json = self._search_regex( - r'(?sm)(.*?)', - xml_config, u'JSON information') + info_json = config.find('format.json').text info = json.loads(info_json)['versions'][0] video_url = 'http://video720.jeuxvideo.com/' + info['file'] diff --git a/youtube_dl/extractor/jukebox.py b/youtube_dl/extractor/jukebox.py index c7bb234fe..592c64e1d 100644 --- a/youtube_dl/extractor/jukebox.py +++ b/youtube_dl/extractor/jukebox.py @@ -8,7 +8,7 @@ from ..utils import ( ) class JukeboxIE(InfoExtractor): - _VALID_URL = r'^http://www\.jukebox?\..+?\/.+[,](?P[a-z0-9\-]+).html' + _VALID_URL = r'^http://www\.jukebox?\..+?\/.+[,](?P[a-z0-9\-]+)\.html' _IFRAME = r'