From d5a9bb4ea97287e633e891ddd1a416619c9aada9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rog=C3=A9rio=20Brito?= Date: Sat, 19 Oct 2013 14:04:44 -0300 Subject: [PATCH 001/121] extractor: youtube: Swap video dimensions to match standard practice. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit While working on this, I thought about simplifying things like changing 480x854 to 480p, and that seemed like a good option, until I realized that people (me included) usually link the concept of some number followed by a p with the video being 16:9. So, we would be losing some information and, as we all know, [explicit is better than implicit][*]. [*]: http://www.python.org/dev/peps/pep-0020/ This closes #1446. Signed-off-by: Rogério Brito --- youtube_dl/extractor/youtube.py | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index fb7c42830..143fac98a 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -253,21 +253,21 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): '248': 'webm', } _video_dimensions = { - '5': '240x400', + '5': '400x240', '6': '???', '13': '???', - '17': '144x176', - '18': '360x640', - '22': '720x1280', - '34': '360x640', - '35': '480x854', - '36': '240x320', - '37': '1080x1920', - '38': '3072x4096', - '43': '360x640', - '44': '480x854', - '45': '720x1280', - '46': '1080x1920', + '17': '176x144', + '18': '640x360', + '22': '1280x720', + '34': '640x360', + '35': '854x480', + '36': '320x240', + '37': '1920x1080', + '38': '4096x3072', + '43': '640x360', + '44': '854x480', + '45': '1280x720', + '46': '1920x1080', '82': '360p', '83': '480p', '84': '720p', From 4894fe8c5baec8b1f21ac6fdebe08175abc7f094 Mon Sep 17 00:00:00 2001 From: rzhxeo Date: Tue, 29 Oct 2013 01:05:21 +0100 Subject: [PATCH 002/121] Report download progress of rtmpdump --- youtube_dl/FileDownloader.py | 70 ++++++++++++++++++++++++++++++++---- 1 file changed, 63 insertions(+), 7 deletions(-) diff --git a/youtube_dl/FileDownloader.py b/youtube_dl/FileDownloader.py index 8ecabab1a..664b78662 100644 --- a/youtube_dl/FileDownloader.py +++ b/youtube_dl/FileDownloader.py @@ -268,6 +268,61 @@ class FileDownloader(object): (clear_line, data_len_str, self.format_seconds(tot_time))) def _download_with_rtmpdump(self, filename, url, player_url, page_url, play_path, tc_url): + def run_rtmpdump(args): + start = time.time() + resume_percent = None + resume_downloaded_data_len = None + proc = subprocess.Popen(args, stderr=subprocess.PIPE) + cursor_in_new_line = True + proc_stderr_closed = False + while not proc_stderr_closed: + # read line from stderr + line = u'' + while True: + char = proc.stderr.read(1) + if not char: + proc_stderr_closed = True + break + if char in [b'\r', b'\n']: + break + line += char.decode('ascii', 'replace') + if not line: + # proc_stderr_closed is True + continue + mobj = re.search(r'([0-9]+\.[0-9]{3}) kB / [0-9]+\.[0-9]{2} sec \(([0-9]{1,2}\.[0-9])%\)', line) + if mobj: + downloaded_data_len = int(float(mobj.group(1))*1024) + percent = float(mobj.group(2)) + if not resume_percent: + resume_percent = percent + resume_downloaded_data_len = downloaded_data_len + eta = self.calc_eta(start, time.time(), 100-resume_percent, percent-resume_percent) + speed = self.calc_speed(start, time.time(), downloaded_data_len-resume_downloaded_data_len) + data_len = None + if percent > 0: + data_len = int(downloaded_data_len * 100 / percent) + data_len_str = u'~'+self.format_bytes(data_len) + self.report_progress(percent, data_len_str, speed, eta) + cursor_in_new_line = False + self._hook_progress({ + 'downloaded_bytes': downloaded_data_len, + 'total_bytes': data_len, + 'tmpfilename': tmpfilename, + 'filename': filename, + 'status': 'downloading', + 'eta': eta, + 'speed': speed, + }) + elif self.params.get('verbose', False): + if not cursor_in_new_line: + self.to_screen(u'') + cursor_in_new_line = True + self.to_screen(u'[rtmpdump] '+line) + proc.wait() + if not cursor_in_new_line: + self.to_screen(u'') + return proc.returncode + self.report_destination(filename) tmpfilename = self.temp_name(filename) test = self.params.get('test', False) @@ -278,12 +333,11 @@ class FileDownloader(object): except (OSError, IOError): self.report_error(u'RTMP download detected but "rtmpdump" could not be run') return False - verbosity_option = '--verbose' if self.params.get('verbose', False) else '--quiet' # Download using rtmpdump. rtmpdump returns exit code 2 when # the connection was interrumpted and resuming appears to be # possible. This is part of rtmpdump's normal usage, AFAIK. - basic_args = ['rtmpdump', verbosity_option, '-r', url, '-o', tmpfilename] + basic_args = ['rtmpdump', '--verbose', '-r', url, '-o', tmpfilename] if player_url is not None: basic_args += ['--swfVfy', player_url] if page_url is not None: @@ -302,23 +356,25 @@ class FileDownloader(object): except ImportError: shell_quote = repr self.to_screen(u'[debug] rtmpdump command line: ' + shell_quote(args)) - retval = subprocess.call(args) + + retval = run_rtmpdump(args) + while (retval == 2 or retval == 1) and not test: prevsize = os.path.getsize(encodeFilename(tmpfilename)) - self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True) + self.to_screen(u'[rtmpdump] %s bytes' % prevsize) time.sleep(5.0) # This seems to be needed - retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1]) + retval = run_rtmpdump(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1]) cursize = os.path.getsize(encodeFilename(tmpfilename)) if prevsize == cursize and retval == 1: break # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those if prevsize == cursize and retval == 2 and cursize > 1024: - self.to_screen(u'\r[rtmpdump] Could not download the whole video. This can happen for some advertisements.') + self.to_screen(u'[rtmpdump] Could not download the whole video. This can happen for some advertisements.') retval = 0 break if retval == 0 or (test and retval == 2): fsize = os.path.getsize(encodeFilename(tmpfilename)) - self.to_screen(u'\r[rtmpdump] %s bytes' % fsize) + self.to_screen(u'[rtmpdump] %s bytes' % fsize) self.try_rename(tmpfilename, filename) self._hook_progress({ 'downloaded_bytes': fsize, From dca087205692c934163ec9aca5962056f890cd19 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Fri, 22 Nov 2013 19:57:52 +0100 Subject: [PATCH 003/121] Move the opener to the YoutubeDL object. This is the first step towards being able to just import youtube_dl and start using it. Apart from removing global state, this would fix problems like #1805. --- youtube_dl/YoutubeDL.py | 86 ++++++++++++++++++++++++++++- youtube_dl/__init__.py | 98 +++------------------------------- youtube_dl/extractor/common.py | 4 +- youtube_dl/utils.py | 4 +- 4 files changed, 96 insertions(+), 96 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index a2e3df1f9..72ccfa2ae 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -7,8 +7,10 @@ import errno import io import json import os +import platform import re import shutil +import subprocess import socket import sys import time @@ -18,6 +20,7 @@ if os.name == 'nt': import ctypes from .utils import ( + compat_cookiejar, compat_http_client, compat_print, compat_str, @@ -31,8 +34,10 @@ from .utils import ( encodeFilename, ExtractorError, locked_file, + make_HTTPS_handler, MaxDownloadsReached, PostProcessingError, + platform_name, preferredencoding, SameFileError, sanitize_filename, @@ -41,9 +46,11 @@ from .utils import ( UnavailableVideoError, write_json_file, write_string, + YoutubeDLHandler, ) from .extractor import get_info_extractor, gen_extractors from .FileDownloader import FileDownloader +from .version import __version__ class YoutubeDL(object): @@ -120,6 +127,8 @@ class YoutubeDL(object): downloadarchive: File name of a file where all downloads are recorded. Videos already present in the file are not downloaded again. + cookiefile: File name where cookies should be read from and dumped to. + nocheckcertificate Do not verify SSL certificates The following parameters are not used by YoutubeDL itself, they are used by the FileDownloader: @@ -160,6 +169,8 @@ class YoutubeDL(object): if '%(stitle)s' in self.params['outtmpl']: self.report_warning(u'%(stitle)s is deprecated. Use the %(title)s and the --restrict-filenames flag(which also secures %(uploader)s et al) instead.') + self._setup_opener() + def add_info_extractor(self, ie): """Add an InfoExtractor object to the end of the list.""" self._ies.append(ie) @@ -235,6 +246,9 @@ class YoutubeDL(object): def __exit__(self, *args): self.restore_console_title() + + if self.params.get('cookiefile') is not None: + self.cookiejar.save() def fixed_template(self): """Checks if the output template is fixed.""" @@ -774,7 +788,7 @@ class YoutubeDL(object): for url in url_list: try: #It also downloads the videos - videos = self.extract_info(url) + self.extract_info(url) except UnavailableVideoError: self.report_error(u'unable to download video') except MaxDownloadsReached: @@ -885,3 +899,73 @@ class YoutubeDL(object): '_resolution': u'resolution', 'format_note': u'note'}) self.to_screen(u'[info] Available formats for %s:\n%s\n%s' % (info_dict['id'], header_line, u"\n".join(formats_s))) + + def urlopen(self, req): + """ Start an HTTP download """ + return self._opener.open(req) + + def print_debug_header(self): + if not self.params.get('verbose'): + return + write_string(u'[debug] youtube-dl version ' + __version__ + u'\n') + try: + sp = subprocess.Popen( + ['git', 'rev-parse', '--short', 'HEAD'], + stdout=subprocess.PIPE, stderr=subprocess.PIPE, + cwd=os.path.dirname(os.path.abspath(__file__))) + out, err = sp.communicate() + out = out.decode().strip() + if re.match('[0-9a-f]+', out): + write_string(u'[debug] Git HEAD: ' + out + u'\n') + except: + try: + sys.exc_clear() + except: + pass + write_string(u'[debug] Python version %s - %s' % + (platform.python_version(), platform_name()) + u'\n') + + proxy_map = {} + for handler in self._opener.handlers: + if hasattr(handler, 'proxies'): + proxy_map.update(handler.proxies) + write_string(u'[debug] Proxy map: ' + compat_str(proxy_map) + u'\n') + + def _setup_opener(self, timeout=300): + opts_cookiefile = self.params.get('cookiefile') + opts_proxy = self.params.get('proxy') + + if opts_cookiefile is None: + self.cookiejar = compat_cookiejar.CookieJar() + else: + self.cookiejar = compat_cookiejar.MozillaCookieJar( + opts_cookiefile) + if os.access(opts_cookiefile, os.R_OK): + self.cookiejar.load() + + cookie_processor = compat_urllib_request.HTTPCookieProcessor( + self.cookiejar) + if opts_proxy is not None: + if opts_proxy == '': + proxies = {} + else: + proxies = {'http': opts_proxy, 'https': opts_proxy} + else: + proxies = compat_urllib_request.getproxies() + # Set HTTPS proxy to HTTP one if given (https://github.com/rg3/youtube-dl/issues/805) + if 'http' in proxies and 'https' not in proxies: + proxies['https'] = proxies['http'] + proxy_handler = compat_urllib_request.ProxyHandler(proxies) + https_handler = make_HTTPS_handler( + self.params.get('nocheckcertificate', False)) + opener = compat_urllib_request.build_opener( + https_handler, proxy_handler, cookie_processor, YoutubeDLHandler()) + # Delete the default user-agent header, which would otherwise apply in + # cases where our custom HTTP handler doesn't come into play + # (See https://github.com/rg3/youtube-dl/issues/1309 for details) + opener.addheaders = [] + self._opener = opener + + # TODO remove this global modification + compat_urllib_request.install_opener(opener) + socket.setdefaulttimeout(timeout) diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 64ebf4d48..27886593b 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -40,45 +40,35 @@ __authors__ = ( __license__ = 'Public Domain' import codecs -import collections import getpass import optparse import os import random import re import shlex -import socket import subprocess import sys -import traceback -import platform from .utils import ( - compat_cookiejar, compat_print, - compat_str, - compat_urllib_request, DateRange, decodeOption, determine_ext, DownloadError, get_cachedir, - make_HTTPS_handler, MaxDownloadsReached, - platform_name, preferredencoding, SameFileError, std_headers, write_string, - YoutubeDLHandler, ) from .update import update_self -from .version import __version__ from .FileDownloader import ( FileDownloader, ) from .extractor import gen_extractors +from .version import __version__ from .YoutubeDL import YoutubeDL from .PostProcessor import ( FFmpegMetadataPP, @@ -451,19 +441,6 @@ def _real_main(argv=None): parser, opts, args = parseOpts(argv) - # Open appropriate CookieJar - if opts.cookiefile is None: - jar = compat_cookiejar.CookieJar() - else: - try: - jar = compat_cookiejar.MozillaCookieJar(opts.cookiefile) - if os.access(opts.cookiefile, os.R_OK): - jar.load() - except (IOError, OSError) as err: - if opts.verbose: - traceback.print_exc() - write_string(u'ERROR: unable to open cookie file\n') - sys.exit(101) # Set user agent if opts.user_agent is not None: std_headers['User-Agent'] = opts.user_agent @@ -495,8 +472,6 @@ def _real_main(argv=None): all_urls = batchurls + args all_urls = [url.strip() for url in all_urls] - opener = _setup_opener(jar=jar, opts=opts) - extractors = gen_extractors() if opts.list_extractors: @@ -551,7 +526,7 @@ def _real_main(argv=None): if opts.retries is not None: try: opts.retries = int(opts.retries) - except (TypeError, ValueError) as err: + except (TypeError, ValueError): parser.error(u'invalid retry count specified') if opts.buffersize is not None: numeric_buffersize = FileDownloader.parse_bytes(opts.buffersize) @@ -562,13 +537,13 @@ def _real_main(argv=None): opts.playliststart = int(opts.playliststart) if opts.playliststart <= 0: raise ValueError(u'Playlist start must be positive') - except (TypeError, ValueError) as err: + except (TypeError, ValueError): parser.error(u'invalid playlist start number specified') try: opts.playlistend = int(opts.playlistend) if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart): raise ValueError(u'Playlist end must be greater than playlist start') - except (TypeError, ValueError) as err: + except (TypeError, ValueError): parser.error(u'invalid playlist end number specified') if opts.extractaudio: if opts.audioformat not in ['best', 'aac', 'mp3', 'm4a', 'opus', 'vorbis', 'wav']: @@ -671,34 +646,12 @@ def _real_main(argv=None): 'youtube_print_sig_code': opts.youtube_print_sig_code, 'age_limit': opts.age_limit, 'download_archive': opts.download_archive, + 'cookiefile': opts.cookiefile, + 'nocheckcertificate': opts.no_check_certificate, } with YoutubeDL(ydl_opts) as ydl: - if opts.verbose: - write_string(u'[debug] youtube-dl version ' + __version__ + u'\n') - try: - sp = subprocess.Popen( - ['git', 'rev-parse', '--short', 'HEAD'], - stdout=subprocess.PIPE, stderr=subprocess.PIPE, - cwd=os.path.dirname(os.path.abspath(__file__))) - out, err = sp.communicate() - out = out.decode().strip() - if re.match('[0-9a-f]+', out): - write_string(u'[debug] Git HEAD: ' + out + u'\n') - except: - try: - sys.exc_clear() - except: - pass - write_string(u'[debug] Python version %s - %s' % - (platform.python_version(), platform_name()) + u'\n') - - proxy_map = {} - for handler in opener.handlers: - if hasattr(handler, 'proxies'): - proxy_map.update(handler.proxies) - write_string(u'[debug] Proxy map: ' + compat_str(proxy_map) + u'\n') - + ydl.print_debug_header() ydl.add_default_info_extractors() # PostProcessors @@ -729,46 +682,9 @@ def _real_main(argv=None): ydl.to_screen(u'--max-download limit reached, aborting.') retcode = 101 - # Dump cookie jar if requested - if opts.cookiefile is not None: - try: - jar.save() - except (IOError, OSError): - sys.exit(u'ERROR: unable to save cookie jar') - sys.exit(retcode) -def _setup_opener(jar=None, opts=None, timeout=300): - if opts is None: - FakeOptions = collections.namedtuple( - 'FakeOptions', ['proxy', 'no_check_certificate']) - opts = FakeOptions(proxy=None, no_check_certificate=False) - - cookie_processor = compat_urllib_request.HTTPCookieProcessor(jar) - if opts.proxy is not None: - if opts.proxy == '': - proxies = {} - else: - proxies = {'http': opts.proxy, 'https': opts.proxy} - else: - proxies = compat_urllib_request.getproxies() - # Set HTTPS proxy to HTTP one if given (https://github.com/rg3/youtube-dl/issues/805) - if 'http' in proxies and 'https' not in proxies: - proxies['https'] = proxies['http'] - proxy_handler = compat_urllib_request.ProxyHandler(proxies) - https_handler = make_HTTPS_handler(opts) - opener = compat_urllib_request.build_opener( - https_handler, proxy_handler, cookie_processor, YoutubeDLHandler()) - # Delete the default user-agent header, which would otherwise apply in - # cases where our custom HTTP handler doesn't come into play - # (See https://github.com/rg3/youtube-dl/issues/1309 for details) - opener.addheaders = [] - compat_urllib_request.install_opener(opener) - socket.setdefaulttimeout(timeout) - return opener - - def main(argv=None): try: _real_main(argv) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index eb3435c77..423e54cea 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -8,7 +8,6 @@ import netrc from ..utils import ( compat_http_client, compat_urllib_error, - compat_urllib_request, compat_str, clean_html, @@ -19,6 +18,7 @@ from ..utils import ( unescapeHTML, ) + class InfoExtractor(object): """Information Extractor class. @@ -156,7 +156,7 @@ class InfoExtractor(object): elif note is not False: self.to_screen(u'%s: %s' % (video_id, note)) try: - return compat_urllib_request.urlopen(url_or_request) + return self._downloader.urlopen(url_or_request) except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: if errnote is None: errnote = u'Unable to download webpage' diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 0720fe9eb..0d2b7bd10 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -535,7 +535,7 @@ def formatSeconds(secs): else: return '%d' % secs -def make_HTTPS_handler(opts): +def make_HTTPS_handler(opts_no_check_certificate): if sys.version_info < (3,2): # Python's 2.x handler is very simplistic return compat_urllib_request.HTTPSHandler() @@ -545,7 +545,7 @@ def make_HTTPS_handler(opts): context.set_default_verify_paths() context.verify_mode = (ssl.CERT_NONE - if opts.no_check_certificate + if opts_no_check_certificate else ssl.CERT_REQUIRED) return compat_urllib_request.HTTPSHandler(context=context) From 2e767313e49b43400b3baae247e0f4c9e9e24992 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Sun, 24 Nov 2013 06:52:21 +0100 Subject: [PATCH 004/121] [update] fix error --- youtube_dl/update.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/update.py b/youtube_dl/update.py index be7800e8b..cd9670166 100644 --- a/youtube_dl/update.py +++ b/youtube_dl/update.py @@ -86,7 +86,7 @@ def update_self(to_screen, verbose): def version_tuple(version_str): return tuple(map(int, version_str.split('.'))) - if version_tuple(__version__) >= version_tuple(version_str): + if version_tuple(__version__) >= version_tuple(version_id): to_screen(u'youtube-dl is up to date (%s)' % __version__) return From 23e6d50d73188eab26944e41f164a5a1ab7f547a Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Sun, 24 Nov 2013 06:52:53 +0100 Subject: [PATCH 005/121] [bandcamp] Remove unused variable --- youtube_dl/extractor/bandcamp.py | 1 - 1 file changed, 1 deletion(-) diff --git a/youtube_dl/extractor/bandcamp.py b/youtube_dl/extractor/bandcamp.py index 1aa9dbefd..3a32c14c5 100644 --- a/youtube_dl/extractor/bandcamp.py +++ b/youtube_dl/extractor/bandcamp.py @@ -34,7 +34,6 @@ class BandcampIE(InfoExtractor): json_code = m_trackinfo.group(1) data = json.loads(json_code) - entries = [] for d in data: formats = [{ 'format_id': 'format_id', From bd49928f7a0254eeb8d5f918c5649ce4eb78ef36 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Sun, 24 Nov 2013 06:53:50 +0100 Subject: [PATCH 006/121] [niconico] Clarify download --- youtube_dl/extractor/niconico.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/niconico.py b/youtube_dl/extractor/niconico.py index 22898b5a1..729607ea3 100644 --- a/youtube_dl/extractor/niconico.py +++ b/youtube_dl/extractor/niconico.py @@ -77,9 +77,9 @@ class NiconicoIE(InfoExtractor): mobj = re.match(self._VALID_URL, url) video_id = mobj.group(1) - # Get video webpage - video_webpage = self._download_webpage( - 'http://www.nicovideo.jp/watch/' + video_id, video_id) + # Get video webpage. We are not actually interested in it, but need + # the cookies in order to be able to download the info webpage + self._download_webpage('http://www.nicovideo.jp/watch/' + video_id, video_id) video_info_webpage = self._download_webpage( 'http://ext.nicovideo.jp/api/getthumbinfo/' + video_id, video_id, From 66ec0192406bbf1bffcb6c4e72fe1529f1e21195 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Sun, 24 Nov 2013 06:54:26 +0100 Subject: [PATCH 007/121] [youtube] do not use variable name twice --- youtube_dl/extractor/youtube.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 126688652..07a457f4d 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1571,8 +1571,8 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): playlist_title = self._og_search_title(page) - url_results = [self.url_result(video_id, 'Youtube', video_id=video_id) - for video_id in ids] + url_results = [self.url_result(vid_id, 'Youtube', video_id=vid_id) + for vid_id in ids] return self.playlist_result(url_results, playlist_id, playlist_title) From 382ed50e0ecfb2fa692049030c858b99159c791b Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Sun, 24 Nov 2013 07:30:05 +0100 Subject: [PATCH 008/121] [viki] Add extractor (fixes #1813) --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/viki.py | 91 ++++++++++++++++++++++++++++++++ 2 files changed, 92 insertions(+) create mode 100644 youtube_dl/extractor/viki.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index f443f11f6..867734fa2 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -157,6 +157,7 @@ from .videofyme import VideofyMeIE from .videopremium import VideoPremiumIE from .vimeo import VimeoIE, VimeoChannelIE from .vine import VineIE +from .viki import VikiIE from .vk import VKIE from .wat import WatIE from .websurg import WeBSurgIE diff --git a/youtube_dl/extractor/viki.py b/youtube_dl/extractor/viki.py new file mode 100644 index 000000000..78d03c079 --- /dev/null +++ b/youtube_dl/extractor/viki.py @@ -0,0 +1,91 @@ +import re + +from ..utils import ( + unified_strdate, +) +from .subtitles import SubtitlesInfoExtractor + + +class VikiIE(SubtitlesInfoExtractor): + IE_NAME = u'viki' + + _VALID_URL = r'^https?://(?:www\.)?viki\.com/videos/(?P[0-9]+v)' + _TEST = { + u'url': u'http://www.viki.com/videos/1023585v-heirs-episode-14', + u'file': u'1023585v.mp4', + u'md5': u'a21454021c2646f5433514177e2caa5f', + u'info_dict': { + u'title': u'Heirs Episode 14', + u'uploader': u'SBS', + u'description': u'md5:c4b17b9626dd4b143dcc4d855ba3474e', + u'upload_date': u'20131121', + u'age_limit': 13, + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group(1) + + webpage = self._download_webpage(url, video_id) + title = self._og_search_title(webpage) + description = self._og_search_description(webpage) + thumbnail = self._og_search_thumbnail(webpage) + + uploader = self._html_search_regex( + r'Broadcast Network: \s*([^<]*)<', webpage, + u'uploader') + if uploader is not None: + uploader = uploader.strip() + + rating_str = self._html_search_regex( + r'Rating: \s*([^<]*)<', webpage, + u'rating information', default='').strip() + RATINGS = { + 'G': 0, + 'PG': 10, + 'PG-13': 13, + 'R': 16, + 'NC': 18, + } + age_limit = RATINGS.get(rating_str) + + info_url = 'http://www.viki.com/player5_fragment/%s?action=show&controller=videos' % video_id + info_webpage = self._download_webpage(info_url, video_id) + video_url = self._html_search_regex( + r']+src="([^"]+)"', info_webpage, u'video URL') + + upload_date_str = self._html_search_regex( + r'"created_at":"([^"]+)"', info_webpage, u'upload date') + upload_date = ( + unified_strdate(upload_date_str) + if upload_date_str is not None + else None + ) + + # subtitles + video_subtitles = self.extract_subtitles(video_id, info_webpage) + if self._downloader.params.get('listsubtitles', False): + self._list_available_subtitles(video_id, info_webpage) + return + + return { + 'id': video_id, + 'title': title, + 'url': video_url, + 'description': description, + 'thumbnail': thumbnail, + 'age_limit': age_limit, + 'uploader': uploader, + 'subtitles': video_subtitles, + 'upload_date': upload_date, + } + + def _get_available_subtitles(self, video_id, info_webpage): + res = {} + for sturl in re.findall(r''): + m = re.search(r'/(?P[a-z]+)\.vtt', sturl) + if not m: + continue + res[m.group('lang')] = sturl + return res From eaaafc59c2f8ffaee4df06092a57f65eec1b6eaa Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Sun, 24 Nov 2013 07:30:34 +0100 Subject: [PATCH 009/121] release 2013.11.24 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index f6d18f945..68ef46a30 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2013.11.22.2' +__version__ = '2013.11.24' From 0c7c19d6bc55a624532f2426d080aea51962cfe0 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Sun, 24 Nov 2013 07:51:44 +0100 Subject: [PATCH 010/121] [clipfish] Add extractor (Fixes #1760) --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/clipfish.py | 53 ++++++++++++++++++++++++++++++++ 2 files changed, 54 insertions(+) create mode 100644 youtube_dl/extractor/clipfish.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 867734fa2..4c280fa5e 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -20,6 +20,7 @@ from .c56 import C56IE from .canalplus import CanalplusIE from .canalc2 import Canalc2IE from .cinemassacre import CinemassacreIE +from .clipfish import ClipfishIE from .cnn import CNNIE from .collegehumor import CollegeHumorIE from .comedycentral import ComedyCentralIE diff --git a/youtube_dl/extractor/clipfish.py b/youtube_dl/extractor/clipfish.py new file mode 100644 index 000000000..95449da3c --- /dev/null +++ b/youtube_dl/extractor/clipfish.py @@ -0,0 +1,53 @@ +import re +import time +import xml.etree.ElementTree + +from .common import InfoExtractor + + +class ClipfishIE(InfoExtractor): + IE_NAME = u'clipfish' + + _VALID_URL = r'^https?://(?:www\.)?clipfish\.de/.*?/video/(?P[0-9]+)/' + _TEST = { + u'url': u'http://www.clipfish.de/special/supertalent/video/4028320/supertalent-2013-ivana-opacak-singt-nobodys-perfect/', + u'file': u'4028320.f4v', + u'md5': u'5e38bda8c329fbfb42be0386a3f5a382', + u'info_dict': { + u'title': u'Supertalent 2013: Ivana Opacak singt Nobody\'s Perfect', + u'duration': 399, + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group(1) + + info_url = ('http://www.clipfish.de/devxml/videoinfo/%s?ts=%d' % + (video_id, int(time.time()))) + info_xml = self._download_webpage( + info_url, video_id, note=u'Downloading info page') + doc = xml.etree.ElementTree.fromstring(info_xml) + title = doc.find('title').text + video_url = doc.find('filename').text + thumbnail = doc.find('imageurl').text + duration_str = doc.find('duration').text + m = re.match( + r'^(?P[0-9]+):(?P[0-9]{2}):(?P[0-9]{2}):(?P[0-9]*)$', + duration_str) + if m: + duration = ( + (int(m.group('hours')) * 60 * 60) + + (int(m.group('minutes')) * 60) + + (int(m.group('seconds'))) + ) + else: + duration = None + + return { + 'id': video_id, + 'title': title, + 'url': video_url, + 'thumbnail': thumbnail, + 'duration': duration, + } From 138df537ffaeda182789440c4086f009a739dde3 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Sun, 24 Nov 2013 07:51:56 +0100 Subject: [PATCH 011/121] release 2013.11.24.1 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 68ef46a30..de92411bb 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2013.11.24' +__version__ = '2013.11.24.1' From d214fdb8fe796e92485e28038ee72d28caa3ad10 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sun, 24 Nov 2013 11:02:34 +0100 Subject: [PATCH 012/121] [brightcove] Don't use 'or' with the xml nodes, use the 'value' attribute instead --- youtube_dl/extractor/brightcove.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index 74a7d13e3..66fe0ac9a 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -76,18 +76,21 @@ class BrightcoveIE(InfoExtractor): 'playerID': find_xpath_attr(object_doc, './param', 'name', 'playerID').attrib['value'], } def find_param(name): - return find_xpath_attr(object_doc, './param', 'name', name) + node = find_xpath_attr(object_doc, './param', 'name', name) + if node is not None: + return node.attrib['value'] + return None playerKey = find_param('playerKey') # Not all pages define this value if playerKey is not None: - params['playerKey'] = playerKey.attrib['value'] + params['playerKey'] = playerKey # The three fields hold the id of the video videoPlayer = find_param('@videoPlayer') or find_param('videoId') or find_param('videoID') if videoPlayer is not None: - params['@videoPlayer'] = videoPlayer.attrib['value'] + params['@videoPlayer'] = videoPlayer linkBase = find_param('linkBaseURL') if linkBase is not None: - params['linkBaseURL'] = linkBase.attrib['value'] + params['linkBaseURL'] = linkBase data = compat_urllib_parse.urlencode(params) return cls._FEDERATED_URL_TEMPLATE % data From dc65dcbb6d709ef6e38f336fe77c14767d6c8f9d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sun, 24 Nov 2013 11:28:44 +0100 Subject: [PATCH 013/121] [mixcloud] The description field may be missing (fixes #1819) --- youtube_dl/extractor/mixcloud.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/mixcloud.py b/youtube_dl/extractor/mixcloud.py index a200dcd74..e2baf44d7 100644 --- a/youtube_dl/extractor/mixcloud.py +++ b/youtube_dl/extractor/mixcloud.py @@ -60,7 +60,7 @@ class MixcloudIE(InfoExtractor): 'title': info['name'], 'url': final_song_url, 'ext': 'mp3', - 'description': info['description'], + 'description': info.get('description'), 'thumbnail': info['pictures'].get('extra_large'), 'uploader': info['user']['name'], 'uploader_id': info['user']['username'], From f459d17018812dc896324f8208cdfe2ada04ea50 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sun, 24 Nov 2013 14:33:50 +0100 Subject: [PATCH 014/121] [youtube] Add an extractor for downloading the watch history (closes #1821) --- test/test_all_urls.py | 1 + youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/youtube.py | 14 ++++++++++++++ 3 files changed, 16 insertions(+) diff --git a/test/test_all_urls.py b/test/test_all_urls.py index 56e5f80e1..42813da1a 100644 --- a/test/test_all_urls.py +++ b/test/test_all_urls.py @@ -100,6 +100,7 @@ class TestAllURLsMatching(unittest.TestCase): def test_keywords(self): self.assertMatch(':ytsubs', ['youtube:subscriptions']) self.assertMatch(':ytsubscriptions', ['youtube:subscriptions']) + self.assertMatch(':ythistory', ['youtube:history']) self.assertMatch(':thedailyshow', ['ComedyCentral']) self.assertMatch(':tds', ['ComedyCentral']) self.assertMatch(':colbertreport', ['ComedyCentral']) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 4c280fa5e..1fbd10bc5 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -186,6 +186,7 @@ from .youtube import ( YoutubeTruncatedURLIE, YoutubeWatchLaterIE, YoutubeFavouritesIE, + YoutubeHistoryIE, ) from .zdf import ZDFIE diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 07a457f4d..64d4c2445 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1826,6 +1826,20 @@ class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor): _PAGING_STEP = 100 _PERSONAL_FEED = True +class YoutubeHistoryIE(YoutubeFeedsInfoExtractor): + IE_DESC = u'Youtube watch history, "ythistory" keyword (requires authentication)' + _VALID_URL = u'https?://www\.youtube\.com/feed/history|:ythistory' + _FEED_NAME = 'history' + _PERSONAL_FEED = True + _PLAYLIST_TITLE = u'Youtube Watch History' + + def _real_extract(self, url): + webpage = self._download_webpage('https://www.youtube.com/feed/history', u'History') + data_paging = self._search_regex(r'data-paging="(\d+)"', webpage, u'data-paging') + # The step is actually a ridiculously big number (like 1374343569725646) + self._PAGING_STEP = int(data_paging) + return super(YoutubeHistoryIE, self)._real_extract(url) + class YoutubeFavouritesIE(YoutubeBaseInfoExtractor): IE_NAME = u'youtube:favorites' IE_DESC = u'YouTube.com favourite videos, "ytfav" keyword (requires authentication)' From 267ed0c5d3547c68f1d34203c2ae4b0d826a29d9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sun, 24 Nov 2013 14:59:19 +0100 Subject: [PATCH 015/121] [collegehumor] Encode the xml before calling xml.etree.ElementTree.fromstring (fixes #1822) Uses a new helper method in InfoExtractor: _download_xml --- youtube_dl/extractor/collegehumor.py | 7 ++----- youtube_dl/extractor/common.py | 6 ++++++ 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/collegehumor.py b/youtube_dl/extractor/collegehumor.py index 0c29acfb1..b27c1dfc5 100644 --- a/youtube_dl/extractor/collegehumor.py +++ b/youtube_dl/extractor/collegehumor.py @@ -1,5 +1,4 @@ import re -import xml.etree.ElementTree from .common import InfoExtractor from ..utils import ( @@ -46,11 +45,10 @@ class CollegeHumorIE(InfoExtractor): self.report_extraction(video_id) xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id - metaXml = self._download_webpage(xmlUrl, video_id, + mdoc = self._download_xml(xmlUrl, video_id, u'Downloading info XML', u'Unable to download video info XML') - mdoc = xml.etree.ElementTree.fromstring(metaXml) try: videoNode = mdoc.findall('./video')[0] youtubeIdNode = videoNode.find('./youtubeID') @@ -65,11 +63,10 @@ class CollegeHumorIE(InfoExtractor): if next_url.endswith(u'manifest.f4m'): manifest_url = next_url + '?hdcore=2.10.3' - manifestXml = self._download_webpage(manifest_url, video_id, + adoc = self._download_xml(manifest_url, video_id, u'Downloading XML manifest', u'Unable to download video info XML') - adoc = xml.etree.ElementTree.fromstring(manifestXml) try: video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text except IndexError: diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 3cebeaf29..482a231ec 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -4,6 +4,7 @@ import re import socket import sys import netrc +import xml.etree.ElementTree from ..utils import ( compat_http_client, @@ -208,6 +209,11 @@ class InfoExtractor(object): """ Returns the data of the page as a string """ return self._download_webpage_handle(url_or_request, video_id, note, errnote)[0] + def _download_xml(self, url_or_request, video_id, note=u'Downloading XML', errnote=u'Unable to downloand XML'): + """Return the xml as an xml.etree.ElementTree.Element""" + xml_string = self._download_webpage(url_or_request, video_id, note, errnote) + return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8')) + def to_screen(self, msg): """Print msg to screen, prefixing it with '[ie_name]'""" self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg)) From a1ee09e815cb413d67cee17ad686224b26182dfb Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Sun, 24 Nov 2013 15:03:25 +0100 Subject: [PATCH 016/121] Document proxy --- youtube_dl/YoutubeDL.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 72ccfa2ae..0a845a344 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -128,7 +128,8 @@ class YoutubeDL(object): Videos already present in the file are not downloaded again. cookiefile: File name where cookies should be read from and dumped to. - nocheckcertificate Do not verify SSL certificates + nocheckcertificate:Do not verify SSL certificates + proxy: URL of the proxy server to use The following parameters are not used by YoutubeDL itself, they are used by the FileDownloader: From b7553b25543175c27c885b0c6ab77d91b270a520 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Sun, 24 Nov 2013 15:20:16 +0100 Subject: [PATCH 017/121] [vik] Clarify output --- youtube_dl/extractor/viki.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/viki.py b/youtube_dl/extractor/viki.py index 78d03c079..8088dcf0b 100644 --- a/youtube_dl/extractor/viki.py +++ b/youtube_dl/extractor/viki.py @@ -51,7 +51,8 @@ class VikiIE(SubtitlesInfoExtractor): age_limit = RATINGS.get(rating_str) info_url = 'http://www.viki.com/player5_fragment/%s?action=show&controller=videos' % video_id - info_webpage = self._download_webpage(info_url, video_id) + info_webpage = self._download_webpage( + info_url, video_id, note=u'Downloading info page') video_url = self._html_search_regex( r']+src="([^"]+)"', info_webpage, u'video URL') From 6d88bc37a32d5d624c09d68cd19e64e6095fa5de Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Sun, 24 Nov 2013 15:28:33 +0100 Subject: [PATCH 018/121] [viki] Skip travis test Also provide a better error message for geoblocked videos. --- youtube_dl/extractor/viki.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/viki.py b/youtube_dl/extractor/viki.py index 8088dcf0b..7b3a58de8 100644 --- a/youtube_dl/extractor/viki.py +++ b/youtube_dl/extractor/viki.py @@ -1,6 +1,7 @@ import re from ..utils import ( + ExtractorError, unified_strdate, ) from .subtitles import SubtitlesInfoExtractor @@ -20,7 +21,8 @@ class VikiIE(SubtitlesInfoExtractor): u'description': u'md5:c4b17b9626dd4b143dcc4d855ba3474e', u'upload_date': u'20131121', u'age_limit': 13, - } + }, + u'skip': u'Blocked in the US', } def _real_extract(self, url): @@ -53,6 +55,10 @@ class VikiIE(SubtitlesInfoExtractor): info_url = 'http://www.viki.com/player5_fragment/%s?action=show&controller=videos' % video_id info_webpage = self._download_webpage( info_url, video_id, note=u'Downloading info page') + if re.match(r'\s*]+src="([^"]+)"', info_webpage, u'video URL') From 66cfab4226296c1596fbf37c27758bbdb6846d53 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sun, 24 Nov 2013 21:18:35 +0100 Subject: [PATCH 019/121] [comedycentral] Add support for comedycentral.com videos (closes #1824) It's a subclass of MTVIE The extractor for colbertnation.com and thedailyshow.com is called now ComedyCentralShowsIE --- youtube_dl/extractor/__init__.py | 2 +- youtube_dl/extractor/comedycentral.py | 33 ++++++++++++++++++++++++++- 2 files changed, 33 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 1fbd10bc5..0b4d086b7 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -23,7 +23,7 @@ from .cinemassacre import CinemassacreIE from .clipfish import ClipfishIE from .cnn import CNNIE from .collegehumor import CollegeHumorIE -from .comedycentral import ComedyCentralIE +from .comedycentral import ComedyCentralIE, ComedyCentralShowsIE from .condenast import CondeNastIE from .criterion import CriterionIE from .cspan import CSpanIE diff --git a/youtube_dl/extractor/comedycentral.py b/youtube_dl/extractor/comedycentral.py index 69b2beece..725849d2e 100644 --- a/youtube_dl/extractor/comedycentral.py +++ b/youtube_dl/extractor/comedycentral.py @@ -2,6 +2,7 @@ import re import xml.etree.ElementTree from .common import InfoExtractor +from .mtv import MTVIE, _media_xml_tag from ..utils import ( compat_str, compat_urllib_parse, @@ -11,7 +12,37 @@ from ..utils import ( ) -class ComedyCentralIE(InfoExtractor): +class ComedyCentralIE(MTVIE): + _VALID_URL = r'http://www.comedycentral.com/(video-clips|episodes|cc-studios)/(?P.*)' + _FEED_URL = u'http://comedycentral.com/feeds/mrss/' + + _TEST = { + u'url': u'http://www.comedycentral.com/video-clips/kllhuv/stand-up-greg-fitzsimmons--uncensored---too-good-of-a-mother', + u'md5': u'4167875aae411f903b751a21f357f1ee', + u'info_dict': { + u'id': u'cef0cbb3-e776-4bc9-b62e-8016deccb354', + u'ext': u'mp4', + u'title': u'Uncensored - Greg Fitzsimmons - Too Good of a Mother', + u'description': u'After a certain point, breastfeeding becomes c**kblocking.', + }, + } + # Overwrite MTVIE properties we don't want + _TESTS = [] + + def _get_thumbnail_url(self, uri, itemdoc): + search_path = '%s/%s' % (_media_xml_tag('group'), _media_xml_tag('thumbnail')) + return itemdoc.find(search_path).attrib['url'] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + title = mobj.group('title') + webpage = self._download_webpage(url, title) + mgid = self._search_regex(r'data-mgid="(?P<mgid>mgid:.*?)"', + webpage, u'mgid') + return self._get_videos_info(mgid) + + +class ComedyCentralShowsIE(InfoExtractor): IE_DESC = u'The Daily Show / Colbert Report' # urls can be abbreviations like :thedailyshow or :colbert # urls for episodes like: From 16e055849ebfa5a942aef4411728b36bf53ebaa3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Sun, 24 Nov 2013 22:13:20 +0100 Subject: [PATCH 020/121] Update the keywords tests for the rename of the old ComedyCentralIE --- test/test_all_urls.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/test/test_all_urls.py b/test/test_all_urls.py index 42813da1a..1f1adb6b4 100644 --- a/test/test_all_urls.py +++ b/test/test_all_urls.py @@ -101,10 +101,10 @@ class TestAllURLsMatching(unittest.TestCase): self.assertMatch(':ytsubs', ['youtube:subscriptions']) self.assertMatch(':ytsubscriptions', ['youtube:subscriptions']) self.assertMatch(':ythistory', ['youtube:history']) - self.assertMatch(':thedailyshow', ['ComedyCentral']) - self.assertMatch(':tds', ['ComedyCentral']) - self.assertMatch(':colbertreport', ['ComedyCentral']) - self.assertMatch(':cr', ['ComedyCentral']) + self.assertMatch(':thedailyshow', ['ComedyCentralShows']) + self.assertMatch(':tds', ['ComedyCentralShows']) + self.assertMatch(':colbertreport', ['ComedyCentralShows']) + self.assertMatch(':cr', ['ComedyCentralShows']) if __name__ == '__main__': From 1fb2bcbbf748e07d05f98110cc27d440506a9b77 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 25 Nov 2013 02:02:34 +0100 Subject: [PATCH 021/121] [viki] Make uploader field optional (#1813) --- youtube_dl/extractor/viki.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/viki.py b/youtube_dl/extractor/viki.py index 7b3a58de8..cd986a749 100644 --- a/youtube_dl/extractor/viki.py +++ b/youtube_dl/extractor/viki.py @@ -34,11 +34,12 @@ class VikiIE(SubtitlesInfoExtractor): description = self._og_search_description(webpage) thumbnail = self._og_search_thumbnail(webpage) - uploader = self._html_search_regex( - r'<strong>Broadcast Network: </strong>\s*([^<]*)<', webpage, - u'uploader') - if uploader is not None: - uploader = uploader.strip() + uploader_m = re.search( + r'<strong>Broadcast Network: </strong>\s*([^<]*)<', webpage) + if uploader_m is None: + uploader = None + else: + uploader = uploader.group(1).strip() rating_str = self._html_search_regex( r'<strong>Rating: </strong>\s*([^<]*)<', webpage, From 02dbf93f0e98a56ed04b4a9e6a6d62efd6d801f9 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 25 Nov 2013 03:12:26 +0100 Subject: [PATCH 022/121] [zdf/common] Use API in ZDF extractor. This also comes with a lot of extra format fields Fixes #1518 --- youtube_dl/FileDownloader.py | 20 +----- youtube_dl/YoutubeDL.py | 24 ++++--- youtube_dl/extractor/common.py | 2 + youtube_dl/extractor/zdf.py | 115 ++++++++++++++++++++------------- youtube_dl/utils.py | 21 ++++++ 5 files changed, 112 insertions(+), 70 deletions(-) diff --git a/youtube_dl/FileDownloader.py b/youtube_dl/FileDownloader.py index e5a542ed5..2b4fb0b31 100644 --- a/youtube_dl/FileDownloader.py +++ b/youtube_dl/FileDownloader.py @@ -1,4 +1,3 @@ -import math import os import re import subprocess @@ -11,6 +10,7 @@ from .utils import ( ContentTooShortError, determine_ext, encodeFilename, + format_bytes, sanitize_open, timeconvert, ) @@ -53,20 +53,6 @@ class FileDownloader(object): self._progress_hooks = [] self.params = params - @staticmethod - def format_bytes(bytes): - if bytes is None: - return 'N/A' - if type(bytes) is str: - bytes = float(bytes) - if bytes == 0.0: - exponent = 0 - else: - exponent = int(math.log(bytes, 1024.0)) - suffix = ['B','KiB','MiB','GiB','TiB','PiB','EiB','ZiB','YiB'][exponent] - converted = float(bytes) / float(1024 ** exponent) - return '%.2f%s' % (converted, suffix) - @staticmethod def format_seconds(seconds): (mins, secs) = divmod(seconds, 60) @@ -117,7 +103,7 @@ class FileDownloader(object): def format_speed(speed): if speed is None: return '%10s' % '---b/s' - return '%10s' % ('%s/s' % FileDownloader.format_bytes(speed)) + return '%10s' % ('%s/s' % format_bytes(speed)) @staticmethod def best_block_size(elapsed_time, bytes): @@ -525,7 +511,7 @@ class FileDownloader(object): self.to_screen(u'\r[download] File is larger than max-filesize (%s bytes > %s bytes). Aborting.' % (data_len, max_data_len)) return False - data_len_str = self.format_bytes(data_len) + data_len_str = format_bytes(data_len) byte_counter = 0 + resume_len block_size = self.params.get('buffersize', 1024) start = time.time() diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index d7e2417ac..0578fe6c1 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -30,6 +30,7 @@ from .utils import ( DownloadError, encodeFilename, ExtractorError, + format_bytes, locked_file, MaxDownloadsReached, PostProcessingError, @@ -867,9 +868,11 @@ class YoutubeDL(object): def list_formats(self, info_dict): def format_note(fdict): - if fdict.get('format_note') is not None: - return fdict['format_note'] res = u'' + if fdict.get('format_note') is not None: + res += fdict['format_note'] + u' ' + if fdict.get('quality_name') is not None: + res += u'%s ' % fdict['quality_name'] if fdict.get('vcodec') is not None: res += u'%-5s' % fdict['vcodec'] elif fdict.get('vbr') is not None: @@ -886,25 +889,30 @@ class YoutubeDL(object): res += 'audio' if fdict.get('abr') is not None: res += u'@%3dk' % fdict['abr'] + if fdict.get('filesize') is not None: + if res: + res += u', ' + res += format_bytes(fdict['filesize']) return res - def line(format): - return (u'%-20s%-10s%-12s%s' % ( + def line(format, idlen=20): + return ((u'%-' + compat_str(idlen + 1) + u's%-10s%-12s%s') % ( format['format_id'], format['ext'], self.format_resolution(format), format_note(format), - ) - ) + )) formats = info_dict.get('formats', [info_dict]) - formats_s = list(map(line, formats)) + idlen = max(len(u'format code'), + max(len(f['format_id']) for f in formats)) + formats_s = [line(f, idlen) for f in formats] if len(formats) > 1: formats_s[0] += (' ' if format_note(formats[0]) else '') + '(worst)' formats_s[-1] += (' ' if format_note(formats[-1]) else '') + '(best)' header_line = line({ 'format_id': u'format code', 'ext': u'extension', - '_resolution': u'resolution', 'format_note': u'note'}) + '_resolution': u'resolution', 'format_note': u'note'}, idlen=idlen) self.to_screen(u'[info] Available formats for %s:\n%s\n%s' % (info_dict['id'], header_line, u"\n".join(formats_s))) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 482a231ec..3c4781121 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -76,6 +76,8 @@ class InfoExtractor(object): * acodec Name of the audio codec in use * vbr Average video bitrate in KBit/s * vcodec Name of the video codec in use + * quality_name Human-readable name of the video quality. + * filesize The number of bytes, if known in advance webpage_url: The url to the video webpage, if given to youtube-dl it should allow to get the same result again. (It will be set by YoutubeDL if it's missing) diff --git a/youtube_dl/extractor/zdf.py b/youtube_dl/extractor/zdf.py index c6a9d06f2..a8d899883 100644 --- a/youtube_dl/extractor/zdf.py +++ b/youtube_dl/extractor/zdf.py @@ -1,75 +1,100 @@ +import operator import re from .common import InfoExtractor from ..utils import ( - determine_ext, - ExtractorError, + parse_xml_doc, + unified_strdate, ) class ZDFIE(InfoExtractor): _VALID_URL = r'^http://www\.zdf\.de\/ZDFmediathek(?P<hash>#)?\/(.*beitrag\/video\/)(?P<video_id>[^/\?]+)(?:\?.*)?' - _MEDIA_STREAM = r'<a href="(?P<video_url>.+(?P<media_type>.streaming).+/zdf/(?P<quality>[^\/]+)/[^"]*)".+class="play".+>' def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - if mobj is None: - raise ExtractorError(u'Invalid URL: %s' % url) video_id = mobj.group('video_id') - if mobj.group('hash'): - url = url.replace(u'#', u'', 1) + xml_url = u'http://www.zdf.de/ZDFmediathek/xmlservice/web/beitragsDetails?ak=web&id=%s' % video_id + info_xml = self._download_webpage( + xml_url, video_id, note=u'Downloading video info') + doc = parse_xml_doc(info_xml) - html = self._download_webpage(url, video_id) - streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)] - if streams is None: - raise ExtractorError(u'No media url found.') + title = doc.find('.//information/title').text + description = doc.find('.//information/detail').text + uploader_node = doc.find('.//details/originChannelTitle') + uploader = None if uploader_node is None else uploader_node.text + duration_str = doc.find('.//details/length').text + duration_m = re.match(r'''(?x)^ + (?P<hours>[0-9]{2}) + :(?P<minutes>[0-9]{2}) + :(?P<seconds>[0-9]{2}) + (?:\.(?P<ms>[0-9]+)?) + ''', duration_str) + duration = ( + ( + (int(duration_m.group('hours')) * 60 * 60) + + (int(duration_m.group('minutes')) * 60) + + int(duration_m.group('seconds')) + ) + if duration_m + else None + ) + upload_date = unified_strdate(doc.find('.//details/airtime').text) - # s['media_type'] == 'wstreaming' -> use 'Windows Media Player' and mms url - # s['media_type'] == 'hstreaming' -> use 'Quicktime' and rtsp url - # choose first/default media type and highest quality for now - def stream_pref(s): - TYPE_ORDER = ['ostreaming', 'hstreaming', 'wstreaming'] + def xml_to_format(fnode): + video_url = fnode.find('url').text + is_available = u'http://www.metafilegenerator' not in video_url + + format_id = fnode.attrib['basetype'] + format_m = re.match(r'''(?x) + (?P<vcodec>[^_]+)_(?P<acodec>[^_]+)_(?P<container>[^_]+)_ + (?P<proto>[^_]+)_(?P<index>[^_]+)_(?P<indexproto>[^_]+) + ''', format_id) + + PROTO_ORDER = ['http', 'rtmp', 'rtsp'] try: - type_pref = TYPE_ORDER.index(s['media_type']) + proto_pref = -PROTO_ORDER.index(format_m.group('proto')) except ValueError: - type_pref = 999 + proto_pref = 999 - QUALITY_ORDER = ['veryhigh', '300'] + quality = fnode.find('./quality').text + QUALITY_ORDER = ['veryhigh', '300', 'high', 'med', 'low'] try: - quality_pref = QUALITY_ORDER.index(s['quality']) + quality_pref = -QUALITY_ORDER.index(quality) except ValueError: quality_pref = 999 - return (type_pref, quality_pref) + abr = int(fnode.find('./audioBitrate').text) // 1000 + vbr = int(fnode.find('./videoBitrate').text) // 1000 + pref = (is_available, proto_pref, quality_pref, vbr, abr) - sorted_streams = sorted(streams, key=stream_pref) - if not sorted_streams: - raise ExtractorError(u'No stream found.') - stream = sorted_streams[0] + return { + 'format_id': format_id, + 'url': video_url, + 'ext': format_m.group('container'), + 'acodec': format_m.group('acodec'), + 'vcodec': format_m.group('vcodec'), + 'abr': abr, + 'vbr': vbr, + 'width': int(fnode.find('./width').text), + 'height': int(fnode.find('./height').text), + 'quality_name': quality, + 'filesize': int(fnode.find('./filesize').text), + 'format_note': None if is_available else u'(unavailable)', + '_pref': pref, + } - media_link = self._download_webpage( - stream['video_url'], - video_id, - u'Get stream URL') - - #MMS_STREAM = r'href="(?P<video_url>mms://[^"]*)"' - RTSP_STREAM = r'(?P<video_url>rtsp://[^"]*.mp4)' - - mobj = re.search(self._MEDIA_STREAM, media_link) - if mobj is None: - mobj = re.search(RTSP_STREAM, media_link) - if mobj is None: - raise ExtractorError(u'Cannot extract mms:// or rtsp:// URL') - video_url = mobj.group('video_url') - - title = self._html_search_regex( - r'<h1(?: class="beitragHeadline")?>(.*?)</h1>', - html, u'title') + format_nodes = doc.findall('.//formitaeten/formitaet') + formats = sorted(map(xml_to_format, format_nodes), + key=operator.itemgetter('_pref')) return { 'id': video_id, - 'url': video_url, 'title': title, - 'ext': determine_ext(video_url) + 'formats': formats, + 'description': description, + 'uploader': uploader, + 'duration': duration, + 'upload_date': upload_date, } diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 34b3d19e0..ad0a06287 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -8,6 +8,7 @@ import gzip import io import json import locale +import math import os import pipes import platform @@ -16,6 +17,7 @@ import ssl import socket import sys import traceback +import xml.etree.ElementTree import zlib try: @@ -1006,3 +1008,22 @@ def unsmuggle_url(smug_url): jsond = compat_parse_qs(sdata)[u'__youtubedl_smuggle'][0] data = json.loads(jsond) return url, data + + +def parse_xml_doc(s): + assert isinstance(s, type(u'')) + return xml.etree.ElementTree.fromstring(s.encode('utf-8')) + + +def format_bytes(bytes): + if bytes is None: + return u'N/A' + if type(bytes) is str: + bytes = float(bytes) + if bytes == 0.0: + exponent = 0 + else: + exponent = int(math.log(bytes, 1024.0)) + suffix = [u'B', u'KiB', u'MiB', u'GiB', u'TiB', u'PiB', u'EiB', u'ZiB', u'YiB'][exponent] + converted = float(bytes) / float(1024 ** exponent) + return u'%.2f%s' % (converted, suffix) From c059bdd432911cff8c7426380a876c9679855ab5 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 25 Nov 2013 03:28:55 +0100 Subject: [PATCH 023/121] Remove quality_name field and improve zdf extractor --- youtube_dl/YoutubeDL.py | 2 -- youtube_dl/extractor/common.py | 1 - youtube_dl/extractor/zdf.py | 23 +++++++++++++++++------ 3 files changed, 17 insertions(+), 9 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 0578fe6c1..a896d9e63 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -871,8 +871,6 @@ class YoutubeDL(object): res = u'' if fdict.get('format_note') is not None: res += fdict['format_note'] + u' ' - if fdict.get('quality_name') is not None: - res += u'%s ' % fdict['quality_name'] if fdict.get('vcodec') is not None: res += u'%-5s' % fdict['vcodec'] elif fdict.get('vbr') is not None: diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 3c4781121..3d8ac8ba2 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -76,7 +76,6 @@ class InfoExtractor(object): * acodec Name of the audio codec in use * vbr Average video bitrate in KBit/s * vcodec Name of the video codec in use - * quality_name Human-readable name of the video quality. * filesize The number of bytes, if known in advance webpage_url: The url to the video webpage, if given to youtube-dl it should allow to get the same result again. (It will be set diff --git a/youtube_dl/extractor/zdf.py b/youtube_dl/extractor/zdf.py index a8d899883..07f830e80 100644 --- a/youtube_dl/extractor/zdf.py +++ b/youtube_dl/extractor/zdf.py @@ -52,6 +52,9 @@ class ZDFIE(InfoExtractor): (?P<proto>[^_]+)_(?P<index>[^_]+)_(?P<indexproto>[^_]+) ''', format_id) + ext = format_m.group('container') + is_supported = ext != 'f4f' + PROTO_ORDER = ['http', 'rtmp', 'rtsp'] try: proto_pref = -PROTO_ORDER.index(format_m.group('proto')) @@ -67,26 +70,34 @@ class ZDFIE(InfoExtractor): abr = int(fnode.find('./audioBitrate').text) // 1000 vbr = int(fnode.find('./videoBitrate').text) // 1000 - pref = (is_available, proto_pref, quality_pref, vbr, abr) + pref = (is_available, is_supported, + proto_pref, quality_pref, vbr, abr) + + format_note = u'' + if not is_supported: + format_note += u'(unsupported)' + if not format_note: + format_note = None return { - 'format_id': format_id, + 'format_id': format_id + u'-' + quality, 'url': video_url, - 'ext': format_m.group('container'), + 'ext': ext, 'acodec': format_m.group('acodec'), 'vcodec': format_m.group('vcodec'), 'abr': abr, 'vbr': vbr, 'width': int(fnode.find('./width').text), 'height': int(fnode.find('./height').text), - 'quality_name': quality, 'filesize': int(fnode.find('./filesize').text), - 'format_note': None if is_available else u'(unavailable)', + 'format_note': format_note, '_pref': pref, + '_available': is_available, } format_nodes = doc.findall('.//formitaeten/formitaet') - formats = sorted(map(xml_to_format, format_nodes), + formats = sorted(filter(lambda f: f['_available'], + map(xml_to_format, format_nodes)), key=operator.itemgetter('_pref')) return { From 113577e155b10d6775f38e00b897f8e1d743a17e Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 25 Nov 2013 03:35:52 +0100 Subject: [PATCH 024/121] [generic] Improve detection Allow download of http://goo.gl/7X5tOk Fixes #1818 --- youtube_dl/extractor/generic.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 0b5f2b2bb..37671430a 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -209,7 +209,7 @@ class GenericIE(InfoExtractor): mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage) if mobj is None: # Broaden the search a little bit: JWPlayer JS loader - mobj = re.search(r'[^A-Za-z0-9]?file["\']?:\s*["\'](http[^\'"&]*)', webpage) + mobj = re.search(r'[^A-Za-z0-9]?file["\']?:\s*["\'](http[^\'"]*)', webpage) if mobj is None: # Try to find twitter cards info mobj = re.search(r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage) @@ -236,18 +236,16 @@ class GenericIE(InfoExtractor): video_id = compat_urllib_parse.unquote(os.path.basename(video_url)) # here's a fun little line of code for you: - video_extension = os.path.splitext(video_id)[1][1:] video_id = os.path.splitext(video_id)[0] # video uploader is domain name video_uploader = self._search_regex(r'(?:https?://)?([^/]*)/.*', url, u'video uploader') - return [{ + return { 'id': video_id, 'url': video_url, 'uploader': video_uploader, 'upload_date': None, 'title': video_title, - 'ext': video_extension, - }] + } From ac05067d3dbc68cd50e8e07d51700b5a8a698a29 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 25 Nov 2013 03:37:49 +0100 Subject: [PATCH 025/121] release 2013.11.25 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index de92411bb..91b36e55c 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2013.11.24.1' +__version__ = '2013.11.25' From d0efb9ec9a85662fa43f026339821513ac2f039c Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 25 Nov 2013 03:47:32 +0100 Subject: [PATCH 026/121] [tests] Remove global_setup function --- test/helper.py | 4 ---- test/test_age_restriction.py | 3 +-- test/test_download.py | 2 -- test/test_playlists.py | 3 +-- test/test_subtitles.py | 3 +-- test/test_write_annotations.py | 3 +-- test/test_write_info_json.py | 3 +-- test/test_youtube_lists.py | 3 +-- test/test_youtube_signature.py | 3 --- 9 files changed, 6 insertions(+), 21 deletions(-) diff --git a/test/helper.py b/test/helper.py index d7bf7a828..b1f421ac5 100644 --- a/test/helper.py +++ b/test/helper.py @@ -12,10 +12,6 @@ from youtube_dl import YoutubeDL from youtube_dl.utils import preferredencoding -def global_setup(): - youtube_dl._setup_opener(timeout=10) - - def get_params(override=None): PARAMETERS_FILE = os.path.join(os.path.dirname(os.path.abspath(__file__)), "parameters.json") diff --git a/test/test_age_restriction.py b/test/test_age_restriction.py index 506572e9e..c9cdb96cb 100644 --- a/test/test_age_restriction.py +++ b/test/test_age_restriction.py @@ -6,8 +6,7 @@ import sys import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from test.helper import global_setup, try_rm -global_setup() +from test.helper import try_rm from youtube_dl import YoutubeDL diff --git a/test/test_download.py b/test/test_download.py index fe7f7b8cb..dd5818dba 100644 --- a/test/test_download.py +++ b/test/test_download.py @@ -9,12 +9,10 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from test.helper import ( get_params, get_testcases, - global_setup, try_rm, md5, report_warning ) -global_setup() import hashlib diff --git a/test/test_playlists.py b/test/test_playlists.py index 7c67239a4..167801ae2 100644 --- a/test/test_playlists.py +++ b/test/test_playlists.py @@ -8,8 +8,7 @@ import sys import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from test.helper import FakeYDL, global_setup -global_setup() +from test.helper import FakeYDL from youtube_dl.extractor import ( diff --git a/test/test_subtitles.py b/test/test_subtitles.py index 06a304879..94a1f771d 100644 --- a/test/test_subtitles.py +++ b/test/test_subtitles.py @@ -6,8 +6,7 @@ import sys import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from test.helper import FakeYDL, global_setup, md5 -global_setup() +from test.helper import FakeYDL, md5 from youtube_dl.extractor import ( diff --git a/test/test_write_annotations.py b/test/test_write_annotations.py index 35defb895..eac53b285 100644 --- a/test/test_write_annotations.py +++ b/test/test_write_annotations.py @@ -7,8 +7,7 @@ import sys import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from test.helper import get_params, global_setup, try_rm -global_setup() +from test.helper import get_params, try_rm import io diff --git a/test/test_write_info_json.py b/test/test_write_info_json.py index 30c4859fd..d7177611b 100644 --- a/test/test_write_info_json.py +++ b/test/test_write_info_json.py @@ -7,8 +7,7 @@ import sys import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from test.helper import get_params, global_setup -global_setup() +from test.helper import get_params import io diff --git a/test/test_youtube_lists.py b/test/test_youtube_lists.py index 938517a2d..8fd073f31 100644 --- a/test/test_youtube_lists.py +++ b/test/test_youtube_lists.py @@ -6,8 +6,7 @@ import sys import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from test.helper import FakeYDL, global_setup -global_setup() +from test.helper import FakeYDL from youtube_dl.extractor import ( diff --git a/test/test_youtube_signature.py b/test/test_youtube_signature.py index 5e1ff5eb0..056700614 100644 --- a/test/test_youtube_signature.py +++ b/test/test_youtube_signature.py @@ -6,9 +6,6 @@ import sys import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from test.helper import global_setup -global_setup() - import io import re From 07e40358799158e51453e2d2c493d265a495b9e0 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 25 Nov 2013 05:57:55 +0100 Subject: [PATCH 027/121] [viki] Fix uploader extraction --- youtube_dl/extractor/viki.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/viki.py b/youtube_dl/extractor/viki.py index cd986a749..20e8bbf7e 100644 --- a/youtube_dl/extractor/viki.py +++ b/youtube_dl/extractor/viki.py @@ -39,7 +39,7 @@ class VikiIE(SubtitlesInfoExtractor): if uploader_m is None: uploader = None else: - uploader = uploader.group(1).strip() + uploader = uploader_m.group(1).strip() rating_str = self._html_search_regex( r'<strong>Rating: </strong>\s*([^<]*)<', webpage, From 94ccb6fa2e3ec014bb995d05bfe634cf986d6198 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 25 Nov 2013 05:58:04 +0100 Subject: [PATCH 028/121] [viki] Fix subtitles extraction --- youtube_dl/extractor/viki.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/viki.py b/youtube_dl/extractor/viki.py index 20e8bbf7e..ac199d410 100644 --- a/youtube_dl/extractor/viki.py +++ b/youtube_dl/extractor/viki.py @@ -91,7 +91,7 @@ class VikiIE(SubtitlesInfoExtractor): def _get_available_subtitles(self, video_id, info_webpage): res = {} - for sturl in re.findall(r'<track src="([^"]+)"/>'): + for sturl in re.findall(r'<track src="([^"]+)"/>', info_webpage): m = re.search(r'/(?P<lang>[a-z]+)\.vtt', sturl) if not m: continue From de79c46c8fa86dd3cb2383fd46cdd19a48e2f81f Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 25 Nov 2013 06:06:18 +0100 Subject: [PATCH 029/121] [viki] Fix subtitle extraction --- youtube_dl/extractor/viki.py | 4 +++- youtube_dl/utils.py | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/viki.py b/youtube_dl/extractor/viki.py index ac199d410..2206a06d5 100644 --- a/youtube_dl/extractor/viki.py +++ b/youtube_dl/extractor/viki.py @@ -2,6 +2,7 @@ import re from ..utils import ( ExtractorError, + unescapeHTML, unified_strdate, ) from .subtitles import SubtitlesInfoExtractor @@ -91,7 +92,8 @@ class VikiIE(SubtitlesInfoExtractor): def _get_available_subtitles(self, video_id, info_webpage): res = {} - for sturl in re.findall(r'<track src="([^"]+)"/>', info_webpage): + for sturl_html in re.findall(r'<track src="([^"]+)"/>', info_webpage): + sturl = unescapeHTML(sturl_html) m = re.search(r'/(?P<lang>[a-z]+)\.vtt', sturl) if not m: continue diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index caec00e37..946e90e93 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -553,7 +553,7 @@ def make_HTTPS_handler(opts_no_check_certificate): self._tunnel() try: self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv3) - except ssl.SSLError as e: + except ssl.SSLError: self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23) class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler): From daa0dd2973212fc1b2837c9572b1502f91f6acbc Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 25 Nov 2013 06:06:39 +0100 Subject: [PATCH 030/121] release 2013.11.25.1 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 91b36e55c..2af23040f 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2013.11.25' +__version__ = '2013.11.25.1' From d0d2b49ab728e70b8b325298e7825760fa7b3775 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 25 Nov 2013 06:17:41 +0100 Subject: [PATCH 031/121] [FileDownloader] use moved format_bytes method --- youtube_dl/FileDownloader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/FileDownloader.py b/youtube_dl/FileDownloader.py index c6276d194..27684d0f6 100644 --- a/youtube_dl/FileDownloader.py +++ b/youtube_dl/FileDownloader.py @@ -289,7 +289,7 @@ class FileDownloader(object): data_len = None if percent > 0: data_len = int(downloaded_data_len * 100 / percent) - data_len_str = u'~'+self.format_bytes(data_len) + data_len_str = u'~' + format_bytes(data_len) self.report_progress(percent, data_len_str, speed, eta) cursor_in_new_line = False self._hook_progress({ From 5db07df634713fe73e15e98de62f70ffe3a47871 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 25 Nov 2013 15:46:54 +0100 Subject: [PATCH 032/121] Fix --download-archive (Fixes #1826) --- youtube_dl/YoutubeDL.py | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 30ba94666..a1ef3a94a 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -836,20 +836,26 @@ class YoutubeDL(object): except (IOError, OSError): self.report_warning(u'Unable to remove downloaded video file') - def in_download_archive(self, info_dict): - fn = self.params.get('download_archive') - if fn is None: - return False - extractor = info_dict.get('extractor_id') + def _make_archive_id(self, info_dict): + # Future-proof against any change in case + # and backwards compatibility with prior versions + extractor = info_dict.get('extractor') if extractor is None: if 'id' in info_dict: extractor = info_dict.get('ie_key') # key in a playlist if extractor is None: + return None # Incomplete video information + return extractor.lower() + u' ' + info_dict['id'] + + def in_download_archive(self, info_dict): + fn = self.params.get('download_archive') + if fn is None: + return False + + vid_id = self._make_archive_id(info_dict) + if vid_id is None: return False # Incomplete video information - # Future-proof against any change in case - # and backwards compatibility with prior versions - extractor = extractor.lower() - vid_id = extractor + u' ' + info_dict['id'] + try: with locked_file(fn, 'r', encoding='utf-8') as archive_file: for line in archive_file: @@ -864,7 +870,8 @@ class YoutubeDL(object): fn = self.params.get('download_archive') if fn is None: return - vid_id = info_dict['extractor'] + u' ' + info_dict['id'] + vid_id = self._make_archive_id(info_dict) + assert vid_id with locked_file(fn, 'a', encoding='utf-8') as archive_file: archive_file.write(vid_id + u'\n') From bb2bebdbe1ef06adc3c1cb2d078e061f44cf7d29 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 25 Nov 2013 15:47:14 +0100 Subject: [PATCH 033/121] release 2013.11.25.2 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 2af23040f..aed0c4e75 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2013.11.25.1' +__version__ = '2013.11.25.2' From d46cc192d763f66655247ee122e397626481caca Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 25 Nov 2013 19:11:01 +0100 Subject: [PATCH 034/121] Reduce socket timeout --- youtube_dl/YoutubeDL.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index a1ef3a94a..0cef1daf3 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -968,7 +968,7 @@ class YoutubeDL(object): proxy_map.update(handler.proxies) write_string(u'[debug] Proxy map: ' + compat_str(proxy_map) + u'\n') - def _setup_opener(self, timeout=300): + def _setup_opener(self, timeout=20): opts_cookiefile = self.params.get('cookiefile') opts_proxy = self.params.get('proxy') From 2a15e7063bceed326bcbc0a01ba77324f0373a0c Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 25 Nov 2013 20:30:41 +0100 Subject: [PATCH 035/121] [soundcloud] Prefer HTTP over RTMP (#1798) --- youtube_dl/extractor/soundcloud.py | 66 ++++++++++++++++++++++-------- 1 file changed, 48 insertions(+), 18 deletions(-) diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index 67b2dff9c..ee8da227e 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -76,44 +76,74 @@ class SoundcloudIE(InfoExtractor): def _extract_info_dict(self, info, full_title=None, quiet=False): track_id = compat_str(info['id']) name = full_title or track_id - if quiet == False: + if quiet: self.report_extraction(name) thumbnail = info['artwork_url'] if thumbnail is not None: thumbnail = thumbnail.replace('-large', '-t500x500') + ext = info.get('original_format', u'mp3') result = { - 'id': track_id, + 'id': track_id, 'uploader': info['user']['username'], 'upload_date': unified_strdate(info['created_at']), - 'title': info['title'], - 'ext': info.get('original_format', u'mp3'), + 'title': info['title'], 'description': info['description'], 'thumbnail': thumbnail, } if info.get('downloadable', False): # We can build a direct link to the song - result['url'] = 'https://api.soundcloud.com/tracks/{0}/download?client_id={1}'.format(track_id, self._CLIENT_ID) + format_url = ( + u'https://api.soundcloud.com/tracks/{0}/download?client_id={1}'.format( + track_id, self._CLIENT_ID)) + result['formats'] = [{ + 'format_id': 'download', + 'ext': ext, + 'url': format_url, + }] else: # We have to retrieve the url stream_json = self._download_webpage( 'http://api.soundcloud.com/i1/tracks/{0}/streams?client_id={1}'.format(track_id, self._IPHONE_CLIENT_ID), track_id, u'Downloading track url') - # There should be only one entry in the dictionary - key, stream_url = list(json.loads(stream_json).items())[0] - if key.startswith(u'http'): - result['url'] = stream_url - elif key.startswith(u'rtmp'): - # The url doesn't have an rtmp app, we have to extract the playpath - url, path = stream_url.split('mp3:', 1) - result.update({ - 'url': url, - 'play_path': 'mp3:' + path, - }) - else: + + formats = [] + format_dict = json.loads(stream_json) + for key, stream_url in format_dict.items(): + if key.startswith(u'http'): + formats.append({ + 'format_id': key, + 'ext': ext, + 'url': stream_url, + }) + elif key.startswith(u'rtmp'): + # The url doesn't have an rtmp app, we have to extract the playpath + url, path = stream_url.split('mp3:', 1) + formats.append({ + 'format_id': key, + 'url': url, + 'play_path': 'mp3:' + path, + 'ext': ext, + }) + + if not formats: # We fallback to the stream_url in the original info, this # cannot be always used, sometimes it can give an HTTP 404 error - result['url'] = info['stream_url'] + '?client_id=' + self._CLIENT_ID, + formats.append({ + 'format_id': u'fallback', + 'url': info['stream_url'] + '?client_id=' + self._CLIENT_ID, + 'ext': ext, + }) + + def format_pref(f): + if f['format_id'].startswith('http'): + return 2 + if f['format_id'].startswith('rtmp'): + return 1 + return 0 + + formats.sort(key=format_pref) + result['formats'] = formats return result From 1a62c18f6521803ab41483f5da56fc72957d7655 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Mon, 25 Nov 2013 22:03:20 +0100 Subject: [PATCH 036/121] [bambuser] Skip the download in the test It doesn't respect the 'Range' header. --- youtube_dl/extractor/bambuser.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/youtube_dl/extractor/bambuser.py b/youtube_dl/extractor/bambuser.py index 967568c4a..b80508efe 100644 --- a/youtube_dl/extractor/bambuser.py +++ b/youtube_dl/extractor/bambuser.py @@ -25,6 +25,11 @@ class BambuserIE(InfoExtractor): u'uploader': u'pixelversity', u'uploader_id': u'344706', }, + u'params': { + # It doesn't respect the 'Range' header, it would download the whole video + # caused the travis builds to fail: https://travis-ci.org/rg3/youtube-dl/jobs/14493845#L59 + u'skip_download': True, + }, } def _real_extract(self, url): From a3927cf7eefd2318cdfb44cdb213b3810ea7627b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Mon, 25 Nov 2013 21:55:20 +0100 Subject: [PATCH 037/121] Allow to initialize a YoutubeDL object without parameters Having to pass the 'outtmpl' parameter feels really strange when you just want to extract the info of a video. --- youtube_dl/YoutubeDL.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index a1ef3a94a..46635bce1 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -146,7 +146,7 @@ class YoutubeDL(object): _num_downloads = None _screen_file = None - def __init__(self, params): + def __init__(self, params={}): """Create a FileDownloader object with the given options.""" self._ies = [] self._ies_instances = {} @@ -169,7 +169,7 @@ class YoutubeDL(object): self.params = params self.fd = FileDownloader(self, self.params) - if '%(stitle)s' in self.params['outtmpl']: + if '%(stitle)s' in self.params.get('outtmpl', ''): self.report_warning(u'%(stitle)s is deprecated. Use the %(title)s and the --restrict-filenames flag(which also secures %(uploader)s et al) instead.') self._setup_opener() From 0c75c3fa7a24c05a74891ec49e5a18de4f2792f1 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 25 Nov 2013 22:15:20 +0100 Subject: [PATCH 038/121] Do not warn about fixed output template if --max-downloads is 1 Fixes #1828 --- youtube_dl/YoutubeDL.py | 8 +++----- youtube_dl/__init__.py | 4 +++- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 0cef1daf3..50f750593 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -258,10 +258,6 @@ class YoutubeDL(object): if self.params.get('cookiefile') is not None: self.cookiejar.save() - def fixed_template(self): - """Checks if the output template is fixed.""" - return (re.search(u'(?u)%\\(.+?\\)s', self.params['outtmpl']) is None) - def trouble(self, message=None, tb=None): """Determine action to take when a download problem appears. @@ -798,7 +794,9 @@ class YoutubeDL(object): def download(self, url_list): """Download a given list of URLs.""" - if len(url_list) > 1 and self.fixed_template(): + if (len(url_list) > 1 and + '%' not in self.params['outtmpl'] + and self.params.get('max_downloads') != 1): raise SameFileError(self.params['outtmpl']) for url in url_list: diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 1f15c7eaa..102508cf9 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -206,7 +206,9 @@ def parseOpts(overrideArguments=None): dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1) selection.add_option('--match-title', dest='matchtitle', metavar='REGEX',help='download only matching titles (regex or caseless sub-string)') selection.add_option('--reject-title', dest='rejecttitle', metavar='REGEX',help='skip download for matching titles (regex or caseless sub-string)') - selection.add_option('--max-downloads', metavar='NUMBER', dest='max_downloads', help='Abort after downloading NUMBER files', default=None) + selection.add_option('--max-downloads', metavar='NUMBER', + dest='max_downloads', type=int, default=None, + help='Abort after downloading NUMBER files') selection.add_option('--min-filesize', metavar='SIZE', dest='min_filesize', help="Do not download any videos smaller than SIZE (e.g. 50k or 44.6m)", default=None) selection.add_option('--max-filesize', metavar='SIZE', dest='max_filesize', help="Do not download any videos larger than SIZE (e.g. 50k or 44.6m)", default=None) selection.add_option('--date', metavar='DATE', dest='date', help='download only videos uploaded in this date', default=None) From d9b011f201ef61c10ce63b6078cd1e21b6da4d4a Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 25 Nov 2013 22:31:27 +0100 Subject: [PATCH 039/121] Fix rtmpdump with non-ASCII filenames on Windows on 2.x Reported in #1798 --- youtube_dl/FileDownloader.py | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/youtube_dl/FileDownloader.py b/youtube_dl/FileDownloader.py index 27684d0f6..3ff9716b3 100644 --- a/youtube_dl/FileDownloader.py +++ b/youtube_dl/FileDownloader.py @@ -339,13 +339,29 @@ class FileDownloader(object): if live: basic_args += ['--live'] args = basic_args + [[], ['--resume', '--skip', '1']][self.params.get('continuedl', False)] + + if sys.platform == 'win32' and sys.version_info < (3, 0): + # Windows subprocess module does not actually support Unicode + # on Python 2.x + # See http://stackoverflow.com/a/9951851/35070 + subprocess_encoding = sys.getfilesystemencoding() + args = [a.encode(subprocess_encoding, 'ignore') for a in args] + else: + subprocess_encoding = None + if self.params.get('verbose', False): + if subprocess_encoding: + str_args = [ + a.decode(subprocess_encoding) if isinstance(a, bytes) else a + for a in args] + else: + str_args = args try: import pipes - shell_quote = lambda args: ' '.join(map(pipes.quote, args)) + shell_quote = lambda args: ' '.join(map(pipes.quote, str_args)) except ImportError: shell_quote = repr - self.to_screen(u'[debug] rtmpdump command line: ' + shell_quote(args)) + self.to_screen(u'[debug] rtmpdump command line: ' + shell_quote(str_args)) retval = run_rtmpdump(args) From fb04e40396509fd2bd41250eec3b07adf1aa1125 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 25 Nov 2013 22:34:56 +0100 Subject: [PATCH 040/121] [soundcloud] Support for listing of audio-only files --- youtube_dl/YoutubeDL.py | 5 ++++- youtube_dl/extractor/soundcloud.py | 4 ++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index d0aab1bbd..87635e173 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -875,6 +875,8 @@ class YoutubeDL(object): @staticmethod def format_resolution(format, default='unknown'): + if format.get('vcodec') == 'none': + return 'audio only' if format.get('_resolution') is not None: return format['_resolution'] if format.get('height') is not None: @@ -891,7 +893,8 @@ class YoutubeDL(object): res = u'' if fdict.get('format_note') is not None: res += fdict['format_note'] + u' ' - if fdict.get('vcodec') is not None: + if (fdict.get('vcodec') is not None and + fdict.get('vcodec') != 'none'): res += u'%-5s' % fdict['vcodec'] elif fdict.get('vbr') is not None: res += u'video' diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index ee8da227e..3a19ab172 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -100,6 +100,7 @@ class SoundcloudIE(InfoExtractor): 'format_id': 'download', 'ext': ext, 'url': format_url, + 'vcodec': 'none', }] else: # We have to retrieve the url @@ -115,6 +116,7 @@ class SoundcloudIE(InfoExtractor): 'format_id': key, 'ext': ext, 'url': stream_url, + 'vcodec': 'none', }) elif key.startswith(u'rtmp'): # The url doesn't have an rtmp app, we have to extract the playpath @@ -124,6 +126,7 @@ class SoundcloudIE(InfoExtractor): 'url': url, 'play_path': 'mp3:' + path, 'ext': ext, + 'vcodec': 'none', }) if not formats: @@ -133,6 +136,7 @@ class SoundcloudIE(InfoExtractor): 'format_id': u'fallback', 'url': info['stream_url'] + '?client_id=' + self._CLIENT_ID, 'ext': ext, + 'vcodec': 'none', }) def format_pref(f): From 781a7d054657d813527fa0f98f831679675f8ea7 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 25 Nov 2013 22:36:18 +0100 Subject: [PATCH 041/121] release 2013.11.25.3 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index aed0c4e75..fc0881201 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2013.11.25.2' +__version__ = '2013.11.25.3' From 529a2e2cc35df8c77418f9d02b0f5b4730b95b06 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Mon, 25 Nov 2013 22:52:09 +0100 Subject: [PATCH 042/121] Fix typo in the documentation of the 'download_archive' param --- youtube_dl/YoutubeDL.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 87635e173..e23042c48 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -126,7 +126,7 @@ class YoutubeDL(object): noplaylist: Download single video instead of a playlist if in doubt. age_limit: An integer representing the user's age in years. Unsuitable videos for the given age are skipped. - downloadarchive: File name of a file where all downloads are recorded. + download_archive: File name of a file where all downloads are recorded. Videos already present in the file are not downloaded again. cookiefile: File name where cookies should be read from and dumped to. From d31209a1449d0bd9315e063be4cf7f5d45726563 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Mon, 25 Nov 2013 22:57:15 +0100 Subject: [PATCH 043/121] Use the 'extractor_key' field for the download archive file It has the same value as the ie_key. --- youtube_dl/YoutubeDL.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index e23042c48..e86e8a090 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -837,7 +837,7 @@ class YoutubeDL(object): def _make_archive_id(self, info_dict): # Future-proof against any change in case # and backwards compatibility with prior versions - extractor = info_dict.get('extractor') + extractor = info_dict.get('extractor_key') if extractor is None: if 'id' in info_dict: extractor = info_dict.get('ie_key') # key in a playlist From c2e52508cca307113ff0c3aedcc0519d92c48f00 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Tue, 26 Nov 2013 08:03:11 +0100 Subject: [PATCH 044/121] Include the proxy in the parameters for YoutubeDL (fixes #1831) --- youtube_dl/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 102508cf9..0704515df 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -651,6 +651,7 @@ def _real_main(argv=None): 'download_archive': opts.download_archive, 'cookiefile': opts.cookiefile, 'nocheckcertificate': opts.no_check_certificate, + 'proxy': opts.proxy, } with YoutubeDL(ydl_opts) as ydl: From c5ed4e8f7efaa258c74dd3179a7c691208874e41 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Tue, 26 Nov 2013 10:41:35 +0100 Subject: [PATCH 045/121] release 2013.11.26 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index fc0881201..99a5e0505 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2013.11.25.3' +__version__ = '2013.11.26' From 4a98cdbf3b19b07c7a885d348e79ddf79318f133 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Tue, 26 Nov 2013 18:53:36 +0100 Subject: [PATCH 046/121] YoutubeDL: set the 'params' property before any message/warning/error is sent (fixes #1840) If it sets the 'restrictfilenames' param, it will first report a warning. It will try to get the logger from the 'params' property, which would be set at that moment to None, raising the error 'AttributeError: 'NoneType' object has no attribute 'get'' --- youtube_dl/YoutubeDL.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index e86e8a090..711b5d79e 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -155,6 +155,7 @@ class YoutubeDL(object): self._download_retcode = 0 self._num_downloads = 0 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)] + self.params = params if (sys.version_info >= (3,) and sys.platform != 'win32' and sys.getfilesystemencoding() in ['ascii', 'ANSI_X3.4-1968'] @@ -164,9 +165,8 @@ class YoutubeDL(object): u'Assuming --restrict-filenames since file system encoding ' u'cannot encode all charactes. ' u'Set the LC_ALL environment variable to fix this.') - params['restrictfilenames'] = True + self.params['restrictfilenames'] = True - self.params = params self.fd = FileDownloader(self, self.params) if '%(stitle)s' in self.params.get('outtmpl', ''): From 6e47b51eef26dbaa3634b73914e4ee7213ad38f7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Tue, 26 Nov 2013 19:09:14 +0100 Subject: [PATCH 047/121] [youtube:playlist] Remove the link with index 0 It's not the first video of the playlist, it appears in the 'Play all' button (see the test course for an example) --- youtube_dl/extractor/youtube.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 1bf9cb7d4..4c43d5739 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1528,7 +1528,7 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): )""" _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s&page=%s' _MORE_PAGES_INDICATOR = r'data-link-type="next"' - _VIDEO_RE = r'href="/watch\?v=([0-9A-Za-z_-]{11})&' + _VIDEO_RE = r'href="/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&[^"]*?index=(?P<index>\d+)' IE_NAME = u'youtube:playlist' @classmethod @@ -1562,8 +1562,10 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): for page_num in itertools.count(1): url = self._TEMPLATE_URL % (playlist_id, page_num) page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num) - # The ids are duplicated - new_ids = orderedSet(re.findall(self._VIDEO_RE, page)) + matches = re.finditer(self._VIDEO_RE, page) + # We remove the duplicates and the link with index 0 + # (it's not the first video of the playlist) + new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0') ids.extend(new_ids) if re.search(self._MORE_PAGES_INDICATOR, page) is None: From e26f8712289c727a43d74a4669aee4924b9f75f2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Tue, 26 Nov 2013 18:48:52 +0100 Subject: [PATCH 048/121] Use the new '_download_xml' helper in more extractors --- youtube_dl/extractor/anitube.py | 4 +--- youtube_dl/extractor/arte.py | 7 ++----- youtube_dl/extractor/canalplus.py | 4 +--- youtube_dl/extractor/clipfish.py | 4 +--- youtube_dl/extractor/cnn.py | 4 +--- youtube_dl/extractor/comedycentral.py | 7 ++----- youtube_dl/extractor/daum.py | 10 +++------- youtube_dl/extractor/dreisat.py | 4 +--- youtube_dl/extractor/ebaumsworld.py | 4 +--- youtube_dl/extractor/faz.py | 4 +--- youtube_dl/extractor/francetv.py | 4 +--- youtube_dl/extractor/internetvideoarchive.py | 7 ++----- youtube_dl/extractor/jeuxvideo.py | 8 ++------ youtube_dl/extractor/justintv.py | 4 +--- youtube_dl/extractor/livestream.py | 4 +--- youtube_dl/extractor/mtv.py | 3 +-- youtube_dl/extractor/myspass.py | 4 +--- youtube_dl/extractor/naver.py | 7 ++----- youtube_dl/extractor/nbc.py | 5 ++--- youtube_dl/extractor/nhl.py | 4 +--- youtube_dl/extractor/niconico.py | 10 +++------- youtube_dl/extractor/sina.py | 4 +--- youtube_dl/extractor/spiegel.py | 5 +---- youtube_dl/extractor/teamcoco.py | 4 +--- youtube_dl/extractor/toutv.py | 5 +---- youtube_dl/extractor/trilulilu.py | 5 +---- youtube_dl/extractor/videofyme.py | 4 +--- youtube_dl/extractor/youtube.py | 4 +--- 28 files changed, 38 insertions(+), 105 deletions(-) diff --git a/youtube_dl/extractor/anitube.py b/youtube_dl/extractor/anitube.py index 691d5a844..2b019daa9 100644 --- a/youtube_dl/extractor/anitube.py +++ b/youtube_dl/extractor/anitube.py @@ -1,5 +1,4 @@ import re -import xml.etree.ElementTree from .common import InfoExtractor @@ -28,9 +27,8 @@ class AnitubeIE(InfoExtractor): key = self._html_search_regex(r'http://www\.anitube\.se/embed/([A-Za-z0-9_-]*)', webpage, u'key') - webpage_config = self._download_webpage('http://www.anitube.se/nuevo/econfig.php?key=%s' % key, + config_xml = self._download_xml('http://www.anitube.se/nuevo/econfig.php?key=%s' % key, key) - config_xml = xml.etree.ElementTree.fromstring(webpage_config.encode('utf-8')) video_title = config_xml.find('title').text diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py index 44d0b5d70..8b62ee774 100644 --- a/youtube_dl/extractor/arte.py +++ b/youtube_dl/extractor/arte.py @@ -1,7 +1,6 @@ # encoding: utf-8 import re import json -import xml.etree.ElementTree from .common import InfoExtractor from ..utils import ( @@ -78,8 +77,7 @@ class ArteTvIE(InfoExtractor): """Extract from videos.arte.tv""" ref_xml_url = url.replace('/videos/', '/do_delegate/videos/') ref_xml_url = ref_xml_url.replace('.html', ',view,asPlayerXml.xml') - ref_xml = self._download_webpage(ref_xml_url, video_id, note=u'Downloading metadata') - ref_xml_doc = xml.etree.ElementTree.fromstring(ref_xml) + ref_xml_doc = self._download_xml(ref_xml_url, video_id, note=u'Downloading metadata') config_node = find_xpath_attr(ref_xml_doc, './/video', 'lang', lang) config_xml_url = config_node.attrib['ref'] config_xml = self._download_webpage(config_xml_url, video_id, note=u'Downloading configuration') @@ -109,9 +107,8 @@ class ArteTvIE(InfoExtractor): """Extract form http://liveweb.arte.tv/""" webpage = self._download_webpage(url, name) video_id = self._search_regex(r'eventId=(\d+?)("|&)', webpage, u'event id') - config_xml = self._download_webpage('http://download.liveweb.arte.tv/o21/liveweb/events/event-%s.xml' % video_id, + config_doc = self._download_xml('http://download.liveweb.arte.tv/o21/liveweb/events/event-%s.xml' % video_id, video_id, u'Downloading information') - config_doc = xml.etree.ElementTree.fromstring(config_xml.encode('utf-8')) event_doc = config_doc.find('event') url_node = event_doc.find('video').find('urlHd') if url_node is None: diff --git a/youtube_dl/extractor/canalplus.py b/youtube_dl/extractor/canalplus.py index bfa2a8b40..7cdcd8399 100644 --- a/youtube_dl/extractor/canalplus.py +++ b/youtube_dl/extractor/canalplus.py @@ -1,6 +1,5 @@ # encoding: utf-8 import re -import xml.etree.ElementTree from .common import InfoExtractor from ..utils import unified_strdate @@ -31,11 +30,10 @@ class CanalplusIE(InfoExtractor): webpage = self._download_webpage(url, mobj.group('path')) video_id = self._search_regex(r'videoId = "(\d+)";', webpage, u'video id') info_url = self._VIDEO_INFO_TEMPLATE % video_id - info_page = self._download_webpage(info_url,video_id, + doc = self._download_xml(info_url,video_id, u'Downloading video info') self.report_extraction(video_id) - doc = xml.etree.ElementTree.fromstring(info_page.encode('utf-8')) video_info = [video for video in doc if video.find('ID').text == video_id][0] infos = video_info.find('INFOS') media = video_info.find('MEDIA') diff --git a/youtube_dl/extractor/clipfish.py b/youtube_dl/extractor/clipfish.py index 95449da3c..5f0b5602f 100644 --- a/youtube_dl/extractor/clipfish.py +++ b/youtube_dl/extractor/clipfish.py @@ -1,6 +1,5 @@ import re import time -import xml.etree.ElementTree from .common import InfoExtractor @@ -25,9 +24,8 @@ class ClipfishIE(InfoExtractor): info_url = ('http://www.clipfish.de/devxml/videoinfo/%s?ts=%d' % (video_id, int(time.time()))) - info_xml = self._download_webpage( + doc = self._download_xml( info_url, video_id, note=u'Downloading info page') - doc = xml.etree.ElementTree.fromstring(info_xml) title = doc.find('title').text video_url = doc.find('filename').text thumbnail = doc.find('imageurl').text diff --git a/youtube_dl/extractor/cnn.py b/youtube_dl/extractor/cnn.py index 34adf6dda..a034bb2fb 100644 --- a/youtube_dl/extractor/cnn.py +++ b/youtube_dl/extractor/cnn.py @@ -1,5 +1,4 @@ import re -import xml.etree.ElementTree from .common import InfoExtractor from ..utils import determine_ext @@ -33,8 +32,7 @@ class CNNIE(InfoExtractor): path = mobj.group('path') page_title = mobj.group('title') info_url = u'http://cnn.com/video/data/3.0/%s/index.xml' % path - info_xml = self._download_webpage(info_url, page_title) - info = xml.etree.ElementTree.fromstring(info_xml.encode('utf-8')) + info = self._download_xml(info_url, page_title) formats = [] for f in info.findall('files/file'): diff --git a/youtube_dl/extractor/comedycentral.py b/youtube_dl/extractor/comedycentral.py index 725849d2e..23647f99e 100644 --- a/youtube_dl/extractor/comedycentral.py +++ b/youtube_dl/extractor/comedycentral.py @@ -1,5 +1,4 @@ import re -import xml.etree.ElementTree from .common import InfoExtractor from .mtv import MTVIE, _media_xml_tag @@ -158,13 +157,12 @@ class ComedyCentralShowsIE(InfoExtractor): uri = mMovieParams[0][1] indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri}) - indexXml = self._download_webpage(indexUrl, epTitle, + idoc = self._download_xml(indexUrl, epTitle, u'Downloading show index', u'unable to download episode index') results = [] - idoc = xml.etree.ElementTree.fromstring(indexXml) itemEls = idoc.findall('.//item') for partNum,itemEl in enumerate(itemEls): mediaId = itemEl.findall('./guid')[0].text @@ -175,10 +173,9 @@ class ComedyCentralShowsIE(InfoExtractor): configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' + compat_urllib_parse.urlencode({'uri': mediaId})) - configXml = self._download_webpage(configUrl, epTitle, + cdoc = self._download_xml(configUrl, epTitle, u'Downloading configuration for %s' % shortMediaId) - cdoc = xml.etree.ElementTree.fromstring(configXml) turls = [] for rendition in cdoc.findall('.//rendition'): finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text) diff --git a/youtube_dl/extractor/daum.py b/youtube_dl/extractor/daum.py index a804e83bd..3d1dcb793 100644 --- a/youtube_dl/extractor/daum.py +++ b/youtube_dl/extractor/daum.py @@ -1,6 +1,5 @@ # encoding: utf-8 import re -import xml.etree.ElementTree from .common import InfoExtractor from ..utils import ( @@ -32,14 +31,12 @@ class DaumIE(InfoExtractor): full_id = self._search_regex(r'<link rel="video_src" href=".+?vid=(.+?)"', webpage, u'full id') query = compat_urllib_parse.urlencode({'vid': full_id}) - info_xml = self._download_webpage( + info = self._download_xml( 'http://tvpot.daum.net/clip/ClipInfoXml.do?' + query, video_id, u'Downloading video info') - urls_xml = self._download_webpage( + urls = self._download_xml( 'http://videofarm.daum.net/controller/api/open/v1_2/MovieData.apixml?' + query, video_id, u'Downloading video formats info') - info = xml.etree.ElementTree.fromstring(info_xml.encode('utf-8')) - urls = xml.etree.ElementTree.fromstring(urls_xml.encode('utf-8')) self.to_screen(u'%s: Getting video urls' % video_id) formats = [] @@ -49,10 +46,9 @@ class DaumIE(InfoExtractor): 'vid': full_id, 'profile': profile, }) - url_xml = self._download_webpage( + url_doc = self._download_xml( 'http://videofarm.daum.net/controller/api/open/v1_2/MovieLocation.apixml?' + format_query, video_id, note=False) - url_doc = xml.etree.ElementTree.fromstring(url_xml.encode('utf-8')) format_url = url_doc.find('result/url').text formats.append({ 'url': format_url, diff --git a/youtube_dl/extractor/dreisat.py b/youtube_dl/extractor/dreisat.py index 765cb1f37..3cb382e12 100644 --- a/youtube_dl/extractor/dreisat.py +++ b/youtube_dl/extractor/dreisat.py @@ -1,7 +1,6 @@ # coding: utf-8 import re -import xml.etree.ElementTree from .common import InfoExtractor from ..utils import ( @@ -30,8 +29,7 @@ class DreiSatIE(InfoExtractor): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') details_url = 'http://www.3sat.de/mediathek/xmlservice/web/beitragsDetails?ak=web&id=%s' % video_id - details_xml = self._download_webpage(details_url, video_id, note=u'Downloading video details') - details_doc = xml.etree.ElementTree.fromstring(details_xml.encode('utf-8')) + details_doc = self._download_xml(details_url, video_id, note=u'Downloading video details') thumbnail_els = details_doc.findall('.//teaserimage') thumbnails = [{ diff --git a/youtube_dl/extractor/ebaumsworld.py b/youtube_dl/extractor/ebaumsworld.py index f02c6998b..877113d63 100644 --- a/youtube_dl/extractor/ebaumsworld.py +++ b/youtube_dl/extractor/ebaumsworld.py @@ -1,5 +1,4 @@ import re -import xml.etree.ElementTree from .common import InfoExtractor from ..utils import determine_ext @@ -21,9 +20,8 @@ class EbaumsWorldIE(InfoExtractor): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') - config_xml = self._download_webpage( + config = self._download_xml( 'http://www.ebaumsworld.com/video/player/%s' % video_id, video_id) - config = xml.etree.ElementTree.fromstring(config_xml.encode('utf-8')) video_url = config.find('file').text return { diff --git a/youtube_dl/extractor/faz.py b/youtube_dl/extractor/faz.py index 89ed08db4..c0169de04 100644 --- a/youtube_dl/extractor/faz.py +++ b/youtube_dl/extractor/faz.py @@ -1,6 +1,5 @@ # encoding: utf-8 import re -import xml.etree.ElementTree from .common import InfoExtractor from ..utils import ( @@ -28,9 +27,8 @@ class FazIE(InfoExtractor): webpage = self._download_webpage(url, video_id) config_xml_url = self._search_regex(r'writeFLV\(\'(.+?)\',', webpage, u'config xml url') - config_xml = self._download_webpage(config_xml_url, video_id, + config = self._download_xml(config_xml_url, video_id, u'Downloading config xml') - config = xml.etree.ElementTree.fromstring(config_xml.encode('utf-8')) encodings = config.find('ENCODINGS') formats = [] diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py index 086cafca0..6e1971043 100644 --- a/youtube_dl/extractor/francetv.py +++ b/youtube_dl/extractor/francetv.py @@ -1,6 +1,5 @@ # encoding: utf-8 import re -import xml.etree.ElementTree import json from .common import InfoExtractor @@ -11,11 +10,10 @@ from ..utils import ( class FranceTVBaseInfoExtractor(InfoExtractor): def _extract_video(self, video_id): - xml_desc = self._download_webpage( + info = self._download_xml( 'http://www.francetvinfo.fr/appftv/webservices/video/' 'getInfosOeuvre.php?id-diffusion=' + video_id, video_id, 'Downloading XML config') - info = xml.etree.ElementTree.fromstring(xml_desc.encode('utf-8')) manifest_url = info.find('videos/video/url').text video_url = manifest_url.replace('manifest.f4m', 'index_2_av.m3u8') diff --git a/youtube_dl/extractor/internetvideoarchive.py b/youtube_dl/extractor/internetvideoarchive.py index be8e05f53..16a6f73c8 100644 --- a/youtube_dl/extractor/internetvideoarchive.py +++ b/youtube_dl/extractor/internetvideoarchive.py @@ -1,5 +1,4 @@ import re -import xml.etree.ElementTree from .common import InfoExtractor from ..utils import ( @@ -43,9 +42,8 @@ class InternetVideoArchiveIE(InfoExtractor): video_id = query_dic['publishedid'][0] url = self._build_url(query) - flashconfiguration_xml = self._download_webpage(url, video_id, + flashconfiguration = self._download_xml(url, video_id, u'Downloading flash configuration') - flashconfiguration = xml.etree.ElementTree.fromstring(flashconfiguration_xml.encode('utf-8')) file_url = flashconfiguration.find('file').text file_url = file_url.replace('/playlist.aspx', '/mrssplaylist.aspx') # Replace some of the parameters in the query to get the best quality @@ -53,9 +51,8 @@ class InternetVideoArchiveIE(InfoExtractor): file_url = re.sub(r'(?<=\?)(.+)$', lambda m: self._clean_query(m.group()), file_url) - info_xml = self._download_webpage(file_url, video_id, + info = self._download_xml(file_url, video_id, u'Downloading video info') - info = xml.etree.ElementTree.fromstring(info_xml.encode('utf-8')) item = info.find('channel/item') def _bp(p): diff --git a/youtube_dl/extractor/jeuxvideo.py b/youtube_dl/extractor/jeuxvideo.py index 0020c47cf..caf9d8c85 100644 --- a/youtube_dl/extractor/jeuxvideo.py +++ b/youtube_dl/extractor/jeuxvideo.py @@ -2,7 +2,6 @@ import json import re -import xml.etree.ElementTree from .common import InfoExtractor @@ -32,12 +31,9 @@ class JeuxVideoIE(InfoExtractor): r'http://www\.jeuxvideo\.com/config/\w+/\d+/(.*?)/\d+_player\.xml', xml_link, u'video ID') - xml_config = self._download_webpage( + config = self._download_xml( xml_link, title, u'Downloading XML config') - config = xml.etree.ElementTree.fromstring(xml_config.encode('utf-8')) - info_json = self._search_regex( - r'(?sm)<format\.json>(.*?)</format\.json>', - xml_config, u'JSON information') + info_json = config.find('format.json').text info = json.loads(info_json)['versions'][0] video_url = 'http://video720.jeuxvideo.com/' + info['file'] diff --git a/youtube_dl/extractor/justintv.py b/youtube_dl/extractor/justintv.py index f60017992..e9bde0c18 100644 --- a/youtube_dl/extractor/justintv.py +++ b/youtube_dl/extractor/justintv.py @@ -1,7 +1,6 @@ import json import os import re -import xml.etree.ElementTree from .common import InfoExtractor from ..utils import ( @@ -94,10 +93,9 @@ class JustinTVIE(InfoExtractor): archive_id = m.group(1) api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id - chapter_info_xml = self._download_webpage(api, chapter_id, + doc = self._download_xml(api, chapter_id, note=u'Downloading chapter information', errnote=u'Chapter information download failed') - doc = xml.etree.ElementTree.fromstring(chapter_info_xml) for a in doc.findall('.//archive'): if archive_id == a.find('./id').text: break diff --git a/youtube_dl/extractor/livestream.py b/youtube_dl/extractor/livestream.py index 5f548437c..9bc35b115 100644 --- a/youtube_dl/extractor/livestream.py +++ b/youtube_dl/extractor/livestream.py @@ -1,6 +1,5 @@ import re import json -import xml.etree.ElementTree from .common import InfoExtractor from ..utils import ( @@ -80,8 +79,7 @@ class LivestreamOriginalIE(InfoExtractor): user = mobj.group('user') api_url = 'http://x{0}x.api.channel.livestream.com/2.0/clipdetails?extendedInfo=true&id={1}'.format(user, video_id) - api_response = self._download_webpage(api_url, video_id) - info = xml.etree.ElementTree.fromstring(api_response.encode('utf-8')) + info = self._download_xml(api_url, video_id) item = info.find('channel').find('item') ns = {'media': 'http://search.yahoo.com/mrss'} thumbnail_url = item.find(xpath_with_ns('media:thumbnail', ns)).attrib['url'] diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py index 04afd6c4c..42aee58be 100644 --- a/youtube_dl/extractor/mtv.py +++ b/youtube_dl/extractor/mtv.py @@ -109,9 +109,8 @@ class MTVIE(InfoExtractor): def _get_videos_info(self, uri): video_id = self._id_from_uri(uri) data = compat_urllib_parse.urlencode({'uri': uri}) - infoXml = self._download_webpage(self._FEED_URL +'?' + data, video_id, + idoc = self._download_xml(self._FEED_URL +'?' + data, video_id, u'Downloading info') - idoc = xml.etree.ElementTree.fromstring(infoXml.encode('utf-8')) return [self._get_video_info(item) for item in idoc.findall('.//item')] def _real_extract(self, url): diff --git a/youtube_dl/extractor/myspass.py b/youtube_dl/extractor/myspass.py index 107665d15..0067bf134 100644 --- a/youtube_dl/extractor/myspass.py +++ b/youtube_dl/extractor/myspass.py @@ -1,5 +1,4 @@ import os.path -import xml.etree.ElementTree from .common import InfoExtractor from ..utils import ( @@ -33,8 +32,7 @@ class MySpassIE(InfoExtractor): # get metadata metadata_url = META_DATA_URL_TEMPLATE % video_id - metadata_text = self._download_webpage(metadata_url, video_id) - metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8')) + metadata = self._download_xml(metadata_url, video_id) # extract values from metadata url_flv_el = metadata.find('url_flv') diff --git a/youtube_dl/extractor/naver.py b/youtube_dl/extractor/naver.py index 9df236d69..d290397c7 100644 --- a/youtube_dl/extractor/naver.py +++ b/youtube_dl/extractor/naver.py @@ -1,6 +1,5 @@ # encoding: utf-8 import re -import xml.etree.ElementTree from .common import InfoExtractor from ..utils import ( @@ -38,14 +37,12 @@ class NaverIE(InfoExtractor): 'protocol': 'p2p', 'inKey': key, }) - info_xml = self._download_webpage( + info = self._download_xml( 'http://serviceapi.rmcnmv.naver.com/flash/videoInfo.nhn?' + query, video_id, u'Downloading video info') - urls_xml = self._download_webpage( + urls = self._download_xml( 'http://serviceapi.rmcnmv.naver.com/flash/playableEncodingOption.nhn?' + query_urls, video_id, u'Downloading video formats info') - info = xml.etree.ElementTree.fromstring(info_xml.encode('utf-8')) - urls = xml.etree.ElementTree.fromstring(urls_xml.encode('utf-8')) formats = [] for format_el in urls.findall('EncodingOptions/EncodingOption'): diff --git a/youtube_dl/extractor/nbc.py b/youtube_dl/extractor/nbc.py index 3bc9dae6d..e8bbfff7b 100644 --- a/youtube_dl/extractor/nbc.py +++ b/youtube_dl/extractor/nbc.py @@ -1,5 +1,4 @@ import re -import xml.etree.ElementTree from .common import InfoExtractor from ..utils import find_xpath_attr, compat_str @@ -21,8 +20,8 @@ class NBCNewsIE(InfoExtractor): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') - info_xml = self._download_webpage('http://www.nbcnews.com/id/%s/displaymode/1219' % video_id, video_id) - info = xml.etree.ElementTree.fromstring(info_xml.encode('utf-8')).find('video') + all_info = self._download_xml('http://www.nbcnews.com/id/%s/displaymode/1219' % video_id, video_id) + info = all_info.find('video') return {'id': video_id, 'title': info.find('headline').text, diff --git a/youtube_dl/extractor/nhl.py b/youtube_dl/extractor/nhl.py index 458fe4063..2edd806a3 100644 --- a/youtube_dl/extractor/nhl.py +++ b/youtube_dl/extractor/nhl.py @@ -1,6 +1,5 @@ import re import json -import xml.etree.ElementTree from .common import InfoExtractor from ..utils import ( @@ -26,9 +25,8 @@ class NHLBaseInfoExtractor(InfoExtractor): 'path': initial_video_url.replace('.mp4', '_sd.mp4'), }) path_url = 'http://video.nhl.com/videocenter/servlets/encryptvideopath?' + data - path_response = self._download_webpage(path_url, video_id, + path_doc = self._download_xml(path_url, video_id, u'Downloading final video url') - path_doc = xml.etree.ElementTree.fromstring(path_response) video_url = path_doc.find('path').text join = compat_urlparse.urljoin diff --git a/youtube_dl/extractor/niconico.py b/youtube_dl/extractor/niconico.py index 729607ea3..46774317c 100644 --- a/youtube_dl/extractor/niconico.py +++ b/youtube_dl/extractor/niconico.py @@ -2,7 +2,6 @@ import re import socket -import xml.etree.ElementTree from .common import InfoExtractor from ..utils import ( @@ -81,7 +80,7 @@ class NiconicoIE(InfoExtractor): # the cookies in order to be able to download the info webpage self._download_webpage('http://www.nicovideo.jp/watch/' + video_id, video_id) - video_info_webpage = self._download_webpage( + video_info = self._download_xml( 'http://ext.nicovideo.jp/api/getthumbinfo/' + video_id, video_id, note=u'Downloading video info page') @@ -92,7 +91,6 @@ class NiconicoIE(InfoExtractor): video_real_url = compat_urlparse.parse_qs(flv_info_webpage)['url'][0] # Start extracting information - video_info = xml.etree.ElementTree.fromstring(video_info_webpage) video_title = video_info.find('.//title').text video_extension = video_info.find('.//movie_type').text video_format = video_extension.upper() @@ -107,13 +105,11 @@ class NiconicoIE(InfoExtractor): video_uploader = video_uploader_id url = 'http://seiga.nicovideo.jp/api/user/info?id=' + video_uploader_id try: - user_info_webpage = self._download_webpage( + user_info = self._download_xml( url, video_id, note=u'Downloading user information') + video_uploader = user_info.find('.//nickname').text except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: self._downloader.report_warning(u'Unable to download user info webpage: %s' % compat_str(err)) - else: - user_info = xml.etree.ElementTree.fromstring(user_info_webpage) - video_uploader = user_info.find('.//nickname').text return { 'id': video_id, diff --git a/youtube_dl/extractor/sina.py b/youtube_dl/extractor/sina.py index 14b1c656c..74a87fe56 100644 --- a/youtube_dl/extractor/sina.py +++ b/youtube_dl/extractor/sina.py @@ -1,7 +1,6 @@ # coding: utf-8 import re -import xml.etree.ElementTree from .common import InfoExtractor from ..utils import ( @@ -35,12 +34,11 @@ class SinaIE(InfoExtractor): def _extract_video(self, video_id): data = compat_urllib_parse.urlencode({'vid': video_id}) - url_page = self._download_webpage('http://v.iask.com/v_play.php?%s' % data, + url_doc = self._download_xml('http://v.iask.com/v_play.php?%s' % data, video_id, u'Downloading video url') image_page = self._download_webpage( 'http://interface.video.sina.com.cn/interface/common/getVideoImage.php?%s' % data, video_id, u'Downloading thumbnail info') - url_doc = xml.etree.ElementTree.fromstring(url_page.encode('utf-8')) return {'id': video_id, 'url': url_doc.find('./durl/url').text, diff --git a/youtube_dl/extractor/spiegel.py b/youtube_dl/extractor/spiegel.py index 19ce585cf..695520524 100644 --- a/youtube_dl/extractor/spiegel.py +++ b/youtube_dl/extractor/spiegel.py @@ -1,5 +1,4 @@ import re -import xml.etree.ElementTree from .common import InfoExtractor @@ -33,12 +32,10 @@ class SpiegelIE(InfoExtractor): r'<div class="module-title">(.*?)</div>', webpage, u'title') xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml' - xml_code = self._download_webpage( + idoc = self._download_xml( xml_url, video_id, note=u'Downloading XML', errnote=u'Failed to download XML') - idoc = xml.etree.ElementTree.fromstring(xml_code) - formats = [ { 'format_id': n.tag.rpartition('type')[2], diff --git a/youtube_dl/extractor/teamcoco.py b/youtube_dl/extractor/teamcoco.py index 165d9f88b..2bf26d056 100644 --- a/youtube_dl/extractor/teamcoco.py +++ b/youtube_dl/extractor/teamcoco.py @@ -1,5 +1,4 @@ import re -import xml.etree.ElementTree from .common import InfoExtractor from ..utils import ( @@ -32,8 +31,7 @@ class TeamcocoIE(InfoExtractor): self.report_extraction(video_id) data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id - data_xml = self._download_webpage(data_url, video_id, 'Downloading data webpage') - data = xml.etree.ElementTree.fromstring(data_xml.encode('utf-8')) + data = self._download_xml(data_url, video_id, 'Downloading data webpage') qualities = ['500k', '480p', '1000k', '720p', '1080p'] diff --git a/youtube_dl/extractor/toutv.py b/youtube_dl/extractor/toutv.py index 2f728d3dc..1e9598ef6 100644 --- a/youtube_dl/extractor/toutv.py +++ b/youtube_dl/extractor/toutv.py @@ -1,6 +1,5 @@ # coding: utf-8 import re -import xml.etree.ElementTree from .common import InfoExtractor from ..utils import ( @@ -40,11 +39,9 @@ class TouTvIE(InfoExtractor): r'"idMedia":\s*"([^"]+)"', webpage, u'media ID') streams_url = u'http://release.theplatform.com/content.select?pid=' + mediaId - streams_webpage = self._download_webpage( + streams_doc = self._download_xml( streams_url, video_id, note=u'Downloading stream list') - streams_doc = xml.etree.ElementTree.fromstring( - streams_webpage.encode('utf-8')) video_url = next(n.text for n in streams_doc.findall('.//choice/url') if u'//ad.doubleclick' not in n.text) diff --git a/youtube_dl/extractor/trilulilu.py b/youtube_dl/extractor/trilulilu.py index 0bf028f61..1c49e580d 100644 --- a/youtube_dl/extractor/trilulilu.py +++ b/youtube_dl/extractor/trilulilu.py @@ -1,6 +1,5 @@ import json import re -import xml.etree.ElementTree from .common import InfoExtractor @@ -36,12 +35,10 @@ class TriluliluIE(InfoExtractor): format_url = (u'http://fs%(server)s.trilulilu.ro/%(hash)s/' u'video-formats2' % log) - format_str = self._download_webpage( + format_doc = self._download_xml( format_url, video_id, note=u'Downloading formats', errnote=u'Error while downloading formats') - - format_doc = xml.etree.ElementTree.fromstring(format_str) video_url_template = ( u'http://fs%(server)s.trilulilu.ro/stream.php?type=video' diff --git a/youtube_dl/extractor/videofyme.py b/youtube_dl/extractor/videofyme.py index 94f64ffa5..912802d9a 100644 --- a/youtube_dl/extractor/videofyme.py +++ b/youtube_dl/extractor/videofyme.py @@ -1,5 +1,4 @@ import re -import xml.etree.ElementTree from .common import InfoExtractor from ..utils import ( @@ -27,9 +26,8 @@ class VideofyMeIE(InfoExtractor): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') - config_xml = self._download_webpage('http://sunshine.videofy.me/?videoId=%s' % video_id, + config = self._download_xml('http://sunshine.videofy.me/?videoId=%s' % video_id, video_id) - config = xml.etree.ElementTree.fromstring(config_xml.encode('utf-8')) video = config.find('video') sources = video.find('sources') url_node = next(node for node in [find_xpath_attr(sources, 'source', 'id', 'HQ %s' % key) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 4c43d5739..a76a9071a 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -11,7 +11,6 @@ import socket import string import struct import traceback -import xml.etree.ElementTree import zlib from .common import InfoExtractor, SearchInfoExtractor @@ -1144,8 +1143,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): 'asrs': 1, }) list_url = caption_url + '&' + list_params - list_page = self._download_webpage(list_url, video_id) - caption_list = xml.etree.ElementTree.fromstring(list_page.encode('utf-8')) + caption_list = self._download_xml(list_url, video_id) original_lang_node = caption_list.find('track') if original_lang_node is None or original_lang_node.attrib.get('kind') != 'asr' : self._downloader.report_warning(u'Video doesn\'t have automatic captions') From 652cdaa269725dfbf9effdc18a8fd0b369100399 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Tue, 26 Nov 2013 21:35:03 +0100 Subject: [PATCH 049/121] [youtube:playlist] Add support for YouTube mixes (fixes #1839) --- test/test_youtube_lists.py | 9 +++++++++ youtube_dl/extractor/youtube.py | 24 ++++++++++++++++++++++-- 2 files changed, 31 insertions(+), 2 deletions(-) diff --git a/test/test_youtube_lists.py b/test/test_youtube_lists.py index 8fd073f31..95f07d129 100644 --- a/test/test_youtube_lists.py +++ b/test/test_youtube_lists.py @@ -107,5 +107,14 @@ class TestYoutubeLists(unittest.TestCase): result = ie.extract('http://www.youtube.com/show/airdisasters') self.assertTrue(len(result) >= 3) + def test_youtube_mix(self): + dl = FakeYDL() + ie = YoutubePlaylistIE(dl) + result = ie.extract('http://www.youtube.com/watch?v=lLJf9qJHR3E&list=RDrjFaenf1T-Y') + entries = result['entries'] + self.assertTrue(len(entries) >= 20) + original_video = entries[0] + self.assertEqual(original_video['id'], 'rjFaenf1T-Y') + if __name__ == '__main__': unittest.main() diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index a76a9071a..9ef5fecce 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -28,6 +28,7 @@ from ..utils import ( clean_html, get_cachedir, get_element_by_id, + get_element_by_attribute, ExtractorError, unescapeHTML, unified_strdate, @@ -1537,6 +1538,22 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): def _real_initialize(self): self._login() + def _ids_to_results(self, ids): + return [self.url_result(vid_id, 'Youtube', video_id=vid_id) + for vid_id in ids] + + def _extract_mix(self, playlist_id): + # The mixes are generated from a a single video + # the id of the playlist is just 'RD' + video_id + url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[2:], playlist_id) + webpage = self._download_webpage(url, playlist_id, u'Downloading Youtube mix') + title = clean_html(get_element_by_attribute('class', 'title long-title', webpage)) + video_re = r'data-index="\d+".*?href="/watch\?v=([0-9A-Za-z_-]{11})&[^"]*?list=%s' % re.escape(playlist_id) + ids = orderedSet(re.findall(video_re, webpage)) + url_results = self._ids_to_results(ids) + + return self.playlist_result(url_results, playlist_id, title) + def _real_extract(self, url): # Extract playlist id mobj = re.match(self._VALID_URL, url, re.VERBOSE) @@ -1554,6 +1571,10 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): else: self.to_screen(u'Downloading playlist PL%s - add --no-playlist to just download video %s' % (playlist_id, video_id)) + if len(playlist_id) == 13: # 'RD' + 11 characters for the video id + # Mixes require a custom extraction process + return self._extract_mix(playlist_id) + # Extract the video ids from the playlist pages ids = [] @@ -1571,8 +1592,7 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): playlist_title = self._og_search_title(page) - url_results = [self.url_result(vid_id, 'Youtube', video_id=vid_id) - for vid_id in ids] + url_results = self._ids_to_results(ids) return self.playlist_result(url_results, playlist_id, playlist_title) From c1f9c59d11c0a96be7caa0b4c6e90d900e3161c6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Wed, 27 Nov 2013 00:41:30 +0100 Subject: [PATCH 050/121] [bash-completion] Complete filenames or directories if the previous option requires it --- devscripts/bash-completion.in | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/devscripts/bash-completion.in b/devscripts/bash-completion.in index ce893fcbe..cc469366d 100644 --- a/devscripts/bash-completion.in +++ b/devscripts/bash-completion.in @@ -1,10 +1,21 @@ __youtube_dl() { - local cur prev opts + local cur prev opts fileopts diropts COMPREPLY=() cur="${COMP_WORDS[COMP_CWORD]}" + prev="${COMP_WORDS[COMP_CWORD-1]}" opts="{{flags}}" keywords=":ytfavorites :ytrecommended :ytsubscriptions :ytwatchlater" + fileopts="-a|--batch-file|--download-archive|--cookies" + diropts="--cache-dir" + + if [[ ${prev} =~ ${fileopts} ]]; then + COMPREPLY=( $(compgen -f -- ${cur}) ) + return 0 + elif [[ ${prev} =~ ${diropts} ]]; then + COMPREPLY=( $(compgen -d -- ${cur}) ) + return 0 + fi if [[ ${cur} =~ : ]]; then COMPREPLY=( $(compgen -W "${keywords}" -- ${cur}) ) From 5f09bbff4df9e11d803e3af066c08167f7f7dcb6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Wed, 27 Nov 2013 00:42:59 +0100 Subject: [PATCH 051/121] [bash-completion] Complete the ':ythistory' keyword --- devscripts/bash-completion.in | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/devscripts/bash-completion.in b/devscripts/bash-completion.in index cc469366d..3af87a378 100644 --- a/devscripts/bash-completion.in +++ b/devscripts/bash-completion.in @@ -1,11 +1,11 @@ __youtube_dl() { - local cur prev opts fileopts diropts + local cur prev opts fileopts diropts keywords COMPREPLY=() cur="${COMP_WORDS[COMP_CWORD]}" prev="${COMP_WORDS[COMP_CWORD-1]}" opts="{{flags}}" - keywords=":ytfavorites :ytrecommended :ytsubscriptions :ytwatchlater" + keywords=":ytfavorites :ytrecommended :ytsubscriptions :ytwatchlater :ythistory" fileopts="-a|--batch-file|--download-archive|--cookies" diropts="--cache-dir" From 4b19e3895492a472c5b63d9da5777bc29d44e25c Mon Sep 17 00:00:00 2001 From: Filippo Valsorda <filosottile.wiki@gmail.com> Date: Wed, 27 Nov 2013 02:54:51 +0100 Subject: [PATCH 052/121] [videopremium] support new .me domain --- youtube_dl/extractor/videopremium.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/videopremium.py b/youtube_dl/extractor/videopremium.py index 4800415bd..acae81448 100644 --- a/youtube_dl/extractor/videopremium.py +++ b/youtube_dl/extractor/videopremium.py @@ -5,7 +5,7 @@ from .common import InfoExtractor class VideoPremiumIE(InfoExtractor): - _VALID_URL = r'(?:https?://)?(?:www\.)?videopremium\.tv/(?P<id>\w+)(?:/.*)?' + _VALID_URL = r'(?:https?://)?(?:www\.)?videopremium\.(?:tv|me)/(?P<id>\w+)(?:/.*)?' _TEST = { u'url': u'http://videopremium.tv/4w7oadjsf156', u'file': u'4w7oadjsf156.f4v', @@ -41,4 +41,4 @@ class VideoPremiumIE(InfoExtractor): 'player_url': "http://videopremium.tv/uplayer/uppod.swf", 'ext': 'f4v', 'title': video_title, - } \ No newline at end of file + } From dcca796ce431da0d8b6927609c08938f22ba44cf Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Wed, 27 Nov 2013 18:33:51 +0100 Subject: [PATCH 053/121] [clipfish] Effect a better error message (#1842) --- youtube_dl/extractor/clipfish.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/youtube_dl/extractor/clipfish.py b/youtube_dl/extractor/clipfish.py index 5f0b5602f..05afce338 100644 --- a/youtube_dl/extractor/clipfish.py +++ b/youtube_dl/extractor/clipfish.py @@ -1,5 +1,6 @@ import re import time +import xml.etree.ElementTree from .common import InfoExtractor @@ -28,6 +29,10 @@ class ClipfishIE(InfoExtractor): info_url, video_id, note=u'Downloading info page') title = doc.find('title').text video_url = doc.find('filename').text + if video_url is None: + xml_bytes = xml.etree.ElementTree.tostring(doc) + raise ExtractorError(u'Cannot find video URL in document %r' % + xml_bytes) thumbnail = doc.find('imageurl').text duration_str = doc.find('duration').text m = re.match( From 76d1700b283ee482288eec12a6903a345742eead Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Wed, 27 Nov 2013 20:01:51 +0100 Subject: [PATCH 054/121] [youtube:playlist] Fix the extraction of the title for some mixes (#1844) Like https://www.youtube.com/watch?v=g8jDB5xOiuE&list=RDIh2gxLqR7HM --- youtube_dl/extractor/youtube.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 9ef5fecce..fb61f47e8 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1547,7 +1547,9 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): # the id of the playlist is just 'RD' + video_id url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[2:], playlist_id) webpage = self._download_webpage(url, playlist_id, u'Downloading Youtube mix') - title = clean_html(get_element_by_attribute('class', 'title long-title', webpage)) + title_span = (get_element_by_attribute('class', 'title long-title', webpage) or + get_element_by_attribute('class', 'title ', webpage)) + title = clean_html(title_span) video_re = r'data-index="\d+".*?href="/watch\?v=([0-9A-Za-z_-]{11})&[^"]*?list=%s' % re.escape(playlist_id) ids = orderedSet(re.findall(video_re, webpage)) url_results = self._ids_to_results(ids) From 35907e23ec4d7e754ff239693500e05886b80ee7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Wed, 27 Nov 2013 21:24:55 +0100 Subject: [PATCH 055/121] [yahoo] Fix video extraction and use the new format system exclusively --- youtube_dl/extractor/yahoo.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/yahoo.py b/youtube_dl/extractor/yahoo.py index 34e6afb20..617e3bb06 100644 --- a/youtube_dl/extractor/yahoo.py +++ b/youtube_dl/extractor/yahoo.py @@ -46,7 +46,7 @@ class YahooIE(InfoExtractor): video_id = mobj.group('id') webpage = self._download_webpage(url, video_id) - items_json = self._search_regex(r'YVIDEO_INIT_ITEMS = ({.*?});$', + items_json = self._search_regex(r'mediaItems: ({.*?})$', webpage, u'items', flags=re.MULTILINE) items = json.loads(items_json) info = items['mediaItems']['query']['results']['mediaObj'][0] @@ -91,17 +91,13 @@ class YahooIE(InfoExtractor): formats.append(format_info) formats = sorted(formats, key=lambda f:(f['height'], f['width'])) - info = { + return { 'id': video_id, 'title': meta['title'], 'formats': formats, 'description': clean_html(meta['description']), 'thumbnail': meta['thumbnail'], } - # TODO: Remove when #980 has been merged - info.update(formats[-1]) - - return info class YahooSearchIE(SearchInfoExtractor): From 0e44d8381a439c84dd23477d32f7da4bb0a06293 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Thu, 28 Nov 2013 00:33:27 +0100 Subject: [PATCH 056/121] [youtube:feeds] Use the 'paging' value from the downloaded json information (fixes #1845) --- youtube_dl/extractor/youtube.py | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index fb61f47e8..765b4a9bf 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1791,7 +1791,6 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor): Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties. """ _LOGIN_REQUIRED = True - _PAGING_STEP = 30 # use action_load_personal_feed instead of action_load_system_feed _PERSONAL_FEED = False @@ -1811,9 +1810,8 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor): def _real_extract(self, url): feed_entries = [] - # The step argument is available only in 2.7 or higher - for i in itertools.count(0): - paging = i*self._PAGING_STEP + paging = 0 + for i in itertools.count(1): info = self._download_webpage(self._FEED_TEMPLATE % paging, u'%s feed' % self._FEED_NAME, u'Downloading page %s' % i) @@ -1826,6 +1824,7 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor): for video_id in ids) if info['paging'] is None: break + paging = info['paging'] return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE) class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor): @@ -1845,7 +1844,6 @@ class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor): _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater' _FEED_NAME = 'watch_later' _PLAYLIST_TITLE = u'Youtube Watch Later' - _PAGING_STEP = 100 _PERSONAL_FEED = True class YoutubeHistoryIE(YoutubeFeedsInfoExtractor): @@ -1855,13 +1853,6 @@ class YoutubeHistoryIE(YoutubeFeedsInfoExtractor): _PERSONAL_FEED = True _PLAYLIST_TITLE = u'Youtube Watch History' - def _real_extract(self, url): - webpage = self._download_webpage('https://www.youtube.com/feed/history', u'History') - data_paging = self._search_regex(r'data-paging="(\d+)"', webpage, u'data-paging') - # The step is actually a ridiculously big number (like 1374343569725646) - self._PAGING_STEP = int(data_paging) - return super(YoutubeHistoryIE, self)._real_extract(url) - class YoutubeFavouritesIE(YoutubeBaseInfoExtractor): IE_NAME = u'youtube:favorites' IE_DESC = u'YouTube.com favourite videos, "ytfav" keyword (requires authentication)' From a2e6db365c11d8c9eaaaeb8de53d59add648f978 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Thu, 28 Nov 2013 05:47:20 +0100 Subject: [PATCH 057/121] [zdf] add a pseudo-testcase and fix URL matching --- youtube_dl/extractor/zdf.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/zdf.py b/youtube_dl/extractor/zdf.py index 07f830e80..3c01cc041 100644 --- a/youtube_dl/extractor/zdf.py +++ b/youtube_dl/extractor/zdf.py @@ -1,3 +1,5 @@ +# coding: utf-8 + import operator import re @@ -9,7 +11,19 @@ from ..utils import ( class ZDFIE(InfoExtractor): - _VALID_URL = r'^http://www\.zdf\.de\/ZDFmediathek(?P<hash>#)?\/(.*beitrag\/video\/)(?P<video_id>[^/\?]+)(?:\?.*)?' + _VALID_URL = r'^https?://www\.zdf\.de/ZDFmediathek(?P<hash>#)?/(.*beitrag/(?:video/)?)(?P<video_id>[0-9]+)(?:/[^/?]+)?(?:\?.*)?' + + _TEST = { + u"url": u"http://www.zdf.de/ZDFmediathek/beitrag/video/2037704/ZDFspezial---Ende-des-Machtpokers--?bc=sts;stt", + u"file": u"2037704.webm", + u"info_dict": { + u"upload_date": u"20131127", + u"description": u"Union und SPD haben sich auf einen Koalitionsvertrag geeinigt. Aber was bedeutet das für die Bürger? Sehen Sie hierzu das ZDFspezial \"Ende des Machtpokers - Große Koalition für Deutschland\".", + u"uploader": u"spezial", + u"title": u"ZDFspezial - Ende des Machtpokers" + }, + u"skip": u"Videos on ZDF.de are depublicised in short order", + } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) From 2a275ab007d6d336b44a6a0cd4fac6783ba63cb8 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Thu, 28 Nov 2013 05:47:50 +0100 Subject: [PATCH 058/121] [zdf] Use _download_xml --- youtube_dl/extractor/common.py | 3 ++- youtube_dl/extractor/zdf.py | 8 ++++---- youtube_dl/utils.py | 5 ----- 3 files changed, 6 insertions(+), 10 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 5656445a3..4f1b50880 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -210,7 +210,8 @@ class InfoExtractor(object): """ Returns the data of the page as a string """ return self._download_webpage_handle(url_or_request, video_id, note, errnote)[0] - def _download_xml(self, url_or_request, video_id, note=u'Downloading XML', errnote=u'Unable to downloand XML'): + def _download_xml(self, url_or_request, video_id, + note=u'Downloading XML', errnote=u'Unable to download XML'): """Return the xml as an xml.etree.ElementTree.Element""" xml_string = self._download_webpage(url_or_request, video_id, note, errnote) return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8')) diff --git a/youtube_dl/extractor/zdf.py b/youtube_dl/extractor/zdf.py index 3c01cc041..689f19735 100644 --- a/youtube_dl/extractor/zdf.py +++ b/youtube_dl/extractor/zdf.py @@ -5,7 +5,6 @@ import re from .common import InfoExtractor from ..utils import ( - parse_xml_doc, unified_strdate, ) @@ -30,9 +29,10 @@ class ZDFIE(InfoExtractor): video_id = mobj.group('video_id') xml_url = u'http://www.zdf.de/ZDFmediathek/xmlservice/web/beitragsDetails?ak=web&id=%s' % video_id - info_xml = self._download_webpage( - xml_url, video_id, note=u'Downloading video info') - doc = parse_xml_doc(info_xml) + doc = self._download_xml( + xml_url, video_id, + note=u'Downloading video info', + errnote=u'Failed to download video info') title = doc.find('.//information/title').text description = doc.find('.//information/detail').text diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 946e90e93..c486ef8ec 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1009,11 +1009,6 @@ def unsmuggle_url(smug_url): return url, data -def parse_xml_doc(s): - assert isinstance(s, type(u'')) - return xml.etree.ElementTree.fromstring(s.encode('utf-8')) - - def format_bytes(bytes): if bytes is None: return u'N/A' From ea07dbb8b108d7c77b6b822fba98817063a8457a Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Thu, 28 Nov 2013 05:48:32 +0100 Subject: [PATCH 059/121] release 2013.11.28 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 99a5e0505..03cb283bd 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2013.11.26' +__version__ = '2013.11.28' From f8f60d27931421f969c7ec0a2a45caa743549994 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Thu, 28 Nov 2013 05:54:46 +0100 Subject: [PATCH 060/121] [clipfish] Fix imports (#1842) --- youtube_dl/extractor/clipfish.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/clipfish.py b/youtube_dl/extractor/clipfish.py index 05afce338..ba5623572 100644 --- a/youtube_dl/extractor/clipfish.py +++ b/youtube_dl/extractor/clipfish.py @@ -3,6 +3,7 @@ import time import xml.etree.ElementTree from .common import InfoExtractor +from ..utils import ExtractorError class ClipfishIE(InfoExtractor): From fc9e1cc69706ef079fca0ee32529503ecedae578 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Thu, 28 Nov 2013 06:10:37 +0100 Subject: [PATCH 061/121] [clipfish] Use FIFA trailer as testcase (#1842) --- youtube_dl/extractor/clipfish.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/clipfish.py b/youtube_dl/extractor/clipfish.py index ba5623572..0d18e9a7a 100644 --- a/youtube_dl/extractor/clipfish.py +++ b/youtube_dl/extractor/clipfish.py @@ -11,12 +11,12 @@ class ClipfishIE(InfoExtractor): _VALID_URL = r'^https?://(?:www\.)?clipfish\.de/.*?/video/(?P<id>[0-9]+)/' _TEST = { - u'url': u'http://www.clipfish.de/special/supertalent/video/4028320/supertalent-2013-ivana-opacak-singt-nobodys-perfect/', - u'file': u'4028320.f4v', - u'md5': u'5e38bda8c329fbfb42be0386a3f5a382', + u'url': u'http://www.clipfish.de/special/game-trailer/video/3966754/fifa-14-e3-2013-trailer/', + u'file': u'3966754.mp4', + u'md5': u'2521cd644e862936cf2e698206e47385', u'info_dict': { - u'title': u'Supertalent 2013: Ivana Opacak singt Nobody\'s Perfect', - u'duration': 399, + u'title': u'FIFA 14 - E3 2013 Trailer', + u'duration': 82, } } From 4e0084d92e589d385f28ac98bfb847240d61dc93 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Thu, 28 Nov 2013 06:14:17 +0100 Subject: [PATCH 062/121] [youtube/subtitles] Change MD5 of vtt subtitle in test --- test/test_subtitles.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_subtitles.py b/test/test_subtitles.py index 94a1f771d..23a653124 100644 --- a/test/test_subtitles.py +++ b/test/test_subtitles.py @@ -72,7 +72,7 @@ class TestYoutubeSubtitles(BaseTestSubtitles): self.DL.params['writesubtitles'] = True self.DL.params['subtitlesformat'] = 'vtt' subtitles = self.getSubtitles() - self.assertEqual(md5(subtitles['en']), '356cdc577fde0c6783b9b822e7206ff7') + self.assertEqual(md5(subtitles['en']), '3cb210999d3e021bd6c7f0ea751eab06') def test_youtube_list_subtitles(self): self.DL.expect_warning(u'Video doesn\'t have automatic captions') From 2be54167d085c5b4c956c66ad0367fdcfb68b891 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Thu, 28 Nov 2013 06:17:56 +0100 Subject: [PATCH 063/121] release 2013.11.28.1 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 03cb283bd..9cae97ee2 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2013.11.28' +__version__ = '2013.11.28.1' From d8d6148628b972b6998a8c2a5465f031a44f4004 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Thu, 28 Nov 2013 13:32:49 +0100 Subject: [PATCH 064/121] Add an extractor for Internet Movie Database trailers (closes #1832) --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/imdb.py | 59 ++++++++++++++++++++++++++++++++ 2 files changed, 60 insertions(+) create mode 100644 youtube_dl/extractor/imdb.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 0b4d086b7..30e4a9105 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -71,6 +71,7 @@ from .hotnewhiphop import HotNewHipHopIE from .howcast import HowcastIE from .hypem import HypemIE from .ign import IGNIE, OneUPIE +from .imdb import ImdbIE from .ina import InaIE from .infoq import InfoQIE from .instagram import InstagramIE diff --git a/youtube_dl/extractor/imdb.py b/youtube_dl/extractor/imdb.py new file mode 100644 index 000000000..07e4f7d29 --- /dev/null +++ b/youtube_dl/extractor/imdb.py @@ -0,0 +1,59 @@ +import re +import json + +from .common import InfoExtractor +from ..utils import ( + compat_urlparse, + get_element_by_attribute, +) + + +class ImdbIE(InfoExtractor): + IE_NAME = u'imdb' + IE_DESC = u'Internet Movie Database trailers' + _VALID_URL = r'http://www\.imdb\.com/video/imdb/vi(?P<id>\d+)' + + _TEST = { + u'url': u'http://www.imdb.com/video/imdb/vi2524815897', + u'md5': u'9f34fa777ade3a6e57a054fdbcb3a068', + u'info_dict': { + u'id': u'2524815897', + u'ext': u'mp4', + u'title': u'Ice Age: Continental Drift Trailer (No. 2) - IMDb', + u'description': u'md5:9061c2219254e5d14e03c25c98e96a81', + u'duration': 151, + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + webpage = self._download_webpage(url,video_id) + descr = get_element_by_attribute('itemprop', 'description', webpage) + available_formats = re.findall( + r'case \'(?P<f_id>.*?)\' :$\s+url = \'(?P<path>.*?)\'', webpage, + flags=re.MULTILINE) + formats = [] + for f_id, f_path in available_formats: + format_page = self._download_webpage( + compat_urlparse.urljoin(url, f_path), + u'Downloading info for %s format' % f_id) + json_data = get_element_by_attribute('class', 'imdb-player-data', + format_page) + info = json.loads(json_data) + format_info = info['videoPlayerObject']['video'] + formats.append({ + 'format_id': f_id, + 'url': format_info['url'], + 'height': format_info['height'], + 'width': format_info['width'], + }) + + return { + 'id': video_id, + 'title': self._og_search_title(webpage), + 'formats': formats, + 'description': descr, + 'thumbnail': format_info['slate'], + 'duration': int(info['titleObject']['title']['duration_seconds']), + } From b03d0d064c0e198aa281faacb2b5a74af7628b74 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Thu, 28 Nov 2013 13:49:00 +0100 Subject: [PATCH 065/121] [imdb] Fix extraction in python 2.6 Using a regular expression because the html cannot be parsed. --- youtube_dl/extractor/imdb.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/imdb.py b/youtube_dl/extractor/imdb.py index 07e4f7d29..520edc7d0 100644 --- a/youtube_dl/extractor/imdb.py +++ b/youtube_dl/extractor/imdb.py @@ -38,8 +38,9 @@ class ImdbIE(InfoExtractor): format_page = self._download_webpage( compat_urlparse.urljoin(url, f_path), u'Downloading info for %s format' % f_id) - json_data = get_element_by_attribute('class', 'imdb-player-data', - format_page) + json_data = self._search_regex( + r'<script[^>]+class="imdb-player-data"[^>]*?>(.*?)</script>', + format_page, u'json data', flags=re.DOTALL) info = json.loads(json_data) format_info = info['videoPlayerObject']['video'] formats.append({ From 3862402ff3a991e7fb58470ac38fba82ba9b18d9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Thu, 28 Nov 2013 14:38:10 +0100 Subject: [PATCH 066/121] Add an extractor for Clipsyndicate (closes #1744) --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/clipsyndicate.py | 52 +++++++++++++++++++++++++++ 2 files changed, 53 insertions(+) create mode 100644 youtube_dl/extractor/clipsyndicate.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 30e4a9105..1e4f36aa3 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -21,6 +21,7 @@ from .canalplus import CanalplusIE from .canalc2 import Canalc2IE from .cinemassacre import CinemassacreIE from .clipfish import ClipfishIE +from .clipsyndicate import ClipsyndicateIE from .cnn import CNNIE from .collegehumor import CollegeHumorIE from .comedycentral import ComedyCentralIE, ComedyCentralShowsIE diff --git a/youtube_dl/extractor/clipsyndicate.py b/youtube_dl/extractor/clipsyndicate.py new file mode 100644 index 000000000..d4fc86973 --- /dev/null +++ b/youtube_dl/extractor/clipsyndicate.py @@ -0,0 +1,52 @@ +import re +import xml.etree.ElementTree + +from .common import InfoExtractor +from ..utils import ( + find_xpath_attr, +) + + +class ClipsyndicateIE(InfoExtractor): + _VALID_URL = r'http://www\.clipsyndicate\.com/video/play(list/\d+)?/(?P<id>\d+)' + + _TEST = { + u'url': u'http://www.clipsyndicate.com/video/play/4629301/brick_briscoe', + u'md5': u'4d7d549451bad625e0ff3d7bd56d776c', + u'info_dict': { + u'id': u'4629301', + u'ext': u'mp4', + u'title': u'Brick Briscoe', + u'duration': 612, + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + js_player = self._download_webpage( + 'http://eplayer.clipsyndicate.com/embed/player.js?va_id=%s' % video_id, + video_id, u'Downlaoding player') + # it includes a required token + flvars = self._search_regex(r'flvars: "(.*?)"', js_player, u'flvars') + + playlist_page = self._download_webpage( + 'http://eplayer.clipsyndicate.com/osmf/playlist?%s' % flvars, + video_id, u'Downloading video info') + # Fix broken xml + playlist_page = re.sub('&', '&', playlist_page) + pdoc = xml.etree.ElementTree.fromstring(playlist_page.encode('utf-8')) + + track_doc = pdoc.find('trackList/track') + def find_param(name): + node = find_xpath_attr(track_doc, './/param', 'name', name) + if node is not None: + return node.attrib['value'] + + return { + 'id': video_id, + 'title': find_param('title'), + 'url': track_doc.find('location').text, + 'thumbnail': find_param('thumbnail'), + 'duration': int(find_param('duration')), + } From 677c18092d8fd5ca6e08b25985c8533b6a0738d5 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Fri, 29 Nov 2013 03:33:25 +0100 Subject: [PATCH 067/121] [podomatic] Add extractor --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/podomatic.py | 49 +++++++++++++++++++++++++++++++ 2 files changed, 50 insertions(+) create mode 100644 youtube_dl/extractor/podomatic.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 1e4f36aa3..fd890e251 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -107,6 +107,7 @@ from .ooyala import OoyalaIE from .orf import ORFIE from .pbs import PBSIE from .photobucket import PhotobucketIE +from .podomatic import PodomaticIE from .pornhub import PornHubIE from .pornotube import PornotubeIE from .rbmaradio import RBMARadioIE diff --git a/youtube_dl/extractor/podomatic.py b/youtube_dl/extractor/podomatic.py new file mode 100644 index 000000000..58200971b --- /dev/null +++ b/youtube_dl/extractor/podomatic.py @@ -0,0 +1,49 @@ +import json +import re + +from .common import InfoExtractor + + +class PodomaticIE(InfoExtractor): + IE_NAME = 'podomatic' + _VALID_URL = r'^(?P<proto>https?)://(?P<channel>[^.]+)\.podomatic\.com/entry/(?P<id>[^?]+)' + + _TEST = { + u"url": u"http://scienceteachingtips.podomatic.com/entry/2009-01-02T16_03_35-08_00", + u"file": u"2009-01-02T16_03_35-08_00.mp3", + u"md5": u"84bb855fcf3429e6bf72460e1eed782d", + u"info_dict": { + u"uploader": u"Science Teaching Tips", + u"uploader_id": u"scienceteachingtips", + u"title": u"64. When the Moon Hits Your Eye", + u"duration": 446, + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + channel = mobj.group('channel') + + json_url = (('%s://%s.podomatic.com/entry/embed_params/%s' + + '?permalink=true&rtmp=0') % + (mobj.group('proto'), channel, video_id)) + data_json = self._download_webpage( + json_url, video_id, note=u'Downloading video info') + data = json.loads(data_json) + + video_url = data['downloadLink'] + uploader = data['podcast'] + title = data['title'] + thumbnail = data['imageLocation'] + duration = int(data['length'] / 1000.0) + + return { + 'id': video_id, + 'url': video_url, + 'title': title, + 'uploader': uploader, + 'uploader_id': channel, + 'thumbnail': thumbnail, + 'duration': duration, + } From 17769d5a6c24eb8f5d609aa99f84debc3fe4adec Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Fri, 29 Nov 2013 03:34:26 +0100 Subject: [PATCH 068/121] release 2013.11.29 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 9cae97ee2..a73d7fb5c 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2013.11.28.1' +__version__ = '2013.11.29' From acf37ca151d67ee28034775662318d9a0a1eb6f4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Fri, 29 Nov 2013 07:56:14 +0100 Subject: [PATCH 069/121] [imdb] Fix the resolution values (fixes #1847) We were using the size of the player, it was the same for all the formats --- youtube_dl/extractor/imdb.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/youtube_dl/extractor/imdb.py b/youtube_dl/extractor/imdb.py index 520edc7d0..d8e9712a7 100644 --- a/youtube_dl/extractor/imdb.py +++ b/youtube_dl/extractor/imdb.py @@ -46,8 +46,7 @@ class ImdbIE(InfoExtractor): formats.append({ 'format_id': f_id, 'url': format_info['url'], - 'height': format_info['height'], - 'width': format_info['width'], + 'height': int(info['titleObject']['encoding']['selected'][:-1]), }) return { From e1f900d6a4c449b2a7c7ed74dbe8eca74cbccf13 Mon Sep 17 00:00:00 2001 From: Nicolas Kaiser <nikai@nikai.net> Date: Fri, 29 Nov 2013 09:44:05 +0100 Subject: [PATCH 070/121] fix typo in README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 580b16004..af4d969d6 100644 --- a/README.md +++ b/README.md @@ -134,7 +134,7 @@ which means you can modify it, redistribute it or use it however you like. directory ## Video Format Options: - -f, --format FORMAT video format code, specifiy the order of + -f, --format FORMAT video format code, specify the order of preference using slashes: "-f 22/17/18". "-f mp4" and "-f flv" are also supported --all-formats download all available video formats From 9986238ba9a486ca76334c50562760a312ab20fa Mon Sep 17 00:00:00 2001 From: Nicolas Kaiser <nikai@nikai.net> Date: Fri, 29 Nov 2013 09:48:38 +0100 Subject: [PATCH 071/121] fix typo in help --- youtube_dl/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 0704515df..8f8422cc7 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -235,7 +235,7 @@ def parseOpts(overrideArguments=None): video_format.add_option('-f', '--format', action='store', dest='format', metavar='FORMAT', default='best', - help='video format code, specifiy the order of preference using slashes: "-f 22/17/18". "-f mp4" and "-f flv" are also supported') + help='video format code, specify the order of preference using slashes: "-f 22/17/18". "-f mp4" and "-f flv" are also supported') video_format.add_option('--all-formats', action='store_const', dest='format', help='download all available video formats', const='all') video_format.add_option('--prefer-free-formats', From a3fb4675fb67b061e2a71cec78a5dbd8695b8ef0 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Fri, 29 Nov 2013 15:25:09 +0100 Subject: [PATCH 072/121] Do not mutate default arguments In this case, it looks rather harmless (since the conditions for --restrict-filenames should not change while a process is running), but just to be sure. This also simplifies the interface for callers, who can just pass in the idiomatic None for "I don't care, whatever is the default". --- youtube_dl/YoutubeDL.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 711b5d79e..b822930cb 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -146,7 +146,7 @@ class YoutubeDL(object): _num_downloads = None _screen_file = None - def __init__(self, params={}): + def __init__(self, params=None): """Create a FileDownloader object with the given options.""" self._ies = [] self._ies_instances = {} @@ -155,7 +155,7 @@ class YoutubeDL(object): self._download_retcode = 0 self._num_downloads = 0 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)] - self.params = params + self.params = {} if params is None else params if (sys.version_info >= (3,) and sys.platform != 'win32' and sys.getfilesystemencoding() in ['ascii', 'ANSI_X3.4-1968'] From befd88b786dc41ff075693fd17bafbc7fa4c100e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Fri, 29 Nov 2013 15:25:43 +0100 Subject: [PATCH 073/121] [yahoo] Add an extractor for yahoo news (closes #1849) --- youtube_dl/extractor/__init__.py | 6 +++++- youtube_dl/extractor/yahoo.py | 34 +++++++++++++++++++++++++++++++- 2 files changed, 38 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index fd890e251..664639b53 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -172,7 +172,11 @@ from .xhamster import XHamsterIE from .xnxx import XNXXIE from .xvideos import XVideosIE from .xtube import XTubeIE -from .yahoo import YahooIE, YahooSearchIE +from .yahoo import ( + YahooIE, + YahooNewsIE, + YahooSearchIE, +) from .youjizz import YouJizzIE from .youku import YoukuIE from .youporn import YouPornIE diff --git a/youtube_dl/extractor/yahoo.py b/youtube_dl/extractor/yahoo.py index 617e3bb06..2d87e81b2 100644 --- a/youtube_dl/extractor/yahoo.py +++ b/youtube_dl/extractor/yahoo.py @@ -53,8 +53,11 @@ class YahooIE(InfoExtractor): # The 'meta' field is not always in the video webpage, we request it # from another page long_id = info['id'] + return self._get_info(info['id'], video_id) + + def _get_info(self, long_id, video_id): query = ('SELECT * FROM yahoo.media.video.streams WHERE id="%s"' - ' AND plrs="86Gj0vCaSzV_Iuf6hNylf2"' % long_id) + ' AND plrs="86Gj0vCaSzV_Iuf6hNylf2" AND region="US"' % long_id) data = compat_urllib_parse.urlencode({ 'q': query, 'env': 'prod', @@ -100,6 +103,35 @@ class YahooIE(InfoExtractor): } +class YahooNewsIE(YahooIE): + IE_NAME = 'yahoo:news' + _VALID_URL = r'http://news\.yahoo\.com/video/.*?-(?P<id>\d*?)\.html' + + _TEST = { + u'url': u'http://news.yahoo.com/video/china-moses-crazy-blues-104538833.html', + u'info_dict': { + u'id': u'104538833', + u'ext': u'flv', + u'title': u'China Moses Is Crazy About the Blues', + u'description': u'md5:9900ab8cd5808175c7b3fe55b979bed0', + }, + u'params': { + # Requires rtmpdump + u'skip_download': True, + }, + } + + # Overwrite YahooIE properties we don't want + _TESTS = [] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + webpage = self._download_webpage(url, video_id) + long_id = self._search_regex(r'contentId: \'(.+?)\',', webpage, u'long id') + return self._get_info(long_id, video_id) + + class YahooSearchIE(SearchInfoExtractor): IE_DESC = u'Yahoo screen search' _MAX_RESULTS = 1000 From 323ec6ae566af9744edce97a23e623d99eea8a1f Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Fri, 29 Nov 2013 15:57:43 +0100 Subject: [PATCH 074/121] Clarify --download-archive help --- youtube_dl/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 0704515df..c63d62986 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -220,7 +220,7 @@ def parseOpts(overrideArguments=None): default=None, type=int) selection.add_option('--download-archive', metavar='FILE', dest='download_archive', - help='Download only videos not present in the archive file. Record all downloaded videos in it.') + help='Download only videos not present in the archive file. Record the IDs of all downloaded videos in it.') authentication.add_option('-u', '--username', From c5171c454b4392f7276b7f9e94c25d7f1ad60375 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Fri, 29 Nov 2013 22:06:17 +0100 Subject: [PATCH 075/121] [yahoo] Force use of the http protocol for downloading the videos. --- youtube_dl/extractor/yahoo.py | 24 ++++++++---------------- 1 file changed, 8 insertions(+), 16 deletions(-) diff --git a/youtube_dl/extractor/yahoo.py b/youtube_dl/extractor/yahoo.py index 2d87e81b2..e457c4707 100644 --- a/youtube_dl/extractor/yahoo.py +++ b/youtube_dl/extractor/yahoo.py @@ -17,27 +17,21 @@ class YahooIE(InfoExtractor): _TESTS = [ { u'url': u'http://screen.yahoo.com/julian-smith-travis-legg-watch-214727115.html', - u'file': u'214727115.flv', + u'file': u'214727115.mp4', + u'md5': u'4962b075c08be8690a922ee026d05e69', u'info_dict': { u'title': u'Julian Smith & Travis Legg Watch Julian Smith', u'description': u'Julian and Travis watch Julian Smith', }, - u'params': { - # Requires rtmpdump - u'skip_download': True, - }, }, { u'url': u'http://screen.yahoo.com/wired/codefellas-s1-ep12-cougar-lies-103000935.html', - u'file': u'103000935.flv', + u'file': u'103000935.mp4', + u'md5': u'd6e6fc6e1313c608f316ddad7b82b306', u'info_dict': { u'title': u'Codefellas - The Cougar Lies with Spanish Moss', u'description': u'Agent Topple\'s mustache does its dirty work, and Nicole brokers a deal for peace. But why is the NSA collecting millions of Instagram brunch photos? And if your waffles have nothing to hide, what are they so worried about?', }, - u'params': { - # Requires rtmpdump - u'skip_download': True, - }, }, ] @@ -57,7 +51,8 @@ class YahooIE(InfoExtractor): def _get_info(self, long_id, video_id): query = ('SELECT * FROM yahoo.media.video.streams WHERE id="%s"' - ' AND plrs="86Gj0vCaSzV_Iuf6hNylf2" AND region="US"' % long_id) + ' AND plrs="86Gj0vCaSzV_Iuf6hNylf2" AND region="US"' + ' AND protocol="http"' % long_id) data = compat_urllib_parse.urlencode({ 'q': query, 'env': 'prod', @@ -109,16 +104,13 @@ class YahooNewsIE(YahooIE): _TEST = { u'url': u'http://news.yahoo.com/video/china-moses-crazy-blues-104538833.html', + u'md5': u'67010fdf3a08d290e060a4dd96baa07b', u'info_dict': { u'id': u'104538833', - u'ext': u'flv', + u'ext': u'mp4', u'title': u'China Moses Is Crazy About the Blues', u'description': u'md5:9900ab8cd5808175c7b3fe55b979bed0', }, - u'params': { - # Requires rtmpdump - u'skip_download': True, - }, } # Overwrite YahooIE properties we don't want From 06dcbb71d8e19947eb6e71390a6a0640abe3dad0 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Sat, 30 Nov 2013 00:42:43 +0100 Subject: [PATCH 076/121] Clarify help of --write-pages (#1853) --- youtube_dl/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index c63d62986..42ab572f2 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -317,7 +317,7 @@ def parseOpts(overrideArguments=None): help='print downloaded pages to debug problems(very verbose)') verbosity.add_option('--write-pages', action='store_true', dest='write_pages', default=False, - help='Write downloaded pages to files in the current directory') + help='Write downloaded intermediary pages to files in the current directory to debug problems') verbosity.add_option('--youtube-print-sig-code', action='store_true', dest='youtube_print_sig_code', default=False, help=optparse.SUPPRESS_HELP) From 5e09d6abbd09de92869cbb8ed204d18f9cd04931 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Sun, 1 Dec 2013 01:16:20 +0100 Subject: [PATCH 077/121] [clipfish] Skip test on travis --- youtube_dl/extractor/clipfish.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/clipfish.py b/youtube_dl/extractor/clipfish.py index 0d18e9a7a..43efb08bf 100644 --- a/youtube_dl/extractor/clipfish.py +++ b/youtube_dl/extractor/clipfish.py @@ -17,7 +17,8 @@ class ClipfishIE(InfoExtractor): u'info_dict': { u'title': u'FIFA 14 - E3 2013 Trailer', u'duration': 82, - } + }, + u'skip': 'Blocked in the US' } def _real_extract(self, url): From 355e4fd07e7f9c0632d9d78415675f8b5cc3c2ce Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Sun, 1 Dec 2013 01:21:33 +0100 Subject: [PATCH 078/121] [generic] Find embedded dailymotion videos (Fixes #1848) --- youtube_dl/extractor/generic.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 37671430a..10ae06263 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -195,6 +195,15 @@ class GenericIE(InfoExtractor): return self.playlist_result( urlrs, playlist_id=video_id, playlist_title=video_title) + # Look for embedded Dailymotion player + matches = re.findall( + r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion.com/embed/video/.+?)\1', webpage) + if matches: + urlrs = [self.url_result(unescapeHTML(tuppl[1]), 'Dailymotion') + for tuppl in matches] + return self.playlist_result( + urlrs, playlist_id=video_id, playlist_title=video_title) + # Look for Bandcamp pages with custom domain mobj = re.search(r'<meta property="og:url"[^>]*?content="(.*?bandcamp\.com.*?)"', webpage) if mobj is not None: From e344693b65a42436eb40efe85095c01f767a502d Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Sun, 1 Dec 2013 11:42:02 +0100 Subject: [PATCH 079/121] Make socket timeout configurable, and bump default to 10 minutes (#1862) --- test/parameters.json | 3 ++- youtube_dl/YoutubeDL.py | 4 +++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/test/parameters.json b/test/parameters.json index f042880ed..487a46d56 100644 --- a/test/parameters.json +++ b/test/parameters.json @@ -39,5 +39,6 @@ "writeinfojson": true, "writesubtitles": false, "allsubtitles": false, - "listssubtitles": false + "listssubtitles": false, + "socket_timeout": 20 } diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index b822930cb..b7393fd79 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -132,6 +132,7 @@ class YoutubeDL(object): cookiefile: File name where cookies should be read from and dumped to. nocheckcertificate:Do not verify SSL certificates proxy: URL of the proxy server to use + socket_timeout: Time to wait for unresponsive hosts, in seconds The following parameters are not used by YoutubeDL itself, they are used by the FileDownloader: @@ -969,7 +970,8 @@ class YoutubeDL(object): proxy_map.update(handler.proxies) write_string(u'[debug] Proxy map: ' + compat_str(proxy_map) + u'\n') - def _setup_opener(self, timeout=20): + def _setup_opener(self): + timeout = float(self.params.get('socket_timeout', 600)) opts_cookiefile = self.params.get('cookiefile') opts_proxy = self.params.get('proxy') From 55a10eab48776197245d3d87b86195f182d8d82a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Sun, 1 Dec 2013 22:36:18 +0100 Subject: [PATCH 080/121] [vimeo] Add an extractor for users (closes #1871) --- test/test_all_urls.py | 4 ++++ test/test_playlists.py | 9 +++++++ youtube_dl/extractor/__init__.py | 6 ++++- youtube_dl/extractor/vimeo.py | 41 ++++++++++++++++++++++++-------- 4 files changed, 49 insertions(+), 11 deletions(-) diff --git a/test/test_all_urls.py b/test/test_all_urls.py index 1f1adb6b4..6b9764c67 100644 --- a/test/test_all_urls.py +++ b/test/test_all_urls.py @@ -106,6 +106,10 @@ class TestAllURLsMatching(unittest.TestCase): self.assertMatch(':colbertreport', ['ComedyCentralShows']) self.assertMatch(':cr', ['ComedyCentralShows']) + def test_vimeo_matching(self): + self.assertMatch('http://vimeo.com/channels/tributes', ['vimeo:channel']) + self.assertMatch('http://vimeo.com/user7108434', ['vimeo:user']) + if __name__ == '__main__': unittest.main() diff --git a/test/test_playlists.py b/test/test_playlists.py index 167801ae2..13a6f4b2f 100644 --- a/test/test_playlists.py +++ b/test/test_playlists.py @@ -15,6 +15,7 @@ from youtube_dl.extractor import ( DailymotionPlaylistIE, DailymotionUserIE, VimeoChannelIE, + VimeoUserIE, UstreamChannelIE, SoundcloudSetIE, SoundcloudUserIE, @@ -54,6 +55,14 @@ class TestPlaylists(unittest.TestCase): self.assertEqual(result['title'], u'Vimeo Tributes') self.assertTrue(len(result['entries']) > 24) + def test_vimeo_user(self): + dl = FakeYDL() + ie = VimeoUserIE(dl) + result = ie.extract('http://vimeo.com/nkistudio/videos') + self.assertIsPlaylist(result) + self.assertEqual(result['title'], u'Nki') + self.assertTrue(len(result['entries']) > 65) + def test_ustream_channel(self): dl = FakeYDL() ie = UstreamChannelIE(dl) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 664639b53..cc93e619c 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -159,7 +159,11 @@ from .viddler import ViddlerIE from .videodetective import VideoDetectiveIE from .videofyme import VideofyMeIE from .videopremium import VideoPremiumIE -from .vimeo import VimeoIE, VimeoChannelIE +from .vimeo import ( + VimeoIE, + VimeoChannelIE, + VimeoUserIE, +) from .vine import VineIE from .viki import VikiIE from .vk import VKIE diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 7d82c2cfa..f27763ae2 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -249,25 +249,46 @@ class VimeoChannelIE(InfoExtractor): IE_NAME = u'vimeo:channel' _VALID_URL = r'(?:https?://)?vimeo.\com/channels/(?P<id>[^/]+)' _MORE_PAGES_INDICATOR = r'<a.+?rel="next"' + _TITLE_RE = r'<link rel="alternate"[^>]+?title="(.*?)"' - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - channel_id = mobj.group('id') + def _extract_videos(self, list_id, base_url): video_ids = [] - for pagenum in itertools.count(1): - webpage = self._download_webpage('http://vimeo.com/channels/%s/videos/page:%d' % (channel_id, pagenum), - channel_id, u'Downloading page %s' % pagenum) + webpage = self._download_webpage( + '%s/videos/page:%d/' % (base_url, pagenum),list_id, + u'Downloading page %s' % pagenum) video_ids.extend(re.findall(r'id="clip_(\d+?)"', webpage)) if re.search(self._MORE_PAGES_INDICATOR, webpage, re.DOTALL) is None: break entries = [self.url_result('http://vimeo.com/%s' % video_id, 'Vimeo') for video_id in video_ids] - channel_title = self._html_search_regex(r'<a href="/channels/%s">(.*?)</a>' % channel_id, - webpage, u'channel title') + list_title = self._html_search_regex(self._TITLE_RE, webpage, + u'list title') return {'_type': 'playlist', - 'id': channel_id, - 'title': channel_title, + 'id': list_id, + 'title': list_title, 'entries': entries, } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + channel_id = mobj.group('id') + return self._extract_videos(channel_id, 'http://vimeo.com/channels/%s' % channel_id) + + +class VimeoUserIE(VimeoChannelIE): + IE_NAME = u'vimeo:user' + _VALID_URL = r'(?:https?://)?vimeo.\com/(?P<name>[^/]+)' + _TITLE_RE = r'<a[^>]+?class="user">([^<>]+?)</a>' + + @classmethod + def suitable(cls, url): + if VimeoChannelIE.suitable(url) or VimeoIE.suitable(url): + return False + return super(VimeoUserIE, cls).suitable(url) + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + name = mobj.group('name') + return self._extract_videos(name, 'http://vimeo.com/%s' % name) From 6ad14cab599b05a658756fef47d3837281429da7 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 2 Dec 2013 13:37:05 +0100 Subject: [PATCH 081/121] Add --socket-timeout option --- youtube_dl/YoutubeDL.py | 4 +++- youtube_dl/__init__.py | 4 ++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index b7393fd79..b68b110a4 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -971,7 +971,9 @@ class YoutubeDL(object): write_string(u'[debug] Proxy map: ' + compat_str(proxy_map) + u'\n') def _setup_opener(self): - timeout = float(self.params.get('socket_timeout', 600)) + timeout_val = self.params.get('socket_timeout') + timeout = 600 if timeout_val is None else float(timeout_val) + opts_cookiefile = self.params.get('cookiefile') opts_proxy = self.params.get('proxy') diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 92e583744..799eca566 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -198,6 +198,9 @@ def parseOpts(overrideArguments=None): general.add_option( '--no-cache-dir', action='store_const', const=None, dest='cachedir', help='Disable filesystem caching') + general.add_option( + '--socket-timeout', dest='socket_timeout', + type=float, default=None, help=optparse.SUPPRESS_HELP) selection.add_option('--playlist-start', @@ -652,6 +655,7 @@ def _real_main(argv=None): 'cookiefile': opts.cookiefile, 'nocheckcertificate': opts.no_check_certificate, 'proxy': opts.proxy, + 'socket_timeout': opts.socket_timeout, } with YoutubeDL(ydl_opts) as ydl: From 0037e02921e7f70409ce113fb060765a6f24a27e Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 2 Dec 2013 13:37:26 +0100 Subject: [PATCH 082/121] release 2013.12.02 --- README.md | 7 ++++--- youtube_dl/version.py | 2 +- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index af4d969d6..031e436b6 100644 --- a/README.md +++ b/README.md @@ -56,7 +56,8 @@ which means you can modify it, redistribute it or use it however you like. --no-playlist download only the currently playing video --age-limit YEARS download only videos suitable for the given age --download-archive FILE Download only videos not present in the archive - file. Record all downloaded videos in it. + file. Record the IDs of all downloaded videos in + it. ## Download Options: -r, --rate-limit LIMIT maximum download rate in bytes per second (e.g. @@ -130,8 +131,8 @@ which means you can modify it, redistribute it or use it however you like. -v, --verbose print various debugging information --dump-intermediate-pages print downloaded pages to debug problems(very verbose) - --write-pages Write downloaded pages to files in the current - directory + --write-pages Write downloaded intermediary pages to files in + the current directory to debug problems ## Video Format Options: -f, --format FORMAT video format code, specify the order of diff --git a/youtube_dl/version.py b/youtube_dl/version.py index a73d7fb5c..d8f341ab9 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2013.11.29' +__version__ = '2013.12.02' From 5270d8cb1389a9b26fa698137bf4861d4bab6a25 Mon Sep 17 00:00:00 2001 From: dst <dstftw@gmail.com> Date: Mon, 2 Dec 2013 20:10:19 +0700 Subject: [PATCH 083/121] Added extractors for smotri.com --- test/test_playlists.py | 22 ++- youtube_dl/extractor/__init__.py | 5 + youtube_dl/extractor/smotri.py | 239 +++++++++++++++++++++++++++++++ 3 files changed, 265 insertions(+), 1 deletion(-) create mode 100644 youtube_dl/extractor/smotri.py diff --git a/test/test_playlists.py b/test/test_playlists.py index 13a6f4b2f..00c950109 100644 --- a/test/test_playlists.py +++ b/test/test_playlists.py @@ -22,7 +22,9 @@ from youtube_dl.extractor import ( LivestreamIE, NHLVideocenterIE, BambuserChannelIE, - BandcampAlbumIE + BandcampAlbumIE, + SmotriCommunityIE, + SmotriUserIE ) @@ -119,6 +121,24 @@ class TestPlaylists(unittest.TestCase): self.assertIsPlaylist(result) self.assertEqual(result['title'], u'Nightmare Night EP') self.assertTrue(len(result['entries']) >= 4) + + def test_smotri_community(self): + dl = FakeYDL() + ie = SmotriCommunityIE(dl) + result = ie.extract('http://smotri.com/community/video/kommuna') + self.assertIsPlaylist(result) + self.assertEqual(result['id'], u'kommuna') + self.assertEqual(result['title'], u'КПРФ') + self.assertTrue(len(result['entries']) >= 4) + + def test_smotri_user(self): + dl = FakeYDL() + ie = SmotriUserIE(dl) + result = ie.extract('http://smotri.com/user/inspector') + self.assertIsPlaylist(result) + self.assertEqual(result['id'], u'inspector') + self.assertEqual(result['title'], u'Inspector') + self.assertTrue(len(result['entries']) >= 9) if __name__ == '__main__': unittest.main() diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index cc93e619c..bd996483b 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -121,6 +121,11 @@ from .rutube import RutubeIE from .sina import SinaIE from .slashdot import SlashdotIE from .slideshare import SlideshareIE +from .smotri import ( + SmotriIE, + SmotriCommunityIE, + SmotriUserIE, +) from .sohu import SohuIE from .soundcloud import SoundcloudIE, SoundcloudSetIE, SoundcloudUserIE from .southparkstudios import ( diff --git a/youtube_dl/extractor/smotri.py b/youtube_dl/extractor/smotri.py new file mode 100644 index 000000000..ea42d5320 --- /dev/null +++ b/youtube_dl/extractor/smotri.py @@ -0,0 +1,239 @@ +# encoding: utf-8 + +import re +import json +import hashlib + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + ExtractorError +) + + +class SmotriIE(InfoExtractor): + IE_DESC = u'Smotri.com' + IE_NAME = u'smotri' + _VALID_URL = r'^(?:http://)?(?:www\.)?(?P<url>smotri\.com/video/view/\?id=(?P<videoid>v(?P<realvideoid>[0-9]+)[a-z0-9]{4}))' + + _TESTS = [ + # real video id 2610366 + { + u'url': u'http://smotri.com/video/view/?id=v261036632ab', + u'file': u'v261036632ab.mp4', + u'md5': u'46a72e83a6ad8862b64fa6953fa93f8a', + u'info_dict': { + u'title': u'катастрофа с камер видеонаблюдения', + u'uploader': u'rbc2008', + u'uploader_id': u'rbc08', + u'upload_date': u'20131118', + u'thumbnail': u'http://frame6.loadup.ru/8b/a9/2610366.3.3.jpg' + }, + }, + # real video id 57591 + { + u'url': u'http://smotri.com/video/view/?id=v57591cb20', + u'file': u'v57591cb20.flv', + u'md5': u'9eae59f6dda7087bf39a140e2fff5757', + u'info_dict': { + u'title': u'test', + u'uploader': u'Support Photofile@photofile', + u'uploader_id': u'support-photofile', + u'upload_date': u'20070704', + u'thumbnail': u'http://frame4.loadup.ru/03/ed/57591.2.3.jpg' + }, + }, + # video-password + { + u'url': u'http://smotri.com/video/view/?id=v1390466a13c', + u'file': u'v1390466a13c.mp4', + u'md5': u'fe4dd9357558d5ee3c8fc0ef0d39de66', + u'info_dict': { + u'title': u'TOCCA_A_NOI_-_LE_COSE_NON_VANNO_CAMBIAMOLE_ORA-1', + u'uploader': u'timoxa40', + u'uploader_id': u'timoxa40', + u'upload_date': u'20100404', + u'thumbnail': u'http://frame7.loadup.ru/af/3f/1390466.3.3.jpg' + }, + u'params': { + u'videopassword': u'qwerty', + }, + }, + # age limit + video-password + { + u'url': u'http://smotri.com/video/view/?id=v15408898bcf', + u'file': u'v15408898bcf.flv', + u'md5': u'c66a5d61379ac6fde06f07eebe436316', + u'info_dict': { + u'title': u'этот ролик не покажут по ТВ', + u'uploader': u'zzxxx', + u'uploader_id': u'ueggb', + u'upload_date': u'20101001', + u'thumbnail': u'http://frame3.loadup.ru/75/75/1540889.1.3.jpg', + u'age_limit': 18 + }, + u'params': { + u'videopassword': u'333' + } + } + ] + + _SUCCESS = 0 + _PASSWORD_NOT_VERIFIED = 1 + _PASSWORD_DETECTED = 2 + _VIDEO_NOT_FOUND = 3 + + def _search_meta(self, name, html, display_name=None): + if display_name is None: + display_name = name + return self._html_search_regex( + r'<meta itemprop="%s" content="([^"]+)" />' % re.escape(name), + html, display_name, fatal=False) + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('videoid') + real_video_id = mobj.group('realvideoid') + + # Download video JSON data + video_json_url = 'http://smotri.com/vt.php?id=%s' % real_video_id + video_json_page = self._download_webpage(video_json_url, video_id, u'Downloading video JSON') + video_json = json.loads(video_json_page) + + status = video_json['status'] + if status == self._VIDEO_NOT_FOUND: + raise ExtractorError(u'Video %s does not exist' % video_id, expected=True) + elif status == self._PASSWORD_DETECTED: # The video is protected by a password, retry with + # video-password set + video_password = self._downloader.params.get('videopassword', None) + if not video_password: + raise ExtractorError(u'This video is protected by a password, use the --video-password option', expected=True) + video_json_url += '&md5pass=%s' % hashlib.md5(video_password).hexdigest() + video_json_page = self._download_webpage(video_json_url, video_id, u'Downloading video JSON (video-password set)') + video_json = json.loads(video_json_page) + status = video_json['status'] + if status == self._PASSWORD_NOT_VERIFIED: + raise ExtractorError(u'Video password is invalid', expected=True) + + if status != self._SUCCESS: + raise ExtractorError(u'Unexpected status value %s' % status) + + # Extract the URL of the video + video_url = video_json['file_data'] + video_ext = determine_ext(video_url) + + # Video JSON does not provide enough meta data + # We will extract some from the video web page instead + video_page_url = 'http://' + mobj.group('url') + video_page = self._download_webpage(video_page_url, video_id, u'Downloading video page') + + # Adult content + if re.search(u'EroConfirmText">', video_page) is not None: + self.report_age_confirmation() + confirm_string = self._html_search_regex( + ur'<a href="/video/view/\?id=%s&confirm=([^"]+)" title="[^"]+">' % video_id, + video_page, u'confirm string') + confirm_url = video_page_url + '&confirm=%s' % confirm_string + video_page = self._download_webpage(confirm_url, video_id, u'Downloading video page (age confirmed)') + adult_content = True + else: + adult_content = False + + # Extract the rest of meta data + video_title = self._search_meta(u'name', video_page, u'title') + if not video_title: + video_title = video_url.rsplit('/', 1)[-1] + + video_description = self._search_meta(u'description', video_page) + video_thumbnail = self._search_meta(u'thumbnail', video_page) + + upload_date_str = self._search_meta(u'uploadDate', video_page, u'upload date') + upload_date_m = re.search(r'(?P<year>\d{4})\.(?P<month>\d{2})\.(?P<day>\d{2})T', upload_date_str) + video_upload_date = ( + ( + upload_date_m.group('year') + + upload_date_m.group('month') + + upload_date_m.group('day') + ) + if upload_date_m else None + ) + + duration_str = self._search_meta(u'duration', video_page) + duration_m = re.search(r'T(?P<hours>[0-9]{2})H(?P<minutes>[0-9]{2})M(?P<seconds>[0-9]{2})S', duration_str) + video_duration = ( + ( + (int(duration_m.group('hours')) * 60 * 60) + + (int(duration_m.group('minutes')) * 60) + + int(duration_m.group('seconds')) + ) + if duration_m else None + ) + + video_uploader = self._html_search_regex( + ur'<div class="DescrUser"><div>Автор.*?onmouseover="popup_user_info[^"]+">(.*?)</a>', + video_page, u'uploader', fatal=False, flags=re.MULTILINE|re.DOTALL) + + video_uploader_id = self._html_search_regex( + ur'<div class="DescrUser"><div>Автор.*?onmouseover="popup_user_info\(.*?\'([^\']+)\'\);">', + video_page, u'uploader id', fatal=False, flags=re.MULTILINE|re.DOTALL) + + video_view_count = self._html_search_regex( + ur'Общее количество просмотров.*?<span class="Number">(\d+)</span>', + video_page, u'view count', fatal=False, flags=re.MULTILINE|re.DOTALL) + + return { + 'id': video_id, + 'url': video_url, + 'title': video_title, + 'ext': video_ext, + 'thumbnail': video_thumbnail, + 'description': video_description, + 'uploader': video_uploader, + 'upload_date': video_upload_date, + 'uploader_id': video_uploader_id, + 'video_duration': video_duration, + 'view_count': video_view_count, + 'age_limit': 18 if adult_content else 0, + 'video_page_url': video_page_url + } + +class SmotriCommunityIE(InfoExtractor): + IE_DESC = u'Smotri.com community videos' + IE_NAME = u'smotri:community' + _VALID_URL = r'^(?:http://)?(?:www\.)?smotri\.com/community/video/(?P<communityid>[0-9A-Za-z_\'-]+)' + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + community_id = mobj.group('communityid') + + url = 'http://smotri.com/export/rss/video/by/community/-/%s/video.xml' % community_id + rss = self._download_xml(url, community_id, u'Downloading community RSS') + + entries = [self.url_result(video_url.text, 'Smotri') + for video_url in rss.findall('./channel/item/link')] + + community_title = self._html_search_regex( + ur'^Видео сообщества "([^"]+)"$', rss.find('./channel/description').text, u'community title') + + return self.playlist_result(entries, community_id, community_title) + +class SmotriUserIE(InfoExtractor): + IE_DESC = u'Smotri.com user videos' + IE_NAME = u'smotri:user' + _VALID_URL = r'^(?:http://)?(?:www\.)?smotri\.com/user/(?P<userid>[0-9A-Za-z_\'-]+)' + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url); + user_id = mobj.group('userid') + + url = 'http://smotri.com/export/rss/user/video/-/%s/video.xml' % user_id + rss = self._download_xml(url, user_id, u'Downloading user RSS') + + entries = [self.url_result(video_url.text, 'Smotri') + for video_url in rss.findall('./channel/item/link')] + + user_nickname = self._html_search_regex( + ur'^Видео режиссера (.*)$', rss.find('./channel/description').text, u'user nickname') + + return self.playlist_result(entries, user_id, user_nickname) + \ No newline at end of file From aaebed13a8447961e23cca9c75b097732c246476 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 2 Dec 2013 17:08:17 +0100 Subject: [PATCH 084/121] [smotri] Simplify --- youtube_dl/extractor/common.py | 3 +- youtube_dl/extractor/smotri.py | 91 +++++++++++++++++++--------------- 2 files changed, 54 insertions(+), 40 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 4f1b50880..1b049082d 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -364,7 +364,8 @@ class InfoExtractor(object): if display_name is None: display_name = name return self._html_search_regex( - r'''(?ix)<meta(?=[^>]+(?:name|property)=["\']%s["\']) + r'''(?ix)<meta + (?=[^>]+(?:itemprop|name|property)=["\']%s["\']) [^>]+content=["\']([^"\']+)["\']''' % re.escape(name), html, display_name, fatal=False) diff --git a/youtube_dl/extractor/smotri.py b/youtube_dl/extractor/smotri.py index ea42d5320..f035a3214 100644 --- a/youtube_dl/extractor/smotri.py +++ b/youtube_dl/extractor/smotri.py @@ -14,46 +14,49 @@ from ..utils import ( class SmotriIE(InfoExtractor): IE_DESC = u'Smotri.com' IE_NAME = u'smotri' - _VALID_URL = r'^(?:http://)?(?:www\.)?(?P<url>smotri\.com/video/view/\?id=(?P<videoid>v(?P<realvideoid>[0-9]+)[a-z0-9]{4}))' - + _VALID_URL = r'^https?://(?:www\.)?(?P<url>smotri\.com/video/view/\?id=(?P<videoid>v(?P<realvideoid>[0-9]+)[a-z0-9]{4}))' + _TESTS = [ # real video id 2610366 { u'url': u'http://smotri.com/video/view/?id=v261036632ab', u'file': u'v261036632ab.mp4', - u'md5': u'46a72e83a6ad8862b64fa6953fa93f8a', + u'md5': u'2a7b08249e6f5636557579c368040eb9', u'info_dict': { u'title': u'катастрофа с камер видеонаблюдения', u'uploader': u'rbc2008', u'uploader_id': u'rbc08', u'upload_date': u'20131118', - u'thumbnail': u'http://frame6.loadup.ru/8b/a9/2610366.3.3.jpg' + u'description': u'катастрофа с камер видеонаблюдения, видео катастрофа с камер видеонаблюдения', + u'thumbnail': u'http://frame6.loadup.ru/8b/a9/2610366.3.3.jpg', }, }, # real video id 57591 { u'url': u'http://smotri.com/video/view/?id=v57591cb20', u'file': u'v57591cb20.flv', - u'md5': u'9eae59f6dda7087bf39a140e2fff5757', + u'md5': u'830266dfc21f077eac5afd1883091bcd', u'info_dict': { u'title': u'test', u'uploader': u'Support Photofile@photofile', u'uploader_id': u'support-photofile', u'upload_date': u'20070704', - u'thumbnail': u'http://frame4.loadup.ru/03/ed/57591.2.3.jpg' - }, + u'description': u'test, видео test', + u'thumbnail': u'http://frame4.loadup.ru/03/ed/57591.2.3.jpg', + }, }, # video-password { u'url': u'http://smotri.com/video/view/?id=v1390466a13c', u'file': u'v1390466a13c.mp4', - u'md5': u'fe4dd9357558d5ee3c8fc0ef0d39de66', + u'md5': u'f6331cef33cad65a0815ee482a54440b', u'info_dict': { u'title': u'TOCCA_A_NOI_-_LE_COSE_NON_VANNO_CAMBIAMOLE_ORA-1', u'uploader': u'timoxa40', u'uploader_id': u'timoxa40', u'upload_date': u'20100404', - u'thumbnail': u'http://frame7.loadup.ru/af/3f/1390466.3.3.jpg' + u'thumbnail': u'http://frame7.loadup.ru/af/3f/1390466.3.3.jpg', + u'description': u'TOCCA_A_NOI_-_LE_COSE_NON_VANNO_CAMBIAMOLE_ORA-1, видео TOCCA_A_NOI_-_LE_COSE_NON_VANNO_CAMBIAMOLE_ORA-1', }, u'params': { u'videopassword': u'qwerty', @@ -63,15 +66,16 @@ class SmotriIE(InfoExtractor): { u'url': u'http://smotri.com/video/view/?id=v15408898bcf', u'file': u'v15408898bcf.flv', - u'md5': u'c66a5d61379ac6fde06f07eebe436316', + u'md5': u'91e909c9f0521adf5ee86fbe073aad70', u'info_dict': { u'title': u'этот ролик не покажут по ТВ', u'uploader': u'zzxxx', u'uploader_id': u'ueggb', u'upload_date': u'20101001', u'thumbnail': u'http://frame3.loadup.ru/75/75/1540889.1.3.jpg', - u'age_limit': 18 - }, + u'age_limit': 18, + u'description': u'этот ролик не покажут по ТВ, видео этот ролик не покажут по ТВ', + }, u'params': { u'videopassword': u'333' } @@ -82,14 +86,15 @@ class SmotriIE(InfoExtractor): _PASSWORD_NOT_VERIFIED = 1 _PASSWORD_DETECTED = 2 _VIDEO_NOT_FOUND = 3 - + def _search_meta(self, name, html, display_name=None): if display_name is None: display_name = name return self._html_search_regex( r'<meta itemprop="%s" content="([^"]+)" />' % re.escape(name), html, display_name, fatal=False) - + return self._html_search_meta(name, html, display_name) + def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('videoid') @@ -103,12 +108,12 @@ class SmotriIE(InfoExtractor): status = video_json['status'] if status == self._VIDEO_NOT_FOUND: raise ExtractorError(u'Video %s does not exist' % video_id, expected=True) - elif status == self._PASSWORD_DETECTED: # The video is protected by a password, retry with + elif status == self._PASSWORD_DETECTED: # The video is protected by a password, retry with # video-password set video_password = self._downloader.params.get('videopassword', None) if not video_password: raise ExtractorError(u'This video is protected by a password, use the --video-password option', expected=True) - video_json_url += '&md5pass=%s' % hashlib.md5(video_password).hexdigest() + video_json_url += '&md5pass=%s' % hashlib.md5(video_password.encode('utf-8')).hexdigest() video_json_page = self._download_webpage(video_json_url, video_id, u'Downloading video JSON (video-password set)') video_json = json.loads(video_json_page) status = video_json['status'] @@ -120,7 +125,6 @@ class SmotriIE(InfoExtractor): # Extract the URL of the video video_url = video_json['file_data'] - video_ext = determine_ext(video_url) # Video JSON does not provide enough meta data # We will extract some from the video web page instead @@ -131,7 +135,7 @@ class SmotriIE(InfoExtractor): if re.search(u'EroConfirmText">', video_page) is not None: self.report_age_confirmation() confirm_string = self._html_search_regex( - ur'<a href="/video/view/\?id=%s&confirm=([^"]+)" title="[^"]+">' % video_id, + r'<a href="/video/view/\?id=%s&confirm=([^"]+)" title="[^"]+">' % video_id, video_page, u'confirm string') confirm_url = video_page_url + '&confirm=%s' % confirm_string video_page = self._download_webpage(confirm_url, video_id, u'Downloading video page (age confirmed)') @@ -143,11 +147,17 @@ class SmotriIE(InfoExtractor): video_title = self._search_meta(u'name', video_page, u'title') if not video_title: video_title = video_url.rsplit('/', 1)[-1] - + video_description = self._search_meta(u'description', video_page) + END_TEXT = u' на сайте Smotri.com' + if video_description.endswith(END_TEXT): + video_description = video_description[:-len(END_TEXT)] + START_TEXT = u'Смотреть онлайн ролик ' + if video_description.startswith(START_TEXT): + video_description = video_description[len(START_TEXT):] video_thumbnail = self._search_meta(u'thumbnail', video_page) - - upload_date_str = self._search_meta(u'uploadDate', video_page, u'upload date') + + upload_date_str = self._search_meta(u'uploadDate', video_page, u'upload date') upload_date_m = re.search(r'(?P<year>\d{4})\.(?P<month>\d{2})\.(?P<day>\d{2})T', upload_date_str) video_upload_date = ( ( @@ -170,22 +180,21 @@ class SmotriIE(InfoExtractor): ) video_uploader = self._html_search_regex( - ur'<div class="DescrUser"><div>Автор.*?onmouseover="popup_user_info[^"]+">(.*?)</a>', + u'<div class="DescrUser"><div>Автор.*?onmouseover="popup_user_info[^"]+">(.*?)</a>', video_page, u'uploader', fatal=False, flags=re.MULTILINE|re.DOTALL) video_uploader_id = self._html_search_regex( - ur'<div class="DescrUser"><div>Автор.*?onmouseover="popup_user_info\(.*?\'([^\']+)\'\);">', + u'<div class="DescrUser"><div>Автор.*?onmouseover="popup_user_info\\(.*?\'([^\']+)\'\\);">', video_page, u'uploader id', fatal=False, flags=re.MULTILINE|re.DOTALL) video_view_count = self._html_search_regex( - ur'Общее количество просмотров.*?<span class="Number">(\d+)</span>', + u'Общее количество просмотров.*?<span class="Number">(\\d+)</span>', video_page, u'view count', fatal=False, flags=re.MULTILINE|re.DOTALL) return { 'id': video_id, 'url': video_url, 'title': video_title, - 'ext': video_ext, 'thumbnail': video_thumbnail, 'description': video_description, 'uploader': video_uploader, @@ -197,43 +206,47 @@ class SmotriIE(InfoExtractor): 'video_page_url': video_page_url } + class SmotriCommunityIE(InfoExtractor): IE_DESC = u'Smotri.com community videos' IE_NAME = u'smotri:community' - _VALID_URL = r'^(?:http://)?(?:www\.)?smotri\.com/community/video/(?P<communityid>[0-9A-Za-z_\'-]+)' + _VALID_URL = r'^https?://(?:www\.)?smotri\.com/community/video/(?P<communityid>[0-9A-Za-z_\'-]+)' def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) community_id = mobj.group('communityid') - + url = 'http://smotri.com/export/rss/video/by/community/-/%s/video.xml' % community_id rss = self._download_xml(url, community_id, u'Downloading community RSS') - + entries = [self.url_result(video_url.text, 'Smotri') for video_url in rss.findall('./channel/item/link')] - + + description_text = rss.find('./channel/description').text community_title = self._html_search_regex( - ur'^Видео сообщества "([^"]+)"$', rss.find('./channel/description').text, u'community title') + u'^Видео сообщества "([^"]+)"$', description_text, u'community title') return self.playlist_result(entries, community_id, community_title) - + + class SmotriUserIE(InfoExtractor): IE_DESC = u'Smotri.com user videos' IE_NAME = u'smotri:user' - _VALID_URL = r'^(?:http://)?(?:www\.)?smotri\.com/user/(?P<userid>[0-9A-Za-z_\'-]+)' - + _VALID_URL = r'^https?://(?:www\.)?smotri\.com/user/(?P<userid>[0-9A-Za-z_\'-]+)' + def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url); + mobj = re.match(self._VALID_URL, url) user_id = mobj.group('userid') - + url = 'http://smotri.com/export/rss/user/video/-/%s/video.xml' % user_id rss = self._download_xml(url, user_id, u'Downloading user RSS') - + entries = [self.url_result(video_url.text, 'Smotri') for video_url in rss.findall('./channel/item/link')] - + + description_text = rss.find('./channel/description').text user_nickname = self._html_search_regex( - ur'^Видео режиссера (.*)$', rss.find('./channel/description').text, u'user nickname') + u'^Видео режиссера (.*)$', description_text, + u'user nickname') return self.playlist_result(entries, user_id, user_nickname) - \ No newline at end of file From 87968574293ef87b98f51cf0d7c0958b9f496a7a Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 2 Dec 2013 17:43:22 +0100 Subject: [PATCH 085/121] Credit @dstftw for smotri IE --- youtube_dl/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 799eca566..2eeef2ae9 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -36,6 +36,7 @@ __authors__ = ( 'Marcin Cieślak', 'Anton Larionov', 'Takuya Tsuchida', + 'Sergey M.', ) __license__ = 'Public Domain' From 36a826a50dc5e53af8355f1233cc4f3ceba2e61b Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Tue, 3 Dec 2013 11:54:52 +0100 Subject: [PATCH 086/121] Clarify --download-archive help (#1757) --- youtube_dl/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 2eeef2ae9..48137ebe5 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -224,7 +224,7 @@ def parseOpts(overrideArguments=None): default=None, type=int) selection.add_option('--download-archive', metavar='FILE', dest='download_archive', - help='Download only videos not present in the archive file. Record the IDs of all downloaded videos in it.') + help='Download only videos not listed in the archive file. Record the IDs of all downloaded videos in it.') authentication.add_option('-u', '--username', From 1b753cb3344837fb69e9bfde89d03161d33ba3ff Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Tue, 3 Dec 2013 13:04:02 +0100 Subject: [PATCH 087/121] Add Windows configuration file locations (#1881) --- README.md | 2 +- youtube_dl/__init__.py | 29 ++++++++++++++++++++++------- 2 files changed, 23 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 031e436b6..0ff6ff8b9 100644 --- a/README.md +++ b/README.md @@ -183,7 +183,7 @@ which means you can modify it, redistribute it or use it however you like. # CONFIGURATION -You can configure youtube-dl by placing default arguments (such as `--extract-audio --no-mtime` to always extract the audio and not copy the mtime) into `/etc/youtube-dl.conf` and/or `~/.config/youtube-dl.conf`. +You can configure youtube-dl by placing default arguments (such as `--extract-audio --no-mtime` to always extract the audio and not copy the mtime) into `/etc/youtube-dl.conf` and/or `~/.config/youtube-dl.conf`. On Windows, the configuration file locations are `%APPDATA%\youtube-dl\config` and `C:\Users\<Yourname>\youtube-dl.conf`. # OUTPUT TEMPLATE diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 48137ebe5..32490b24e 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -81,15 +81,13 @@ from .PostProcessor import ( def parseOpts(overrideArguments=None): - def _readOptions(filename_bytes): + def _readOptions(filename_bytes, def=[]): try: optionf = open(filename_bytes) except IOError: - return [] # silently skip if file is not present + return def # silently skip if file is not present try: - res = [] - for l in optionf: - res += shlex.split(l, comments=True) + res = [shlex.split(l, comments=True) for l in optionf] finally: optionf.close() return res @@ -419,6 +417,8 @@ def parseOpts(overrideArguments=None): if opts.verbose: write_string(u'[debug] Override config: ' + repr(overrideArguments) + '\n') else: + systemConf = _readOptions('/etc/youtube-dl.conf') + xdg_config_home = os.environ.get('XDG_CONFIG_HOME') if xdg_config_home: userConfFile = os.path.join(xdg_config_home, 'youtube-dl', 'config') @@ -428,8 +428,23 @@ def parseOpts(overrideArguments=None): userConfFile = os.path.join(os.path.expanduser('~'), '.config', 'youtube-dl', 'config') if not os.path.isfile(userConfFile): userConfFile = os.path.join(os.path.expanduser('~'), '.config', 'youtube-dl.conf') - systemConf = _readOptions('/etc/youtube-dl.conf') - userConf = _readOptions(userConfFile) + userConf = _readOptions(userConfFile, None) + + if userConf is None: + appdata_dir = os.environ.get('appdata') + if appdata_dir: + userConf = _readOptions( + os.path.join(appdata_dir, 'youtube-dl', 'config'), + def=None) + + if userConf is None: + userConfFile = _readOptions( + os.path.join(os.path.expanduser('~'), 'youtube-dl.conf'), + def=None) + + if userConf is None: + userConf = [] + commandLineConf = sys.argv[1:] argv = systemConf + userConf + commandLineConf opts, args = parser.parse_args(argv) From fb27c2295e0e9d6f2f6ac45ed5906987b4710d0a Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Tue, 3 Dec 2013 13:09:48 +0100 Subject: [PATCH 088/121] Correct configuration file locations --- README.md | 2 +- youtube_dl/__init__.py | 18 +++++++++++++----- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 0ff6ff8b9..85af7cf7e 100644 --- a/README.md +++ b/README.md @@ -183,7 +183,7 @@ which means you can modify it, redistribute it or use it however you like. # CONFIGURATION -You can configure youtube-dl by placing default arguments (such as `--extract-audio --no-mtime` to always extract the audio and not copy the mtime) into `/etc/youtube-dl.conf` and/or `~/.config/youtube-dl.conf`. On Windows, the configuration file locations are `%APPDATA%\youtube-dl\config` and `C:\Users\<Yourname>\youtube-dl.conf`. +You can configure youtube-dl by placing default arguments (such as `--extract-audio --no-mtime` to always extract the audio and not copy the mtime) into `/etc/youtube-dl.conf` and/or `~/.config/youtube-dl.conf`. On Windows, the configuration file locations are `%APPDATA%\youtube-dl\config.txt` and `C:\Users\<Yourname>\youtube-dl.conf`. # OUTPUT TEMPLATE diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 32490b24e..9c8a694f0 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -81,11 +81,11 @@ from .PostProcessor import ( def parseOpts(overrideArguments=None): - def _readOptions(filename_bytes, def=[]): + def _readOptions(filename_bytes, default=[]): try: optionf = open(filename_bytes) except IOError: - return def # silently skip if file is not present + return default # silently skip if file is not present try: res = [shlex.split(l, comments=True) for l in optionf] finally: @@ -435,12 +435,20 @@ def parseOpts(overrideArguments=None): if appdata_dir: userConf = _readOptions( os.path.join(appdata_dir, 'youtube-dl', 'config'), - def=None) + default=None) + if userConf is None: + userConf = _readOptions( + os.path.join(appdata_dir, 'youtube-dl', 'config.txt'), + default=None) if userConf is None: - userConfFile = _readOptions( + userConf = _readOptions( os.path.join(os.path.expanduser('~'), 'youtube-dl.conf'), - def=None) + default=None) + if userConf is None: + userConf = _readOptions( + os.path.join(os.path.expanduser('~'), 'youtube-dl.conf.txt'), + default=None) if userConf is None: userConf = [] From a0eaa341e1ce6254179c1a00a11704da1887e124 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Tue, 3 Dec 2013 13:11:20 +0100 Subject: [PATCH 089/121] [configuration] Undo code breakage --- youtube_dl/__init__.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 9c8a694f0..fff295e8c 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -87,7 +87,9 @@ def parseOpts(overrideArguments=None): except IOError: return default # silently skip if file is not present try: - res = [shlex.split(l, comments=True) for l in optionf] + res = [] + for l in optionf: + res += shlex.split(l, comments=True) finally: optionf.close() return res From 731e3dde299844fc3b0f369d5a161fa4df0eb718 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Tue, 3 Dec 2013 13:13:09 +0100 Subject: [PATCH 090/121] release 2013.12.03 --- README.md | 2 +- youtube_dl/version.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 85af7cf7e..00975ab5e 100644 --- a/README.md +++ b/README.md @@ -55,7 +55,7 @@ which means you can modify it, redistribute it or use it however you like. --dateafter DATE download only videos uploaded after this date --no-playlist download only the currently playing video --age-limit YEARS download only videos suitable for the given age - --download-archive FILE Download only videos not present in the archive + --download-archive FILE Download only videos not listed in the archive file. Record the IDs of all downloaded videos in it. diff --git a/youtube_dl/version.py b/youtube_dl/version.py index d8f341ab9..f9a339c02 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2013.12.02' +__version__ = '2013.12.03' From cf6758d2040816033ec47afe9c1d497e4c2abd4d Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Tue, 3 Dec 2013 13:33:07 +0100 Subject: [PATCH 091/121] Document disabling proxy (#1882) --- youtube_dl/__init__.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index fff295e8c..d2446b670 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -191,7 +191,9 @@ def parseOpts(overrideArguments=None): general.add_option('--extractor-descriptions', action='store_true', dest='list_extractor_descriptions', help='Output descriptions of all supported extractors', default=False) - general.add_option('--proxy', dest='proxy', default=None, help='Use the specified HTTP/HTTPS proxy', metavar='URL') + general.add_option( + '--proxy', dest='proxy', default=None, metavar='URL', + help='Use the specified HTTP/HTTPS proxy. Pass in an empty string (--proxy "") for direct connection') general.add_option('--no-check-certificate', action='store_true', dest='no_check_certificate', default=False, help='Suppress HTTPS certificate validation.') general.add_option( '--cache-dir', dest='cachedir', default=get_cachedir(), metavar='DIR', From cb7fb54600a96bcced33020b925f2cfc9428bd4c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Tue, 3 Dec 2013 13:55:25 +0100 Subject: [PATCH 092/121] Change the ie_name of YoutubeSearchDateIE It produced a duplicate entry when listing the extractors with '--list-extractors' and generates noise in the commit log when generating the supported sites webpage (like in 09f355f73bf1657ecacfd05eda21d2c4bf1cc4a8) --- youtube_dl/extractor/youtube.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 765b4a9bf..66f5af000 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1765,6 +1765,7 @@ class YoutubeSearchIE(SearchInfoExtractor): return self.playlist_result(videos, query) class YoutubeSearchDateIE(YoutubeSearchIE): + IE_NAME = YoutubeSearchIE.IE_NAME + ':date' _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published' _SEARCH_KEY = 'ytsearchdate' IE_DESC = u'YouTube.com searches, newest videos first' From e9d8e302aafdb6fcf72c44d582c1f6d4447cd5fc Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Tue, 3 Dec 2013 14:06:08 +0100 Subject: [PATCH 093/121] [xhamster] Change test checksum --- youtube_dl/extractor/xhamster.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/xhamster.py b/youtube_dl/extractor/xhamster.py index 7444d3393..279f75e7a 100644 --- a/youtube_dl/extractor/xhamster.py +++ b/youtube_dl/extractor/xhamster.py @@ -26,7 +26,7 @@ class XHamsterIE(InfoExtractor): { u'url': u'http://xhamster.com/movies/2221348/britney_spears_sexy_booty.html?hd', u'file': u'2221348.flv', - u'md5': u'e767b9475de189320f691f49c679c4c7', + u'md5': u'970a94178ca4118c5aa3aaea21211b81', u'info_dict': { u"upload_date": u"20130914", u"uploader_id": u"jojo747400", From 938384c587c33696bcdb9c28b982e2b744695b3d Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Tue, 3 Dec 2013 14:08:16 +0100 Subject: [PATCH 094/121] [redtube] Fix search for title --- youtube_dl/extractor/redtube.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/redtube.py b/youtube_dl/extractor/redtube.py index 3bbda128e..c2254ae8a 100644 --- a/youtube_dl/extractor/redtube.py +++ b/youtube_dl/extractor/redtube.py @@ -30,7 +30,7 @@ class RedTubeIE(InfoExtractor): r'<source src="(.+?)" type="video/mp4">', webpage, u'video URL') video_title = self._html_search_regex( - r'<h1 class="videoTitle slidePanelMovable">(.+?)</h1>', + r'<h1 class="videoTitle[^"]*">(.+?)</h1>', webpage, u'title') # No self-labeling, but they describe themselves as From ce93879a9b3b1661db3e65ec43649c5b6a08778c Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Tue, 3 Dec 2013 14:16:58 +0100 Subject: [PATCH 095/121] [daum] Fix real video ID extraction --- youtube_dl/extractor/daum.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/daum.py b/youtube_dl/extractor/daum.py index 3d1dcb793..fe7cfb064 100644 --- a/youtube_dl/extractor/daum.py +++ b/youtube_dl/extractor/daum.py @@ -28,7 +28,8 @@ class DaumIE(InfoExtractor): video_id = mobj.group(1) canonical_url = 'http://tvpot.daum.net/v/%s' % video_id webpage = self._download_webpage(canonical_url, video_id) - full_id = self._search_regex(r'<link rel="video_src" href=".+?vid=(.+?)"', + full_id = self._search_regex( + r'<iframe src="http://videofarm.daum.net/controller/video/viewer/Video.html\?.*?vid=(.+?)[&"]', webpage, u'full id') query = compat_urllib_parse.urlencode({'vid': full_id}) info = self._download_xml( From fb7abb31af93a2a1d84ba17beb0f389dd09eafdb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Tue, 3 Dec 2013 14:21:06 +0100 Subject: [PATCH 096/121] Remove the compatibility code used before the new format system was implemented --- youtube_dl/extractor/appletrailers.py | 9 ++------- youtube_dl/extractor/archiveorg.py | 11 ++--------- youtube_dl/extractor/comedycentral.py | 9 ++------- youtube_dl/extractor/daum.py | 5 +---- youtube_dl/extractor/dreisat.py | 7 +------ youtube_dl/extractor/faz.py | 5 +---- youtube_dl/extractor/gamespot.py | 5 +---- youtube_dl/extractor/metacritic.py | 5 +---- youtube_dl/extractor/mtv.py | 7 +------ youtube_dl/extractor/naver.py | 5 +---- youtube_dl/extractor/trilulilu.py | 6 +----- youtube_dl/extractor/viddler.py | 8 +------- 12 files changed, 15 insertions(+), 67 deletions(-) diff --git a/youtube_dl/extractor/appletrailers.py b/youtube_dl/extractor/appletrailers.py index 6d6237f8a..4befff394 100644 --- a/youtube_dl/extractor/appletrailers.py +++ b/youtube_dl/extractor/appletrailers.py @@ -113,7 +113,7 @@ class AppleTrailersIE(InfoExtractor): }) formats = sorted(formats, key=lambda f: (f['height'], f['width'])) - info = { + playlist.append({ '_type': 'video', 'id': video_id, 'title': title, @@ -124,12 +124,7 @@ class AppleTrailersIE(InfoExtractor): 'upload_date': upload_date, 'uploader_id': uploader_id, 'user_agent': 'QuickTime compatible (youtube-dl)', - } - # TODO: Remove when #980 has been merged - info['url'] = formats[-1]['url'] - info['ext'] = formats[-1]['ext'] - - playlist.append(info) + }) return { '_type': 'playlist', diff --git a/youtube_dl/extractor/archiveorg.py b/youtube_dl/extractor/archiveorg.py index 61ce4469a..3ae0aebb1 100644 --- a/youtube_dl/extractor/archiveorg.py +++ b/youtube_dl/extractor/archiveorg.py @@ -49,7 +49,7 @@ class ArchiveOrgIE(InfoExtractor): for f in formats: f['ext'] = determine_ext(f['url']) - info = { + return { '_type': 'video', 'id': video_id, 'title': title, @@ -57,12 +57,5 @@ class ArchiveOrgIE(InfoExtractor): 'description': description, 'uploader': uploader, 'upload_date': upload_date, + 'thumbnail': data.get('misc', {}).get('image'), } - thumbnail = data.get('misc', {}).get('image') - if thumbnail: - info['thumbnail'] = thumbnail - - # TODO: Remove when #980 has been merged - info.update(formats[-1]) - - return info diff --git a/youtube_dl/extractor/comedycentral.py b/youtube_dl/extractor/comedycentral.py index 23647f99e..41ef9ad47 100644 --- a/youtube_dl/extractor/comedycentral.py +++ b/youtube_dl/extractor/comedycentral.py @@ -197,7 +197,7 @@ class ComedyCentralShowsIE(InfoExtractor): }) effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1) - info = { + results.append({ 'id': shortMediaId, 'formats': formats, 'uploader': showId, @@ -205,11 +205,6 @@ class ComedyCentralShowsIE(InfoExtractor): 'title': effTitle, 'thumbnail': None, 'description': compat_str(officialTitle), - } - - # TODO: Remove when #980 has been merged - info.update(info['formats'][-1]) - - results.append(info) + }) return results diff --git a/youtube_dl/extractor/daum.py b/youtube_dl/extractor/daum.py index fe7cfb064..d418ce4a8 100644 --- a/youtube_dl/extractor/daum.py +++ b/youtube_dl/extractor/daum.py @@ -57,7 +57,7 @@ class DaumIE(InfoExtractor): 'format_id': profile, }) - info = { + return { 'id': video_id, 'title': info.find('TITLE').text, 'formats': formats, @@ -66,6 +66,3 @@ class DaumIE(InfoExtractor): 'duration': int(info.find('DURATION').text), 'upload_date': info.find('REGDTTM').text[:8], } - # TODO: Remove when #980 has been merged - info.update(formats[-1]) - return info diff --git a/youtube_dl/extractor/dreisat.py b/youtube_dl/extractor/dreisat.py index 3cb382e12..24ce79425 100644 --- a/youtube_dl/extractor/dreisat.py +++ b/youtube_dl/extractor/dreisat.py @@ -65,7 +65,7 @@ class DreiSatIE(InfoExtractor): return (qidx, prefer_http, format['video_bitrate']) formats.sort(key=_sortkey) - info = { + return { '_type': 'video', 'id': video_id, 'title': video_title, @@ -76,8 +76,3 @@ class DreiSatIE(InfoExtractor): 'uploader': video_uploader, 'upload_date': upload_date, } - - # TODO: Remove when #980 has been merged - info.update(formats[-1]) - - return info diff --git a/youtube_dl/extractor/faz.py b/youtube_dl/extractor/faz.py index c0169de04..d0dfde694 100644 --- a/youtube_dl/extractor/faz.py +++ b/youtube_dl/extractor/faz.py @@ -44,13 +44,10 @@ class FazIE(InfoExtractor): }) descr = self._html_search_regex(r'<p class="Content Copy">(.*?)</p>', webpage, u'description') - info = { + return { 'id': video_id, 'title': self._og_search_title(webpage), 'formats': formats, 'description': descr, 'thumbnail': config.find('STILL/STILL_BIG').text, } - # TODO: Remove when #980 has been merged - info.update(formats[-1]) - return info diff --git a/youtube_dl/extractor/gamespot.py b/youtube_dl/extractor/gamespot.py index 9645b00c3..26b7d2ae5 100644 --- a/youtube_dl/extractor/gamespot.py +++ b/youtube_dl/extractor/gamespot.py @@ -47,13 +47,10 @@ class GameSpotIE(InfoExtractor): 'format_id': q, }) - info = { + return { 'id': data_video['guid'], 'title': compat_urllib_parse.unquote(data_video['title']), 'formats': formats, 'description': get_meta_content('description', webpage), 'thumbnail': self._og_search_thumbnail(webpage), } - # TODO: Remove when #980 has been merged - info.update(formats[-1]) - return info diff --git a/youtube_dl/extractor/metacritic.py b/youtube_dl/extractor/metacritic.py index 449138b56..6b95b4998 100644 --- a/youtube_dl/extractor/metacritic.py +++ b/youtube_dl/extractor/metacritic.py @@ -43,13 +43,10 @@ class MetacriticIE(InfoExtractor): description = self._html_search_regex(r'<b>Description:</b>(.*?)</p>', webpage, u'description', flags=re.DOTALL) - info = { + return { 'id': video_id, 'title': clip.find('title').text, 'formats': formats, 'description': description, 'duration': int(clip.find('duration').text), } - # TODO: Remove when #980 has been merged - info.update(formats[-1]) - return info diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py index 42aee58be..972336782 100644 --- a/youtube_dl/extractor/mtv.py +++ b/youtube_dl/extractor/mtv.py @@ -93,7 +93,7 @@ class MTVIE(InfoExtractor): else: description = None - info = { + return { 'title': itemdoc.find('title').text, 'formats': self._extract_video_formats(mediagen_page), 'id': video_id, @@ -101,11 +101,6 @@ class MTVIE(InfoExtractor): 'description': description, } - # TODO: Remove when #980 has been merged - info.update(info['formats'][-1]) - - return info - def _get_videos_info(self, uri): video_id = self._id_from_uri(uri) data = compat_urllib_parse.urlencode({'uri': uri}) diff --git a/youtube_dl/extractor/naver.py b/youtube_dl/extractor/naver.py index d290397c7..c012ec0cf 100644 --- a/youtube_dl/extractor/naver.py +++ b/youtube_dl/extractor/naver.py @@ -56,7 +56,7 @@ class NaverIE(InfoExtractor): 'height': int(format_el.find('height').text), }) - info = { + return { 'id': video_id, 'title': info.find('Subject').text, 'formats': formats, @@ -65,6 +65,3 @@ class NaverIE(InfoExtractor): 'upload_date': info.find('WriteDate').text.replace('.', ''), 'view_count': int(info.find('PlayCount').text), } - # TODO: Remove when #980 has been merged - info.update(formats[-1]) - return info diff --git a/youtube_dl/extractor/trilulilu.py b/youtube_dl/extractor/trilulilu.py index 1c49e580d..d64aaa41f 100644 --- a/youtube_dl/extractor/trilulilu.py +++ b/youtube_dl/extractor/trilulilu.py @@ -55,7 +55,7 @@ class TriluliluIE(InfoExtractor): for fnode in format_doc.findall('./formats/format') ] - info = { + return { '_type': 'video', 'id': video_id, 'formats': formats, @@ -64,7 +64,3 @@ class TriluliluIE(InfoExtractor): 'thumbnail': thumbnail, } - # TODO: Remove when #980 has been merged - info.update(formats[-1]) - - return info diff --git a/youtube_dl/extractor/viddler.py b/youtube_dl/extractor/viddler.py index 826804af3..75335dfb8 100644 --- a/youtube_dl/extractor/viddler.py +++ b/youtube_dl/extractor/viddler.py @@ -47,7 +47,7 @@ class ViddlerIE(InfoExtractor): r"thumbnail\s*:\s*'([^']*)'", webpage, u'thumbnail', fatal=False) - info = { + return { '_type': 'video', 'id': video_id, 'title': title, @@ -56,9 +56,3 @@ class ViddlerIE(InfoExtractor): 'duration': duration, 'formats': formats, } - - # TODO: Remove when #980 has been merged - info['formats'][-1]['ext'] = determine_ext(info['formats'][-1]['url']) - info.update(info['formats'][-1]) - - return info From 84db81815af6787d91188ca065cc9ced4d83a4ca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Tue, 3 Dec 2013 14:58:24 +0100 Subject: [PATCH 097/121] Move common code for extractors based in MTV services to a new base class Removes the duplication of the thumbnail extraction code (only MTVIE needs to override it) --- youtube_dl/extractor/comedycentral.py | 10 +--- youtube_dl/extractor/gametrailers.py | 16 ++---- youtube_dl/extractor/mtv.py | 69 ++++++++++++++---------- youtube_dl/extractor/southparkstudios.py | 13 +---- 4 files changed, 48 insertions(+), 60 deletions(-) diff --git a/youtube_dl/extractor/comedycentral.py b/youtube_dl/extractor/comedycentral.py index 41ef9ad47..53579aa27 100644 --- a/youtube_dl/extractor/comedycentral.py +++ b/youtube_dl/extractor/comedycentral.py @@ -1,7 +1,7 @@ import re from .common import InfoExtractor -from .mtv import MTVIE, _media_xml_tag +from .mtv import MTVServicesInfoExtractor from ..utils import ( compat_str, compat_urllib_parse, @@ -11,7 +11,7 @@ from ..utils import ( ) -class ComedyCentralIE(MTVIE): +class ComedyCentralIE(MTVServicesInfoExtractor): _VALID_URL = r'http://www.comedycentral.com/(video-clips|episodes|cc-studios)/(?P<title>.*)' _FEED_URL = u'http://comedycentral.com/feeds/mrss/' @@ -25,12 +25,6 @@ class ComedyCentralIE(MTVIE): u'description': u'After a certain point, breastfeeding becomes c**kblocking.', }, } - # Overwrite MTVIE properties we don't want - _TESTS = [] - - def _get_thumbnail_url(self, uri, itemdoc): - search_path = '%s/%s' % (_media_xml_tag('group'), _media_xml_tag('thumbnail')) - return itemdoc.find(search_path).attrib['url'] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) diff --git a/youtube_dl/extractor/gametrailers.py b/youtube_dl/extractor/gametrailers.py index 3cc02d97e..3a8bef250 100644 --- a/youtube_dl/extractor/gametrailers.py +++ b/youtube_dl/extractor/gametrailers.py @@ -1,13 +1,11 @@ import re -from .mtv import MTVIE, _media_xml_tag +from .mtv import MTVServicesInfoExtractor -class GametrailersIE(MTVIE): - """ - Gametrailers use the same videos system as MTVIE, it just changes the feed - url, where the uri is and the method to get the thumbnails. - """ + +class GametrailersIE(MTVServicesInfoExtractor): _VALID_URL = r'http://www.gametrailers.com/(?P<type>videos|reviews|full-episodes)/(?P<id>.*?)/(?P<title>.*)' + _TEST = { u'url': u'http://www.gametrailers.com/videos/zbvr8i/mirror-s-edge-2-e3-2013--debut-trailer', u'file': u'70e9a5d7-cf25-4a10-9104-6f3e7342ae0d.mp4', @@ -17,15 +15,9 @@ class GametrailersIE(MTVIE): u'description': u'Faith is back! Check out the World Premiere trailer for Mirror\'s Edge 2 straight from the EA Press Conference at E3 2013!', }, } - # Overwrite MTVIE properties we don't want - _TESTS = [] _FEED_URL = 'http://www.gametrailers.com/feeds/mrss' - def _get_thumbnail_url(self, uri, itemdoc): - search_path = '%s/%s' % (_media_xml_tag('group'), _media_xml_tag('thumbnail')) - return itemdoc.find(search_path).attrib['url'] - def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py index 972336782..6b3feb560 100644 --- a/youtube_dl/extractor/mtv.py +++ b/youtube_dl/extractor/mtv.py @@ -10,35 +10,8 @@ from ..utils import ( def _media_xml_tag(tag): return '{http://search.yahoo.com/mrss/}%s' % tag -class MTVIE(InfoExtractor): - _VALID_URL = r'^https?://(?:www\.)?mtv\.com/videos/.+?/(?P<videoid>[0-9]+)/[^/]+$' - - _FEED_URL = 'http://www.mtv.com/player/embed/AS3/rss/' - - _TESTS = [ - { - u'url': u'http://www.mtv.com/videos/misc/853555/ours-vh1-storytellers.jhtml', - u'file': u'853555.mp4', - u'md5': u'850f3f143316b1e71fa56a4edfd6e0f8', - u'info_dict': { - u'title': u'Taylor Swift - "Ours (VH1 Storytellers)"', - u'description': u'Album: Taylor Swift performs "Ours" for VH1 Storytellers at Harvey Mudd College.', - }, - }, - { - u'add_ie': ['Vevo'], - u'url': u'http://www.mtv.com/videos/taylor-swift/916187/everything-has-changed-ft-ed-sheeran.jhtml', - u'file': u'USCJY1331283.mp4', - u'md5': u'73b4e7fcadd88929292fe52c3ced8caf', - u'info_dict': { - u'title': u'Everything Has Changed', - u'upload_date': u'20130606', - u'uploader': u'Taylor Swift', - }, - u'skip': u'VEVO is only available in some countries', - }, - ] +class MTVServicesInfoExtractor(InfoExtractor): @staticmethod def _id_from_uri(uri): return uri.split(':')[-1] @@ -53,7 +26,12 @@ class MTVIE(InfoExtractor): return base + m.group('finalid') def _get_thumbnail_url(self, uri, itemdoc): - return 'http://mtv.mtvnimages.com/uri/' + uri + search_path = '%s/%s' % (_media_xml_tag('group'), _media_xml_tag('thumbnail')) + thumb_node = itemdoc.find(search_path) + if thumb_node is None: + return None + else: + return thumb_node.attrib['url'] def _extract_video_formats(self, metadataXml): if '/error_country_block.swf' in metadataXml: @@ -108,6 +86,39 @@ class MTVIE(InfoExtractor): u'Downloading info') return [self._get_video_info(item) for item in idoc.findall('.//item')] + +class MTVIE(MTVServicesInfoExtractor): + _VALID_URL = r'^https?://(?:www\.)?mtv\.com/videos/.+?/(?P<videoid>[0-9]+)/[^/]+$' + + _FEED_URL = 'http://www.mtv.com/player/embed/AS3/rss/' + + _TESTS = [ + { + u'url': u'http://www.mtv.com/videos/misc/853555/ours-vh1-storytellers.jhtml', + u'file': u'853555.mp4', + u'md5': u'850f3f143316b1e71fa56a4edfd6e0f8', + u'info_dict': { + u'title': u'Taylor Swift - "Ours (VH1 Storytellers)"', + u'description': u'Album: Taylor Swift performs "Ours" for VH1 Storytellers at Harvey Mudd College.', + }, + }, + { + u'add_ie': ['Vevo'], + u'url': u'http://www.mtv.com/videos/taylor-swift/916187/everything-has-changed-ft-ed-sheeran.jhtml', + u'file': u'USCJY1331283.mp4', + u'md5': u'73b4e7fcadd88929292fe52c3ced8caf', + u'info_dict': { + u'title': u'Everything Has Changed', + u'upload_date': u'20130606', + u'uploader': u'Taylor Swift', + }, + u'skip': u'VEVO is only available in some countries', + }, + ] + + def _get_thumbnail_url(self, uri, itemdoc): + return 'http://mtv.mtvnimages.com/uri/' + uri + def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('videoid') diff --git a/youtube_dl/extractor/southparkstudios.py b/youtube_dl/extractor/southparkstudios.py index a711531e6..fd90cc5dd 100644 --- a/youtube_dl/extractor/southparkstudios.py +++ b/youtube_dl/extractor/southparkstudios.py @@ -1,15 +1,14 @@ import re -from .mtv import MTVIE, _media_xml_tag +from .mtv import MTVServicesInfoExtractor -class SouthParkStudiosIE(MTVIE): +class SouthParkStudiosIE(MTVServicesInfoExtractor): IE_NAME = u'southparkstudios.com' _VALID_URL = r'(https?://)?(www\.)?(?P<url>southparkstudios\.com/(clips|full-episodes)/(?P<id>.+?)(\?|#|$))' _FEED_URL = 'http://www.southparkstudios.com/feeds/video-player/mrss' - # Overwrite MTVIE properties we don't want _TESTS = [{ u'url': u'http://www.southparkstudios.com/clips/104437/bat-daded#tab=featured', u'file': u'a7bff6c2-ed00-11e0-aca6-0026b9414f30.mp4', @@ -19,14 +18,6 @@ class SouthParkStudiosIE(MTVIE): }, }] - def _get_thumbnail_url(self, uri, itemdoc): - search_path = '%s/%s' % (_media_xml_tag('group'), _media_xml_tag('thumbnail')) - thumb_node = itemdoc.find(search_path) - if thumb_node is None: - return None - else: - return thumb_node.attrib['url'] - def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) url = u'http://www.' + mobj.group(u'url') From 27dcce19045670fc348ff1119c0d2283aaed3ae2 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Wed, 4 Dec 2013 14:16:52 +0100 Subject: [PATCH 098/121] [youtube] Resolve URLs in comments --- youtube_dl/extractor/youtube.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 66f5af000..7fff761bd 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -336,7 +336,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): u"uploader": u"Philipp Hagemeister", u"uploader_id": u"phihag", u"upload_date": u"20121002", - u"description": u"test chars: \"'/\\ä↭𝕐\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de ." + u"description": u"test chars: \"'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de ." } }, { @@ -1366,6 +1366,15 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): # description video_description = get_element_by_id("eow-description", video_webpage) if video_description: + video_description = re.sub(r'''(?x) + <a\s+ + (?:[a-zA-Z-]+="[^"]+"\s+)*? + title="([^"]+)"\s+ + (?:[a-zA-Z-]+="[^"]+"\s+)*? + class="yt-uix-redirect-link"\s*> + [^<]+ + </a> + ''', r'\1', video_description) video_description = clean_html(video_description) else: fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage) From 671c0f151d5a7bb5c32a59f483a8e330f1f9a15b Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Wed, 4 Dec 2013 14:19:07 +0100 Subject: [PATCH 099/121] release 2013.12.04 --- README.md | 3 ++- youtube_dl/version.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 00975ab5e..029c418d1 100644 --- a/README.md +++ b/README.md @@ -30,7 +30,8 @@ which means you can modify it, redistribute it or use it however you like. --list-extractors List all supported extractors and the URLs they would handle --extractor-descriptions Output descriptions of all supported extractors - --proxy URL Use the specified HTTP/HTTPS proxy + --proxy URL Use the specified HTTP/HTTPS proxy. Pass in an + empty string (--proxy "") for direct connection --no-check-certificate Suppress HTTPS certificate validation. --cache-dir DIR Location in the filesystem where youtube-dl can store downloaded information permanently. By diff --git a/youtube_dl/version.py b/youtube_dl/version.py index f9a339c02..68b30bfd4 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2013.12.03' +__version__ = '2013.12.04' From c0ade33e167d1668c4aa8a6684e7083e6c71dd6e Mon Sep 17 00:00:00 2001 From: dst <dstftw@gmail.com> Date: Wed, 4 Dec 2013 20:34:47 +0700 Subject: [PATCH 100/121] Correct some extractor _VALID_URL regexes --- youtube_dl/extractor/addanime.py | 2 +- youtube_dl/extractor/appletrailers.py | 2 +- youtube_dl/extractor/archiveorg.py | 2 +- youtube_dl/extractor/arte.py | 4 ++-- youtube_dl/extractor/auengine.py | 2 +- youtube_dl/extractor/bambuser.py | 2 +- youtube_dl/extractor/bloomberg.py | 2 +- youtube_dl/extractor/comedycentral.py | 2 +- youtube_dl/extractor/cspan.py | 2 +- youtube_dl/extractor/dreisat.py | 2 +- youtube_dl/extractor/eighttracks.py | 2 +- youtube_dl/extractor/exfm.py | 2 +- youtube_dl/extractor/faz.py | 2 +- youtube_dl/extractor/fktv.py | 4 ++-- youtube_dl/extractor/francetv.py | 2 +- youtube_dl/extractor/gamekings.py | 2 +- youtube_dl/extractor/gametrailers.py | 2 +- youtube_dl/extractor/ign.py | 2 +- youtube_dl/extractor/instagram.py | 2 +- youtube_dl/extractor/jukebox.py | 2 +- youtube_dl/extractor/liveleak.py | 2 +- youtube_dl/extractor/livestream.py | 2 +- youtube_dl/extractor/muzu.py | 2 +- youtube_dl/extractor/myspass.py | 2 +- youtube_dl/extractor/orf.py | 2 +- youtube_dl/extractor/pbs.py | 2 +- youtube_dl/extractor/rutube.py | 2 +- youtube_dl/extractor/slashdot.py | 2 +- youtube_dl/extractor/soundcloud.py | 4 ++-- youtube_dl/extractor/space.py | 2 +- youtube_dl/extractor/stanfordoc.py | 2 +- youtube_dl/extractor/tf1.py | 2 +- youtube_dl/extractor/unistra.py | 2 +- youtube_dl/extractor/veehd.py | 2 +- youtube_dl/extractor/vevo.py | 2 +- youtube_dl/extractor/vice.py | 2 +- youtube_dl/extractor/viddler.py | 2 +- youtube_dl/extractor/videofyme.py | 2 +- youtube_dl/extractor/wat.py | 2 +- youtube_dl/extractor/youjizz.py | 2 +- 40 files changed, 43 insertions(+), 43 deletions(-) diff --git a/youtube_dl/extractor/addanime.py b/youtube_dl/extractor/addanime.py index b99d4b966..a3a1b999d 100644 --- a/youtube_dl/extractor/addanime.py +++ b/youtube_dl/extractor/addanime.py @@ -13,7 +13,7 @@ from ..utils import ( class AddAnimeIE(InfoExtractor): - _VALID_URL = r'^http://(?:\w+\.)?add-anime\.net/watch_video.php\?(?:.*?)v=(?P<video_id>[\w_]+)(?:.*)' + _VALID_URL = r'^http://(?:\w+\.)?add-anime\.net/watch_video\.php\?(?:.*?)v=(?P<video_id>[\w_]+)(?:.*)' IE_NAME = u'AddAnime' _TEST = { u'url': u'http://www.add-anime.net/watch_video.php?v=24MR3YO5SAS9', diff --git a/youtube_dl/extractor/appletrailers.py b/youtube_dl/extractor/appletrailers.py index 6d6237f8a..5b522552a 100644 --- a/youtube_dl/extractor/appletrailers.py +++ b/youtube_dl/extractor/appletrailers.py @@ -10,7 +10,7 @@ from ..utils import ( class AppleTrailersIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?trailers.apple.com/trailers/(?P<company>[^/]+)/(?P<movie>[^/]+)' + _VALID_URL = r'https?://(?:www\.)?trailers\.apple\.com/trailers/(?P<company>[^/]+)/(?P<movie>[^/]+)' _TEST = { u"url": u"http://trailers.apple.com/trailers/wb/manofsteel/", u"playlist": [ diff --git a/youtube_dl/extractor/archiveorg.py b/youtube_dl/extractor/archiveorg.py index 61ce4469a..a8394bfb0 100644 --- a/youtube_dl/extractor/archiveorg.py +++ b/youtube_dl/extractor/archiveorg.py @@ -11,7 +11,7 @@ from ..utils import ( class ArchiveOrgIE(InfoExtractor): IE_NAME = 'archive.org' IE_DESC = 'archive.org videos' - _VALID_URL = r'(?:https?://)?(?:www\.)?archive.org/details/(?P<id>[^?/]+)(?:[?].*)?$' + _VALID_URL = r'(?:https?://)?(?:www\.)?archive\.org/details/(?P<id>[^?/]+)(?:[?].*)?$' _TEST = { u"url": u"http://archive.org/details/XD300-23_68HighlightsAResearchCntAugHumanIntellect", u'file': u'XD300-23_68HighlightsAResearchCntAugHumanIntellect.ogv', diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py index 8b62ee774..56a5d009f 100644 --- a/youtube_dl/extractor/arte.py +++ b/youtube_dl/extractor/arte.py @@ -17,8 +17,8 @@ from ..utils import ( # add tests. class ArteTvIE(InfoExtractor): - _VIDEOS_URL = r'(?:http://)?videos.arte.tv/(?P<lang>fr|de)/.*-(?P<id>.*?).html' - _LIVEWEB_URL = r'(?:http://)?liveweb.arte.tv/(?P<lang>fr|de)/(?P<subpage>.+?)/(?P<name>.+)' + _VIDEOS_URL = r'(?:http://)?videos\.arte\.tv/(?P<lang>fr|de)/.*-(?P<id>.*?)\.html' + _LIVEWEB_URL = r'(?:http://)?liveweb\.arte\.tv/(?P<lang>fr|de)/(?P<subpage>.+?)/(?P<name>.+)' _LIVE_URL = r'index-[0-9]+\.html$' IE_NAME = u'arte.tv' diff --git a/youtube_dl/extractor/auengine.py b/youtube_dl/extractor/auengine.py index 95c038003..bcccc0b7a 100644 --- a/youtube_dl/extractor/auengine.py +++ b/youtube_dl/extractor/auengine.py @@ -16,7 +16,7 @@ class AUEngineIE(InfoExtractor): u"title": u"[Commie]The Legend of the Legendary Heroes - 03 - Replication Eye (Alpha Stigma)[F9410F5A]" } } - _VALID_URL = r'(?:http://)?(?:www\.)?auengine\.com/embed.php\?.*?file=([^&]+).*?' + _VALID_URL = r'(?:http://)?(?:www\.)?auengine\.com/embed\.php\?.*?file=([^&]+).*?' def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) diff --git a/youtube_dl/extractor/bambuser.py b/youtube_dl/extractor/bambuser.py index b80508efe..d48c0c38d 100644 --- a/youtube_dl/extractor/bambuser.py +++ b/youtube_dl/extractor/bambuser.py @@ -54,7 +54,7 @@ class BambuserIE(InfoExtractor): class BambuserChannelIE(InfoExtractor): IE_NAME = u'bambuser:channel' - _VALID_URL = r'http://bambuser.com/channel/(?P<user>.*?)(?:/|#|\?|$)' + _VALID_URL = r'https?://bambuser\.com/channel/(?P<user>.*?)(?:/|#|\?|$)' # The maximum number we can get with each request _STEP = 50 diff --git a/youtube_dl/extractor/bloomberg.py b/youtube_dl/extractor/bloomberg.py index 3666a780b..755d9c9ef 100644 --- a/youtube_dl/extractor/bloomberg.py +++ b/youtube_dl/extractor/bloomberg.py @@ -4,7 +4,7 @@ from .common import InfoExtractor class BloombergIE(InfoExtractor): - _VALID_URL = r'https?://www\.bloomberg\.com/video/(?P<name>.+?).html' + _VALID_URL = r'https?://www\.bloomberg\.com/video/(?P<name>.+?)\.html' _TEST = { u'url': u'http://www.bloomberg.com/video/shah-s-presentation-on-foreign-exchange-strategies-qurhIVlJSB6hzkVi229d8g.html', diff --git a/youtube_dl/extractor/comedycentral.py b/youtube_dl/extractor/comedycentral.py index 23647f99e..caea446ea 100644 --- a/youtube_dl/extractor/comedycentral.py +++ b/youtube_dl/extractor/comedycentral.py @@ -12,7 +12,7 @@ from ..utils import ( class ComedyCentralIE(MTVIE): - _VALID_URL = r'http://www.comedycentral.com/(video-clips|episodes|cc-studios)/(?P<title>.*)' + _VALID_URL = r'https?://(?:www\.)?comedycentral\.com/(video-clips|episodes|cc-studios)/(?P<title>.*)' _FEED_URL = u'http://comedycentral.com/feeds/mrss/' _TEST = { diff --git a/youtube_dl/extractor/cspan.py b/youtube_dl/extractor/cspan.py index 7bf03c584..d5730684d 100644 --- a/youtube_dl/extractor/cspan.py +++ b/youtube_dl/extractor/cspan.py @@ -6,7 +6,7 @@ from ..utils import ( ) class CSpanIE(InfoExtractor): - _VALID_URL = r'http://www.c-spanvideo.org/program/(.*)' + _VALID_URL = r'http://www\.c-spanvideo\.org/program/(.*)' _TEST = { u'url': u'http://www.c-spanvideo.org/program/HolderonV', u'file': u'315139.flv', diff --git a/youtube_dl/extractor/dreisat.py b/youtube_dl/extractor/dreisat.py index 3cb382e12..008c99699 100644 --- a/youtube_dl/extractor/dreisat.py +++ b/youtube_dl/extractor/dreisat.py @@ -11,7 +11,7 @@ from ..utils import ( class DreiSatIE(InfoExtractor): IE_NAME = '3sat' - _VALID_URL = r'(?:http://)?(?:www\.)?3sat.de/mediathek/index.php\?(?:(?:mode|display)=[^&]+&)*obj=(?P<id>[0-9]+)$' + _VALID_URL = r'(?:http://)?(?:www\.)?3sat\.de/mediathek/index\.php\?(?:(?:mode|display)=[^&]+&)*obj=(?P<id>[0-9]+)$' _TEST = { u"url": u"http://www.3sat.de/mediathek/index.php?obj=36983", u'file': u'36983.webm', diff --git a/youtube_dl/extractor/eighttracks.py b/youtube_dl/extractor/eighttracks.py index f21ef8853..88f5526b8 100644 --- a/youtube_dl/extractor/eighttracks.py +++ b/youtube_dl/extractor/eighttracks.py @@ -10,7 +10,7 @@ from ..utils import ( class EightTracksIE(InfoExtractor): IE_NAME = '8tracks' - _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$' + _VALID_URL = r'https?://8tracks\.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$' _TEST = { u"name": u"EightTracks", u"url": u"http://8tracks.com/ytdl/youtube-dl-test-tracks-a", diff --git a/youtube_dl/extractor/exfm.py b/youtube_dl/extractor/exfm.py index a51d79b08..682901d16 100644 --- a/youtube_dl/extractor/exfm.py +++ b/youtube_dl/extractor/exfm.py @@ -8,7 +8,7 @@ class ExfmIE(InfoExtractor): IE_NAME = u'exfm' IE_DESC = u'ex.fm' _VALID_URL = r'(?:http://)?(?:www\.)?ex\.fm/song/([^/]+)' - _SOUNDCLOUD_URL = r'(?:http://)?(?:www\.)?api\.soundcloud.com/tracks/([^/]+)/stream' + _SOUNDCLOUD_URL = r'(?:http://)?(?:www\.)?api\.soundcloud\.com/tracks/([^/]+)/stream' _TESTS = [ { u'url': u'http://ex.fm/song/eh359', diff --git a/youtube_dl/extractor/faz.py b/youtube_dl/extractor/faz.py index c0169de04..615674baf 100644 --- a/youtube_dl/extractor/faz.py +++ b/youtube_dl/extractor/faz.py @@ -9,7 +9,7 @@ from ..utils import ( class FazIE(InfoExtractor): IE_NAME = u'faz.net' - _VALID_URL = r'https?://www\.faz\.net/multimedia/videos/.*?-(?P<id>\d+).html' + _VALID_URL = r'https?://www\.faz\.net/multimedia/videos/.*?-(?P<id>\d+)\.html' _TEST = { u'url': u'http://www.faz.net/multimedia/videos/stockholm-chemie-nobelpreis-fuer-drei-amerikanische-forscher-12610585.html', diff --git a/youtube_dl/extractor/fktv.py b/youtube_dl/extractor/fktv.py index dba1a8dc2..d7048c8c1 100644 --- a/youtube_dl/extractor/fktv.py +++ b/youtube_dl/extractor/fktv.py @@ -12,7 +12,7 @@ from ..utils import ( class FKTVIE(InfoExtractor): IE_NAME = u'fernsehkritik.tv' - _VALID_URL = r'(?:http://)?(?:www\.)?fernsehkritik.tv/folge-(?P<ep>[0-9]+)(?:/.*)?' + _VALID_URL = r'(?:http://)?(?:www\.)?fernsehkritik\.tv/folge-(?P<ep>[0-9]+)(?:/.*)?' _TEST = { u'url': u'http://fernsehkritik.tv/folge-1', @@ -52,7 +52,7 @@ class FKTVIE(InfoExtractor): class FKTVPosteckeIE(InfoExtractor): IE_NAME = u'fernsehkritik.tv:postecke' - _VALID_URL = r'(?:http://)?(?:www\.)?fernsehkritik.tv/inline-video/postecke.php\?(.*&)?ep=(?P<ep>[0-9]+)(&|$)' + _VALID_URL = r'(?:http://)?(?:www\.)?fernsehkritik\.tv/inline-video/postecke\.php\?(.*&)?ep=(?P<ep>[0-9]+)(&|$)' _TEST = { u'url': u'http://fernsehkritik.tv/inline-video/postecke.php?iframe=true&width=625&height=440&ep=120', u'file': u'0120.flv', diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py index 6e1971043..66aa3aa0d 100644 --- a/youtube_dl/extractor/francetv.py +++ b/youtube_dl/extractor/francetv.py @@ -45,7 +45,7 @@ class PluzzIE(FranceTVBaseInfoExtractor): class FranceTvInfoIE(FranceTVBaseInfoExtractor): IE_NAME = u'francetvinfo.fr' - _VALID_URL = r'https?://www\.francetvinfo\.fr/replay.*/(?P<title>.+).html' + _VALID_URL = r'https?://www\.francetvinfo\.fr/replay.*/(?P<title>.+)\.html' _TEST = { u'url': u'http://www.francetvinfo.fr/replay-jt/france-3/soir-3/jt-grand-soir-3-lundi-26-aout-2013_393427.html', diff --git a/youtube_dl/extractor/gamekings.py b/youtube_dl/extractor/gamekings.py index c91669b0e..a3a5251fe 100644 --- a/youtube_dl/extractor/gamekings.py +++ b/youtube_dl/extractor/gamekings.py @@ -4,7 +4,7 @@ from .common import InfoExtractor class GamekingsIE(InfoExtractor): - _VALID_URL = r'http?://www\.gamekings\.tv/videos/(?P<name>[0-9a-z\-]+)' + _VALID_URL = r'http://www\.gamekings\.tv/videos/(?P<name>[0-9a-z\-]+)' _TEST = { u"url": u"http://www.gamekings.tv/videos/phoenix-wright-ace-attorney-dual-destinies-review/", u'file': u'20130811.mp4', diff --git a/youtube_dl/extractor/gametrailers.py b/youtube_dl/extractor/gametrailers.py index 3cc02d97e..88f656031 100644 --- a/youtube_dl/extractor/gametrailers.py +++ b/youtube_dl/extractor/gametrailers.py @@ -7,7 +7,7 @@ class GametrailersIE(MTVIE): Gametrailers use the same videos system as MTVIE, it just changes the feed url, where the uri is and the method to get the thumbnails. """ - _VALID_URL = r'http://www.gametrailers.com/(?P<type>videos|reviews|full-episodes)/(?P<id>.*?)/(?P<title>.*)' + _VALID_URL = r'http://www\.gametrailers\.com/(?P<type>videos|reviews|full-episodes)/(?P<id>.*?)/(?P<title>.*)' _TEST = { u'url': u'http://www.gametrailers.com/videos/zbvr8i/mirror-s-edge-2-e3-2013--debut-trailer', u'file': u'70e9a5d7-cf25-4a10-9104-6f3e7342ae0d.mp4', diff --git a/youtube_dl/extractor/ign.py b/youtube_dl/extractor/ign.py index c52146f7d..57b79a336 100644 --- a/youtube_dl/extractor/ign.py +++ b/youtube_dl/extractor/ign.py @@ -103,7 +103,7 @@ class IGNIE(InfoExtractor): class OneUPIE(IGNIE): """Extractor for 1up.com, it uses the ign videos system.""" - _VALID_URL = r'https?://gamevideos.1up.com/(?P<type>video)/id/(?P<name_or_id>.+)' + _VALID_URL = r'https?://gamevideos\.1up\.com/(?P<type>video)/id/(?P<name_or_id>.+)' IE_NAME = '1up.com' _DESCRIPTION_RE = r'<div id="vid_summary">(.+?)</div>' diff --git a/youtube_dl/extractor/instagram.py b/youtube_dl/extractor/instagram.py index 213aac428..660573d02 100644 --- a/youtube_dl/extractor/instagram.py +++ b/youtube_dl/extractor/instagram.py @@ -3,7 +3,7 @@ import re from .common import InfoExtractor class InstagramIE(InfoExtractor): - _VALID_URL = r'(?:http://)?instagram.com/p/(.*?)/' + _VALID_URL = r'(?:http://)?instagram\.com/p/(.*?)/' _TEST = { u'url': u'http://instagram.com/p/aye83DjauH/?foo=bar#abc', u'file': u'aye83DjauH.mp4', diff --git a/youtube_dl/extractor/jukebox.py b/youtube_dl/extractor/jukebox.py index c7bb234fe..592c64e1d 100644 --- a/youtube_dl/extractor/jukebox.py +++ b/youtube_dl/extractor/jukebox.py @@ -8,7 +8,7 @@ from ..utils import ( ) class JukeboxIE(InfoExtractor): - _VALID_URL = r'^http://www\.jukebox?\..+?\/.+[,](?P<video_id>[a-z0-9\-]+).html' + _VALID_URL = r'^http://www\.jukebox?\..+?\/.+[,](?P<video_id>[a-z0-9\-]+)\.html' _IFRAME = r'<iframe .*src="(?P<iframe>[^"]*)".*>' _VIDEO_URL = r'"config":{"file":"(?P<video_url>http:[^"]+[.](?P<video_ext>[^.?]+)[?]mdtk=[0-9]+)"' _TITLE = r'<h1 class="inline">(?P<title>[^<]+)</h1>.*<span id="infos_article_artist">(?P<artist>[^<]+)</span>' diff --git a/youtube_dl/extractor/liveleak.py b/youtube_dl/extractor/liveleak.py index dd062a14e..5ae57a77c 100644 --- a/youtube_dl/extractor/liveleak.py +++ b/youtube_dl/extractor/liveleak.py @@ -8,7 +8,7 @@ from ..utils import ( class LiveLeakIE(InfoExtractor): - _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)' + _VALID_URL = r'^(?:http://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)' IE_NAME = u'liveleak' _TEST = { u'url': u'http://www.liveleak.com/view?i=757_1364311680', diff --git a/youtube_dl/extractor/livestream.py b/youtube_dl/extractor/livestream.py index 9bc35b115..1dcd1fb2d 100644 --- a/youtube_dl/extractor/livestream.py +++ b/youtube_dl/extractor/livestream.py @@ -11,7 +11,7 @@ from ..utils import ( class LivestreamIE(InfoExtractor): IE_NAME = u'livestream' - _VALID_URL = r'http://new.livestream.com/.*?/(?P<event_name>.*?)(/videos/(?P<id>\d+))?/?$' + _VALID_URL = r'http://new\.livestream\.com/.*?/(?P<event_name>.*?)(/videos/(?P<id>\d+))?/?$' _TEST = { u'url': u'http://new.livestream.com/CoheedandCambria/WebsterHall/videos/4719370', u'file': u'4719370.mp4', diff --git a/youtube_dl/extractor/muzu.py b/youtube_dl/extractor/muzu.py index 03e31ea1c..1772b7f9a 100644 --- a/youtube_dl/extractor/muzu.py +++ b/youtube_dl/extractor/muzu.py @@ -9,7 +9,7 @@ from ..utils import ( class MuzuTVIE(InfoExtractor): - _VALID_URL = r'https?://www.muzu.tv/(.+?)/(.+?)/(?P<id>\d+)' + _VALID_URL = r'https?://www\.muzu\.tv/(.+?)/(.+?)/(?P<id>\d+)' IE_NAME = u'muzu.tv' _TEST = { diff --git a/youtube_dl/extractor/myspass.py b/youtube_dl/extractor/myspass.py index 0067bf134..4becddee6 100644 --- a/youtube_dl/extractor/myspass.py +++ b/youtube_dl/extractor/myspass.py @@ -9,7 +9,7 @@ from ..utils import ( class MySpassIE(InfoExtractor): - _VALID_URL = r'http://www.myspass.de/.*' + _VALID_URL = r'http://www\.myspass\.de/.*' _TEST = { u'url': u'http://www.myspass.de/myspass/shows/tvshows/absolute-mehrheit/Absolute-Mehrheit-vom-17022013-Die-Highlights-Teil-2--/11741/', u'file': u'11741.mp4', diff --git a/youtube_dl/extractor/orf.py b/youtube_dl/extractor/orf.py index cfca2a063..b42eae89a 100644 --- a/youtube_dl/extractor/orf.py +++ b/youtube_dl/extractor/orf.py @@ -12,7 +12,7 @@ from ..utils import ( ) class ORFIE(InfoExtractor): - _VALID_URL = r'https?://tvthek.orf.at/(programs/.+?/episodes|topics/.+?)/(?P<id>\d+)' + _VALID_URL = r'https?://tvthek\.orf\.at/(programs/.+?/episodes|topics/.+?)/(?P<id>\d+)' def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py index 65462d867..25f019231 100644 --- a/youtube_dl/extractor/pbs.py +++ b/youtube_dl/extractor/pbs.py @@ -5,7 +5,7 @@ from .common import InfoExtractor class PBSIE(InfoExtractor): - _VALID_URL = r'https?://video.pbs.org/video/(?P<id>\d+)/?' + _VALID_URL = r'https?://video\.pbs\.org/video/(?P<id>\d+)/?' _TEST = { u'url': u'http://video.pbs.org/video/2365006249/', diff --git a/youtube_dl/extractor/rutube.py b/youtube_dl/extractor/rutube.py index a18034fe2..e3e9bc07f 100644 --- a/youtube_dl/extractor/rutube.py +++ b/youtube_dl/extractor/rutube.py @@ -11,7 +11,7 @@ from ..utils import ( class RutubeIE(InfoExtractor): - _VALID_URL = r'https?://rutube.ru/video/(?P<long_id>\w+)' + _VALID_URL = r'https?://rutube\.ru/video/(?P<long_id>\w+)' _TEST = { u'url': u'http://rutube.ru/video/3eac3b4561676c17df9132a9a1e62e3e/', diff --git a/youtube_dl/extractor/slashdot.py b/youtube_dl/extractor/slashdot.py index f5003c7f9..d68646d24 100644 --- a/youtube_dl/extractor/slashdot.py +++ b/youtube_dl/extractor/slashdot.py @@ -4,7 +4,7 @@ from .common import InfoExtractor class SlashdotIE(InfoExtractor): - _VALID_URL = r'https?://tv.slashdot.org/video/\?embed=(?P<id>.*?)(&|$)' + _VALID_URL = r'https?://tv\.slashdot\.org/video/\?embed=(?P<id>.*?)(&|$)' _TEST = { u'add_ie': ['Ooyala'], diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index 3a19ab172..cb6dedab7 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -25,7 +25,7 @@ class SoundcloudIE(InfoExtractor): _VALID_URL = r'''^(?:https?://)? (?:(?:(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)/?(?:[?].*)?$) |(?:api\.soundcloud\.com/tracks/(?P<track_id>\d+)) - |(?P<widget>w.soundcloud.com/player/?.*?url=.*) + |(?P<widget>w\.soundcloud\.com/player/?.*?url=.*) ) ''' IE_NAME = u'soundcloud' @@ -217,7 +217,7 @@ class SoundcloudSetIE(SoundcloudIE): class SoundcloudUserIE(SoundcloudIE): - _VALID_URL = r'https?://(www\.)?soundcloud.com/(?P<user>[^/]+)(/?(tracks/)?)?(\?.*)?$' + _VALID_URL = r'https?://(www\.)?soundcloud\.com/(?P<user>[^/]+)(/?(tracks/)?)?(\?.*)?$' IE_NAME = u'soundcloud:user' # it's in tests/test_playlists.py diff --git a/youtube_dl/extractor/space.py b/youtube_dl/extractor/space.py index 0d32a0688..11455e0fa 100644 --- a/youtube_dl/extractor/space.py +++ b/youtube_dl/extractor/space.py @@ -6,7 +6,7 @@ from ..utils import RegexNotFoundError, ExtractorError class SpaceIE(InfoExtractor): - _VALID_URL = r'https?://www\.space\.com/\d+-(?P<title>[^/\.\?]*?)-video.html' + _VALID_URL = r'https?://www\.space\.com/\d+-(?P<title>[^/\.\?]*?)-video\.html' _TEST = { u'add_ie': ['Brightcove'], u'url': u'http://www.space.com/23373-huge-martian-landforms-detail-revealed-by-european-probe-video.html', diff --git a/youtube_dl/extractor/stanfordoc.py b/youtube_dl/extractor/stanfordoc.py index b27838bf9..d54e01a12 100644 --- a/youtube_dl/extractor/stanfordoc.py +++ b/youtube_dl/extractor/stanfordoc.py @@ -18,7 +18,7 @@ from ..utils import ( class StanfordOpenClassroomIE(InfoExtractor): IE_NAME = u'stanfordoc' IE_DESC = u'Stanford Open ClassRoom' - _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$' + _VALID_URL = r'^(?:https?://)?openclassroom\.stanford\.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$' _TEST = { u'url': u'http://openclassroom.stanford.edu/MainFolder/VideoPage.php?course=PracticalUnix&video=intro-environment&speed=100', u'file': u'PracticalUnix_intro-environment.mp4', diff --git a/youtube_dl/extractor/tf1.py b/youtube_dl/extractor/tf1.py index 772134a12..2c5c88be8 100644 --- a/youtube_dl/extractor/tf1.py +++ b/youtube_dl/extractor/tf1.py @@ -7,7 +7,7 @@ from .common import InfoExtractor class TF1IE(InfoExtractor): """TF1 uses the wat.tv player.""" - _VALID_URL = r'http://videos.tf1.fr/.*-(.*?).html' + _VALID_URL = r'http://videos\.tf1\.fr/.*-(.*?)\.html' _TEST = { u'url': u'http://videos.tf1.fr/auto-moto/citroen-grand-c4-picasso-2013-presentation-officielle-8062060.html', u'file': u'10635995.mp4', diff --git a/youtube_dl/extractor/unistra.py b/youtube_dl/extractor/unistra.py index 516e18914..474610eec 100644 --- a/youtube_dl/extractor/unistra.py +++ b/youtube_dl/extractor/unistra.py @@ -3,7 +3,7 @@ import re from .common import InfoExtractor class UnistraIE(InfoExtractor): - _VALID_URL = r'http://utv.unistra.fr/(?:index|video).php\?id_video\=(\d+)' + _VALID_URL = r'http://utv\.unistra\.fr/(?:index|video)\.php\?id_video\=(\d+)' _TEST = { u'url': u'http://utv.unistra.fr/video.php?id_video=154', diff --git a/youtube_dl/extractor/veehd.py b/youtube_dl/extractor/veehd.py index 3a99a29c6..3cf8c853d 100644 --- a/youtube_dl/extractor/veehd.py +++ b/youtube_dl/extractor/veehd.py @@ -9,7 +9,7 @@ from ..utils import ( ) class VeeHDIE(InfoExtractor): - _VALID_URL = r'https?://veehd.com/video/(?P<id>\d+)' + _VALID_URL = r'https?://veehd\.com/video/(?P<id>\d+)' _TEST = { u'url': u'http://veehd.com/video/4686958', diff --git a/youtube_dl/extractor/vevo.py b/youtube_dl/extractor/vevo.py index 4378b1780..d8bfcd155 100644 --- a/youtube_dl/extractor/vevo.py +++ b/youtube_dl/extractor/vevo.py @@ -15,7 +15,7 @@ class VevoIE(InfoExtractor): Accepts urls from vevo.com or in the format 'vevo:{id}' (currently used by MTVIE) """ - _VALID_URL = r'((http://www.vevo.com/watch/.*?/.*?/)|(vevo:))(?P<id>.*?)(\?|$)' + _VALID_URL = r'((http://www\.vevo\.com/watch/.*?/.*?/)|(vevo:))(?P<id>.*?)(\?|$)' _TESTS = [{ u'url': u'http://www.vevo.com/watch/hurts/somebody-to-die-for/GB1101300280', u'file': u'GB1101300280.mp4', diff --git a/youtube_dl/extractor/vice.py b/youtube_dl/extractor/vice.py index 6b93afa50..87812d6af 100644 --- a/youtube_dl/extractor/vice.py +++ b/youtube_dl/extractor/vice.py @@ -6,7 +6,7 @@ from ..utils import ExtractorError class ViceIE(InfoExtractor): - _VALID_URL = r'http://www.vice.com/.*?/(?P<name>.+)' + _VALID_URL = r'http://www\.vice\.com/.*?/(?P<name>.+)' _TEST = { u'url': u'http://www.vice.com/Fringes/cowboy-capitalists-part-1', diff --git a/youtube_dl/extractor/viddler.py b/youtube_dl/extractor/viddler.py index 826804af3..36d1bde08 100644 --- a/youtube_dl/extractor/viddler.py +++ b/youtube_dl/extractor/viddler.py @@ -8,7 +8,7 @@ from ..utils import ( class ViddlerIE(InfoExtractor): - _VALID_URL = r'(?P<domain>https?://(?:www\.)?viddler.com)/(?:v|embed|player)/(?P<id>[a-z0-9]+)' + _VALID_URL = r'(?P<domain>https?://(?:www\.)?viddler\.com)/(?:v|embed|player)/(?P<id>[a-z0-9]+)' _TEST = { u"url": u"http://www.viddler.com/v/43903784", u'file': u'43903784.mp4', diff --git a/youtube_dl/extractor/videofyme.py b/youtube_dl/extractor/videofyme.py index 912802d9a..f75169041 100644 --- a/youtube_dl/extractor/videofyme.py +++ b/youtube_dl/extractor/videofyme.py @@ -7,7 +7,7 @@ from ..utils import ( ) class VideofyMeIE(InfoExtractor): - _VALID_URL = r'https?://(www.videofy.me/.+?|p.videofy.me/v)/(?P<id>\d+)(&|#|$)' + _VALID_URL = r'https?://(www\.videofy\.me/.+?|p\.videofy\.me/v)/(?P<id>\d+)(&|#|$)' IE_NAME = u'videofy.me' _TEST = { diff --git a/youtube_dl/extractor/wat.py b/youtube_dl/extractor/wat.py index 29c25f0e3..4fab6c6e8 100644 --- a/youtube_dl/extractor/wat.py +++ b/youtube_dl/extractor/wat.py @@ -11,7 +11,7 @@ from ..utils import ( class WatIE(InfoExtractor): - _VALID_URL=r'http://www.wat.tv/.*-(?P<shortID>.*?)_.*?.html' + _VALID_URL=r'http://www\.wat\.tv/.*-(?P<shortID>.*?)_.*?\.html' IE_NAME = 'wat.tv' _TEST = { u'url': u'http://www.wat.tv/video/world-war-philadelphia-vost-6bv55_2fjr7_.html', diff --git a/youtube_dl/extractor/youjizz.py b/youtube_dl/extractor/youjizz.py index 1fcc518ac..e971b5b4b 100644 --- a/youtube_dl/extractor/youjizz.py +++ b/youtube_dl/extractor/youjizz.py @@ -7,7 +7,7 @@ from ..utils import ( class YouJizzIE(InfoExtractor): - _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$' + _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+)\.html$' _TEST = { u'url': u'http://www.youjizz.com/videos/zeichentrick-1-2189178.html', u'file': u'2189178.flv', From 6a656a843a629ceef6979976a353d177c97b9527 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Wed, 4 Dec 2013 20:35:00 +0100 Subject: [PATCH 101/121] Update description value for the write_info_json test (required after 27dcce19045670fc348ff1119c0d2283aaed3ae2) --- test/test_write_info_json.py | 1 + 1 file changed, 1 insertion(+) diff --git a/test/test_write_info_json.py b/test/test_write_info_json.py index d7177611b..90426a559 100644 --- a/test/test_write_info_json.py +++ b/test/test_write_info_json.py @@ -33,6 +33,7 @@ TEST_ID = 'BaW_jenozKc' INFO_JSON_FILE = TEST_ID + '.info.json' DESCRIPTION_FILE = TEST_ID + '.mp4.description' EXPECTED_DESCRIPTION = u'''test chars: "'/\ä↭𝕐 +test URL: https://github.com/rg3/youtube-dl/issues/1892 This is a test video for youtube-dl. From bfb9f7bc4c5c6fd9b2d3d46be133988f70534d26 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Wed, 4 Dec 2013 20:36:26 +0100 Subject: [PATCH 102/121] [hotnewhiphop] Update test's title --- youtube_dl/extractor/hotnewhiphop.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/hotnewhiphop.py b/youtube_dl/extractor/hotnewhiphop.py index 3798118a7..0ee74fb38 100644 --- a/youtube_dl/extractor/hotnewhiphop.py +++ b/youtube_dl/extractor/hotnewhiphop.py @@ -11,7 +11,7 @@ class HotNewHipHopIE(InfoExtractor): u'file': u'1435540.mp3', u'md5': u'2c2cd2f76ef11a9b3b581e8b232f3d96', u'info_dict': { - u"title": u"Freddie Gibbs - Lay It Down" + u"title": u'Freddie Gibbs "Lay It Down"' } } From e9bf7479d209c2623753628201ca0daffa19f3cf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Wed, 4 Dec 2013 23:28:40 +0100 Subject: [PATCH 103/121] Add an extractor for theplatform.com --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/theplatform.py | 69 +++++++++++++++++++++++++++++ 2 files changed, 70 insertions(+) create mode 100644 youtube_dl/extractor/theplatform.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index bd996483b..900a6f02f 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -144,6 +144,7 @@ from .teamcoco import TeamcocoIE from .techtalks import TechTalksIE from .ted import TEDIE from .tf1 import TF1IE +from .theplatform import ThePlatformIE from .thisav import ThisAVIE from .toutv import TouTvIE from .traileraddict import TrailerAddictIE diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py new file mode 100644 index 000000000..d1d6a4c2c --- /dev/null +++ b/youtube_dl/extractor/theplatform.py @@ -0,0 +1,69 @@ +import re +import json + +from .common import InfoExtractor +from ..utils import ( + xpath_with_ns, + find_xpath_attr, +) + +_x = lambda p: xpath_with_ns(p, {'smil': 'http://www.w3.org/2005/SMIL21/Language'}) + + +class ThePlatformIE(InfoExtractor): + _VALID_URL = r'https?://link\.theplatform\.com/s/[^/]+/(?P<id>[^/\?]+)' + + _TEST = { + # from http://www.metacafe.com/watch/cb-e9I_cZgTgIPd/blackberrys_big_bold_z30/ + u'url': u'http://link.theplatform.com/s/dJ5BDC/e9I_cZgTgIPd/meta.smil?format=smil&Tracking=true&mbr=true', + u'info_dict': { + u'id': u'e9I_cZgTgIPd', + u'ext': u'flv', + u'title': u'Blackberry\'s big, bold Z30', + u'description': u'The Z30 is Blackberry\'s biggest, baddest mobile messaging device yet.', + u'duration': 247, + }, + u'params': { + # rtmp download + u'skip_download': True, + }, + } + + def _get_info(self, video_id): + smil_url = ('http://link.theplatform.com/s/dJ5BDC/{0}/meta.smil?' + 'format=smil&mbr=true'.format(video_id)) + meta = self._download_xml(smil_url, video_id) + info_url = 'http://link.theplatform.com/s/dJ5BDC/{0}?format=preview'.format(video_id) + info_json = self._download_webpage(info_url, video_id) + info = json.loads(info_json) + + head = meta.find(_x('smil:head')) + body = meta.find(_x('smil:body')) + base_url = head.find(_x('smil:meta')).attrib['base'] + switch = body.find(_x('smil:switch')) + formats = [] + for f in switch.findall(_x('smil:video')): + attr = f.attrib + formats.append({ + 'url': base_url, + 'play_path': 'mp4:' + attr['src'], + 'ext': 'flv', + 'width': int(attr['width']), + 'height': int(attr['height']), + 'vbr': int(attr['system-bitrate']), + }) + formats.sort(key=lambda f: (f['height'], f['width'], f['vbr'])) + + return { + 'id': video_id, + 'title': info['title'], + 'formats': formats, + 'description': info['description'], + 'thumbnail': info['defaultThumbnailUrl'], + 'duration': info['duration']//1000, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + return self._get_info(video_id) From b9a2c53833a3cebc32df908aad74f7c5a3537aa1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Wed, 4 Dec 2013 23:43:50 +0100 Subject: [PATCH 104/121] [metacafe] Add support for cbs videos (fixes #1838) They use theplatform.com --- youtube_dl/extractor/metacafe.py | 29 +++++++++++++++++++++++++---- youtube_dl/extractor/theplatform.py | 2 +- 2 files changed, 26 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/metacafe.py b/youtube_dl/extractor/metacafe.py index 91480ba87..e59bdd604 100644 --- a/youtube_dl/extractor/metacafe.py +++ b/youtube_dl/extractor/metacafe.py @@ -69,6 +69,21 @@ class MetacafeIE(InfoExtractor): u'age_limit': 18, }, }, + # cbs video + { + u'url': u'http://www.metacafe.com/watch/cb-0rOxMBabDXN6/samsung_galaxy_note_2_samsungs_next_generation_phablet/', + u'info_dict': { + u'id': u'0rOxMBabDXN6', + u'ext': u'flv', + u'title': u'Samsung Galaxy Note 2: Samsung\'s next-generation phablet', + u'description': u'md5:54d49fac53d26d5a0aaeccd061ada09d', + u'duration': 129, + }, + u'params': { + # rtmp download + u'skip_download': True, + }, + }, ] @@ -106,10 +121,16 @@ class MetacafeIE(InfoExtractor): video_id = mobj.group(1) - # Check if video comes from YouTube - mobj2 = re.match(r'^yt-(.*)$', video_id) - if mobj2 is not None: - return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1), 'Youtube')] + # the video may come from an external site + m_external = re.match('^(\w{2})-(.*)$', video_id) + if m_external is not None: + prefix, ext_id = m_external.groups() + # Check if video comes from YouTube + if prefix == 'yt': + return self.url_result('http://www.youtube.com/watch?v=%s' % ext_id, 'Youtube') + # CBS videos use theplatform.com + if prefix == 'cb': + return self.url_result('theplatform:%s' % ext_id, 'ThePlatform') # Retrieve video webpage to extract further information req = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id) diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py index d1d6a4c2c..920689511 100644 --- a/youtube_dl/extractor/theplatform.py +++ b/youtube_dl/extractor/theplatform.py @@ -11,7 +11,7 @@ _x = lambda p: xpath_with_ns(p, {'smil': 'http://www.w3.org/2005/SMIL21/Language class ThePlatformIE(InfoExtractor): - _VALID_URL = r'https?://link\.theplatform\.com/s/[^/]+/(?P<id>[^/\?]+)' + _VALID_URL = r'(?:https?://link\.theplatform\.com/s/[^/]+/|theplatform:)(?P<id>[^/\?]+)' _TEST = { # from http://www.metacafe.com/watch/cb-e9I_cZgTgIPd/blackberrys_big_bold_z30/ From 673d1273ff6f6d3267728fbe6f79c9c801598fd2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Thu, 5 Dec 2013 12:41:58 +0100 Subject: [PATCH 105/121] [vevo] Support '/watch/{id}' urls --- youtube_dl/extractor/vevo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/vevo.py b/youtube_dl/extractor/vevo.py index d8bfcd155..3eedcf7dd 100644 --- a/youtube_dl/extractor/vevo.py +++ b/youtube_dl/extractor/vevo.py @@ -15,7 +15,7 @@ class VevoIE(InfoExtractor): Accepts urls from vevo.com or in the format 'vevo:{id}' (currently used by MTVIE) """ - _VALID_URL = r'((http://www\.vevo\.com/watch/.*?/.*?/)|(vevo:))(?P<id>.*?)(\?|$)' + _VALID_URL = r'((http://www\.vevo\.com/watch/(?:[^/]+/[^/]+/)?)|(vevo:))(?P<id>.*?)(\?|$)' _TESTS = [{ u'url': u'http://www.vevo.com/watch/hurts/somebody-to-die-for/GB1101300280', u'file': u'GB1101300280.mp4', From 7fc3fa0545f8a07414e8c97be9862a3c2f79bb98 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Thu, 5 Dec 2013 14:29:08 +0100 Subject: [PATCH 106/121] [9gag] Add extractor --- youtube_dl/YoutubeDL.py | 34 +++++++++++++++++++++++--- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/ninegag.py | 41 ++++++++++++++++++++++++++++++++ 3 files changed, 73 insertions(+), 3 deletions(-) create mode 100644 youtube_dl/extractor/ninegag.py diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index b68b110a4..8ad7bd1da 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -405,7 +405,8 @@ class YoutubeDL(object): for key, value in extra_info.items(): info_dict.setdefault(key, value) - def extract_info(self, url, download=True, ie_key=None, extra_info={}): + def extract_info(self, url, download=True, ie_key=None, extra_info={}, + process=True): ''' Returns a list with a dictionary for each video we find. If 'download', also downloads the videos. @@ -441,7 +442,10 @@ class YoutubeDL(object): 'webpage_url': url, 'extractor_key': ie.ie_key(), }) - return self.process_ie_result(ie_result, download, extra_info) + if process: + return self.process_ie_result(ie_result, download, extra_info) + else: + return ie_result except ExtractorError as de: # An error we somewhat expected self.report_error(compat_str(de), de.format_traceback()) break @@ -474,8 +478,32 @@ class YoutubeDL(object): download, ie_key=ie_result.get('ie_key'), extra_info=extra_info) - elif result_type == 'playlist': + elif result_type == 'url_transparent': + # Use the information from the embedding page + info = self.extract_info( + ie_result['url'], ie_key=ie_result.get('ie_key'), + extra_info=extra_info, download=False, process=False) + def make_result(embedded_info): + new_result = ie_result.copy() + for f in ('_type', 'url', 'ext', 'player_url', 'formats', + 'entries', 'urlhandle', 'ie_key', 'duration', + 'subtitles', 'annotations', 'format'): + if f in new_result: + del new_result[f] + if f in embedded_info: + new_result[f] = embedded_info[f] + return new_result + new_result = make_result(info) + + assert new_result.get('_type') != 'url_transparent' + if new_result.get('_type') == 'compat_list': + new_result['entries'] = [ + make_result(e) for e in new_result['entries']] + + return self.process_ie_result( + new_result, download=download, extra_info=extra_info) + elif result_type == 'playlist': # We process each entry in the playlist playlist = ie_result.get('title', None) or ie_result.get('id', None) self.to_screen(u'[download] Downloading playlist: %s' % playlist) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index bd996483b..a77e98d49 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -102,6 +102,7 @@ from .nbc import NBCNewsIE from .newgrounds import NewgroundsIE from .nhl import NHLIE, NHLVideocenterIE from .niconico import NiconicoIE +from .ninegag import NineGagIE from .nowvideo import NowVideoIE from .ooyala import OoyalaIE from .orf import ORFIE diff --git a/youtube_dl/extractor/ninegag.py b/youtube_dl/extractor/ninegag.py new file mode 100644 index 000000000..cc00ffbcc --- /dev/null +++ b/youtube_dl/extractor/ninegag.py @@ -0,0 +1,41 @@ +import json +import re + +from .common import InfoExtractor + + +class NineGagIE(InfoExtractor): + IE_NAME = '9gag' + _VALID_URL = r'^https?://(?:www\.)?9gag\.tv/v/(?P<id>[0-9]+)' + + _TEST = { + u"url": u"http://9gag.tv/v/1912", + u"file": u"1912.mp4", + u"info_dict": { + u"description": u"This 3-minute video will make you smile and then make you feel untalented and insignificant. Anyway, you should share this awesomeness. (Thanks, Dino!)", + u"title": u"\"People Are Awesome 2013\" Is Absolutely Awesome" + }, + u'add_ie': [u'Youtube'] + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + + webpage = self._download_webpage(url, video_id) + data_json = self._html_search_regex(r'''(?x) + <div\s*id="tv-video"\s*data-video-source="youtube"\s* + data-video-meta="([^"]+)"''', webpage, u'video metadata') + + data = json.loads(data_json) + + return { + '_type': 'url_transparent', + 'url': data['youtubeVideoId'], + 'ie_key': 'Youtube', + 'id': video_id, + 'title': data['title'], + 'description': data['description'], + 'view_count': int(data['view_count']), + 'thumbnail': data['thumbnail_url'], + } From a1ef7e85d6834d5e8a9a2171b220a9e3b93dd2cf Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Thu, 5 Dec 2013 14:31:54 +0100 Subject: [PATCH 107/121] Remove unused imports --- youtube_dl/extractor/smotri.py | 1 - youtube_dl/extractor/theplatform.py | 1 - youtube_dl/extractor/viddler.py | 3 --- youtube_dl/extractor/yahoo.py | 2 +- youtube_dl/utils.py | 1 - 5 files changed, 1 insertion(+), 7 deletions(-) diff --git a/youtube_dl/extractor/smotri.py b/youtube_dl/extractor/smotri.py index f035a3214..5a28bc820 100644 --- a/youtube_dl/extractor/smotri.py +++ b/youtube_dl/extractor/smotri.py @@ -6,7 +6,6 @@ import hashlib from .common import InfoExtractor from ..utils import ( - determine_ext, ExtractorError ) diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py index 920689511..61452e47d 100644 --- a/youtube_dl/extractor/theplatform.py +++ b/youtube_dl/extractor/theplatform.py @@ -4,7 +4,6 @@ import json from .common import InfoExtractor from ..utils import ( xpath_with_ns, - find_xpath_attr, ) _x = lambda p: xpath_with_ns(p, {'smil': 'http://www.w3.org/2005/SMIL21/Language'}) diff --git a/youtube_dl/extractor/viddler.py b/youtube_dl/extractor/viddler.py index 138a35b2a..9328ef4a2 100644 --- a/youtube_dl/extractor/viddler.py +++ b/youtube_dl/extractor/viddler.py @@ -2,9 +2,6 @@ import json import re from .common import InfoExtractor -from ..utils import ( - determine_ext, -) class ViddlerIE(InfoExtractor): diff --git a/youtube_dl/extractor/yahoo.py b/youtube_dl/extractor/yahoo.py index e457c4707..5c9c361b9 100644 --- a/youtube_dl/extractor/yahoo.py +++ b/youtube_dl/extractor/yahoo.py @@ -47,7 +47,7 @@ class YahooIE(InfoExtractor): # The 'meta' field is not always in the video webpage, we request it # from another page long_id = info['id'] - return self._get_info(info['id'], video_id) + return self._get_info(long_id, video_id) def _get_info(self, long_id, video_id): query = ('SELECT * FROM yahoo.media.video.streams WHERE id="%s"' diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index c486ef8ec..77609f7ca 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -17,7 +17,6 @@ import ssl import socket import sys import traceback -import xml.etree.ElementTree import zlib try: From 19e3dfc9f8444a1341a6e71752a3235a0447a565 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Thu, 5 Dec 2013 18:29:07 +0100 Subject: [PATCH 108/121] [9gag] Like/dislike count (#1895) --- youtube_dl/extractor/common.py | 3 +++ youtube_dl/extractor/ninegag.py | 2 ++ 2 files changed, 5 insertions(+) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 1b049082d..92a0c5050 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -55,6 +55,9 @@ class InfoExtractor(object): subtitles: The subtitle file contents as a dictionary in the format {language: subtitles}. view_count: How many users have watched the video on the platform. + like_count: Number of positive ratings of the video + dislike_count: Number of negative ratings of the video + comment_count: Number of comments on the video urlhandle: [internal] The urlHandle to be used to download the file, like returned by urllib.request.urlopen age_limit: Age restriction for the video, as an integer (years) diff --git a/youtube_dl/extractor/ninegag.py b/youtube_dl/extractor/ninegag.py index cc00ffbcc..ea986c00e 100644 --- a/youtube_dl/extractor/ninegag.py +++ b/youtube_dl/extractor/ninegag.py @@ -37,5 +37,7 @@ class NineGagIE(InfoExtractor): 'title': data['title'], 'description': data['description'], 'view_count': int(data['view_count']), + 'like_count': int(data['statistic']['like']), + 'dislike_count': int(data['statistic']['dislike']), 'thumbnail': data['thumbnail_url'], } From 9e6060208430cef6af5e1f6ae24feb65c35fc03c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Thu, 5 Dec 2013 20:45:53 +0100 Subject: [PATCH 109/121] =?UTF-8?q?[francetv]=20Add=20support=20for=20more?= =?UTF-8?q?=20channels:=203,=204,=205=20and=20=C3=94=20(#1898)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rename the France2IE extractor to FranceTVIE --- youtube_dl/extractor/__init__.py | 2 +- youtube_dl/extractor/francetv.py | 103 +++++++++++++++++++++++++------ 2 files changed, 85 insertions(+), 20 deletions(-) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 2e1a8be14..a78dcad7f 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -56,7 +56,7 @@ from .flickr import FlickrIE from .francetv import ( PluzzIE, FranceTvInfoIE, - France2IE, + FranceTVIE, GenerationQuoiIE ) from .freesound import FreesoundIE diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py index 66aa3aa0d..290e650e0 100644 --- a/youtube_dl/extractor/francetv.py +++ b/youtube_dl/extractor/francetv.py @@ -21,7 +21,7 @@ class FranceTVBaseInfoExtractor(InfoExtractor): thumbnail_path = info.find('image').text return {'id': video_id, - 'ext': 'mp4', + 'ext': 'flv' if video_url.startswith('rtmp') else 'mp4', 'url': video_url, 'title': info.find('titre').text, 'thumbnail': compat_urlparse.urljoin('http://pluzz.francetv.fr', thumbnail_path), @@ -66,35 +66,100 @@ class FranceTvInfoIE(FranceTVBaseInfoExtractor): return self._extract_video(video_id) -class France2IE(FranceTVBaseInfoExtractor): - IE_NAME = u'france2.fr' - _VALID_URL = r'''(?x)https?://www\.france2\.fr/ +class FranceTVIE(FranceTVBaseInfoExtractor): + IE_NAME = u'francetv' + IE_DESC = u'France 2, 3, 4, 5 and Ô' + _VALID_URL = r'''(?x)https?://www\.france[2345o]\.fr/ (?: - emissions/.*?/videos/(?P<id>\d+) - | emission/(?P<key>[^/?]+) + emissions/.*?/(videos|emissions)/(?P<id>[^/?]+) + | (emission|jt)/(?P<key>[^/?]+) )''' - _TEST = { - u'url': u'http://www.france2.fr/emissions/13h15-le-samedi-le-dimanche/videos/75540104', - u'file': u'75540104.mp4', - u'info_dict': { - u'title': u'13h15, le samedi...', - u'description': u'md5:2e5b58ba7a2d3692b35c792be081a03d', + _TESTS = [ + # france2 + { + u'url': u'http://www.france2.fr/emissions/13h15-le-samedi-le-dimanche/videos/75540104', + u'file': u'75540104.mp4', + u'info_dict': { + u'title': u'13h15, le samedi...', + u'description': u'md5:2e5b58ba7a2d3692b35c792be081a03d', + }, + u'params': { + # m3u8 download + u'skip_download': True, + }, }, - u'params': { - u'skip_download': True, + # france3 + { + u'url': u'http://www.france3.fr/emissions/pieces-a-conviction/videos/rhozet_pac_ba_20131204_1933_03122013164521_F3', + u'info_dict': { + u'id': u'rhozet_pac_ba_20131204_1933_03122013164521_F3', + u'ext': u'flv', + u'title': u'Pièces à conviction du 04/12/2013', + u'description': u'md5:1cf14ea302ba5f10d992c9eb2bff30dd', + }, + u'params': { + # rtmp download + u'skip_download': True, + }, }, - } + # france4 + { + u'url': u'http://www.france4.fr/emissions/hero-corp/videos/rhozet_herocorp_bonus_1_20131106_1923_06112013172108_F4', + u'info_dict': { + u'id': u'rhozet_herocorp_bonus_1_20131106_1923_06112013172108_F4', + u'ext': u'flv', + u'title': u'Hero Corp Making of - Extrait 1', + u'description': u'md5:c87d54871b1790679aec1197e73d650a', + }, + u'params': { + # rtmp download + u'skip_download': True, + }, + }, + # france5 + { + u'url': u'http://www.france5.fr/emissions/c-a-dire/videos/92837968', + u'info_dict': { + u'id': u'92837968', + u'ext': u'mp4', + u'title': u'C à dire ?!', + u'description': u'md5:fb1db1cbad784dcce7c7a7bd177c8e2f', + }, + u'params': { + # m3u8 download + u'skip_download': True, + }, + }, + # franceo + { + u'url': u'http://www.franceo.fr/jt/info-afrique/04-12-2013', + u'info_dict': { + u'id': u'92327925', + u'ext': u'mp4', + u'title': u'Infô-Afrique', + u'description': u'md5:ebf346da789428841bee0fd2a935ea55', + }, + u'params': { + # m3u8 download + u'skip_download': True, + }, + u'skip': u'The id changes frequently', + }, + ] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) if mobj.group('key'): webpage = self._download_webpage(url, mobj.group('key')) - video_id = self._html_search_regex( - r'''(?x)<div\s+class="video-player">\s* + id_res = [ + (r'''(?x)<div\s+class="video-player">\s* <a\s+href="http://videos.francetv.fr/video/([0-9]+)"\s+ - class="francetv-video-player">''', - webpage, u'video ID') + class="francetv-video-player">'''), + (r'<a id="player_direct" href="http://info\.francetelevisions' + '\.fr/\?id-video=([^"/&]+)'), + ] + video_id = self._html_search_regex(id_res, webpage, u'video ID') else: video_id = mobj.group('id') return self._extract_video(video_id) From 3514813d5b021c5595b212ba3b1801175840c5c0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Thu, 5 Dec 2013 21:26:35 +0100 Subject: [PATCH 110/121] [francetv] Add support for urls in the format http://www.france3.fr/emissions/{program}/diffusions/{date} (fixes #1898) --- youtube_dl/extractor/francetv.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py index 290e650e0..ad85bc16d 100644 --- a/youtube_dl/extractor/francetv.py +++ b/youtube_dl/extractor/francetv.py @@ -72,7 +72,7 @@ class FranceTVIE(FranceTVBaseInfoExtractor): _VALID_URL = r'''(?x)https?://www\.france[2345o]\.fr/ (?: emissions/.*?/(videos|emissions)/(?P<id>[^/?]+) - | (emission|jt)/(?P<key>[^/?]+) + | (emissions?|jt)/(?P<key>[^/?]+) )''' _TESTS = [ @@ -91,12 +91,12 @@ class FranceTVIE(FranceTVBaseInfoExtractor): }, # france3 { - u'url': u'http://www.france3.fr/emissions/pieces-a-conviction/videos/rhozet_pac_ba_20131204_1933_03122013164521_F3', + u'url': u'http://www.france3.fr/emissions/pieces-a-conviction/diffusions/13-11-2013_145575', u'info_dict': { - u'id': u'rhozet_pac_ba_20131204_1933_03122013164521_F3', + u'id': u'000702326_CAPP_PicesconvictionExtrait313022013_120220131722_Au', u'ext': u'flv', - u'title': u'Pièces à conviction du 04/12/2013', - u'description': u'md5:1cf14ea302ba5f10d992c9eb2bff30dd', + u'title': u'Le scandale du prix des médicaments', + u'description': u'md5:1384089fbee2f04fc6c9de025ee2e9ce', }, u'params': { # rtmp download @@ -158,6 +158,7 @@ class FranceTVIE(FranceTVBaseInfoExtractor): class="francetv-video-player">'''), (r'<a id="player_direct" href="http://info\.francetelevisions' '\.fr/\?id-video=([^"/&]+)'), + (r'<a class="video" id="ftv_player_(.+?)"'), ] video_id = self._html_search_regex(id_res, webpage, u'video ID') else: From ef4fd848573b601502ba9142d5ce521294024356 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Fri, 6 Dec 2013 09:15:04 +0100 Subject: [PATCH 111/121] [wistia] Add extractor --- youtube_dl/YoutubeDL.py | 3 +- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/generic.py | 31 ++++++++++++------ youtube_dl/extractor/wistia.py | 55 ++++++++++++++++++++++++++++++++ 4 files changed, 80 insertions(+), 10 deletions(-) create mode 100644 youtube_dl/extractor/wistia.py diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 8ad7bd1da..07b36a98e 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -488,7 +488,8 @@ class YoutubeDL(object): new_result = ie_result.copy() for f in ('_type', 'url', 'ext', 'player_url', 'formats', 'entries', 'urlhandle', 'ie_key', 'duration', - 'subtitles', 'annotations', 'format'): + 'subtitles', 'annotations', 'format', + 'thumbnail', 'thumbnails'): if f in new_result: del new_result[f] if f in embedded_info: diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index a78dcad7f..a7d37d48b 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -178,6 +178,7 @@ from .wat import WatIE from .websurg import WeBSurgIE from .weibo import WeiboIE from .wimp import WimpIE +from .wistia import WistiaIE from .worldstarhiphop import WorldStarHipHopIE from .xhamster import XHamsterIE from .xnxx import XNXXIE diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 10ae06263..216e03218 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -169,8 +169,13 @@ class GenericIE(InfoExtractor): # Site Name | Video Title # Video Title - Tagline | Site Name # and so on and so forth; it's just not practical - video_title = self._html_search_regex(r'<title>(.*)', - webpage, u'video title', default=u'video', flags=re.DOTALL) + video_title = self._html_search_regex( + r'(?s)(.*?)', webpage, u'video title', + default=u'video') + + # video uploader is domain name + video_uploader = self._search_regex( + r'^(?:https?://)?([^/]*)/.*', url, u'video uploader') # Look for BrightCove: bc_url = BrightcoveIE._extract_brightcove_url(webpage) @@ -188,7 +193,7 @@ class GenericIE(InfoExtractor): # Look for embedded YouTube player matches = re.findall( - r']+?src=(["\'])(?P(?:https?:)?//(?:www\.)?youtube.com/embed/.+?)\1', webpage) + r']+?src=(["\'])(?P(?:https?:)?//(?:www\.)?youtube\.com/embed/.+?)\1', webpage) if matches: urlrs = [self.url_result(unescapeHTML(tuppl[1]), 'Youtube') for tuppl in matches] @@ -197,13 +202,26 @@ class GenericIE(InfoExtractor): # Look for embedded Dailymotion player matches = re.findall( - r']+?src=(["\'])(?P(?:https?:)?//(?:www\.)?dailymotion.com/embed/video/.+?)\1', webpage) + r']+?src=(["\'])(?P(?:https?:)?//(?:www\.)?dailymotion\.com/embed/video/.+?)\1', webpage) if matches: urlrs = [self.url_result(unescapeHTML(tuppl[1]), 'Dailymotion') for tuppl in matches] return self.playlist_result( urlrs, playlist_id=video_id, playlist_title=video_title) + # Look for embedded Wistia player + match = re.search( + r']+?src=(["\'])(?P(?:https?:)?//(?:fast\.)?wistia\.net/embed/iframe/.+?)\1', webpage) + if match: + return { + '_type': 'url_transparent', + 'url': unescapeHTML(match.group('url')), + 'ie_key': 'Wistia', + 'uploader': video_uploader, + 'title': video_title, + 'id': video_id, + } + # Look for Bandcamp pages with custom domain mobj = re.search(r']*?content="(.*?bandcamp\.com.*?)"', webpage) if mobj is not None: @@ -247,14 +265,9 @@ class GenericIE(InfoExtractor): # here's a fun little line of code for you: video_id = os.path.splitext(video_id)[0] - # video uploader is domain name - video_uploader = self._search_regex(r'(?:https?://)?([^/]*)/.*', - url, u'video uploader') - return { 'id': video_id, 'url': video_url, 'uploader': video_uploader, - 'upload_date': None, 'title': video_title, } diff --git a/youtube_dl/extractor/wistia.py b/youtube_dl/extractor/wistia.py new file mode 100644 index 000000000..e1748c261 --- /dev/null +++ b/youtube_dl/extractor/wistia.py @@ -0,0 +1,55 @@ +import json +import re + +from .common import InfoExtractor + + +class WistiaIE(InfoExtractor): + _VALID_URL = r'^https?://(?:fast\.)?wistia\.net/embed/iframe/(?P[a-z0-9]+)' + + _TEST = { + u"url": u"http://fast.wistia.net/embed/iframe/sh7fpupwlt", + u"file": u"sh7fpupwlt.mov", + u"md5": u"cafeb56ec0c53c18c97405eecb3133df", + u"info_dict": { + u"title": u"cfh_resourceful_zdkh_final_1" + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + + webpage = self._download_webpage(url, video_id) + data_json = self._html_search_regex( + r'Wistia.iframeInit\((.*?), {}\);', webpage, u'video data') + + data = json.loads(data_json) + + formats = [] + thumbnails = [] + for atype, a in data['assets'].items(): + if atype == 'still': + thumbnails.append({ + 'url': a['url'], + 'resolution': '%dx%d' % (a['width'], a['height']), + }) + continue + if atype == 'preview': + continue + formats.append({ + 'format_id': atype, + 'url': a['url'], + 'width': a['width'], + 'height': a['height'], + 'filesize': a['size'], + 'ext': a['ext'], + }) + formats.sort(key=lambda a: a['filesize']) + + return { + 'id': video_id, + 'title': data['name'], + 'formats': formats, + 'thumbnails': thumbnails, + } From 4e761794760ff5b281205838bf8a02ea496b89b2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Fri, 6 Dec 2013 13:03:08 +0100 Subject: [PATCH 112/121] [vimeo] Extract views count, likes count and comments count (#1895) --- youtube_dl/extractor/vimeo.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index f27763ae2..ac956e673 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -196,6 +196,16 @@ class VimeoIE(InfoExtractor): if mobj is not None: video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3) + try: + view_count = int(self._search_regex(r'UserPlays:(\d+)', webpage, u'view count')) + like_count = int(self._search_regex(r'UserLikes:(\d+)', webpage, u'like count')) + comment_count = int(self._search_regex(r'UserComments:(\d+)', webpage, u'comment count')) + except RegexNotFoundError: + # This info is only available in vimeo.com/{id} urls + view_count = None + like_count = None + comment_count = None + # Vimeo specific: extract request signature and timestamp sig = config['request']['signature'] timestamp = config['request']['timestamp'] @@ -242,6 +252,9 @@ class VimeoIE(InfoExtractor): 'description': video_description, 'formats': formats, 'webpage_url': url, + 'view_count': view_count, + 'like_count': like_count, + 'comment_count': comment_count, } From 336c3a69bd198130e2f65f14dfc83383fec7c5e2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Fri, 6 Dec 2013 13:22:04 +0100 Subject: [PATCH 113/121] [youtube] Extract like and dislike count (#1895) --- youtube_dl/extractor/youtube.py | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 7fff761bd..52c8e7d04 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -388,10 +388,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): super(YoutubeIE, self).__init__(*args, **kwargs) self._player_cache = {} - def report_video_webpage_download(self, video_id): - """Report attempt to download video webpage.""" - self.to_screen(u'%s: Downloading video webpage' % video_id) - def report_video_info_webpage_download(self, video_id): """Report attempt to download video info webpage.""" self.to_screen(u'%s: Downloading video info webpage' % video_id) @@ -1258,15 +1254,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): video_id = self._extract_id(url) # Get video webpage - self.report_video_webpage_download(video_id) url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id - request = compat_urllib_request.Request(url) - try: - video_webpage_bytes = compat_urllib_request.urlopen(request).read() - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err)) - - video_webpage = video_webpage_bytes.decode('utf-8', 'ignore') + video_webpage = self._download_webpage(url, video_id) # Attempt to extract SWF player URL mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage) @@ -1383,6 +1372,14 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): else: video_description = u'' + def _extract_count(klass): + count = self._search_regex(r'class="%s">([\d,]+)' % re.escape(klass), video_webpage, klass, fatal=False) + if count is not None: + return int(count.replace(',', '')) + return None + like_count = _extract_count(u'likes-count') + dislike_count = _extract_count(u'dislikes-count') + # subtitles video_subtitles = self.extract_subtitles(video_id, video_webpage) @@ -1515,6 +1512,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): 'annotations': video_annotations, 'webpage_url': 'https://www.youtube.com/watch?v=%s' % video_id, 'view_count': view_count, + 'like_count': like_count, + 'dislike_count': dislike_count, }) return results From f53c966a73df42a9a949912ef8ab99a64fb99466 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Fri, 6 Dec 2013 13:36:36 +0100 Subject: [PATCH 114/121] [dailymotion] Extract view count (#1895) --- test/test_utils.py | 5 +++++ youtube_dl/extractor/dailymotion.py | 5 +++++ youtube_dl/utils.py | 4 ++++ 3 files changed, 14 insertions(+) diff --git a/test/test_utils.py b/test/test_utils.py index e9e590e74..0fa66beec 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -26,6 +26,7 @@ from youtube_dl.utils import ( unsmuggle_url, shell_quote, encodeFilename, + str_to_int, ) if sys.version_info < (3, 0): @@ -176,6 +177,10 @@ class TestUtil(unittest.TestCase): args = ['ffmpeg', '-i', encodeFilename(u'ñ€ß\'.mp4')] self.assertEqual(shell_quote(args), u"""ffmpeg -i 'ñ€ß'"'"'.mp4'""") + def test_str_to_int(self): + self.assertEqual(str_to_int('123,456'), 123456) + self.assertEqual(str_to_int('123.456'), 123456) + if __name__ == '__main__': unittest.main() diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index 71f5e03ee..3756cf765 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -11,6 +11,7 @@ from ..utils import ( get_element_by_attribute, get_element_by_id, orderedSet, + str_to_int, ExtractorError, ) @@ -146,6 +147,9 @@ class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor): self._list_available_subtitles(video_id, webpage) return + view_count = str_to_int(self._search_regex( + r'video_views_value[^>]+>([\d\.]+)<', webpage, u'view count')) + return { 'id': video_id, 'formats': formats, @@ -155,6 +159,7 @@ class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor): 'subtitles': video_subtitles, 'thumbnail': info['thumbnail_url'], 'age_limit': age_limit, + 'view_count': view_count, } def _get_available_subtitles(self, video_id, webpage): diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 77609f7ca..7b5878830 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1020,3 +1020,7 @@ def format_bytes(bytes): suffix = [u'B', u'KiB', u'MiB', u'GiB', u'TiB', u'PiB', u'EiB', u'ZiB', u'YiB'][exponent] converted = float(bytes) / float(1024 ** exponent) return u'%.2f%s' % (converted, suffix) + +def str_to_int(int_str): + int_str = re.sub(r'[,\.]', u'', int_str) + return int(int_str) From 563e405411131628a6ea160c3fe2b2b4a883ac85 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Fri, 6 Dec 2013 13:41:07 +0100 Subject: [PATCH 115/121] [dailymotion] Fix view count regex In some languages they can be in the format '123,456' instead of '123.456' --- youtube_dl/extractor/dailymotion.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index 3756cf765..3bd0b862c 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -148,7 +148,7 @@ class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor): return view_count = str_to_int(self._search_regex( - r'video_views_value[^>]+>([\d\.]+)<', webpage, u'view count')) + r'video_views_value[^>]+>([\d\.,]+)<', webpage, u'view count')) return { 'id': video_id, From 7d4afc557f88a05f4f45618c07443aee5aa2099e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Fri, 6 Dec 2013 19:48:54 +0100 Subject: [PATCH 116/121] [youtube:playlist] Support mix ids longer than 13 (#1295) --- youtube_dl/extractor/youtube.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 52c8e7d04..91f8028ff 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1553,7 +1553,7 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): def _extract_mix(self, playlist_id): # The mixes are generated from a a single video # the id of the playlist is just 'RD' + video_id - url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[2:], playlist_id) + url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id) webpage = self._download_webpage(url, playlist_id, u'Downloading Youtube mix') title_span = (get_element_by_attribute('class', 'title long-title', webpage) or get_element_by_attribute('class', 'title ', webpage)) @@ -1581,7 +1581,7 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): else: self.to_screen(u'Downloading playlist PL%s - add --no-playlist to just download video %s' % (playlist_id, video_id)) - if len(playlist_id) == 13: # 'RD' + 11 characters for the video id + if playlist_id.startswith('RD'): # Mixes require a custom extraction process return self._extract_mix(playlist_id) From 715c8e7bdb219f30f83c7d76cbbbc77195366cb7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Fri, 6 Dec 2013 19:52:41 +0100 Subject: [PATCH 117/121] [youtube:playlist] Recognize mix ids for direct use (fixes #1295) --- youtube_dl/extractor/youtube.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 91f8028ff..01715024c 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1528,10 +1528,10 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): \? (?:.*?&)*? (?:p|a|list)= | p/ ) - ((?:PL|EC|UU|FL)?[0-9A-Za-z-_]{10,}) + ((?:PL|EC|UU|FL|RD)?[0-9A-Za-z-_]{10,}) .* | - ((?:PL|EC|UU|FL)[0-9A-Za-z-_]{10,}) + ((?:PL|EC|UU|FL|RD)[0-9A-Za-z-_]{10,}) )""" _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s&page=%s' _MORE_PAGES_INDICATOR = r'data-link-type="next"' From 0b6a9f639f6447c7e09c38b88b42964e8fa05349 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Fri, 6 Dec 2013 20:14:29 +0100 Subject: [PATCH 118/121] [vevo] Update test video's duration --- youtube_dl/extractor/vevo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/vevo.py b/youtube_dl/extractor/vevo.py index 3eedcf7dd..4823992ef 100644 --- a/youtube_dl/extractor/vevo.py +++ b/youtube_dl/extractor/vevo.py @@ -24,7 +24,7 @@ class VevoIE(InfoExtractor): u"upload_date": u"20130624", u"uploader": u"Hurts", u"title": u"Somebody to Die For", - u"duration": 230, + u"duration": 230.12, u"width": 1920, u"height": 1080, } From d349cd22401648e88d57b6dcdd0c8bbb12aaa0d9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Fri, 6 Dec 2013 20:26:55 +0100 Subject: [PATCH 119/121] [imdb] Fix extraction The paths to each format's page may have leading whitespace. The height and the duration can't be extracted. --- youtube_dl/extractor/imdb.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/youtube_dl/extractor/imdb.py b/youtube_dl/extractor/imdb.py index d8e9712a7..6fb373db2 100644 --- a/youtube_dl/extractor/imdb.py +++ b/youtube_dl/extractor/imdb.py @@ -21,7 +21,6 @@ class ImdbIE(InfoExtractor): u'ext': u'mp4', u'title': u'Ice Age: Continental Drift Trailer (No. 2) - IMDb', u'description': u'md5:9061c2219254e5d14e03c25c98e96a81', - u'duration': 151, } } @@ -35,6 +34,7 @@ class ImdbIE(InfoExtractor): flags=re.MULTILINE) formats = [] for f_id, f_path in available_formats: + f_path = f_path.strip() format_page = self._download_webpage( compat_urlparse.urljoin(url, f_path), u'Downloading info for %s format' % f_id) @@ -46,7 +46,6 @@ class ImdbIE(InfoExtractor): formats.append({ 'format_id': f_id, 'url': format_info['url'], - 'height': int(info['titleObject']['encoding']['selected'][:-1]), }) return { @@ -55,5 +54,4 @@ class ImdbIE(InfoExtractor): 'formats': formats, 'description': descr, 'thumbnail': format_info['slate'], - 'duration': int(info['titleObject']['title']['duration_seconds']), } From 5cc14c2fd74a721be0effd5bc06a76164a9c97a1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Fri, 6 Dec 2013 21:47:32 +0100 Subject: [PATCH 120/121] [vimeo] Add an extractor for albums (closes #1911) --- test/test_playlists.py | 9 +++++++++ youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/vimeo.py | 23 ++++++++++++++++++++--- 3 files changed, 30 insertions(+), 3 deletions(-) diff --git a/test/test_playlists.py b/test/test_playlists.py index 00c950109..6a5e0b780 100644 --- a/test/test_playlists.py +++ b/test/test_playlists.py @@ -16,6 +16,7 @@ from youtube_dl.extractor import ( DailymotionUserIE, VimeoChannelIE, VimeoUserIE, + VimeoAlbumIE, UstreamChannelIE, SoundcloudSetIE, SoundcloudUserIE, @@ -65,6 +66,14 @@ class TestPlaylists(unittest.TestCase): self.assertEqual(result['title'], u'Nki') self.assertTrue(len(result['entries']) > 65) + def test_vimeo_album(self): + dl = FakeYDL() + ie = VimeoAlbumIE(dl) + result = ie.extract('http://vimeo.com/album/2632481') + self.assertIsPlaylist(result) + self.assertEqual(result['title'], u'Staff Favorites: November 2013') + self.assertTrue(len(result['entries']) > 12) + def test_ustream_channel(self): dl = FakeYDL() ie = UstreamChannelIE(dl) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index a7d37d48b..ac0a11dfe 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -170,6 +170,7 @@ from .vimeo import ( VimeoIE, VimeoChannelIE, VimeoUserIE, + VimeoAlbumIE, ) from .vine import VineIE from .viki import VikiIE diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index ac956e673..293dad3c0 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -20,7 +20,7 @@ class VimeoIE(InfoExtractor): """Information extractor for vimeo.com.""" # _VALID_URL matches Vimeo URLs - _VALID_URL = r'(?Phttps?://)?(?:(?:www|(?Pplayer))\.)?vimeo(?Ppro)?\.com/(?:(?:(?:groups|album)/[^/]+)|(?:.*?)/)?(?Pplay_redirect_hls\?clip_id=)?(?:videos?/)?(?P[0-9]+)/?(?:[?].*)?(?:#.*)?$' + _VALID_URL = r'(?Phttps?://)?(?:(?:www|(?Pplayer))\.)?vimeo(?Ppro)?\.com/(?:(?:(?:groups)/[^/]+)|(?:.*?)/)?(?Pplay_redirect_hls\?clip_id=)?(?:videos?/)?(?P[0-9]+)/?(?:[?].*)?(?:#.*)?$' _NETRC_MACHINE = 'vimeo' IE_NAME = u'vimeo' _TESTS = [ @@ -264,11 +264,14 @@ class VimeoChannelIE(InfoExtractor): _MORE_PAGES_INDICATOR = r']+?title="(.*?)"' + def _page_url(self, base_url, pagenum): + return '%s/videos/page:%d/' % (base_url, pagenum) + def _extract_videos(self, list_id, base_url): video_ids = [] for pagenum in itertools.count(1): webpage = self._download_webpage( - '%s/videos/page:%d/' % (base_url, pagenum),list_id, + self._page_url(base_url, pagenum) ,list_id, u'Downloading page %s' % pagenum) video_ids.extend(re.findall(r'id="clip_(\d+?)"', webpage)) if re.search(self._MORE_PAGES_INDICATOR, webpage, re.DOTALL) is None: @@ -297,7 +300,7 @@ class VimeoUserIE(VimeoChannelIE): @classmethod def suitable(cls, url): - if VimeoChannelIE.suitable(url) or VimeoIE.suitable(url): + if VimeoChannelIE.suitable(url) or VimeoIE.suitable(url) or VimeoAlbumIE.suitable(url): return False return super(VimeoUserIE, cls).suitable(url) @@ -305,3 +308,17 @@ class VimeoUserIE(VimeoChannelIE): mobj = re.match(self._VALID_URL, url) name = mobj.group('name') return self._extract_videos(name, 'http://vimeo.com/%s' % name) + + +class VimeoAlbumIE(VimeoChannelIE): + IE_NAME = u'vimeo:album' + _VALID_URL = r'(?:https?://)?vimeo.\com/album/(?P\d+)' + _TITLE_RE = r'