From d5a9bb4ea97287e633e891ddd1a416619c9aada9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rog=C3=A9rio=20Brito?= Date: Sat, 19 Oct 2013 14:04:44 -0300 Subject: [PATCH 001/425] extractor: youtube: Swap video dimensions to match standard practice. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit While working on this, I thought about simplifying things like changing 480x854 to 480p, and that seemed like a good option, until I realized that people (me included) usually link the concept of some number followed by a p with the video being 16:9. So, we would be losing some information and, as we all know, [explicit is better than implicit][*]. [*]: http://www.python.org/dev/peps/pep-0020/ This closes #1446. Signed-off-by: Rogério Brito --- youtube_dl/extractor/youtube.py | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index fb7c42830..143fac98a 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -253,21 +253,21 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): '248': 'webm', } _video_dimensions = { - '5': '240x400', + '5': '400x240', '6': '???', '13': '???', - '17': '144x176', - '18': '360x640', - '22': '720x1280', - '34': '360x640', - '35': '480x854', - '36': '240x320', - '37': '1080x1920', - '38': '3072x4096', - '43': '360x640', - '44': '480x854', - '45': '720x1280', - '46': '1080x1920', + '17': '176x144', + '18': '640x360', + '22': '1280x720', + '34': '640x360', + '35': '854x480', + '36': '320x240', + '37': '1920x1080', + '38': '4096x3072', + '43': '640x360', + '44': '854x480', + '45': '1280x720', + '46': '1920x1080', '82': '360p', '83': '480p', '84': '720p', From 5137ebac0b1438d22fe2c007e6172ee65e9311a5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcin=20Cie=C5=9Blak?= Date: Tue, 5 Nov 2013 23:30:25 +0100 Subject: [PATCH 002/425] [tvp] Telewizja Polska: new extractor for tvp.pl, fixes #1719 Thanks-To: mplonski https://github.com/mplonski/linux/blob/master/tvp-dl.py --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/tvp.py | 60 ++++++++++++++++++++++++++++++++ 2 files changed, 61 insertions(+) create mode 100644 youtube_dl/extractor/tvp.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 888a91cce..78f84cea3 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -132,6 +132,7 @@ from .tube8 import Tube8IE from .tudou import TudouIE from .tumblr import TumblrIE from .tutv import TutvIE +from .tvp import TvpIE from .unistra import UnistraIE from .ustream import UstreamIE, UstreamChannelIE from .vbox7 import Vbox7IE diff --git a/youtube_dl/extractor/tvp.py b/youtube_dl/extractor/tvp.py new file mode 100644 index 000000000..63fb57bbe --- /dev/null +++ b/youtube_dl/extractor/tvp.py @@ -0,0 +1,60 @@ +# encoding: utf-8 +import re +import json + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + ExtractorError, + RegexNotFoundError, +) + +class TvpIE(InfoExtractor): + IE_NAME = u'tvp.pl' + _VALID_URL = r'https?://www\.tvp\.pl/.*?wideo/(?P\d+)/(?P\d+)' + _INFO_URL = 'http://www.tvp.pl/pub/stat/videofileinfo?video_id=%s' + + + _TEST = { + u'url': u'http://www.tvp.pl/warszawa/magazyny/campusnews/wideo/31102013/12878238', + u'file': u'31.10.2013-12878238.wmv', + u'info_dict': { + u'title': u'31.10.2013', + u'description': u'31.10.2013', + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + webpage = self._download_webpage(url, video_id, "Downloading video webpage") + json_params = self._download_webpage(self._INFO_URL % video_id, video_id, "Downloading video metadata") + + try: + params = json.loads(json_params) + except: + raise ExtractorError(u'Invalid JSON') + + self.report_extraction(video_id) + try: + video_url = params['video_url'] + except KeyError: + raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1]) + + try: + title = self._og_search_title(webpage) + except RegexNotFoundError: + title = video_id + info = { + 'id': video_id, + 'title': title, + 'ext': 'wmv', + 'url': video_url, + } + try: + info['description'] = self._og_search_description(webpage) + info['thumbnail'] = self._og_search_thumbnail(webpage) + except RegexNotFoundError: + pass + + return info From 4894fe8c5baec8b1f21ac6fdebe08175abc7f094 Mon Sep 17 00:00:00 2001 From: rzhxeo Date: Tue, 29 Oct 2013 01:05:21 +0100 Subject: [PATCH 003/425] Report download progress of rtmpdump --- youtube_dl/FileDownloader.py | 70 ++++++++++++++++++++++++++++++++---- 1 file changed, 63 insertions(+), 7 deletions(-) diff --git a/youtube_dl/FileDownloader.py b/youtube_dl/FileDownloader.py index 8ecabab1a..664b78662 100644 --- a/youtube_dl/FileDownloader.py +++ b/youtube_dl/FileDownloader.py @@ -268,6 +268,61 @@ class FileDownloader(object): (clear_line, data_len_str, self.format_seconds(tot_time))) def _download_with_rtmpdump(self, filename, url, player_url, page_url, play_path, tc_url): + def run_rtmpdump(args): + start = time.time() + resume_percent = None + resume_downloaded_data_len = None + proc = subprocess.Popen(args, stderr=subprocess.PIPE) + cursor_in_new_line = True + proc_stderr_closed = False + while not proc_stderr_closed: + # read line from stderr + line = u'' + while True: + char = proc.stderr.read(1) + if not char: + proc_stderr_closed = True + break + if char in [b'\r', b'\n']: + break + line += char.decode('ascii', 'replace') + if not line: + # proc_stderr_closed is True + continue + mobj = re.search(r'([0-9]+\.[0-9]{3}) kB / [0-9]+\.[0-9]{2} sec \(([0-9]{1,2}\.[0-9])%\)', line) + if mobj: + downloaded_data_len = int(float(mobj.group(1))*1024) + percent = float(mobj.group(2)) + if not resume_percent: + resume_percent = percent + resume_downloaded_data_len = downloaded_data_len + eta = self.calc_eta(start, time.time(), 100-resume_percent, percent-resume_percent) + speed = self.calc_speed(start, time.time(), downloaded_data_len-resume_downloaded_data_len) + data_len = None + if percent > 0: + data_len = int(downloaded_data_len * 100 / percent) + data_len_str = u'~'+self.format_bytes(data_len) + self.report_progress(percent, data_len_str, speed, eta) + cursor_in_new_line = False + self._hook_progress({ + 'downloaded_bytes': downloaded_data_len, + 'total_bytes': data_len, + 'tmpfilename': tmpfilename, + 'filename': filename, + 'status': 'downloading', + 'eta': eta, + 'speed': speed, + }) + elif self.params.get('verbose', False): + if not cursor_in_new_line: + self.to_screen(u'') + cursor_in_new_line = True + self.to_screen(u'[rtmpdump] '+line) + proc.wait() + if not cursor_in_new_line: + self.to_screen(u'') + return proc.returncode + self.report_destination(filename) tmpfilename = self.temp_name(filename) test = self.params.get('test', False) @@ -278,12 +333,11 @@ class FileDownloader(object): except (OSError, IOError): self.report_error(u'RTMP download detected but "rtmpdump" could not be run') return False - verbosity_option = '--verbose' if self.params.get('verbose', False) else '--quiet' # Download using rtmpdump. rtmpdump returns exit code 2 when # the connection was interrumpted and resuming appears to be # possible. This is part of rtmpdump's normal usage, AFAIK. - basic_args = ['rtmpdump', verbosity_option, '-r', url, '-o', tmpfilename] + basic_args = ['rtmpdump', '--verbose', '-r', url, '-o', tmpfilename] if player_url is not None: basic_args += ['--swfVfy', player_url] if page_url is not None: @@ -302,23 +356,25 @@ class FileDownloader(object): except ImportError: shell_quote = repr self.to_screen(u'[debug] rtmpdump command line: ' + shell_quote(args)) - retval = subprocess.call(args) + + retval = run_rtmpdump(args) + while (retval == 2 or retval == 1) and not test: prevsize = os.path.getsize(encodeFilename(tmpfilename)) - self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True) + self.to_screen(u'[rtmpdump] %s bytes' % prevsize) time.sleep(5.0) # This seems to be needed - retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1]) + retval = run_rtmpdump(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1]) cursize = os.path.getsize(encodeFilename(tmpfilename)) if prevsize == cursize and retval == 1: break # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those if prevsize == cursize and retval == 2 and cursize > 1024: - self.to_screen(u'\r[rtmpdump] Could not download the whole video. This can happen for some advertisements.') + self.to_screen(u'[rtmpdump] Could not download the whole video. This can happen for some advertisements.') retval = 0 break if retval == 0 or (test and retval == 2): fsize = os.path.getsize(encodeFilename(tmpfilename)) - self.to_screen(u'\r[rtmpdump] %s bytes' % fsize) + self.to_screen(u'[rtmpdump] %s bytes' % fsize) self.try_rename(tmpfilename, filename) self._hook_progress({ 'downloaded_bytes': fsize, From c8434e83163fc90007eb5b501ea0e827f8b5e127 Mon Sep 17 00:00:00 2001 From: rzhxeo Date: Mon, 4 Nov 2013 03:08:17 +0100 Subject: [PATCH 004/425] Add support for crunchyroll.com --- youtube_dl/aes.py | 144 ++++++++++++++++++++--- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/crunchyroll.py | 171 ++++++++++++++++++++++++++++ 3 files changed, 298 insertions(+), 18 deletions(-) create mode 100644 youtube_dl/extractor/crunchyroll.py diff --git a/youtube_dl/aes.py b/youtube_dl/aes.py index 9a0c93fa6..e9c5e2152 100644 --- a/youtube_dl/aes.py +++ b/youtube_dl/aes.py @@ -1,4 +1,4 @@ -__all__ = ['aes_encrypt', 'key_expansion', 'aes_ctr_decrypt', 'aes_decrypt_text'] +__all__ = ['aes_encrypt', 'key_expansion', 'aes_ctr_decrypt', 'aes_cbc_decrypt', 'aes_decrypt_text'] import base64 from math import ceil @@ -32,6 +32,31 @@ def aes_ctr_decrypt(data, key, counter): return decrypted_data +def aes_cbc_decrypt(data, key, iv): + """ + Decrypt with aes in CBC mode + + @param {int[]} data cipher + @param {int[]} key 16/24/32-Byte cipher key + @param {int[]} iv 16-Byte IV + @returns {int[]} decrypted data + """ + expanded_key = key_expansion(key) + block_count = int(ceil(float(len(data)) / BLOCK_SIZE_BYTES)) + + decrypted_data=[] + previous_cipher_block = iv + for i in range(block_count): + block = data[i*BLOCK_SIZE_BYTES : (i+1)*BLOCK_SIZE_BYTES] + block += [0]*(BLOCK_SIZE_BYTES - len(block)) + + decrypted_block = aes_decrypt(block, expanded_key) + decrypted_data += xor(decrypted_block, previous_cipher_block) + previous_cipher_block = block + decrypted_data = decrypted_data[:len(data)] + + return decrypted_data + def key_expansion(data): """ Generate key schedule @@ -75,7 +100,7 @@ def aes_encrypt(data, expanded_key): @returns {int[]} 16-Byte cipher """ rounds = len(expanded_key) // BLOCK_SIZE_BYTES - 1 - + data = xor(data, expanded_key[:BLOCK_SIZE_BYTES]) for i in range(1, rounds+1): data = sub_bytes(data) @@ -83,6 +108,26 @@ def aes_encrypt(data, expanded_key): if i != rounds: data = mix_columns(data) data = xor(data, expanded_key[i*BLOCK_SIZE_BYTES : (i+1)*BLOCK_SIZE_BYTES]) + + return data + +def aes_decrypt(data, expanded_key): + """ + Decrypt one block with aes + + @param {int[]} data 16-Byte cipher + @param {int[]} expanded_key 176/208/240-Byte expanded key + @returns {int[]} 16-Byte state + """ + rounds = len(expanded_key) // BLOCK_SIZE_BYTES - 1 + + for i in range(rounds, 0, -1): + data = xor(data, expanded_key[i*BLOCK_SIZE_BYTES : (i+1)*BLOCK_SIZE_BYTES]) + if i != rounds: + data = mix_columns_inv(data) + data = shift_rows_inv(data) + data = sub_bytes_inv(data) + data = xor(data, expanded_key[:BLOCK_SIZE_BYTES]) return data @@ -139,14 +184,69 @@ SBOX = (0x63, 0x7C, 0x77, 0x7B, 0xF2, 0x6B, 0x6F, 0xC5, 0x30, 0x01, 0x67, 0x2B, 0x70, 0x3E, 0xB5, 0x66, 0x48, 0x03, 0xF6, 0x0E, 0x61, 0x35, 0x57, 0xB9, 0x86, 0xC1, 0x1D, 0x9E, 0xE1, 0xF8, 0x98, 0x11, 0x69, 0xD9, 0x8E, 0x94, 0x9B, 0x1E, 0x87, 0xE9, 0xCE, 0x55, 0x28, 0xDF, 0x8C, 0xA1, 0x89, 0x0D, 0xBF, 0xE6, 0x42, 0x68, 0x41, 0x99, 0x2D, 0x0F, 0xB0, 0x54, 0xBB, 0x16) -MIX_COLUMN_MATRIX = ((2,3,1,1), - (1,2,3,1), - (1,1,2,3), - (3,1,1,2)) +SBOX_INV = (0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38, 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb, + 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87, 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb, + 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d, 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e, + 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2, 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25, + 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16, 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92, + 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda, 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84, + 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a, 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06, + 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02, 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b, + 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea, 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73, + 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85, 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e, + 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89, 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b, + 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20, 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4, + 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31, 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f, + 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d, 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef, + 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0, 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61, + 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26, 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d) +MIX_COLUMN_MATRIX = ((0x2,0x3,0x1,0x1), + (0x1,0x2,0x3,0x1), + (0x1,0x1,0x2,0x3), + (0x3,0x1,0x1,0x2)) +MIX_COLUMN_MATRIX_INV = ((0xE,0xB,0xD,0x9), + (0x9,0xE,0xB,0xD), + (0xD,0x9,0xE,0xB), + (0xB,0xD,0x9,0xE)) +RIJNDAEL_EXP_TABLE = (0x01, 0x03, 0x05, 0x0F, 0x11, 0x33, 0x55, 0xFF, 0x1A, 0x2E, 0x72, 0x96, 0xA1, 0xF8, 0x13, 0x35, + 0x5F, 0xE1, 0x38, 0x48, 0xD8, 0x73, 0x95, 0xA4, 0xF7, 0x02, 0x06, 0x0A, 0x1E, 0x22, 0x66, 0xAA, + 0xE5, 0x34, 0x5C, 0xE4, 0x37, 0x59, 0xEB, 0x26, 0x6A, 0xBE, 0xD9, 0x70, 0x90, 0xAB, 0xE6, 0x31, + 0x53, 0xF5, 0x04, 0x0C, 0x14, 0x3C, 0x44, 0xCC, 0x4F, 0xD1, 0x68, 0xB8, 0xD3, 0x6E, 0xB2, 0xCD, + 0x4C, 0xD4, 0x67, 0xA9, 0xE0, 0x3B, 0x4D, 0xD7, 0x62, 0xA6, 0xF1, 0x08, 0x18, 0x28, 0x78, 0x88, + 0x83, 0x9E, 0xB9, 0xD0, 0x6B, 0xBD, 0xDC, 0x7F, 0x81, 0x98, 0xB3, 0xCE, 0x49, 0xDB, 0x76, 0x9A, + 0xB5, 0xC4, 0x57, 0xF9, 0x10, 0x30, 0x50, 0xF0, 0x0B, 0x1D, 0x27, 0x69, 0xBB, 0xD6, 0x61, 0xA3, + 0xFE, 0x19, 0x2B, 0x7D, 0x87, 0x92, 0xAD, 0xEC, 0x2F, 0x71, 0x93, 0xAE, 0xE9, 0x20, 0x60, 0xA0, + 0xFB, 0x16, 0x3A, 0x4E, 0xD2, 0x6D, 0xB7, 0xC2, 0x5D, 0xE7, 0x32, 0x56, 0xFA, 0x15, 0x3F, 0x41, + 0xC3, 0x5E, 0xE2, 0x3D, 0x47, 0xC9, 0x40, 0xC0, 0x5B, 0xED, 0x2C, 0x74, 0x9C, 0xBF, 0xDA, 0x75, + 0x9F, 0xBA, 0xD5, 0x64, 0xAC, 0xEF, 0x2A, 0x7E, 0x82, 0x9D, 0xBC, 0xDF, 0x7A, 0x8E, 0x89, 0x80, + 0x9B, 0xB6, 0xC1, 0x58, 0xE8, 0x23, 0x65, 0xAF, 0xEA, 0x25, 0x6F, 0xB1, 0xC8, 0x43, 0xC5, 0x54, + 0xFC, 0x1F, 0x21, 0x63, 0xA5, 0xF4, 0x07, 0x09, 0x1B, 0x2D, 0x77, 0x99, 0xB0, 0xCB, 0x46, 0xCA, + 0x45, 0xCF, 0x4A, 0xDE, 0x79, 0x8B, 0x86, 0x91, 0xA8, 0xE3, 0x3E, 0x42, 0xC6, 0x51, 0xF3, 0x0E, + 0x12, 0x36, 0x5A, 0xEE, 0x29, 0x7B, 0x8D, 0x8C, 0x8F, 0x8A, 0x85, 0x94, 0xA7, 0xF2, 0x0D, 0x17, + 0x39, 0x4B, 0xDD, 0x7C, 0x84, 0x97, 0xA2, 0xFD, 0x1C, 0x24, 0x6C, 0xB4, 0xC7, 0x52, 0xF6, 0x01) +RIJNDAEL_LOG_TABLE = (0x00, 0x00, 0x19, 0x01, 0x32, 0x02, 0x1a, 0xc6, 0x4b, 0xc7, 0x1b, 0x68, 0x33, 0xee, 0xdf, 0x03, + 0x64, 0x04, 0xe0, 0x0e, 0x34, 0x8d, 0x81, 0xef, 0x4c, 0x71, 0x08, 0xc8, 0xf8, 0x69, 0x1c, 0xc1, + 0x7d, 0xc2, 0x1d, 0xb5, 0xf9, 0xb9, 0x27, 0x6a, 0x4d, 0xe4, 0xa6, 0x72, 0x9a, 0xc9, 0x09, 0x78, + 0x65, 0x2f, 0x8a, 0x05, 0x21, 0x0f, 0xe1, 0x24, 0x12, 0xf0, 0x82, 0x45, 0x35, 0x93, 0xda, 0x8e, + 0x96, 0x8f, 0xdb, 0xbd, 0x36, 0xd0, 0xce, 0x94, 0x13, 0x5c, 0xd2, 0xf1, 0x40, 0x46, 0x83, 0x38, + 0x66, 0xdd, 0xfd, 0x30, 0xbf, 0x06, 0x8b, 0x62, 0xb3, 0x25, 0xe2, 0x98, 0x22, 0x88, 0x91, 0x10, + 0x7e, 0x6e, 0x48, 0xc3, 0xa3, 0xb6, 0x1e, 0x42, 0x3a, 0x6b, 0x28, 0x54, 0xfa, 0x85, 0x3d, 0xba, + 0x2b, 0x79, 0x0a, 0x15, 0x9b, 0x9f, 0x5e, 0xca, 0x4e, 0xd4, 0xac, 0xe5, 0xf3, 0x73, 0xa7, 0x57, + 0xaf, 0x58, 0xa8, 0x50, 0xf4, 0xea, 0xd6, 0x74, 0x4f, 0xae, 0xe9, 0xd5, 0xe7, 0xe6, 0xad, 0xe8, + 0x2c, 0xd7, 0x75, 0x7a, 0xeb, 0x16, 0x0b, 0xf5, 0x59, 0xcb, 0x5f, 0xb0, 0x9c, 0xa9, 0x51, 0xa0, + 0x7f, 0x0c, 0xf6, 0x6f, 0x17, 0xc4, 0x49, 0xec, 0xd8, 0x43, 0x1f, 0x2d, 0xa4, 0x76, 0x7b, 0xb7, + 0xcc, 0xbb, 0x3e, 0x5a, 0xfb, 0x60, 0xb1, 0x86, 0x3b, 0x52, 0xa1, 0x6c, 0xaa, 0x55, 0x29, 0x9d, + 0x97, 0xb2, 0x87, 0x90, 0x61, 0xbe, 0xdc, 0xfc, 0xbc, 0x95, 0xcf, 0xcd, 0x37, 0x3f, 0x5b, 0xd1, + 0x53, 0x39, 0x84, 0x3c, 0x41, 0xa2, 0x6d, 0x47, 0x14, 0x2a, 0x9e, 0x5d, 0x56, 0xf2, 0xd3, 0xab, + 0x44, 0x11, 0x92, 0xd9, 0x23, 0x20, 0x2e, 0x89, 0xb4, 0x7c, 0xb8, 0x26, 0x77, 0x99, 0xe3, 0xa5, + 0x67, 0x4a, 0xed, 0xde, 0xc5, 0x31, 0xfe, 0x18, 0x0d, 0x63, 0x8c, 0x80, 0xc0, 0xf7, 0x70, 0x07) def sub_bytes(data): return [SBOX[x] for x in data] +def sub_bytes_inv(data): + return [SBOX_INV[x] for x in data] + def rotate(data): return data[1:] + [data[0]] @@ -160,30 +260,31 @@ def key_schedule_core(data, rcon_iteration): def xor(data1, data2): return [x^y for x, y in zip(data1, data2)] -def mix_column(data): +def rijndael_mul(a, b): + if(a==0 or b==0): + return 0 + return RIJNDAEL_EXP_TABLE[(RIJNDAEL_LOG_TABLE[a] + RIJNDAEL_LOG_TABLE[b]) % 0xFF] + +def mix_column(data, matrix): data_mixed = [] for row in range(4): mixed = 0 for column in range(4): - addend = data[column] - if MIX_COLUMN_MATRIX[row][column] in (2,3): - addend <<= 1 - if addend > 0xff: - addend &= 0xff - addend ^= 0x1b - if MIX_COLUMN_MATRIX[row][column] == 3: - addend ^= data[column] - mixed ^= addend & 0xff + # xor is (+) and (-) + mixed ^= rijndael_mul(data[column], matrix[row][column]) data_mixed.append(mixed) return data_mixed -def mix_columns(data): +def mix_columns(data, matrix=MIX_COLUMN_MATRIX): data_mixed = [] for i in range(4): column = data[i*4 : (i+1)*4] - data_mixed += mix_column(column) + data_mixed += mix_column(column, matrix) return data_mixed +def mix_columns_inv(data): + return mix_columns(data, MIX_COLUMN_MATRIX_INV) + def shift_rows(data): data_shifted = [] for column in range(4): @@ -191,6 +292,13 @@ def shift_rows(data): data_shifted.append( data[((column + row) & 0b11) * 4 + row] ) return data_shifted +def shift_rows_inv(data): + data_shifted = [] + for column in range(4): + for row in range(4): + data_shifted.append( data[((column - row) & 0b11) * 4 + row] ) + return data_shifted + def inc(data): data = data[:] # copy for i in range(len(data)-1,-1,-1): diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index bcf1cce7f..a61e17ea1 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -23,6 +23,7 @@ from .collegehumor import CollegeHumorIE from .comedycentral import ComedyCentralIE from .condenast import CondeNastIE from .criterion import CriterionIE +from .crunchyroll import CrunchyrollIE from .cspan import CSpanIE from .dailymotion import ( DailymotionIE, diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py new file mode 100644 index 000000000..4bd366079 --- /dev/null +++ b/youtube_dl/extractor/crunchyroll.py @@ -0,0 +1,171 @@ +# encoding: utf-8 +import re, base64, zlib +from hashlib import sha1 +from math import pow, sqrt, floor +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + compat_urllib_parse, + compat_urllib_request, + bytes_to_intlist, + intlist_to_bytes, + unified_strdate, + clean_html, +) +from ..aes import ( + aes_cbc_decrypt, + inc, +) + +class CrunchyrollIE(InfoExtractor): + _VALID_URL = r'(?:https?://)?(?:www\.)?(?Pcrunchyroll\.com/[^/]*/[^/?&]*?(?P[0-9]+))(?:[/?&]|$)' + _TESTS = [{ + u'url': u'http://www.crunchyroll.com/wanna-be-the-strongest-in-the-world/episode-1-an-idol-wrestler-is-born-645513', + u'file': u'645513.flv', + #u'md5': u'b1639fd6ddfaa43788c85f6d1dddd412', + u'info_dict': { + u'title': u'Wanna be the Strongest in the World – Episode 1 – An Idol-Wrestler is Born!', + u'description': u'md5:2d17137920c64f2f49981a7797d275ef', + u'thumbnail': u'http://img1.ak.crunchyroll.com/i/spire1-tmb/20c6b5e10f1a47b10516877d3c039cae1380951166_full.jpg', + u'uploader': u'Yomiuri Telecasting Corporation (YTV)', + u'upload_date': u'20131013', + }, + u'params': { + # rtmp + u'skip_download': True, + }, + }] + + _FORMAT_IDS = { + u'360': (u'60', u'106'), + u'480': (u'61', u'106'), + u'720': (u'62', u'106'), + u'1080': (u'80', u'108'), + } + + def _decrypt_subtitles(self, data, iv, id): + data = bytes_to_intlist(data) + iv = bytes_to_intlist(iv) + id = int(id) + + def obfuscate_key_aux(count, modulo, start): + output = list(start) + for _ in range(count): + output.append(output[-1] + output[-2]) + # cut off start values + output = output[2:] + output = list(map(lambda x: x % modulo + 33, output)) + return output + + def obfuscate_key(key): + num1 = int(floor(pow(2, 25) * sqrt(6.9))) + num2 = (num1 ^ key) << 5 + num3 = key ^ num1 + num4 = num3 ^ (num3 >> 3) ^ num2 + prefix = intlist_to_bytes(obfuscate_key_aux(20, 97, (1, 2))) + shaHash = bytes_to_intlist(sha1(prefix + str(num4).encode(u'ascii')).digest()) + # Extend 160 Bit hash to 256 Bit + return shaHash + [0] * 12 + + key = obfuscate_key(id) + class Counter: + __value = iv + def next_value(self): + temp = self.__value + self.__value = inc(self.__value) + return temp + decrypted_data = intlist_to_bytes(aes_cbc_decrypt(data, key, iv)) + return zlib.decompress(decrypted_data) + + def _convert_subtitles_to_srt(self, subtitles): + i=1 + output = u'' + for start, end, text in re.findall(r']*?start="([^"]+)" [^>]*?end="([^"]+)" [^>]*?text="([^"]+)"[^>]*?>', subtitles): + start = start.replace(u'.', u',') + end = end.replace(u'.', u',') + text = clean_html(text) + text = text.replace(u'\\N', u'\n') + if not text: + continue + output += u'%d\n%s --> %s\n%s\n\n' % (i, start, end, text) + i+=1 + return output + + def _real_extract(self,url): + mobj = re.match(self._VALID_URL, url) + + webpage_url = u'http://www.' + mobj.group('url') + video_id = mobj.group(u'video_id') + webpage = self._download_webpage(webpage_url, video_id) + note_m = self._html_search_regex(r'
(.+?)
', webpage, u'trailer-notice', default=u'') + if note_m: + raise ExtractorError(note_m) + + video_title = self._html_search_regex(r']*>(.+?)', webpage, u'video_title', flags=re.DOTALL) + video_title = re.sub(r' {5} *–? *', u' – ', video_title) + video_description = self._html_search_regex(r'"description":"([^"]+)', webpage, u'video_description', default=u'') + if not video_description: + video_description = None + video_upload_date = self._html_search_regex(r'
Availability for free users:(.+?)
', webpage, u'video_upload_date', fatal=False, flags=re.DOTALL) + if video_upload_date: + video_upload_date = unified_strdate(video_upload_date) + video_uploader = self._html_search_regex(r'
\s*Publisher:(.+?)
', webpage, u'video_uploader', fatal=False, flags=re.DOTALL) + + playerdata_url = compat_urllib_parse.unquote(self._html_search_regex(r'"config_url":"([^"]+)', webpage, u'playerdata_url')) + playerdata_req = compat_urllib_request.Request(playerdata_url) + playerdata_req.data = compat_urllib_parse.urlencode({u'current_page': webpage_url}) + playerdata_req.add_header(u'Content-Type', u'application/x-www-form-urlencoded') + playerdata = self._download_webpage(playerdata_req, video_id, note=u'Downloading media info') + + stream_id = self._search_regex(r'([^<]+)', playerdata, u'stream_id') + video_thumbnail = self._search_regex(r'([^<]+)', playerdata, u'thumbnail', fatal=False) + + formats = [] + for fmt in re.findall(r'\?p([0-9]{3,4})=1', webpage): + stream_quality, stream_format = self._FORMAT_IDS[fmt] + video_format = fmt+u'p' + streamdata_req = compat_urllib_request.Request(u'http://www.crunchyroll.com/xml/') + # urlencode doesn't work! + streamdata_req.data = u'req=RpcApiVideoEncode%5FGetStreamInfo&video%5Fencode%5Fquality='+stream_quality+u'&media%5Fid='+stream_id+u'&video%5Fformat='+stream_format + streamdata_req.add_header(u'Content-Type', u'application/x-www-form-urlencoded') + streamdata_req.add_header(u'Content-Length', str(len(streamdata_req.data))) + streamdata = self._download_webpage(streamdata_req, video_id, note=u'Downloading media info for '+video_format) + video_url = self._search_regex(r'([^<]+)', streamdata, u'video_url') + video_play_path = self._search_regex(r'([^<]+)', streamdata, u'video_play_path') + formats.append({ + u'url': video_url, + u'play_path': video_play_path, + u'ext': 'flv', + u'format': video_format, + u'format_id': video_format, + }) + + subtitles = {} + for sub_id, sub_name in re.findall(r'\?ssid=([0-9]+)" title="([^"]+)', webpage): + sub_page = self._download_webpage(u'http://www.crunchyroll.com/xml/?req=RpcApiSubtitle_GetXml&subtitle_script_id='+sub_id,\ + video_id, note=u'Downloading subtitles for '+sub_name) + id = self._search_regex(r'id=\'([0-9]+)', sub_page, u'subtitle_id', fatal=False) + iv = self._search_regex(r'([^<]+)', sub_page, u'subtitle_iv', fatal=False) + data = self._search_regex(r'([^<]+)', sub_page, u'subtitle_data', fatal=False) + if not id or not iv or not data: + continue + id = int(id) + iv = base64.b64decode(iv) + data = base64.b64decode(data) + + subtitle = self._decrypt_subtitles(data, iv, id).decode(u'utf-8') + lang_code = self._search_regex(r'lang_code=\'([^\']+)', subtitle, u'subtitle_lang_code', fatal=False) + if not lang_code: + continue + subtitles[lang_code] = self._convert_subtitles_to_srt(subtitle) + + return { + u'id': video_id, + u'title': video_title, + u'description': video_description, + u'thumbnail': video_thumbnail, + u'uploader': video_uploader, + u'upload_date': video_upload_date, + u'subtitles': subtitles, + u'formats': formats, + } From eb0a83986642cf660820b168bd83c8770e3e5ce6 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Tue, 12 Nov 2013 10:36:23 +0100 Subject: [PATCH 005/425] [common] Simplify og_search_property --- youtube_dl/extractor/common.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index fb2d50a09..9c20d30b4 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -322,9 +322,9 @@ class InfoExtractor(object): if name is None: name = 'OpenGraph %s' % prop escaped = self._search_regex(self._og_regex(prop), html, name, flags=re.DOTALL, **kargs) - if not escaped is None: - return unescapeHTML(escaped) - return None + if escaped is None: + return None + return unescapeHTML(escaped) def _og_search_thumbnail(self, html, **kargs): return self._og_search_property('image', html, u'thumbnail url', fatal=False, **kargs) From 72b18c5d34d7bf02f83d335b886921ff25c501f9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Tue, 12 Nov 2013 20:38:13 +0100 Subject: [PATCH 006/425] FFmpegMetadataPP: don't enclose the values with " (fixes #1756) --- youtube_dl/PostProcessor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/PostProcessor.py b/youtube_dl/PostProcessor.py index 13b56ede5..69aedf87a 100644 --- a/youtube_dl/PostProcessor.py +++ b/youtube_dl/PostProcessor.py @@ -501,7 +501,7 @@ class FFmpegMetadataPP(FFmpegPostProcessor): options = ['-c', 'copy'] for (name, value) in metadata.items(): - options.extend(['-metadata', '%s="%s"' % (name, value)]) + options.extend(['-metadata', '%s=%s' % (name, value)]) options.extend(['-f', ext]) self._downloader.to_screen(u'[ffmpeg] Adding metadata to \'%s\'' % filename) From 8b8cbd8f6d3b525dcee3bc3df98c1ad0f093231d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Tue, 12 Nov 2013 20:50:52 +0100 Subject: [PATCH 007/425] [vine] Fix uploader extraction --- youtube_dl/extractor/vine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/vine.py b/youtube_dl/extractor/vine.py index c4ec1f06f..651ba317d 100644 --- a/youtube_dl/extractor/vine.py +++ b/youtube_dl/extractor/vine.py @@ -27,7 +27,7 @@ class VineIE(InfoExtractor): video_url = self._html_search_regex(r'.*?

(.+?)

', + uploader = self._html_search_regex(r'

(.*?)

', webpage, u'uploader', fatal=False, flags=re.DOTALL) return [{ From 0bd59f3723a22914a538527921815436c6d2f2d1 Mon Sep 17 00:00:00 2001 From: migbac Date: Tue, 12 Nov 2013 23:32:03 +0100 Subject: [PATCH 008/425] Add support for d8.tv --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/d8.py | 52 ++++++++++++++++++++++++++++++++ 2 files changed, 53 insertions(+) create mode 100644 youtube_dl/extractor/d8.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index f9caca4ef..d7db840dd 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -25,6 +25,7 @@ from .comedycentral import ComedyCentralIE from .condenast import CondeNastIE from .criterion import CriterionIE from .cspan import CSpanIE +from .d8 import D8IE from .dailymotion import ( DailymotionIE, DailymotionPlaylistIE, diff --git a/youtube_dl/extractor/d8.py b/youtube_dl/extractor/d8.py new file mode 100644 index 000000000..5ce483b16 --- /dev/null +++ b/youtube_dl/extractor/d8.py @@ -0,0 +1,52 @@ +# encoding: utf-8 +import re +import xml.etree.ElementTree + +from .common import InfoExtractor +from ..utils import unified_strdate + +class D8IE(InfoExtractor): + _VALID_URL = r'https?://www\.d8\.tv/.*?/(?P.*)' + _VIDEO_INFO_TEMPLATE = 'http://service.canal-plus.com/video/rest/getVideosLiees/d8/%s' + IE_NAME = u'd8.tv' + + _TEST = { + u'url': u'http://www.d8.tv/d8-docs-mags/pid6589-d8-campagne-intime.html', + u'file': u'966289.flv', + u'info_dict': { + u'title': u'Campagne intime - Documentaire exceptionnel', + u'description': u'md5:d2643b799fb190846ae09c61e59a859f', + u'upload_date': u'20131108', + }, + u'params': { + u'skip_download': True, + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + webpage = self._download_webpage(url, mobj.group('path')) + video_id = self._search_regex(r'videoId = "(\d+)";', webpage, u'video id') + info_url = self._VIDEO_INFO_TEMPLATE % video_id + info_page = self._download_webpage(info_url,video_id, + u'Downloading video info') + + self.report_extraction(video_id) + doc = xml.etree.ElementTree.fromstring(info_page.encode('utf-8')) + video_info = [video for video in doc if video.find('ID').text == video_id][0] + infos = video_info.find('INFOS') + media = video_info.find('MEDIA') + formats = [media.find('VIDEOS/%s' % format) + for format in ['BAS_DEBIT', 'HAUT_DEBIT', 'HD']] + video_url = [format.text for format in formats if format is not None][-1] + + return {'id': video_id, + 'title': u'%s - %s' % (infos.find('TITRAGE/TITRE').text, + infos.find('TITRAGE/SOUS_TITRE').text), + 'url': video_url, + 'ext': 'flv', + 'upload_date': unified_strdate(infos.find('PUBLICATION/DATE').text), + 'thumbnail': media.find('IMAGES/GRAND').text, + 'description': infos.find('DESCRIPTION').text, + 'view_count': int(infos.find('NB_VUES').text), + } From eb9b5bffef9d247bba4f2e8d387ddfbc47ece77b Mon Sep 17 00:00:00 2001 From: Jelle van der Waa Date: Wed, 13 Nov 2013 10:30:41 +0100 Subject: [PATCH 009/425] Add extractor for gamekings.tv --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/gamekings.py | 39 +++++++++++++++++++++++++++++++ 2 files changed, 40 insertions(+) create mode 100644 youtube_dl/extractor/gamekings.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index f9caca4ef..67a0ad5cb 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -57,6 +57,7 @@ from .francetv import ( ) from .freesound import FreesoundIE from .funnyordie import FunnyOrDieIE +from .gamekings import GamekingsIE from .gamespot import GameSpotIE from .gametrailers import GametrailersIE from .generic import GenericIE diff --git a/youtube_dl/extractor/gamekings.py b/youtube_dl/extractor/gamekings.py new file mode 100644 index 000000000..eca71ab05 --- /dev/null +++ b/youtube_dl/extractor/gamekings.py @@ -0,0 +1,39 @@ +import re + +from .common import InfoExtractor +from ..utils import ( + determine_ext, +) + + +class GamekingsIE(InfoExtractor): + _VALID_URL = r'http?://www\.gamekings\.tv/videos/(?P[0-9a-z\-])' + _TEST = { + u"url": u"http://www.gamekings.tv/videos/phoenix-wright-ace-attorney-dual-destinies-review/", + u'file': u'20130811_PhoenixWright.mp4', + u'md5': u'8d42d15381e2dfa81dee86c7956d35ff', + u'info_dict': { + u"title": u"Phoenix Wright: Ace Attorney – Dual Destinies Review", + u"description": u"Melle en Steven hebben voor de review een week in de rechtbank doorbracht met Phoenix Wright: Ace Attorney - Dual Destinies.", + } + } + + def _real_extract(self, url): + + mobj = re.match(self._VALID_URL, url) + name = mobj.group('name') + webpage = self._download_webpage(url, name) + gamekings_url = self._og_search_video_url(webpage) + + video = re.search(r'[0-9]+',gamekings_url) + video_id = video.group(0) + + # Todo: add medium format + gamekings_url = gamekings_url.replace(video_id,'large/' + video_id) + + return {'id': video_id, + 'ext': 'mp4', + 'url': gamekings_url, + 'title': self._og_search_title(webpage), + 'description': self._og_search_description(webpage), + } From 384b98cd8f90ef2ac25c1f4f20ba9385adabaca8 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Wed, 13 Nov 2013 10:50:53 +0100 Subject: [PATCH 010/425] [gamekings] Minor fixes (#1759) --- youtube_dl/extractor/gamekings.py | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/youtube_dl/extractor/gamekings.py b/youtube_dl/extractor/gamekings.py index eca71ab05..4b4259447 100644 --- a/youtube_dl/extractor/gamekings.py +++ b/youtube_dl/extractor/gamekings.py @@ -7,13 +7,13 @@ from ..utils import ( class GamekingsIE(InfoExtractor): - _VALID_URL = r'http?://www\.gamekings\.tv/videos/(?P[0-9a-z\-])' + _VALID_URL = r'http?://www\.gamekings\.tv/videos/(?P[0-9a-z\-]+)' _TEST = { u"url": u"http://www.gamekings.tv/videos/phoenix-wright-ace-attorney-dual-destinies-review/", - u'file': u'20130811_PhoenixWright.mp4', - u'md5': u'8d42d15381e2dfa81dee86c7956d35ff', + u'file': u'20130811.mp4', + u'md5': u'17f6088f7d0149ff2b46f2714bdb1954', u'info_dict': { - u"title": u"Phoenix Wright: Ace Attorney – Dual Destinies Review", + u"title": u"Phoenix Wright: Ace Attorney \u2013 Dual Destinies Review", u"description": u"Melle en Steven hebben voor de review een week in de rechtbank doorbracht met Phoenix Wright: Ace Attorney - Dual Destinies.", } } @@ -23,17 +23,18 @@ class GamekingsIE(InfoExtractor): mobj = re.match(self._VALID_URL, url) name = mobj.group('name') webpage = self._download_webpage(url, name) - gamekings_url = self._og_search_video_url(webpage) + video_url = self._og_search_video_url(webpage) - video = re.search(r'[0-9]+',gamekings_url) + video = re.search(r'[0-9]+', video_url) video_id = video.group(0) - # Todo: add medium format - gamekings_url = gamekings_url.replace(video_id,'large/' + video_id) + # Todo: add medium format + video_url = video_url.replace(video_id, 'large/' + video_id) - return {'id': video_id, - 'ext': 'mp4', - 'url': gamekings_url, - 'title': self._og_search_title(webpage), - 'description': self._og_search_description(webpage), - } + return { + 'id': video_id, + 'ext': 'mp4', + 'url': video_url, + 'title': self._og_search_title(webpage), + 'description': self._og_search_description(webpage), + } From b5bdc2699a5fead926114f6db6f1178181d75c58 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Wed, 13 Nov 2013 10:52:22 +0100 Subject: [PATCH 011/425] Credit @jelly for gamekings extractor (#1759) --- youtube_dl/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 1f1db9f67..254fcd39c 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -32,6 +32,7 @@ __authors__ = ( 'Ismael Mejía', 'Steffan \'Ruirize\' James', 'Andras Elso', + 'Jelle van der Waa', ) __license__ = 'Public Domain' From c3a3028f9f7fa79ee7357b65252ff2c9a062bdc8 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Wed, 13 Nov 2013 11:06:53 +0100 Subject: [PATCH 012/425] [tvp] Minor improvements (#1730) --- youtube_dl/extractor/tvp.py | 47 +++++++++++-------------------------- 1 file changed, 14 insertions(+), 33 deletions(-) diff --git a/youtube_dl/extractor/tvp.py b/youtube_dl/extractor/tvp.py index 63fb57bbe..32e0f5037 100644 --- a/youtube_dl/extractor/tvp.py +++ b/youtube_dl/extractor/tvp.py @@ -1,23 +1,17 @@ -# encoding: utf-8 -import re import json +import re from .common import InfoExtractor -from ..utils import ( - determine_ext, - ExtractorError, - RegexNotFoundError, -) + class TvpIE(InfoExtractor): IE_NAME = u'tvp.pl' _VALID_URL = r'https?://www\.tvp\.pl/.*?wideo/(?P\d+)/(?P\d+)' - _INFO_URL = 'http://www.tvp.pl/pub/stat/videofileinfo?video_id=%s' - _TEST = { u'url': u'http://www.tvp.pl/warszawa/magazyny/campusnews/wideo/31102013/12878238', - u'file': u'31.10.2013-12878238.wmv', + u'md5': u'148408967a6a468953c0a75cbdaf0d7a', + u'file': u'12878238.wmv', u'info_dict': { u'title': u'31.10.2013', u'description': u'31.10.2013', @@ -27,34 +21,21 @@ class TvpIE(InfoExtractor): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') - webpage = self._download_webpage(url, video_id, "Downloading video webpage") - json_params = self._download_webpage(self._INFO_URL % video_id, video_id, "Downloading video metadata") - - try: - params = json.loads(json_params) - except: - raise ExtractorError(u'Invalid JSON') + webpage = self._download_webpage(url, video_id) + json_url = 'http://www.tvp.pl/pub/stat/videofileinfo?video_id=%s' % video_id + json_params = self._download_webpage( + json_url, video_id, u"Downloading video metadata") + params = json.loads(json_params) self.report_extraction(video_id) - try: - video_url = params['video_url'] - except KeyError: - raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1]) + video_url = params['video_url'] - try: - title = self._og_search_title(webpage) - except RegexNotFoundError: - title = video_id - info = { + title = self._og_search_title(webpage, fatal=True) + return { 'id': video_id, 'title': title, 'ext': 'wmv', 'url': video_url, + 'description': self._og_search_description(webpage), + 'thumbnail': self._og_search_thumbnail(webpage), } - try: - info['description'] = self._og_search_description(webpage) - info['thumbnail'] = self._og_search_thumbnail(webpage) - except RegexNotFoundError: - pass - - return info From d37936386f0217a411efe96483b97e590b7eceb3 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Wed, 13 Nov 2013 11:08:07 +0100 Subject: [PATCH 013/425] Credit @saper for tvp IE (#1730) --- youtube_dl/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 254fcd39c..4dee487ab 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -33,6 +33,7 @@ __authors__ = ( 'Steffan \'Ruirize\' James', 'Andras Elso', 'Jelle van der Waa', + 'Marcin Cieślak', ) __license__ = 'Public Domain' From 80b9bbce8687f800b79edb36edf8c193dcf26a78 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Wed, 13 Nov 2013 11:09:04 +0100 Subject: [PATCH 014/425] release 2013.11.13 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 338e7ba1f..26b91105f 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2013.11.11' +__version__ = '2013.11.13' From dcbb45803f9b70041ec0ef9c3c6547340bd1ef7a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Wed, 13 Nov 2013 16:21:24 +0100 Subject: [PATCH 015/425] [youtube:playlist] Don't use the gdata api (closes #1508) Parse the playlist pages instead --- test/test_youtube_lists.py | 14 +++++------ youtube_dl/extractor/youtube.py | 43 ++++++++++----------------------- 2 files changed, 20 insertions(+), 37 deletions(-) diff --git a/test/test_youtube_lists.py b/test/test_youtube_lists.py index 4b7a7847b..50ad52695 100644 --- a/test/test_youtube_lists.py +++ b/test/test_youtube_lists.py @@ -27,7 +27,7 @@ class TestYoutubeLists(unittest.TestCase): def test_youtube_playlist(self): dl = FakeYDL() ie = YoutubePlaylistIE(dl) - result = ie.extract('https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re')[0] + result = ie.extract('https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re') self.assertIsPlaylist(result) self.assertEqual(result['title'], 'ytdl test PL') ytie_results = [YoutubeIE()._extract_id(url['url']) for url in result['entries']] @@ -44,13 +44,13 @@ class TestYoutubeLists(unittest.TestCase): def test_issue_673(self): dl = FakeYDL() ie = YoutubePlaylistIE(dl) - result = ie.extract('PLBB231211A4F62143')[0] + result = ie.extract('PLBB231211A4F62143') self.assertTrue(len(result['entries']) > 25) def test_youtube_playlist_long(self): dl = FakeYDL() ie = YoutubePlaylistIE(dl) - result = ie.extract('https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q')[0] + result = ie.extract('https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q') self.assertIsPlaylist(result) self.assertTrue(len(result['entries']) >= 799) @@ -58,7 +58,7 @@ class TestYoutubeLists(unittest.TestCase): #651 dl = FakeYDL() ie = YoutubePlaylistIE(dl) - result = ie.extract('https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC')[0] + result = ie.extract('https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC') ytie_results = [YoutubeIE()._extract_id(url['url']) for url in result['entries']] self.assertFalse('pElCt5oNDuI' in ytie_results) self.assertFalse('KdPEApIVdWM' in ytie_results) @@ -66,7 +66,7 @@ class TestYoutubeLists(unittest.TestCase): def test_youtube_playlist_empty(self): dl = FakeYDL() ie = YoutubePlaylistIE(dl) - result = ie.extract('https://www.youtube.com/playlist?list=PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx')[0] + result = ie.extract('https://www.youtube.com/playlist?list=PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx') self.assertIsPlaylist(result) self.assertEqual(len(result['entries']), 0) @@ -74,7 +74,7 @@ class TestYoutubeLists(unittest.TestCase): dl = FakeYDL() ie = YoutubePlaylistIE(dl) # TODO find a > 100 (paginating?) videos course - result = ie.extract('https://www.youtube.com/course?list=ECUl4u3cNGP61MdtwGTqZA0MreSaDybji8')[0] + result = ie.extract('https://www.youtube.com/course?list=ECUl4u3cNGP61MdtwGTqZA0MreSaDybji8') entries = result['entries'] self.assertEqual(YoutubeIE()._extract_id(entries[0]['url']), 'j9WZyLZCBzs') self.assertEqual(len(entries), 25) @@ -99,7 +99,7 @@ class TestYoutubeLists(unittest.TestCase): def test_youtube_safe_search(self): dl = FakeYDL() ie = YoutubePlaylistIE(dl) - result = ie.extract('PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl')[0] + result = ie.extract('PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl') self.assertEqual(len(result['entries']), 2) def test_youtube_show(self): diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index c992cba97..d97ea8c83 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1506,8 +1506,9 @@ class YoutubePlaylistIE(InfoExtractor): | ((?:PL|EC|UU|FL)[0-9A-Za-z-_]{10,}) )""" - _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json&safeSearch=none' - _MAX_RESULTS = 50 + _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s&page=%s' + _MORE_PAGES_INDICATOR = r'data-link-type="next"' + _VIDEO_RE = r'href="/watch\?v=([0-9A-Za-z_-]{11})&' IE_NAME = u'youtube:playlist' @classmethod @@ -1532,41 +1533,23 @@ class YoutubePlaylistIE(InfoExtractor): else: self.to_screen(u'Downloading playlist PL%s - add --no-playlist to just download video %s' % (playlist_id, video_id)) - # Download playlist videos from API - videos = [] + # Extract the video ids from the playlist pages + ids = [] for page_num in itertools.count(1): - start_index = self._MAX_RESULTS * (page_num - 1) + 1 - if start_index >= 1000: - self._downloader.report_warning(u'Max number of results reached') - break - url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, start_index) + url = self._TEMPLATE_URL % (playlist_id, page_num) page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num) + # The ids are duplicated + new_ids = orderedSet(re.findall(self._VIDEO_RE, page)) + ids.extend(new_ids) - try: - response = json.loads(page) - except ValueError as err: - raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err)) - - if 'feed' not in response: - raise ExtractorError(u'Got a malformed response from YouTube API') - playlist_title = response['feed']['title']['$t'] - if 'entry' not in response['feed']: - # Number of videos is a multiple of self._MAX_RESULTS + if re.search(self._MORE_PAGES_INDICATOR, page) is None: break - for entry in response['feed']['entry']: - index = entry['yt$position']['$t'] - if 'media$group' in entry and 'yt$videoid' in entry['media$group']: - videos.append(( - index, - 'https://www.youtube.com/watch?v=' + entry['media$group']['yt$videoid']['$t'] - )) + playlist_title = self._og_search_title(page) - videos = [v[1] for v in sorted(videos)] - - url_results = [self.url_result(vurl, 'Youtube') for vurl in videos] - return [self.playlist_result(url_results, playlist_id, playlist_title)] + url_results = [self.url_result(vid, 'Youtube') for vid in ids] + return self.playlist_result(url_results, playlist_id, playlist_title) class YoutubeChannelIE(InfoExtractor): From 880e1c529de1d0f7f0a065afc4148320894a25b4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Wed, 13 Nov 2013 16:39:11 +0100 Subject: [PATCH 016/425] [youtube:playlist] Login into youtube if requested (fixes #1757) Allows to download private playlists --- youtube_dl/extractor/youtube.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index d97ea8c83..c48c0e24f 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1490,7 +1490,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): }) return results -class YoutubePlaylistIE(InfoExtractor): +class YoutubePlaylistIE(YoutubeBaseInfoExtractor): IE_DESC = u'YouTube.com playlists' _VALID_URL = r"""(?: (?:https?://)? @@ -1516,6 +1516,9 @@ class YoutubePlaylistIE(InfoExtractor): """Receives a URL and returns True if suitable for this IE.""" return re.match(cls._VALID_URL, url, re.VERBOSE) is not None + def _real_initialize(self): + self._login() + def _real_extract(self, url): # Extract playlist id mobj = re.match(self._VALID_URL, url, re.VERBOSE) From ea7a7af1d46ecb51566db0af3e8779ab2a04b516 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Wed, 13 Nov 2013 17:13:06 +0100 Subject: [PATCH 017/425] [gamekings] Fix the test video checksum --- youtube_dl/extractor/gamekings.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/gamekings.py b/youtube_dl/extractor/gamekings.py index 4b4259447..6aa657ef4 100644 --- a/youtube_dl/extractor/gamekings.py +++ b/youtube_dl/extractor/gamekings.py @@ -11,7 +11,7 @@ class GamekingsIE(InfoExtractor): _TEST = { u"url": u"http://www.gamekings.tv/videos/phoenix-wright-ace-attorney-dual-destinies-review/", u'file': u'20130811.mp4', - u'md5': u'17f6088f7d0149ff2b46f2714bdb1954', + u'md5': u'2f32b1f7b80fdc5cb616efb4f387f8a3', u'info_dict': { u"title": u"Phoenix Wright: Ace Attorney \u2013 Dual Destinies Review", u"description": u"Melle en Steven hebben voor de review een week in de rechtbank doorbracht met Phoenix Wright: Ace Attorney - Dual Destinies.", From ca715127a2f95da30b6700a5e217a2acc904b459 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Wed, 13 Nov 2013 17:06:02 +0100 Subject: [PATCH 018/425] Don't assume the 'subtitlesformat' is set in the params dict (fixes #1750) --- youtube_dl/YoutubeDL.py | 2 +- youtube_dl/extractor/youtube.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 5253c39e1..f615911de 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -640,7 +640,7 @@ class YoutubeDL(object): # subtitles download errors are already managed as troubles in relevant IE # that way it will silently go on when used with unsupporting IE subtitles = info_dict['subtitles'] - sub_format = self.params.get('subtitlesformat') + sub_format = self.params.get('subtitlesformat', 'srt') for sub_lang in subtitles.keys(): sub = subtitles[sub_lang] if sub is None: diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index c992cba97..56d30f52a 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1098,7 +1098,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): params = compat_urllib_parse.urlencode({ 'lang': lang, 'v': video_id, - 'fmt': self._downloader.params.get('subtitlesformat'), + 'fmt': self._downloader.params.get('subtitlesformat', 'srt'), 'name': l[0].encode('utf-8'), }) url = u'http://www.youtube.com/api/timedtext?' + params @@ -1111,7 +1111,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): def _get_available_automatic_caption(self, video_id, webpage): """We need the webpage for getting the captions url, pass it as an argument to speed up the process.""" - sub_format = self._downloader.params.get('subtitlesformat') + sub_format = self._downloader.params.get('subtitlesformat', 'srt') self.to_screen(u'%s: Looking for automatic captions' % video_id) mobj = re.search(r';ytplayer.config = ({.*?});', webpage) err_msg = u'Couldn\'t find automatic captions for %s' % video_id From 9771cceb2cfab153d0c2a5b0a7dbc923de90f2c1 Mon Sep 17 00:00:00 2001 From: Janez Troha Date: Wed, 13 Nov 2013 18:34:03 +0100 Subject: [PATCH 019/425] Fix filename extension leaking to json filename Makes writeinfojson behaving exactly as writethumbnail in case where filename contains mediafile extension. Case: video.mp4 converted to music.mp3 would yield music.mp4.info.json instead music.mp3.info.json or music.info.json --- youtube_dl/YoutubeDL.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index f615911de..3160d9712 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -655,7 +655,7 @@ class YoutubeDL(object): return if self.params.get('writeinfojson', False): - infofn = filename + u'.info.json' + infofn = os.path.splitext(filename)[0] + u'.info.json' self.report_writeinfojson(infofn) try: json_info_dict = dict((k, v) for k, v in info_dict.items() if not k in ['urlhandle']) From 08bc37cdd02bc861a0b5271abb34623f0ebcd66c Mon Sep 17 00:00:00 2001 From: Janez Troha Date: Wed, 13 Nov 2013 18:55:49 +0100 Subject: [PATCH 020/425] Update test_write_info_json.py --- test/test_write_info_json.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_write_info_json.py b/test/test_write_info_json.py index a5b6f6972..30c4859fd 100644 --- a/test/test_write_info_json.py +++ b/test/test_write_info_json.py @@ -31,7 +31,7 @@ params = get_params({ TEST_ID = 'BaW_jenozKc' -INFO_JSON_FILE = TEST_ID + '.mp4.info.json' +INFO_JSON_FILE = TEST_ID + '.info.json' DESCRIPTION_FILE = TEST_ID + '.mp4.description' EXPECTED_DESCRIPTION = u'''test chars: "'/\ä↭𝕐 From c66d2baa9cb33327a3318f49cbb89f9ac559c978 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Thu, 14 Nov 2013 13:16:32 +0100 Subject: [PATCH 021/425] [livestream] Add an extractor for the original version of livestream (closes #1764) The two versions use different systems. --- youtube_dl/extractor/__init__.py | 2 +- youtube_dl/extractor/livestream.py | 44 ++++++++++++++++++++++++++++++ 2 files changed, 45 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 0594a3666..ee3173468 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -80,7 +80,7 @@ from .keezmovies import KeezMoviesIE from .kickstarter import KickStarterIE from .keek import KeekIE from .liveleak import LiveLeakIE -from .livestream import LivestreamIE +from .livestream import LivestreamIE, LivestreamOriginalIE from .metacafe import MetacafeIE from .metacritic import MetacriticIE from .mit import TechTVMITIE, MITIE diff --git a/youtube_dl/extractor/livestream.py b/youtube_dl/extractor/livestream.py index 4531fd6ab..1a3e0ae6b 100644 --- a/youtube_dl/extractor/livestream.py +++ b/youtube_dl/extractor/livestream.py @@ -1,16 +1,19 @@ import re import json +import xml.etree.ElementTree from .common import InfoExtractor from ..utils import ( compat_urllib_parse_urlparse, compat_urlparse, get_meta_content, + xpath_with_ns, ExtractorError, ) class LivestreamIE(InfoExtractor): + IE_NAME = u'livestream' _VALID_URL = r'http://new.livestream.com/.*?/(?P.*?)(/videos/(?P\d+))?/?$' _TEST = { u'url': u'http://new.livestream.com/CoheedandCambria/WebsterHall/videos/4719370', @@ -54,3 +57,44 @@ class LivestreamIE(InfoExtractor): info = json.loads(self._download_webpage(api_url, video_id, u'Downloading video info')) return self._extract_video_info(info) + + +# The original version of Livestream uses a different system +class LivestreamOriginalIE(InfoExtractor): + IE_NAME = u'livestream:original' + _VALID_URL = r'https?://www\.livestream\.com/(?P[^/]+)/video\?.*?clipId=(?P.*?)(&|$)' + _TEST = { + u'url': u'http://www.livestream.com/dealbook/video?clipId=pla_8aa4a3f1-ba15-46a4-893b-902210e138fb', + u'info_dict': { + u'id': u'pla_8aa4a3f1-ba15-46a4-893b-902210e138fb', + u'ext': u'flv', + u'title': u'Spark 1 (BitCoin) with Cameron Winklevoss & Tyler Winklevoss of Winklevoss Capital', + }, + u'params': { + # rtmp + u'skip_download': True, + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + user = mobj.group('user') + api_url = 'http://x{0}x.api.channel.livestream.com/2.0/clipdetails?extendedInfo=true&id={1}'.format(user, video_id) + + api_response = self._download_webpage(api_url, video_id) + info = xml.etree.ElementTree.fromstring(api_response.encode('utf-8')) + item = info.find('channel').find('item') + ns = {'media': 'http://search.yahoo.com/mrss'} + thumbnail_url = item.find(xpath_with_ns('media:thumbnail', ns)).attrib['url'] + # Remove the extension and number from the path (like 1.jpg) + path = self._search_regex(r'(user-files/.+)_.*?\.jpg$', thumbnail_url, u'path') + + return { + 'id': video_id, + 'title': item.find('title').text, + 'url': 'rtmp://extondemand.livestream.com/ondemand', + 'play_path': 'mp4:trans/dv15/mogulus-{0}.mp4'.format(path), + 'ext': 'flv', + 'thumbnail': thumbnail_url, + } From e3b9ab5e187a590143f7f6110e55d43fb78f15bc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Thu, 14 Nov 2013 19:45:39 +0100 Subject: [PATCH 022/425] [soundlcoud] Set the correct extension for the tracks (fixes #1766) Some tracks are not in mp3 format, they can be wav files. --- youtube_dl/extractor/soundcloud.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index 4717fbb77..83e1f055f 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -87,7 +87,7 @@ class SoundcloudIE(InfoExtractor): 'uploader': info['user']['username'], 'upload_date': unified_strdate(info['created_at']), 'title': info['title'], - 'ext': u'mp3', + 'ext': info.get('original_format', u'mp3'), 'description': info['description'], 'thumbnail': thumbnail, } From 9f9be844fcc5155ab3e832c8428c8f016bea819b Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Fri, 15 Nov 2013 01:45:34 +0100 Subject: [PATCH 023/425] [youtube] Fix protocol-independent URLs (Fixes #1768) --- youtube_dl/extractor/youtube.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index c992cba97..ed82e4fc0 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1019,6 +1019,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): """Turn the encrypted s field into a working signature""" if player_url is not None: + if player_url.startswith(u'//'): + player_url = u'https:' + player_url try: player_id = (player_url, len(s)) if player_id not in self._player_cache: From a25a5cfeecc1c6371ab28d6d458cd066baf7013d Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Fri, 15 Nov 2013 01:47:15 +0100 Subject: [PATCH 024/425] release 2013.11.15 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 26b91105f..cd9f0b546 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2013.11.13' +__version__ = '2013.11.15' From feee2ecfa9fbc6fd34246c7e167ac9542ae7def2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Fri, 15 Nov 2013 11:04:26 +0100 Subject: [PATCH 025/425] Pass the 'download' argument to 'process_video_result' (fixes #1769) --- youtube_dl/YoutubeDL.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index f615911de..b5c670dd4 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -385,7 +385,7 @@ class YoutubeDL(object): result_type = ie_result.get('_type', 'video') # If not given we suppose it's a video, support the default old system if result_type == 'video': self.add_extra_info(ie_result, extra_info) - return self.process_video_result(ie_result) + return self.process_video_result(ie_result, download=download) elif result_type == 'url': # We have to add extra_info to the results because it may be # contained in a playlist From b9643eed7c9b081c91e72257b380ccbd92555254 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Fri, 15 Nov 2013 11:51:45 +0100 Subject: [PATCH 026/425] [youtube:channel] Fix the extraction of autogenerated channels The ajax pages are empty, now it looks directly in the channel's /videos page --- youtube_dl/extractor/youtube.py | 37 +++++++++++++++++++++------------ 1 file changed, 24 insertions(+), 13 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 7d7aeb461..8c0e6f252 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1594,20 +1594,31 @@ class YoutubeChannelIE(InfoExtractor): # Download channel page channel_id = mobj.group(1) video_ids = [] + url = 'https://www.youtube.com/channel/%s/videos' % channel_id + channel_page = self._download_webpage(url, channel_id) + if re.search(r'channel-header-autogenerated-label', channel_page) is not None: + autogenerated = True + else: + autogenerated = False - # Download all channel pages using the json-based channel_ajax query - for pagenum in itertools.count(1): - url = self._MORE_PAGES_URL % (pagenum, channel_id) - page = self._download_webpage(url, channel_id, - u'Downloading page #%s' % pagenum) - - page = json.loads(page) - - ids_in_page = self.extract_videos_from_page(page['content_html']) - video_ids.extend(ids_in_page) - - if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']: - break + if autogenerated: + # The videos are contained in a single page + # the ajax pages can't be used, they are empty + video_ids = self.extract_videos_from_page(channel_page) + else: + # Download all channel pages using the json-based channel_ajax query + for pagenum in itertools.count(1): + url = self._MORE_PAGES_URL % (pagenum, channel_id) + page = self._download_webpage(url, channel_id, + u'Downloading page #%s' % pagenum) + + page = json.loads(page) + + ids_in_page = self.extract_videos_from_page(page['content_html']) + video_ids.extend(ids_in_page) + + if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']: + break self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids))) From 85d61685f15bdc62709c699e849af512db78089f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Fri, 15 Nov 2013 12:10:22 +0100 Subject: [PATCH 027/425] [tvp] Update the title and the description of the test video --- youtube_dl/extractor/tvp.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/tvp.py b/youtube_dl/extractor/tvp.py index 32e0f5037..76721a986 100644 --- a/youtube_dl/extractor/tvp.py +++ b/youtube_dl/extractor/tvp.py @@ -13,8 +13,8 @@ class TvpIE(InfoExtractor): u'md5': u'148408967a6a468953c0a75cbdaf0d7a', u'file': u'12878238.wmv', u'info_dict': { - u'title': u'31.10.2013', - u'description': u'31.10.2013', + u'title': u'31.10.2013 - Odcinek 2', + u'description': u'31.10.2013 - Odcinek 2', }, } From ab2d524780736249c8988313db021e83642c24d1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Fri, 15 Nov 2013 12:24:54 +0100 Subject: [PATCH 028/425] Improve the OpenGraph regex * Do not accept '>' between the property and content attributes. * Recognize the properties if the content attribute is before the property attribute using two regexes (fixes the extraction of the description for SlideshareIE). --- youtube_dl/extractor/common.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 9c20d30b4..e02176852 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -315,13 +315,17 @@ class InfoExtractor(object): # Helper functions for extracting OpenGraph info @staticmethod - def _og_regex(prop): - return r']+?property=[\'"]og:%s[\'"][^>]+?content=(?:"(.+?)"|\'(.+?)\')' % esc_prop, + r']+?content=(?:"(.+?)"|\'(.+?)\')[^>]+?property=[\'"]og:%s[\'"]' % esc_prop, + ] def _og_search_property(self, prop, html, name=None, **kargs): if name is None: name = 'OpenGraph %s' % prop - escaped = self._search_regex(self._og_regex(prop), html, name, flags=re.DOTALL, **kargs) + escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs) if escaped is None: return None return unescapeHTML(escaped) @@ -336,8 +340,8 @@ class InfoExtractor(object): return self._og_search_property('title', html, **kargs) def _og_search_video_url(self, html, name='video url', secure=True, **kargs): - regexes = [self._og_regex('video')] - if secure: regexes.insert(0, self._og_regex('video:secure_url')) + regexes = self._og_regexes('video') + if secure: regexes = self._og_regexes('video:secure_url') + regexes return self._html_search_regex(regexes, html, name, **kargs) def _rta_search(self, html): From 78fb87b2837e15124b5855734a951598dfe025fe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Fri, 15 Nov 2013 12:54:13 +0100 Subject: [PATCH 029/425] Don't accept '>' inside the content attribute in OpenGraph regexes --- youtube_dl/extractor/common.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index e02176852..45dd01789 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -316,10 +316,12 @@ class InfoExtractor(object): # Helper functions for extracting OpenGraph info @staticmethod def _og_regexes(prop): - esc_prop = re.escape(prop) + content_re = r'content=(?:"([^>]+?)"|\'(.+?)\')' + property_re = r'property=[\'"]og:%s[\'"]' % re.escape(prop) + template = r']+?%s[^>]+?%s' return [ - r']+?property=[\'"]og:%s[\'"][^>]+?content=(?:"(.+?)"|\'(.+?)\')' % esc_prop, - r']+?content=(?:"(.+?)"|\'(.+?)\')[^>]+?property=[\'"]og:%s[\'"]' % esc_prop, + template % (property_re, content_re), + template % (content_re, property_re), ] def _og_search_property(self, prop, html, name=None, **kargs): From d24ffe1cfadca8fdc37a7b2fb5c2f080c785ad1b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Fri, 15 Nov 2013 12:57:59 +0100 Subject: [PATCH 030/425] [rtlnow] Remove the test for nitro The videos expire. --- youtube_dl/extractor/rtlnow.py | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/youtube_dl/extractor/rtlnow.py b/youtube_dl/extractor/rtlnow.py index 9ac7c3be8..2f238de35 100644 --- a/youtube_dl/extractor/rtlnow.py +++ b/youtube_dl/extractor/rtlnow.py @@ -62,18 +62,6 @@ class RTLnowIE(InfoExtractor): u'skip_download': True, }, }, - { - u'url': u'http://www.rtlnitronow.de/recht-ordnung/stadtpolizei-frankfurt-gerichtsvollzieher-leipzig.php?film_id=129679&player=1&season=1', - u'file': u'129679.flv', - u'info_dict': { - u'upload_date': u'20131016', - u'title': u'Recht & Ordnung - Stadtpolizei Frankfurt/ Gerichtsvollzieher...', - u'description': u'Stadtpolizei Frankfurt/ Gerichtsvollzieher Leipzig', - }, - u'params': { - u'skip_download': True, - }, - }, { u'url': u'http://www.n-tvnow.de/top-gear/episode-1-2013-01-01-00-00-00.php?film_id=124903&player=1&season=10', u'file': u'124903.flv', From 463a908705674b9411a01b7a696c84348ec0244e Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Fri, 15 Nov 2013 14:06:38 +0100 Subject: [PATCH 031/425] [ted] simplify --- youtube_dl/extractor/ted.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/ted.py b/youtube_dl/extractor/ted.py index 76cfdfb90..8001ca5a3 100644 --- a/youtube_dl/extractor/ted.py +++ b/youtube_dl/extractor/ted.py @@ -85,7 +85,7 @@ class TEDIE(SubtitlesInfoExtractor): 'ext': 'mp4', 'url': stream['file'], 'format': stream['id'] - } for stream in info['htmlStreams']] + } for stream in info['htmlStreams']] video_id = info['id'] @@ -95,7 +95,7 @@ class TEDIE(SubtitlesInfoExtractor): self._list_available_subtitles(video_id, webpage) return - info = { + return { 'id': video_id, 'title': title, 'thumbnail': thumbnail, @@ -104,11 +104,6 @@ class TEDIE(SubtitlesInfoExtractor): 'formats': formats, } - # TODO: Remove when #980 has been merged - info.update(info['formats'][-1]) - - return info - def _get_available_subtitles(self, video_id, webpage): try: options = self._search_regex(r'(?:)', webpage, 'subtitles_language_select', flags=re.DOTALL) From fc2ef392bee9e46564d4106d3b0e5f6b8d71e37b Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Fri, 15 Nov 2013 14:33:51 +0100 Subject: [PATCH 032/425] [ted] Fix playlists (Fixes #1770) --- youtube_dl/extractor/ted.py | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/youtube_dl/extractor/ted.py b/youtube_dl/extractor/ted.py index 8001ca5a3..2e497c86e 100644 --- a/youtube_dl/extractor/ted.py +++ b/youtube_dl/extractor/ted.py @@ -43,26 +43,25 @@ class TEDIE(SubtitlesInfoExtractor): self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name)) return [self._playlist_videos_info(url,name,playlist_id)] - def _playlist_videos_info(self,url,name,playlist_id=0): + + def _playlist_videos_info(self, url, name, playlist_id): '''Returns the videos of the playlist''' - video_RE=r''' - (?P.+?)

' - webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage') - m_videos=re.finditer(video_RE,webpage,re.VERBOSE) - m_names=re.finditer(video_name_RE,webpage) + + webpage = self._download_webpage( + url, playlist_id, u'Downloading playlist webpage') + matches = re.finditer( + r'/talks/[^"]+\.html)">[^<]*

', + webpage) playlist_title = self._html_search_regex(r'div class="headline">\s*?

\s*?(.*?)', webpage, 'playlist title') - playlist_entries = [] - for m_video, m_name in zip(m_videos,m_names): - talk_url='http://www.ted.com%s' % m_name.group('talk_url') - playlist_entries.append(self.url_result(talk_url, 'TED')) - return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title) + playlist_entries = [ + self.url_result(u'http://www.ted.com' + m.group('talk_url'), 'TED') + for m in matches + ] + return self.playlist_result( + playlist_entries, playlist_id=playlist_id, playlist_title=playlist_title) def _talk_info(self, url, video_id=0): """Return the video for the talk in the url""" From aa13b2dffde73978fe2a169d3f99de7e5e7754cb Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Fri, 15 Nov 2013 14:35:00 +0100 Subject: [PATCH 033/425] release 2013.11.15.1 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index cd9f0b546..b04238eb5 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2013.11.15' +__version__ = '2013.11.15.1' From 91c7271aabdd74c833ef570db59018e2d9f9d803 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Sat, 16 Nov 2013 01:08:43 +0100 Subject: [PATCH 034/425] Add automatic generation of format note based on bitrate and codecs --- youtube_dl/YoutubeDL.py | 18 +++++++++++++++++- youtube_dl/extractor/common.py | 4 ++++ youtube_dl/extractor/vevo.py | 7 ++++--- 3 files changed, 25 insertions(+), 4 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index b5c670dd4..9c79af1f2 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -781,12 +781,28 @@ class YoutubeDL(object): return res def list_formats(self, info_dict): + def format_note(fdict): + if fdict.get('format_note') is not None: + return fdict['format_note'] + res = u'' + if fdict.get('vcodec') is not None: + res += fdict['vcodec'] + if fdict.get('vbr') is not None: + res += u'@%4dk' % fdict['vbr'] + if fdict.get('acodec') is not None: + if res: + res += u', ' + res += fdict['acodec'] + if fdict.get('abr') is not None: + res += u'@%3dk' % fdict['abr'] + return res + def line(format): return (u'%-20s%-10s%-12s%s' % ( format['format_id'], format['ext'], self.format_resolution(format), - format.get('format_note', ''), + format_note(format), ) ) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 45dd01789..f787d0a3c 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -71,6 +71,10 @@ class InfoExtractor(object): ("3D" or "DASH video") * width Width of the video, if known * height Height of the video, if known + * abr Average audio bitrate in KBit/s + * acodec Name of the audio codec in use + * vbr Average video bitrate in KBit/s + * vcodec Name of the video codec in use webpage_url: The url to the video webpage, if given to youtube-dl it should allow to get the same result again. (It will be set by YoutubeDL if it's missing) diff --git a/youtube_dl/extractor/vevo.py b/youtube_dl/extractor/vevo.py index 3f6020f74..4378b1780 100644 --- a/youtube_dl/extractor/vevo.py +++ b/youtube_dl/extractor/vevo.py @@ -78,12 +78,13 @@ class VevoIE(InfoExtractor): continue format_url = self._SMIL_BASE_URL + m.group('path') - format_note = ('%(vcodec)s@%(vbr)4sk, %(acodec)s@%(abr)3sk' % - m.groupdict()) formats.append({ 'url': format_url, 'format_id': u'SMIL_' + m.group('cbr'), - 'format_note': format_note, + 'vcodec': m.group('vcodec'), + 'acodec': m.group('acodec'), + 'vbr': int(m.group('vbr')), + 'abr': int(m.group('abr')), 'ext': m.group('ext'), 'width': int(m.group('width')), 'height': int(m.group('height')), From 7150858d49f05a5650a12b5f4694f91dfb9595d3 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Sat, 16 Nov 2013 01:33:12 +0100 Subject: [PATCH 035/425] [spiegel] Implement format selection --- youtube_dl/YoutubeDL.py | 10 +++++-- youtube_dl/extractor/spiegel.py | 50 ++++++++++++++++++++++++--------- 2 files changed, 44 insertions(+), 16 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 9c79af1f2..273f7d977 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -786,13 +786,19 @@ class YoutubeDL(object): return fdict['format_note'] res = u'' if fdict.get('vcodec') is not None: - res += fdict['vcodec'] + res += u'%-5s' % fdict['vcodec'] + elif fdict.get('vbr') is not None: + res += u'video' if fdict.get('vbr') is not None: res += u'@%4dk' % fdict['vbr'] if fdict.get('acodec') is not None: if res: res += u', ' - res += fdict['acodec'] + res += u'%-5s' % fdict['acodec'] + elif fdict.get('abr') is not None: + if res: + res += u', ' + res += 'audio' if fdict.get('abr') is not None: res += u'@%3dk' % fdict['abr'] return res diff --git a/youtube_dl/extractor/spiegel.py b/youtube_dl/extractor/spiegel.py index 13c86401c..6dc2eda6d 100644 --- a/youtube_dl/extractor/spiegel.py +++ b/youtube_dl/extractor/spiegel.py @@ -2,18 +2,27 @@ import re import xml.etree.ElementTree from .common import InfoExtractor +from ..utils import determine_ext class SpiegelIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P[0-9]+)(?:\.html)?(?:#.*)?$' - _TEST = { + _TESTS = [{ u'url': u'http://www.spiegel.de/video/vulkan-tungurahua-in-ecuador-ist-wieder-aktiv-video-1259285.html', u'file': u'1259285.mp4', u'md5': u'2c2754212136f35fb4b19767d242f66e', u'info_dict': { u"title": u"Vulkanausbruch in Ecuador: Der \"Feuerschlund\" ist wieder aktiv" } - } + }, + { + u'url': u'http://www.spiegel.de/video/schach-wm-videoanalyse-des-fuenften-spiels-video-1309159.html', + u'file': u'1309159.mp4', + u'md5': u'f2cdf638d7aa47654e251e1aee360af1', + u'info_dict': { + u'title': u'Schach-WM in der Videoanalyse: Carlsen nutzt die Fehlgriffe des Titelverteidigers' + } + }] def _real_extract(self, url): m = re.match(self._VALID_URL, url) @@ -21,25 +30,38 @@ class SpiegelIE(InfoExtractor): webpage = self._download_webpage(url, video_id) - video_title = self._html_search_regex(r'
(.*?)
', - webpage, u'title') + video_title = self._html_search_regex( + r'
(.*?)
', webpage, u'title') xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml' - xml_code = self._download_webpage(xml_url, video_id, - note=u'Downloading XML', errnote=u'Failed to download XML') + xml_code = self._download_webpage( + xml_url, video_id, + note=u'Downloading XML', errnote=u'Failed to download XML') idoc = xml.etree.ElementTree.fromstring(xml_code) - last_type = idoc[-1] - filename = last_type.findall('./filename')[0].text - duration = float(last_type.findall('./duration')[0].text) - video_url = 'http://video2.spiegel.de/flash/' + filename - video_ext = filename.rpartition('.')[2] + formats = [ + { + 'format_id': n.tag.rpartition('type')[2], + 'url': u'http://video2.spiegel.de/flash/' + n.find('./filename').text, + 'width': int(n.find('./width').text), + 'height': int(n.find('./height').text), + 'abr': int(n.find('./audiobitrate').text), + 'vbr': int(n.find('./videobitrate').text), + 'vcodec': n.find('./codec').text, + 'acodec': 'MP4A', + } + for n in list(idoc) + # Blacklist type 6, it's extremely LQ and not available on the same server + if n.tag.startswith('type') and n.tag != 'type6' + ] + formats.sort(key=lambda f: f['vbr']) + duration = float(idoc[0].findall('./duration')[0].text) + info = { 'id': video_id, - 'url': video_url, - 'ext': video_ext, 'title': video_title, 'duration': duration, + 'formats': formats, } - return [info] + return info From b5349e8721d0580a50519593926726a2ea832c9b Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Sat, 16 Nov 2013 01:39:45 +0100 Subject: [PATCH 036/425] Fix indentation of (best) and (worst) in --list-formats --- youtube_dl/YoutubeDL.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 273f7d977..d29e8bec5 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -815,8 +815,8 @@ class YoutubeDL(object): formats = info_dict.get('formats', [info_dict]) formats_s = list(map(line, formats)) if len(formats) > 1: - formats_s[0] += (' ' if formats[0].get('format_note') else '') + '(worst)' - formats_s[-1] += (' ' if formats[-1].get('format_note') else '') + '(best)' + formats_s[0] += (' ' if format_note(formats[0]) else '') + '(worst)' + formats_s[-1] += (' ' if format_note(formats[-1]) else '') + '(best)' header_line = line({ 'format_id': u'format code', 'ext': u'extension', From f058e340114cb599e21001cef25ab3d02cfb194f Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Sat, 16 Nov 2013 01:56:23 +0100 Subject: [PATCH 037/425] [dailymotion] Fix playlists --- youtube_dl/extractor/dailymotion.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index e87690f9d..71f5e03ee 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -186,7 +186,7 @@ class DailymotionPlaylistIE(DailymotionBaseInfoExtractor): webpage = self._download_webpage(request, id, u'Downloading page %s' % pagenum) - playlist_el = get_element_by_attribute(u'class', u'video_list', webpage) + playlist_el = get_element_by_attribute(u'class', u'row video_list', webpage) video_ids.extend(re.findall(r'data-id="(.+?)"', playlist_el)) if re.search(self._MORE_PAGES_INDICATOR, webpage, re.DOTALL) is None: From ce152341a118c94e442f42db8008de95aebec56b Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Sat, 16 Nov 2013 01:59:28 +0100 Subject: [PATCH 038/425] [bambuser] Do not test for MD5, seems to be flaky --- youtube_dl/extractor/bambuser.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/bambuser.py b/youtube_dl/extractor/bambuser.py index f3b36f473..967568c4a 100644 --- a/youtube_dl/extractor/bambuser.py +++ b/youtube_dl/extractor/bambuser.py @@ -15,7 +15,8 @@ class BambuserIE(InfoExtractor): _TEST = { u'url': u'http://bambuser.com/v/4050584', - u'md5': u'fba8f7693e48fd4e8641b3fd5539a641', + # MD5 seems to be flaky, see https://travis-ci.org/rg3/youtube-dl/jobs/14051016#L388 + #u'md5': u'fba8f7693e48fd4e8641b3fd5539a641', u'info_dict': { u'id': u'4050584', u'ext': u'flv', From 52d703d3d1b0d24f6c4c01ef8fcf33bfee78928b Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Sat, 16 Nov 2013 02:09:30 +0100 Subject: [PATCH 039/425] [tvp] Skip tests --- youtube_dl/extractor/tvp.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/tvp.py b/youtube_dl/extractor/tvp.py index 76721a986..bfed9dd04 100644 --- a/youtube_dl/extractor/tvp.py +++ b/youtube_dl/extractor/tvp.py @@ -16,6 +16,7 @@ class TvpIE(InfoExtractor): u'title': u'31.10.2013 - Odcinek 2', u'description': u'31.10.2013 - Odcinek 2', }, + u'skip': u'Download has to use same server IP as extraction. Therefore, a good (load-balancing) DNS resolver will make the download fail.' } def _real_extract(self, url): From 21ea3e06c9e39ac2cbfa24e7c02ebac936202893 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Sat, 16 Nov 2013 02:31:02 +0100 Subject: [PATCH 040/425] [gamekings] remove unnecessary import --- youtube_dl/extractor/gamekings.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/youtube_dl/extractor/gamekings.py b/youtube_dl/extractor/gamekings.py index 6aa657ef4..64715581d 100644 --- a/youtube_dl/extractor/gamekings.py +++ b/youtube_dl/extractor/gamekings.py @@ -1,9 +1,6 @@ import re from .common import InfoExtractor -from ..utils import ( - determine_ext, -) class GamekingsIE(InfoExtractor): From eab2724138670159f1946608359f5e5bd5a7e7af Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Sat, 16 Nov 2013 02:32:08 +0100 Subject: [PATCH 041/425] [gamekings] Do not test md5 sum, precise file changes regularly --- youtube_dl/extractor/gamekings.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/gamekings.py b/youtube_dl/extractor/gamekings.py index 64715581d..c91669b0e 100644 --- a/youtube_dl/extractor/gamekings.py +++ b/youtube_dl/extractor/gamekings.py @@ -8,7 +8,8 @@ class GamekingsIE(InfoExtractor): _TEST = { u"url": u"http://www.gamekings.tv/videos/phoenix-wright-ace-attorney-dual-destinies-review/", u'file': u'20130811.mp4', - u'md5': u'2f32b1f7b80fdc5cb616efb4f387f8a3', + # MD5 is flaky, seems to change regularly + #u'md5': u'2f32b1f7b80fdc5cb616efb4f387f8a3', u'info_dict': { u"title": u"Phoenix Wright: Ace Attorney \u2013 Dual Destinies Review", u"description": u"Melle en Steven hebben voor de review een week in de rechtbank doorbracht met Phoenix Wright: Ace Attorney - Dual Destinies.", From d1c252048bf74add82de0b9812b589956f3c8a69 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Sat, 16 Nov 2013 10:30:09 +0100 Subject: [PATCH 042/425] [redtube] Do not test md5, seems to vary --- youtube_dl/extractor/redtube.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/redtube.py b/youtube_dl/extractor/redtube.py index 994778e16..3bbda128e 100644 --- a/youtube_dl/extractor/redtube.py +++ b/youtube_dl/extractor/redtube.py @@ -8,7 +8,9 @@ class RedTubeIE(InfoExtractor): _TEST = { u'url': u'http://www.redtube.com/66418', u'file': u'66418.mp4', - u'md5': u'7b8c22b5e7098a3e1c09709df1126d2d', + # md5 varies from time to time, as in + # https://travis-ci.org/rg3/youtube-dl/jobs/14052463#L295 + #u'md5': u'7b8c22b5e7098a3e1c09709df1126d2d', u'info_dict': { u"title": u"Sucked on a toilet", u"age_limit": 18, From ba3881dffd241d8719430c31f107156ed8830996 Mon Sep 17 00:00:00 2001 From: Anton Larionov Date: Sat, 16 Nov 2013 18:26:34 +0400 Subject: [PATCH 043/425] Add support for anitube.se (#1417) --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/anitube.py | 59 ++++++++++++++++++++++++++++++++ 2 files changed, 60 insertions(+) create mode 100644 youtube_dl/extractor/anitube.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index ee3173468..a30de3033 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -1,5 +1,6 @@ from .appletrailers import AppleTrailersIE from .addanime import AddAnimeIE +from .anitube import AnitubeIE from .archiveorg import ArchiveOrgIE from .ard import ARDIE from .arte import ( diff --git a/youtube_dl/extractor/anitube.py b/youtube_dl/extractor/anitube.py new file mode 100644 index 000000000..2954966a6 --- /dev/null +++ b/youtube_dl/extractor/anitube.py @@ -0,0 +1,59 @@ +import re +import xml.etree.ElementTree + +from .common import InfoExtractor + + +class AnitubeIE(InfoExtractor): + IE_NAME = u'anitube.se' + _VALID_URL = r'http?://(?:www\.)?anitube\.se/video/(?P\d+)' + + _TEST = { + u'url': u'http://www.anitube.se/video/36621', + u'md5': u'0c4e4f1051bf50f5982f829f7230f539', + u'info_dict': { + u'id': u'36621', + u'ext': u'mp4', + u'title': u'Recorder to Randoseru 01', + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + + webpage = self._download_webpage(url, video_id) + + key = self._html_search_regex(r'http://www\.anitube\.se/embed/([A-Za-z0-9_-]*)', + webpage, u'key') + + webpage_config = self._download_webpage('http://www.anitube.se/nuevo/econfig.php?key=%s' % key, + key) + + config_xml = xml.etree.ElementTree.fromstring(webpage_config.encode('utf-8')) + + video_title = config_xml.find('title').text + + + formats = [] + + video_url = config_xml.find('file') + if video_url is not None: + formats.append({ + 'format_id': 'sd', + 'url': video_url.text, + }) + + video_url = config_xml.find('filehd') + if video_url is not None: + formats.append({ + 'format_id': 'hd', + 'url': video_url.text, + }) + + return { + 'id': video_id, + 'title': video_title, + 'ext': 'mp4', + 'formats': formats + } From ddf49c63445d236138846d778b18db6ede067fc8 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Sun, 17 Nov 2013 11:05:49 +0100 Subject: [PATCH 044/425] [arte] remove two typos --- youtube_dl/extractor/arte.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py index b35a679e3..44d0b5d70 100644 --- a/youtube_dl/extractor/arte.py +++ b/youtube_dl/extractor/arte.py @@ -69,7 +69,7 @@ class ArteTvIE(InfoExtractor): lang = mobj.group('lang') return self._extract_liveweb(url, name, lang) - if re.search(self._LIVE_URL, video_id) is not None: + if re.search(self._LIVE_URL, url) is not None: raise ExtractorError(u'Arte live streams are not yet supported, sorry') # self.extractLiveStream(url) # return @@ -115,7 +115,7 @@ class ArteTvIE(InfoExtractor): event_doc = config_doc.find('event') url_node = event_doc.find('video').find('urlHd') if url_node is None: - url_node = video_doc.find('urlSd') + url_node = event_doc.find('urlSd') return {'id': video_id, 'title': event_doc.find('name%s' % lang.capitalize()).text, From 1d699755e0978f2ec0932ef8d6562394a4799871 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Sun, 17 Nov 2013 11:06:16 +0100 Subject: [PATCH 045/425] [youtube] Add view_count (Fixes #1781) --- youtube_dl/extractor/youtube.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 8c0e6f252..1aa549740 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1301,6 +1301,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): else: raise ExtractorError(u'"token" parameter not in video info for unknown reason') + if 'view_count' in video_info: + view_count = int(video_info['view_count'][0]) + else: + view_count = None + # Check for "rental" videos if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info: raise ExtractorError(u'"rental" videos not supported') @@ -1489,6 +1494,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): 'age_limit': 18 if age_gate else 0, 'annotations': video_annotations, 'webpage_url': 'https://www.youtube.com/watch?v=%s' % video_id, + 'view_count': view_count, }) return results From 1e5b9a95fd2049e024b3ee2f13b4da5c308d2e9c Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Sun, 17 Nov 2013 11:39:52 +0100 Subject: [PATCH 046/425] Move console_title to YoutubeDL --- youtube_dl/FileDownloader.py | 17 +++-------------- youtube_dl/YoutubeDL.py | 13 +++++++++++++ 2 files changed, 16 insertions(+), 14 deletions(-) diff --git a/youtube_dl/FileDownloader.py b/youtube_dl/FileDownloader.py index 088f59586..e5a542ed5 100644 --- a/youtube_dl/FileDownloader.py +++ b/youtube_dl/FileDownloader.py @@ -5,9 +5,6 @@ import subprocess import sys import time -if os.name == 'nt': - import ctypes - from .utils import ( compat_urllib_error, compat_urllib_request, @@ -151,16 +148,8 @@ class FileDownloader(object): def to_stderr(self, message): self.ydl.to_screen(message) - def to_cons_title(self, message): - """Set console/terminal window title to message.""" - if not self.params.get('consoletitle', False): - return - if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow(): - # c_wchar_p() might not be necessary if `message` is - # already of type unicode() - ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message)) - elif 'TERM' in os.environ: - self.to_screen('\033]0;%s\007' % message, skip_eol=True) + def to_console_title(self, message): + self.ydl.to_console_title(message) def trouble(self, *args, **kargs): self.ydl.trouble(*args, **kargs) @@ -249,7 +238,7 @@ class FileDownloader(object): else: self.to_screen(u'\r%s[download] %s of %s at %s ETA %s' % (clear_line, percent_str, data_len_str, speed_str, eta_str), skip_eol=True) - self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' % + self.to_console_title(u'youtube-dl - %s of %s at %s ETA %s' % (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip())) def report_resuming_byte(self, resume_len): diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index d29e8bec5..6ea865bd9 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -13,6 +13,9 @@ import sys import time import traceback +if os.name == 'nt': + import ctypes + from .utils import * from .extractor import get_info_extractor, gen_extractors from .FileDownloader import FileDownloader @@ -176,6 +179,16 @@ class YoutubeDL(object): output = output.encode(preferredencoding()) sys.stderr.write(output) + def to_console_title(self, message): + if not self.params.get('consoletitle', False): + return + if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow(): + # c_wchar_p() might not be necessary if `message` is + # already of type unicode() + ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message)) + elif 'TERM' in os.environ: + self.to_screen('\033]0;%s\007' % message, skip_eol=True) + def fixed_template(self): """Checks if the output template is fixed.""" return (re.search(u'(?u)%\\(.+?\\)s', self.params['outtmpl']) is None) From ce02ed60f27ea27e66c33af745dc7e716377b46f Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Sun, 17 Nov 2013 16:47:52 +0100 Subject: [PATCH 047/425] Remove * imports --- youtube_dl/YoutubeDL.py | 28 ++++++++++++++++++++++++++-- youtube_dl/update.py | 6 +++++- 2 files changed, 31 insertions(+), 3 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 6ea865bd9..6e5ae44d3 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -16,7 +16,31 @@ import traceback if os.name == 'nt': import ctypes -from .utils import * +from .utils import ( + compat_http_client, + compat_print, + compat_str, + compat_urllib_error, + compat_urllib_request, + ContentTooShortError, + date_from_str, + DateRange, + determine_ext, + DownloadError, + encodeFilename, + ExtractorError, + locked_file, + MaxDownloadsReached, + PostProcessingError, + preferredencoding, + SameFileError, + sanitize_filename, + subtitles_filename, + takewhile_inclusive, + UnavailableVideoError, + write_json_file, + write_string, +) from .extractor import get_info_extractor, gen_extractors from .FileDownloader import FileDownloader @@ -267,7 +291,7 @@ class YoutubeDL(object): """Report file has already been fully downloaded.""" try: self.to_screen(u'[download] %s has already been downloaded' % file_name) - except (UnicodeEncodeError) as err: + except UnicodeEncodeError: self.to_screen(u'[download] The file has already been downloaded') def increment_downloads(self): diff --git a/youtube_dl/update.py b/youtube_dl/update.py index 0689a4891..f41b4785a 100644 --- a/youtube_dl/update.py +++ b/youtube_dl/update.py @@ -2,11 +2,15 @@ import io import json import traceback import hashlib +import os import subprocess import sys from zipimport import zipimporter -from .utils import * +from .utils import ( + compat_str, + compat_urllib_request, +) from .version import __version__ def rsa_verify(message, signature, key): From 90b6bbc38c7258358e294e7f1bcb3d46cd56ffd9 Mon Sep 17 00:00:00 2001 From: rzhxeo Date: Sun, 17 Nov 2013 17:42:24 +0100 Subject: [PATCH 048/425] [SouthParkStudiosIE] Also detect urls without http:// or www --- youtube_dl/extractor/southparkstudios.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/southparkstudios.py b/youtube_dl/extractor/southparkstudios.py index b1e96b679..bb0c4b393 100644 --- a/youtube_dl/extractor/southparkstudios.py +++ b/youtube_dl/extractor/southparkstudios.py @@ -5,7 +5,7 @@ from .mtv import MTVIE, _media_xml_tag class SouthParkStudiosIE(MTVIE): IE_NAME = u'southparkstudios.com' - _VALID_URL = r'https?://www\.southparkstudios\.com/(clips|full-episodes)/(?P.+?)(\?|#|$)' + _VALID_URL = r'(https?://)?(www\.)?(?Psouthparkstudios\.com/(clips|full-episodes)/(?P.+?)(\?|#|$))' _FEED_URL = 'http://www.southparkstudios.com/feeds/video-player/mrss' @@ -31,6 +31,7 @@ class SouthParkStudiosIE(MTVIE): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) + url = u'http://www.' + mobj.group(u'url') video_id = mobj.group('id') webpage = self._download_webpage(url, video_id) mgid = self._search_regex(r'swfobject.embedSWF\(".*?(mgid:.*?)"', From 1672647ade97e070fa67eeff68370910ae715573 Mon Sep 17 00:00:00 2001 From: rzhxeo Date: Sun, 17 Nov 2013 17:43:58 +0100 Subject: [PATCH 049/425] [SouthParkStudiosIE] Move from _TEST to _TESTS --- youtube_dl/extractor/southparkstudios.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/southparkstudios.py b/youtube_dl/extractor/southparkstudios.py index bb0c4b393..a75e328a7 100644 --- a/youtube_dl/extractor/southparkstudios.py +++ b/youtube_dl/extractor/southparkstudios.py @@ -9,17 +9,15 @@ class SouthParkStudiosIE(MTVIE): _FEED_URL = 'http://www.southparkstudios.com/feeds/video-player/mrss' - _TEST = { + # Overwrite MTVIE properties we don't want + _TESTS = [{ u'url': u'http://www.southparkstudios.com/clips/104437/bat-daded#tab=featured', u'file': u'a7bff6c2-ed00-11e0-aca6-0026b9414f30.mp4', u'info_dict': { u'title': u'Bat Daded', u'description': u'Randy disqualifies South Park by getting into a fight with Bat Dad.', }, - } - - # Overwrite MTVIE properties we don't want - _TESTS = [] + }] def _get_thumbnail_url(self, uri, itemdoc): search_path = '%s/%s' % (_media_xml_tag('group'), _media_xml_tag('thumbnail')) From 746f491f82cfddd6fafacbf9978205963d7a214d Mon Sep 17 00:00:00 2001 From: rzhxeo Date: Sun, 17 Nov 2013 17:54:47 +0100 Subject: [PATCH 050/425] Add support for southpark.de --- youtube_dl/extractor/__init__.py | 5 ++++- youtube_dl/extractor/southparkstudios.py | 14 ++++++++++++++ 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index ee3173468..2d1e3cdfd 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -116,7 +116,10 @@ from .slashdot import SlashdotIE from .slideshare import SlideshareIE from .sohu import SohuIE from .soundcloud import SoundcloudIE, SoundcloudSetIE, SoundcloudUserIE -from .southparkstudios import SouthParkStudiosIE +from .southparkstudios import ( + SouthParkStudiosIE, + SouthparkDeIE, +) from .space import SpaceIE from .spankwire import SpankwireIE from .spiegel import SpiegelIE diff --git a/youtube_dl/extractor/southparkstudios.py b/youtube_dl/extractor/southparkstudios.py index a75e328a7..a711531e6 100644 --- a/youtube_dl/extractor/southparkstudios.py +++ b/youtube_dl/extractor/southparkstudios.py @@ -35,3 +35,17 @@ class SouthParkStudiosIE(MTVIE): mgid = self._search_regex(r'swfobject.embedSWF\(".*?(mgid:.*?)"', webpage, u'mgid') return self._get_videos_info(mgid) + +class SouthparkDeIE(SouthParkStudiosIE): + IE_NAME = u'southpark.de' + _VALID_URL = r'(https?://)?(www\.)?(?Psouthpark\.de/(clips|alle-episoden)/(?P.+?)(\?|#|$))' + _FEED_URL = 'http://www.southpark.de/feeds/video-player/mrss/' + + _TESTS = [{ + u'url': u'http://www.southpark.de/clips/uygssh/the-government-wont-respect-my-privacy#tab=featured', + u'file': u'85487c96-b3b9-4e39-9127-ad88583d9bf2.mp4', + u'info_dict': { + u'title': u'The Government Won\'t Respect My Privacy', + u'description': u'Cartman explains the benefits of "Shitter" to Stan, Kyle and Craig.', + }, + }] From bdde425cbe01329d8c24e18cf0492465abb21411 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Sun, 17 Nov 2013 21:05:14 +0100 Subject: [PATCH 051/425] Save and restore console title (Fixes #1782) --- youtube_dl/YoutubeDL.py | 19 ++++++++ youtube_dl/__init__.py | 97 +++++++++++++++++++++-------------------- 2 files changed, 68 insertions(+), 48 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 6e5ae44d3..4e28f9120 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -213,6 +213,25 @@ class YoutubeDL(object): elif 'TERM' in os.environ: self.to_screen('\033]0;%s\007' % message, skip_eol=True) + def save_console_title(self): + if not self.params.get('consoletitle', False): + return + if 'TERM' in os.environ: + self.to_screen('\033[22t') + + def restore_console_title(self): + if not self.params.get('consoletitle', False): + return + if 'TERM' in os.environ: + self.to_screen('\033[23t') + + def __enter__(self): + self.save_console_title() + return self + + def __exit__(self, *args): + self.restore_console_title() + def fixed_template(self): """Checks if the output template is fixed.""" return (re.search(u'(?u)%\\(.+?\\)s', self.params['outtmpl']) is None) diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 4dee487ab..af4c9c5c4 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -603,8 +603,7 @@ def _real_main(argv=None): u' file! Use "%%(ext)s" instead of %r' % determine_ext(outtmpl, u'')) - # YoutubeDL - ydl = YoutubeDL({ + ydl_opts = { 'usenetrc': opts.usenetrc, 'username': opts.username, 'password': opts.password, @@ -667,61 +666,63 @@ def _real_main(argv=None): 'youtube_print_sig_code': opts.youtube_print_sig_code, 'age_limit': opts.age_limit, 'download_archive': opts.download_archive, - }) + } - if opts.verbose: - write_string(u'[debug] youtube-dl version ' + __version__ + u'\n') - try: - sp = subprocess.Popen( - ['git', 'rev-parse', '--short', 'HEAD'], - stdout=subprocess.PIPE, stderr=subprocess.PIPE, - cwd=os.path.dirname(os.path.abspath(__file__))) - out, err = sp.communicate() - out = out.decode().strip() - if re.match('[0-9a-f]+', out): - write_string(u'[debug] Git HEAD: ' + out + u'\n') - except: + with YoutubeDL(ydl_opts) as ydl: + if opts.verbose: + write_string(u'[debug] youtube-dl version ' + __version__ + u'\n') try: - sys.exc_clear() + sp = subprocess.Popen( + ['git', 'rev-parse', '--short', 'HEAD'], + stdout=subprocess.PIPE, stderr=subprocess.PIPE, + cwd=os.path.dirname(os.path.abspath(__file__))) + out, err = sp.communicate() + out = out.decode().strip() + if re.match('[0-9a-f]+', out): + write_string(u'[debug] Git HEAD: ' + out + u'\n') except: - pass - write_string(u'[debug] Python version %s - %s' %(platform.python_version(), platform_name()) + u'\n') + try: + sys.exc_clear() + except: + pass + write_string(u'[debug] Python version %s - %s' % + (platform.python_version(), platform_name()) + u'\n') - proxy_map = {} - for handler in opener.handlers: - if hasattr(handler, 'proxies'): - proxy_map.update(handler.proxies) - write_string(u'[debug] Proxy map: ' + compat_str(proxy_map) + u'\n') + proxy_map = {} + for handler in opener.handlers: + if hasattr(handler, 'proxies'): + proxy_map.update(handler.proxies) + write_string(u'[debug] Proxy map: ' + compat_str(proxy_map) + u'\n') - ydl.add_default_info_extractors() + ydl.add_default_info_extractors() - # PostProcessors - # Add the metadata pp first, the other pps will copy it - if opts.addmetadata: - ydl.add_post_processor(FFmpegMetadataPP()) - if opts.extractaudio: - ydl.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat, preferredquality=opts.audioquality, nopostoverwrites=opts.nopostoverwrites)) - if opts.recodevideo: - ydl.add_post_processor(FFmpegVideoConvertor(preferedformat=opts.recodevideo)) - if opts.embedsubtitles: - ydl.add_post_processor(FFmpegEmbedSubtitlePP(subtitlesformat=opts.subtitlesformat)) + # PostProcessors + # Add the metadata pp first, the other pps will copy it + if opts.addmetadata: + ydl.add_post_processor(FFmpegMetadataPP()) + if opts.extractaudio: + ydl.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat, preferredquality=opts.audioquality, nopostoverwrites=opts.nopostoverwrites)) + if opts.recodevideo: + ydl.add_post_processor(FFmpegVideoConvertor(preferedformat=opts.recodevideo)) + if opts.embedsubtitles: + ydl.add_post_processor(FFmpegEmbedSubtitlePP(subtitlesformat=opts.subtitlesformat)) - # Update version - if opts.update_self: - update_self(ydl.to_screen, opts.verbose) + # Update version + if opts.update_self: + update_self(ydl.to_screen, opts.verbose) - # Maybe do nothing - if len(all_urls) < 1: - if not opts.update_self: - parser.error(u'you must provide at least one URL') - else: - sys.exit() + # Maybe do nothing + if len(all_urls) < 1: + if not opts.update_self: + parser.error(u'you must provide at least one URL') + else: + sys.exit() - try: - retcode = ydl.download(all_urls) - except MaxDownloadsReached: - ydl.to_screen(u'--max-download limit reached, aborting.') - retcode = 101 + try: + retcode = ydl.download(all_urls) + except MaxDownloadsReached: + ydl.to_screen(u'--max-download limit reached, aborting.') + retcode = 101 # Dump cookie jar if requested if opts.cookiefile is not None: From 749febf4d1513328ca8b9b6c16969d8c70ff4555 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Sun, 17 Nov 2013 21:12:50 +0100 Subject: [PATCH 052/425] Allow --console-title when --quiet is given (Fixes #1783) --- youtube_dl/YoutubeDL.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 4e28f9120..20eed96ca 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -211,19 +211,19 @@ class YoutubeDL(object): # already of type unicode() ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message)) elif 'TERM' in os.environ: - self.to_screen('\033]0;%s\007' % message, skip_eol=True) + write_string(u'\033]0;%s\007' % message, self._screen_file) def save_console_title(self): if not self.params.get('consoletitle', False): return if 'TERM' in os.environ: - self.to_screen('\033[22t') + write_string(u'\033[22t', self._screen_file) def restore_console_title(self): if not self.params.get('consoletitle', False): return if 'TERM' in os.environ: - self.to_screen('\033[23t') + write_string(u'\033[23t', self._screen_file) def __enter__(self): self.save_console_title() From 63b7b7224a9cbb33856d3e1f8755ba4bdd3a9d58 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Sun, 17 Nov 2013 22:11:39 +0100 Subject: [PATCH 053/425] [MTVIE] Try with RTMP URL if download fails This fixes youtube-dl http://www.southpark.de/clips/155251/cartman-vs-the-dog-whisperer --- youtube_dl/extractor/mtv.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py index 24a79ae13..3df7f9b85 100644 --- a/youtube_dl/extractor/mtv.py +++ b/youtube_dl/extractor/mtv.py @@ -48,7 +48,7 @@ class MTVIE(InfoExtractor): def _transform_rtmp_url(rtmp_video_url): m = re.match(r'^rtmpe?://.*?/(?Pgsp\..+?/.*)$', rtmp_video_url) if not m: - raise ExtractorError(u'Cannot transform RTMP url') + return rtmp_video_url base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/' return base + m.group('finalid') From 73c566695fac926e7e9e6922fe4e6d82c64a1850 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Sun, 17 Nov 2013 22:14:13 +0100 Subject: [PATCH 054/425] release 2013.11.17 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index b04238eb5..110058c79 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2013.11.15.1' +__version__ = '2013.11.17' From fccd377198e9be2ea95cc761d841704a5f4f5ec5 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Mon, 18 Nov 2013 13:05:18 +0100 Subject: [PATCH 055/425] Suppor embed-only videos (Fixes #1746) --- youtube_dl/extractor/youtube.py | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 1aa549740..514a11f7f 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -139,7 +139,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor): class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): IE_DESC = u'YouTube.com' - _VALID_URL = r"""^ + _VALID_URL = r"""(?xi)^ ( (?:https?://)? # http(s):// (optional) (?:(?:(?:(?:\w+\.)?youtube(?:-nocookie)?\.com/| @@ -363,6 +363,18 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): u"uploader_id": u"justintimberlakeVEVO" } }, + { + u"url": u"https://www.YouTube.com/watch?v=yZIXLfi8CZQ", + u"file": u"yZIXLfi8CZQ.mp4", + u"note": u"Embed-only video (#1746)", + u"info_dict": { + u"upload_date": u"20120608", + u"title": u"Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012", + u"description": u"md5:09b78bd971f1e3e289601dfba15ca4f7", + u"uploader": u"SET India", + u"uploader_id": u"setindia" + } + }, ] @@ -370,7 +382,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): def suitable(cls, url): """Receives a URL and returns True if suitable for this IE.""" if YoutubePlaylistIE.suitable(url): return False - return re.match(cls._VALID_URL, url, re.VERBOSE) is not None + return re.match(cls._VALID_URL, url) is not None def __init__(self, *args, **kwargs): super(YoutubeIE, self).__init__(*args, **kwargs) @@ -1272,7 +1284,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): # We simulate the access to the video from www.youtube.com/v/{video_id} # this can be viewed without login into Youtube data = compat_urllib_parse.urlencode({'video_id': video_id, - 'el': 'embedded', + 'el': 'player_embedded', 'gl': 'US', 'hl': 'en', 'eurl': 'https://youtube.googleapis.com/v/' + video_id, From 96b31b65331710ad6d3582abc7ecbaee0efa605a Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Mon, 18 Nov 2013 13:05:58 +0100 Subject: [PATCH 056/425] Add iPhone to UA (#1746) --- youtube_dl/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 1d9785341..659ac859d 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -176,7 +176,7 @@ def compat_ord(c): compiled_regex_type = type(re.compile('')) std_headers = { - 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)', + 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome) (iPhone)', 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', From 83aa529330245b9221ae99e6769a40531f4ba216 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Mon, 18 Nov 2013 13:18:17 +0100 Subject: [PATCH 057/425] Support protocol-independent URLs (#1787) --- youtube_dl/extractor/youtube.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 514a11f7f..9aa4c6fab 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -141,7 +141,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): IE_DESC = u'YouTube.com' _VALID_URL = r"""(?xi)^ ( - (?:https?://)? # http(s):// (optional) + (?:https?://|//)? # http(s):// or protocol-independent URL (optional) (?:(?:(?:(?:\w+\.)?youtube(?:-nocookie)?\.com/| tube\.majestyc\.net/| youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains @@ -364,7 +364,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): } }, { - u"url": u"https://www.YouTube.com/watch?v=yZIXLfi8CZQ", + u"url": u"//www.YouTube.com/watch?v=yZIXLfi8CZQ", u"file": u"yZIXLfi8CZQ.mp4", u"note": u"Embed-only video (#1746)", u"info_dict": { From 887c6acdf231e6846e0dbb63b644fc7dc3378700 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Mon, 18 Nov 2013 13:28:26 +0100 Subject: [PATCH 058/425] Support multiple embedded YouTube URLs (Fixes #1787) --- youtube_dl/extractor/generic.py | 31 +++++++++++++++++-------------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index c7552fddb..e1d6a2a01 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -162,6 +162,16 @@ class GenericIE(InfoExtractor): raise ExtractorError(u'Failed to download URL: %s' % url) self.report_extraction(video_id) + + # it's tempting to parse this further, but you would + # have to take into account all the variations like + # Video Title - Site Name + # Site Name | Video Title + # Video Title - Tagline | Site Name + # and so on and so forth; it's just not practical + video_title = self._html_search_regex(r'(.*)', + webpage, u'video title', default=u'video', flags=re.DOTALL) + # Look for BrightCove: bc_url = BrightcoveIE._extract_brightcove_url(webpage) if bc_url is not None: @@ -177,11 +187,13 @@ class GenericIE(InfoExtractor): return self.url_result(surl, 'Vimeo') # Look for embedded YouTube player - mobj = re.search( - r']+?src=(["\'])(?Phttps?://(?:www\.)?youtube.com/embed/.+?)\1', webpage) - if mobj: - surl = unescapeHTML(mobj.group(u'url')) - return self.url_result(surl, 'Youtube') + matches = re.findall( + r']+?src=(["\'])(?P(?:https?:)?//(?:www\.)?youtube.com/embed/.+?)\1', webpage) + if matches: + urlrs = [self.url_result(unescapeHTML(tuppl[1]), 'Youtube') + for tuppl in matches] + return self.playlist_result( + urlrs, playlist_id=video_id, playlist_title=video_title) # Look for Bandcamp pages with custom domain mobj = re.search(r']*?content="(.*?bandcamp\.com.*?)"', webpage) @@ -226,15 +238,6 @@ class GenericIE(InfoExtractor): video_extension = os.path.splitext(video_id)[1][1:] video_id = os.path.splitext(video_id)[0] - # it's tempting to parse this further, but you would - # have to take into account all the variations like - # Video Title - Site Name - # Site Name | Video Title - # Video Title - Tagline | Site Name - # and so on and so forth; it's just not practical - video_title = self._html_search_regex(r'(.*)', - webpage, u'video title', default=u'video', flags=re.DOTALL) - # video uploader is domain name video_uploader = self._search_regex(r'(?:https?://)?([^/]*)/.*', url, u'video uploader') From a81b4d5c8fe32611fa79104510d422b208f2a1a2 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Mon, 18 Nov 2013 13:30:43 +0100 Subject: [PATCH 059/425] release 2013.11.18 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 110058c79..16e32abe4 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2013.11.17' +__version__ = '2013.11.18' From ae8f7871412ae9db40c5060d2d24e5a50f5fe9d0 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Mon, 18 Nov 2013 13:52:24 +0100 Subject: [PATCH 060/425] Remove iPhone from user agent. This breaks a lot of extractors In the future, it might be worth investigating whether we get better content when we claime to be an iPhone. --- youtube_dl/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 659ac859d..1d9785341 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -176,7 +176,7 @@ def compat_ord(c): compiled_regex_type = type(re.compile('')) std_headers = { - 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome) (iPhone)', + 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)', 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', From 9906d397a05de3c89fb0ba2d60c284c16cb72581 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Mon, 18 Nov 2013 13:56:45 +0100 Subject: [PATCH 061/425] [auengine] Simplify --- youtube_dl/extractor/auengine.py | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/youtube_dl/extractor/auengine.py b/youtube_dl/extractor/auengine.py index 0febbff4f..90dfa9a46 100644 --- a/youtube_dl/extractor/auengine.py +++ b/youtube_dl/extractor/auengine.py @@ -1,10 +1,10 @@ -import os.path import re from .common import InfoExtractor from ..utils import ( compat_urllib_parse, - compat_urllib_parse_urlparse, + determine_ext, + ExtractorError, ) class AUEngineIE(InfoExtractor): @@ -25,22 +25,26 @@ class AUEngineIE(InfoExtractor): title = self._html_search_regex(r'(?P<title>.+?)', webpage, u'title') title = title.strip() - links = re.findall(r'[^A-Za-z0-9]?(?:file|url):\s*["\'](http[^\'"&]*)', webpage) - links = [compat_urllib_parse.unquote(l) for l in links] + links = re.findall(r'\s(?:file|url):\s*["\']([^\'"]+)["\']', webpage) + links = map(compat_urllib_parse.unquote, links) + + thumbnail = None + video_url = None for link in links: - root, pathext = os.path.splitext(compat_urllib_parse_urlparse(link).path) - if pathext == '.png': + if link.endswith('.png'): thumbnail = link - elif pathext == '.mp4': - url = link - ext = pathext + elif '/videos/' in link: + video_url = link + if not video_url: + raise ExtractorError(u'Could not find video URL') + ext = u'.' + determine_ext(video_url) if ext == title[-len(ext):]: title = title[:-len(ext)] - ext = ext[1:] - return [{ + + return { 'id': video_id, - 'url': url, + 'url': video_url, 'ext': ext, 'title': title, 'thumbnail': thumbnail, - }] + } From 9a942a46712330173683f8a0179868ae0c5a9138 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Mon, 18 Nov 2013 13:56:53 +0100 Subject: [PATCH 062/425] release 2013.11.18.1 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 16e32abe4..6b3bf39fa 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2013.11.18' +__version__ = '2013.11.18.1' From 4113e6ab561e6f2f014fb2548d1c8691352b8304 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Mon, 18 Nov 2013 14:36:01 +0100 Subject: [PATCH 063/425] [auengine] Do not return unnecessary ext --- youtube_dl/extractor/auengine.py | 1 - 1 file changed, 1 deletion(-) diff --git a/youtube_dl/extractor/auengine.py b/youtube_dl/extractor/auengine.py index 90dfa9a46..95c038003 100644 --- a/youtube_dl/extractor/auengine.py +++ b/youtube_dl/extractor/auengine.py @@ -44,7 +44,6 @@ class AUEngineIE(InfoExtractor): return { 'id': video_id, 'url': video_url, - 'ext': ext, 'title': title, 'thumbnail': thumbnail, } From efd6c574a2cabc860f54018af726cd291cbec868 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Mon, 18 Nov 2013 16:35:41 +0100 Subject: [PATCH 064/425] Correctly write and restore the console title on the stack (fixes #1782) --- youtube_dl/YoutubeDL.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 20eed96ca..fd5a30a0c 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -217,13 +217,15 @@ class YoutubeDL(object): if not self.params.get('consoletitle', False): return if 'TERM' in os.environ: - write_string(u'\033[22t', self._screen_file) + # Save the title on stack + write_string(u'\033[22;0t', self._screen_file) def restore_console_title(self): if not self.params.get('consoletitle', False): return if 'TERM' in os.environ: - write_string(u'\033[23t', self._screen_file) + # Restore the title from stack + write_string(u'\033[23;0t', self._screen_file) def __enter__(self): self.save_console_title() From cb7dfeeac407ca973b7d838dd3c8f6faa0a65bac Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Mon, 18 Nov 2013 16:42:35 +0100 Subject: [PATCH 065/425] [youtube] only allow domain name to be upper-case (#1786) --- youtube_dl/extractor/youtube.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 9aa4c6fab..41838237c 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -139,10 +139,10 @@ class YoutubeBaseInfoExtractor(InfoExtractor): class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): IE_DESC = u'YouTube.com' - _VALID_URL = r"""(?xi)^ + _VALID_URL = r"""(?x)^ ( (?:https?://|//)? # http(s):// or protocol-independent URL (optional) - (?:(?:(?:(?:\w+\.)?youtube(?:-nocookie)?\.com/| + (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/| tube\.majestyc\.net/| youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains (?:.*?\#/)? # handle anchor (#/) redirect urls From 34b3afc7be059cbc882b901b897ec99eeee25e14 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Tue, 19 Nov 2013 12:41:01 +0100 Subject: [PATCH 066/425] release 2013.11.19 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 6b3bf39fa..e9ff3f640 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2013.11.18.1' +__version__ = '2013.11.19' From 69545c2affb6b126398fd657a12a560a9857dbee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Tue, 19 Nov 2013 20:43:49 +0100 Subject: [PATCH 067/425] [d8] inherit from CanalplusIE it reuses the same extraction process --- youtube_dl/extractor/canalplus.py | 3 ++- youtube_dl/extractor/d8.py | 36 +++---------------------------- 2 files changed, 5 insertions(+), 34 deletions(-) diff --git a/youtube_dl/extractor/canalplus.py b/youtube_dl/extractor/canalplus.py index 1db9b24cf..bfa2a8b40 100644 --- a/youtube_dl/extractor/canalplus.py +++ b/youtube_dl/extractor/canalplus.py @@ -5,6 +5,7 @@ import xml.etree.ElementTree from .common import InfoExtractor from ..utils import unified_strdate + class CanalplusIE(InfoExtractor): _VALID_URL = r'https?://(www\.canalplus\.fr/.*?/(?P.*)|player\.canalplus\.fr/#/(?P\d+))' _VIDEO_INFO_TEMPLATE = 'http://service.canal-plus.com/video/rest/getVideosLiees/cplus/%s' @@ -25,7 +26,7 @@ class CanalplusIE(InfoExtractor): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + video_id = mobj.groupdict().get('id') if video_id is None: webpage = self._download_webpage(url, mobj.group('path')) video_id = self._search_regex(r'videoId = "(\d+)";', webpage, u'video id') diff --git a/youtube_dl/extractor/d8.py b/youtube_dl/extractor/d8.py index 5ce483b16..a56842b16 100644 --- a/youtube_dl/extractor/d8.py +++ b/youtube_dl/extractor/d8.py @@ -1,11 +1,8 @@ # encoding: utf-8 -import re -import xml.etree.ElementTree +from .canalplus import CanalplusIE -from .common import InfoExtractor -from ..utils import unified_strdate -class D8IE(InfoExtractor): +class D8IE(CanalplusIE): _VALID_URL = r'https?://www\.d8\.tv/.*?/(?P.*)' _VIDEO_INFO_TEMPLATE = 'http://service.canal-plus.com/video/rest/getVideosLiees/d8/%s' IE_NAME = u'd8.tv' @@ -19,34 +16,7 @@ class D8IE(InfoExtractor): u'upload_date': u'20131108', }, u'params': { + # rtmp u'skip_download': True, }, } - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - webpage = self._download_webpage(url, mobj.group('path')) - video_id = self._search_regex(r'videoId = "(\d+)";', webpage, u'video id') - info_url = self._VIDEO_INFO_TEMPLATE % video_id - info_page = self._download_webpage(info_url,video_id, - u'Downloading video info') - - self.report_extraction(video_id) - doc = xml.etree.ElementTree.fromstring(info_page.encode('utf-8')) - video_info = [video for video in doc if video.find('ID').text == video_id][0] - infos = video_info.find('INFOS') - media = video_info.find('MEDIA') - formats = [media.find('VIDEOS/%s' % format) - for format in ['BAS_DEBIT', 'HAUT_DEBIT', 'HD']] - video_url = [format.text for format in formats if format is not None][-1] - - return {'id': video_id, - 'title': u'%s - %s' % (infos.find('TITRAGE/TITRE').text, - infos.find('TITRAGE/SOUS_TITRE').text), - 'url': video_url, - 'ext': 'flv', - 'upload_date': unified_strdate(infos.find('PUBLICATION/DATE').text), - 'thumbnail': media.find('IMAGES/GRAND').text, - 'description': infos.find('DESCRIPTION').text, - 'view_count': int(infos.find('NB_VUES').text), - } From 59040888112e7e5a22ef17bacc27ca88d58a5d92 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Wed, 20 Nov 2013 06:13:19 +0100 Subject: [PATCH 068/425] Add support for tou.tv (Fixes #1792) --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/common.py | 28 ++++++++++++ youtube_dl/extractor/toutv.py | 75 ++++++++++++++++++++++++++++++++ youtube_dl/utils.py | 2 + 4 files changed, 106 insertions(+) create mode 100644 youtube_dl/extractor/toutv.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 2679d1a8f..b0df1cef7 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -133,6 +133,7 @@ from .techtalks import TechTalksIE from .ted import TEDIE from .tf1 import TF1IE from .thisav import ThisAVIE +from .toutv import TouTvIE from .traileraddict import TrailerAddictIE from .trilulilu import TriluliluIE from .tube8 import Tube8IE diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index f787d0a3c..eb3435c77 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -350,6 +350,17 @@ class InfoExtractor(object): if secure: regexes = self._og_regexes('video:secure_url') + regexes return self._html_search_regex(regexes, html, name, **kargs) + def _html_search_meta(self, name, html, display_name=None): + if display_name is None: + display_name = name + return self._html_search_regex( + r'''(?ix)]+(?:name|property)=["\']%s["\']) + [^>]+content=["\']([^"\']+)["\']''' % re.escape(name), + html, display_name, fatal=False) + + def _dc_search_uploader(self, html): + return self._html_search_meta('dc.creator', html, 'uploader') + def _rta_search(self, html): # See http://www.rtalabel.org/index.php?content=howtofaq#single if re.search(r'(?ix)[a-zA-Z0-9_-]+(?:/(?PS[0-9]+E[0-9]+)))' + + _TEST = { + u'url': u'http://www.tou.tv/30-vies/S04E41', + u'file': u'30-vies_S04E41.mp4', + u'info_dict': { + u'title': u'30 vies Saison 4 / Épisode 41', + u'description': u'md5:da363002db82ccbe4dafeb9cab039b09', + u'age_limit': 8, + u'uploader': u'Groupe des Nouveaux Médias', + u'duration': 1296, + u'upload_date': u'20131118', + u'thumbnail': u'http://static.tou.tv/medias/images/2013-11-18_19_00_00_30VIES_0341_01_L.jpeg', + }, + u'params': { + u'skip_download': True, # Requires rtmpdump + }, + u'xskip': 'Only available in Canada' + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + webpage = self._download_webpage(url, video_id) + + mediaId = self._search_regex( + r'"idMedia":\s*"([^"]+)"', webpage, u'media ID') + + # TODO test from de + streams_url = u'http://release.theplatform.com/content.select?pid=' + mediaId + streams_webpage = self._download_webpage( + streams_url, video_id, note=u'Downloading stream list') + + streams_doc = xml.etree.ElementTree.fromstring( + streams_webpage.encode('utf-8')) + video_url = next(n.text + for n in streams_doc.findall('.//choice/url') + if u'//ad.doubleclick' not in n.text) + if video_url.endswith('/Unavailable.flv'): + raise ExtractorError( + u'Access to this video is blocked from outside of Canada', + expected=True) + + duration_str = self._html_search_meta( + 'video:duration', webpage, u'duration') + duration = int(duration_str) if duration_str else None + upload_date_str = self._html_search_meta( + 'video:release_date', webpage, u'upload date') + upload_date = unified_strdate(upload_date_str) if upload_date_str else None + + return { + 'id': video_id, + 'title': self._og_search_title(webpage), + 'url': video_url, + 'description': self._og_search_description(webpage), + 'uploader': self._dc_search_uploader(webpage), + 'thumbnail': self._og_search_thumbnail(webpage), + 'age_limit': self._media_rating_search(webpage), + 'duration': duration, + 'upload_date': upload_date, + 'ext': 'mp4', + } diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 1d9785341..b50c8166f 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -734,6 +734,8 @@ def unified_strdate(date_str): '%Y/%m/%d %H:%M:%S', '%d.%m.%Y %H:%M', '%Y-%m-%dT%H:%M:%SZ', + '%Y-%m-%dT%H:%M:%S.%fZ', + '%Y-%m-%dT%H:%M:%S.%f0Z', '%Y-%m-%dT%H:%M:%S', ] for expression in format_expressions: From 9d1538182fdc540fa79ec8341385a980cde334e7 Mon Sep 17 00:00:00 2001 From: Mohamedh Fazal Date: Tue, 19 Nov 2013 18:59:22 +0500 Subject: [PATCH 069/425] Add an option to dump json information --- youtube_dl/YoutubeDL.py | 3 +++ youtube_dl/__init__.py | 8 ++++++-- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index fd5a30a0c..d8d55d7d7 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -84,6 +84,7 @@ class YoutubeDL(object): forcethumbnail: Force printing thumbnail URL. forcedescription: Force printing description. forcefilename: Force printing final filename. + forcejson: Force printing json information. simulate: Do not download the video files. format: Video format code. format_limit: Highest quality format to try. @@ -651,6 +652,8 @@ class YoutubeDL(object): compat_print(filename) if self.params.get('forceformat', False): compat_print(info_dict['format']) + if self.params.get('forcejson', False): + compat_print(json.dumps(info_dict)) # Do nothing else if in simulate mode if self.params.get('simulate', False): diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index af4c9c5c4..7ab99bdd1 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -306,6 +306,9 @@ def parseOpts(overrideArguments=None): verbosity.add_option('--get-format', action='store_true', dest='getformat', help='simulate, quiet but print output format', default=False) + verbosity.add_option('-j', '--dump-json', + action='store_true', dest='dumpjson', + help='simulate, quiet but print json information', default=False) verbosity.add_option('--newline', action='store_true', dest='progress_with_newline', help='output progress bar as new lines', default=False) verbosity.add_option('--no-progress', @@ -608,7 +611,7 @@ def _real_main(argv=None): 'username': opts.username, 'password': opts.password, 'videopassword': opts.videopassword, - 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getid or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat), + 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getid or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat or opts.dumpjson), 'forceurl': opts.geturl, 'forcetitle': opts.gettitle, 'forceid': opts.getid, @@ -616,8 +619,9 @@ def _real_main(argv=None): 'forcedescription': opts.getdescription, 'forcefilename': opts.getfilename, 'forceformat': opts.getformat, + 'forcejson': opts.dumpjson, 'simulate': opts.simulate, - 'skip_download': (opts.skip_download or opts.simulate or opts.geturl or opts.gettitle or opts.getid or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat), + 'skip_download': (opts.skip_download or opts.simulate or opts.geturl or opts.gettitle or opts.getid or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat or opts.dumpjson), 'format': opts.format, 'format_limit': opts.format_limit, 'listformats': opts.listformats, From 8694c60000d2aed2b37e10da0ffda132249ff10a Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Wed, 20 Nov 2013 06:18:24 +0100 Subject: [PATCH 070/425] import json for --dump-json --- youtube_dl/YoutubeDL.py | 3 ++- youtube_dl/__init__.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index d8d55d7d7..bd093add8 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -5,6 +5,7 @@ from __future__ import absolute_import import errno import io +import json import os import re import shutil @@ -84,7 +85,7 @@ class YoutubeDL(object): forcethumbnail: Force printing thumbnail URL. forcedescription: Force printing description. forcefilename: Force printing final filename. - forcejson: Force printing json information. + forcejson: Force printing info_dict as JSON. simulate: Do not download the video files. format: Video format code. format_limit: Highest quality format to try. diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 7ab99bdd1..e27bd4544 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -308,7 +308,7 @@ def parseOpts(overrideArguments=None): help='simulate, quiet but print output format', default=False) verbosity.add_option('-j', '--dump-json', action='store_true', dest='dumpjson', - help='simulate, quiet but print json information', default=False) + help='simulate, quiet but print JSON information', default=False) verbosity.add_option('--newline', action='store_true', dest='progress_with_newline', help='output progress bar as new lines', default=False) verbosity.add_option('--no-progress', From cc13cc0251b287ac88883096fcc36a8072e7ab5d Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Wed, 20 Nov 2013 06:25:33 +0100 Subject: [PATCH 071/425] [teamcoco] Correct error --- youtube_dl/extractor/teamcoco.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/teamcoco.py b/youtube_dl/extractor/teamcoco.py index bc48620f0..165d9f88b 100644 --- a/youtube_dl/extractor/teamcoco.py +++ b/youtube_dl/extractor/teamcoco.py @@ -60,7 +60,7 @@ class TeamcocoIE(InfoExtractor): return -1 formats.sort(key=sort_key) if not formats: - raise RegexNotFoundError(u'Unable to extract video URL') + raise ExtractorError(u'Unable to extract video URL') return { 'id': video_id, From f3682997d7a709e7cf74ba83773570d2a99beb12 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Wed, 20 Nov 2013 06:27:48 +0100 Subject: [PATCH 072/425] Clean up unused imports and other minor mistakes --- youtube_dl/extractor/collegehumor.py | 4 +--- youtube_dl/extractor/eighttracks.py | 1 - youtube_dl/extractor/facebook.py | 1 - youtube_dl/extractor/fktv.py | 1 - youtube_dl/extractor/gamespot.py | 2 +- youtube_dl/extractor/jeuxvideo.py | 2 +- youtube_dl/extractor/livestream.py | 2 -- youtube_dl/extractor/mtv.py | 1 - youtube_dl/extractor/pornhub.py | 1 - youtube_dl/extractor/soundcloud.py | 1 - youtube_dl/extractor/spankwire.py | 1 - youtube_dl/extractor/spiegel.py | 1 - youtube_dl/extractor/ted.py | 3 +-- youtube_dl/extractor/tube8.py | 2 -- youtube_dl/extractor/xtube.py | 1 - youtube_dl/extractor/zdf.py | 2 +- 16 files changed, 5 insertions(+), 21 deletions(-) diff --git a/youtube_dl/extractor/collegehumor.py b/youtube_dl/extractor/collegehumor.py index 8d4c93d6d..0c29acfb1 100644 --- a/youtube_dl/extractor/collegehumor.py +++ b/youtube_dl/extractor/collegehumor.py @@ -71,10 +71,8 @@ class CollegeHumorIE(InfoExtractor): adoc = xml.etree.ElementTree.fromstring(manifestXml) try: - media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0] - node_id = media_node.attrib['url'] video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text - except IndexError as err: + except IndexError: raise ExtractorError(u'Invalid manifest file') url_pr = compat_urllib_parse_urlparse(info['thumbnail']) info['url'] = url_pr.scheme + '://' + url_pr.netloc + video_id[:-2].replace('.csmil','').replace(',','') diff --git a/youtube_dl/extractor/eighttracks.py b/youtube_dl/extractor/eighttracks.py index 2cfbcd363..f21ef8853 100644 --- a/youtube_dl/extractor/eighttracks.py +++ b/youtube_dl/extractor/eighttracks.py @@ -1,4 +1,3 @@ -import itertools import json import random import re diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index f8bdfc2d3..3b210710e 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -1,5 +1,4 @@ import json -import netrc import re import socket diff --git a/youtube_dl/extractor/fktv.py b/youtube_dl/extractor/fktv.py index 9c89362ef..dba1a8dc2 100644 --- a/youtube_dl/extractor/fktv.py +++ b/youtube_dl/extractor/fktv.py @@ -39,7 +39,6 @@ class FKTVIE(InfoExtractor): for i, _ in enumerate(files, 1): video_id = '%04d%d' % (episode, i) video_url = 'http://dl%d.fernsehkritik.tv/fernsehkritik%d%s.flv' % (server, episode, '' if i == 1 else '-%d' % i) - video_title = 'Fernsehkritik %d.%d' % (episode, i) videos.append({ 'id': video_id, 'url': video_url, diff --git a/youtube_dl/extractor/gamespot.py b/youtube_dl/extractor/gamespot.py index 098768361..9645b00c3 100644 --- a/youtube_dl/extractor/gamespot.py +++ b/youtube_dl/extractor/gamespot.py @@ -24,7 +24,7 @@ class GameSpotIE(InfoExtractor): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - page_id = video_id = mobj.group('page_id') + page_id = mobj.group('page_id') webpage = self._download_webpage(url, page_id) data_video_json = self._search_regex(r'data-video=\'(.*?)\'', webpage, u'data video') data_video = json.loads(unescapeHTML(data_video_json)) diff --git a/youtube_dl/extractor/jeuxvideo.py b/youtube_dl/extractor/jeuxvideo.py index 6bb54b932..0020c47cf 100644 --- a/youtube_dl/extractor/jeuxvideo.py +++ b/youtube_dl/extractor/jeuxvideo.py @@ -22,7 +22,7 @@ class JeuxVideoIE(InfoExtractor): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - title = re.match(self._VALID_URL, url).group(1) + title = mobj.group(1) webpage = self._download_webpage(url, title) xml_link = self._html_search_regex( r'', diff --git a/youtube_dl/extractor/livestream.py b/youtube_dl/extractor/livestream.py index 1a3e0ae6b..5f548437c 100644 --- a/youtube_dl/extractor/livestream.py +++ b/youtube_dl/extractor/livestream.py @@ -6,9 +6,7 @@ from .common import InfoExtractor from ..utils import ( compat_urllib_parse_urlparse, compat_urlparse, - get_meta_content, xpath_with_ns, - ExtractorError, ) diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py index 3df7f9b85..04afd6c4c 100644 --- a/youtube_dl/extractor/mtv.py +++ b/youtube_dl/extractor/mtv.py @@ -59,7 +59,6 @@ class MTVIE(InfoExtractor): if '/error_country_block.swf' in metadataXml: raise ExtractorError(u'This video is not available from your country.', expected=True) mdoc = xml.etree.ElementTree.fromstring(metadataXml.encode('utf-8')) - renditions = mdoc.findall('.//rendition') formats = [] for rendition in mdoc.findall('.//rendition'): diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index 75cf4bb9f..8b3471919 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -6,7 +6,6 @@ from ..utils import ( compat_urllib_parse_urlparse, compat_urllib_request, compat_urllib_parse, - unescapeHTML, ) from ..aes import ( aes_decrypt_text diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index 83e1f055f..687457e10 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -158,7 +158,6 @@ class SoundcloudSetIE(SoundcloudIE): resolv_url = self._resolv_url(url) info_json = self._download_webpage(resolv_url, full_title) - videos = [] info = json.loads(info_json) if 'errors' in info: for err in info['errors']: diff --git a/youtube_dl/extractor/spankwire.py b/youtube_dl/extractor/spankwire.py index 97f9c268a..794550c81 100644 --- a/youtube_dl/extractor/spankwire.py +++ b/youtube_dl/extractor/spankwire.py @@ -6,7 +6,6 @@ from ..utils import ( compat_urllib_parse_urlparse, compat_urllib_request, compat_urllib_parse, - unescapeHTML, ) from ..aes import ( aes_decrypt_text diff --git a/youtube_dl/extractor/spiegel.py b/youtube_dl/extractor/spiegel.py index 6dc2eda6d..19ce585cf 100644 --- a/youtube_dl/extractor/spiegel.py +++ b/youtube_dl/extractor/spiegel.py @@ -2,7 +2,6 @@ import re import xml.etree.ElementTree from .common import InfoExtractor -from ..utils import determine_ext class SpiegelIE(InfoExtractor): diff --git a/youtube_dl/extractor/ted.py b/youtube_dl/extractor/ted.py index 2e497c86e..4bca62ba0 100644 --- a/youtube_dl/extractor/ted.py +++ b/youtube_dl/extractor/ted.py @@ -4,7 +4,6 @@ import re from .subtitles import SubtitlesInfoExtractor from ..utils import ( - compat_str, RegexNotFoundError, ) @@ -113,6 +112,6 @@ class TEDIE(SubtitlesInfoExtractor): url = 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/srt' % (video_id, l) sub_lang_list[l] = url return sub_lang_list - except RegexNotFoundError as err: + except RegexNotFoundError: self._downloader.report_warning(u'video doesn\'t have subtitles') return {} diff --git a/youtube_dl/extractor/tube8.py b/youtube_dl/extractor/tube8.py index d4b7603c7..4d9d41db3 100644 --- a/youtube_dl/extractor/tube8.py +++ b/youtube_dl/extractor/tube8.py @@ -5,8 +5,6 @@ from .common import InfoExtractor from ..utils import ( compat_urllib_parse_urlparse, compat_urllib_request, - compat_urllib_parse, - unescapeHTML, ) from ..aes import ( aes_decrypt_text diff --git a/youtube_dl/extractor/xtube.py b/youtube_dl/extractor/xtube.py index 03ad88bed..e3458d2bd 100644 --- a/youtube_dl/extractor/xtube.py +++ b/youtube_dl/extractor/xtube.py @@ -5,7 +5,6 @@ from .common import InfoExtractor from ..utils import ( compat_urllib_parse_urlparse, compat_urllib_request, - compat_urllib_parse, ) class XTubeIE(InfoExtractor): diff --git a/youtube_dl/extractor/zdf.py b/youtube_dl/extractor/zdf.py index faed7ff7f..c6a9d06f2 100644 --- a/youtube_dl/extractor/zdf.py +++ b/youtube_dl/extractor/zdf.py @@ -53,7 +53,7 @@ class ZDFIE(InfoExtractor): video_id, u'Get stream URL') - MMS_STREAM = r'href="(?Pmms://[^"]*)"' + #MMS_STREAM = r'href="(?Pmms://[^"]*)"' RTSP_STREAM = r'(?Prtsp://[^"]*.mp4)' mobj = re.search(self._MEDIA_STREAM, media_link) From 4eb92208a3bf05d0860e1f138380e8b5cae20c14 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Wed, 20 Nov 2013 06:34:48 +0100 Subject: [PATCH 073/425] Adapt test to changed .info.json name --- test/test_download.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/test/test_download.py b/test/test_download.py index 16f200809..fe7f7b8cb 100644 --- a/test/test_download.py +++ b/test/test_download.py @@ -103,7 +103,7 @@ def generator(test_case): tc_filename = get_tc_filename(tc) try_rm(tc_filename) try_rm(tc_filename + '.part') - try_rm(tc_filename + '.info.json') + try_rm(os.path.splitext(tc_filename)[0] + '.info.json') try_rm_tcs_files() try: try_num = 1 @@ -130,11 +130,12 @@ def generator(test_case): if not test_case.get('params', {}).get('skip_download', False): self.assertTrue(os.path.exists(tc_filename), msg='Missing file ' + tc_filename) self.assertTrue(tc_filename in finished_hook_called) - self.assertTrue(os.path.exists(tc_filename + '.info.json')) + info_json_fn = os.path.splitext(tc_filename)[0] + '.info.json' + self.assertTrue(os.path.exists(info_json_fn)) if 'md5' in tc: md5_for_file = _file_md5(tc_filename) self.assertEqual(md5_for_file, tc['md5']) - with io.open(tc_filename + '.info.json', encoding='utf-8') as infof: + with io.open(info_json_fn, encoding='utf-8') as infof: info_dict = json.load(infof) for (info_field, expected) in tc.get('info_dict', {}).items(): if isinstance(expected, compat_str) and expected.startswith('md5:'): From 8f053519846d8758de4e3ad67960e4161407d334 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Wed, 20 Nov 2013 06:35:02 +0100 Subject: [PATCH 074/425] [anitube] Minor fixes (#1776) --- youtube_dl/extractor/anitube.py | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/anitube.py b/youtube_dl/extractor/anitube.py index 2954966a6..679027f4e 100644 --- a/youtube_dl/extractor/anitube.py +++ b/youtube_dl/extractor/anitube.py @@ -6,11 +6,12 @@ from .common import InfoExtractor class AnitubeIE(InfoExtractor): IE_NAME = u'anitube.se' - _VALID_URL = r'http?://(?:www\.)?anitube\.se/video/(?P\d+)' + _VALID_URL = r'https?://(?:www\.)?anitube\.se/video/(?P\d+)' _TEST = { u'url': u'http://www.anitube.se/video/36621', - u'md5': u'0c4e4f1051bf50f5982f829f7230f539', + u'md5': u'59d0eeae28ea0bc8c05e7af429998d43', + u'file': u'36621.mp4', u'info_dict': { u'id': u'36621', u'ext': u'mp4', @@ -23,27 +24,22 @@ class AnitubeIE(InfoExtractor): video_id = mobj.group('id') webpage = self._download_webpage(url, video_id) - key = self._html_search_regex(r'http://www\.anitube\.se/embed/([A-Za-z0-9_-]*)', webpage, u'key') webpage_config = self._download_webpage('http://www.anitube.se/nuevo/econfig.php?key=%s' % key, key) - config_xml = xml.etree.ElementTree.fromstring(webpage_config.encode('utf-8')) video_title = config_xml.find('title').text - formats = [] - video_url = config_xml.find('file') if video_url is not None: formats.append({ 'format_id': 'sd', 'url': video_url.text, }) - video_url = config_xml.find('filehd') if video_url is not None: formats.append({ @@ -54,6 +50,5 @@ class AnitubeIE(InfoExtractor): return { 'id': video_id, 'title': video_title, - 'ext': 'mp4', 'formats': formats } From 0a120f74b2556f3569d285993c1a0f3dc654f07a Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Wed, 20 Nov 2013 06:36:00 +0100 Subject: [PATCH 075/425] Credit @diffycat for anitube --- youtube_dl/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index e27bd4544..64ebf4d48 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -34,6 +34,7 @@ __authors__ = ( 'Andras Elso', 'Jelle van der Waa', 'Marcin Cieślak', + 'Anton Larionov', ) __license__ = 'Public Domain' From 100959a6d9fee68d2876e112f4be260ade756c40 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Wed, 20 Nov 2013 06:47:50 +0100 Subject: [PATCH 076/425] [escapist] Add support for HD format (Closes #1755) --- youtube_dl/extractor/escapist.py | 57 +++++++++++++++++++------------- 1 file changed, 34 insertions(+), 23 deletions(-) diff --git a/youtube_dl/extractor/escapist.py b/youtube_dl/extractor/escapist.py index 3aa2da52c..f9681fd43 100644 --- a/youtube_dl/extractor/escapist.py +++ b/youtube_dl/extractor/escapist.py @@ -11,11 +11,11 @@ from ..utils import ( class EscapistIE(InfoExtractor): - _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P[^/]+)/(?P[^/?]+)[/?]?.*$' + _VALID_URL = r'^https?://?(www\.)?escapistmagazine\.com/videos/view/(?P[^/]+)/(?P[^/?]+)[/?]?.*$' _TEST = { u'url': u'http://www.escapistmagazine.com/videos/view/the-escapist-presents/6618-Breaking-Down-Baldurs-Gate', u'file': u'6618-Breaking-Down-Baldurs-Gate.mp4', - u'md5': u'c6793dbda81388f4264c1ba18684a74d', + u'md5': u'ab3a706c681efca53f0a35f1415cf0d1', u'info_dict': { u"description": u"Baldur's Gate: Original, Modded or Enhanced Edition? I'll break down what you can expect from the new Baldur's Gate: Enhanced Edition.", u"uploader": u"the-escapist-presents", @@ -25,50 +25,61 @@ class EscapistIE(InfoExtractor): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - if mobj is None: - raise ExtractorError(u'Invalid URL: %s' % url) showName = mobj.group('showname') videoId = mobj.group('episode') self.report_extraction(videoId) webpage = self._download_webpage(url, videoId) - videoDesc = self._html_search_regex(' Date: Wed, 20 Nov 2013 06:55:07 +0100 Subject: [PATCH 077/425] [escapist] Fix syntax error --- youtube_dl/extractor/escapist.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/escapist.py b/youtube_dl/extractor/escapist.py index f9681fd43..03ebb7c48 100644 --- a/youtube_dl/extractor/escapist.py +++ b/youtube_dl/extractor/escapist.py @@ -66,13 +66,12 @@ class EscapistIE(InfoExtractor): }) _add_format(u'normal', configUrl) + hq_url = (configUrl + + ('&hq=1' if '?' in configUrl else configUrl + '?hq=1')) try: - hq_url = (configUrl + - ('&hq=1' if '?' in configUrl else configUrl + '?hq=1')) - try: - _add_format(u'hq', hq_url) - except ExtractorError: - pass # That's fine, we'll just use normal quality + _add_format(u'hq', hq_url) + except ExtractorError: + pass # That's fine, we'll just use normal quality return { 'id': videoId, From c5edcde21f772c6c51bea79edc4ac48f5e5dd8d3 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Wed, 20 Nov 2013 06:56:59 +0100 Subject: [PATCH 078/425] [escapist] upper-case URL --- youtube_dl/extractor/escapist.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/escapist.py b/youtube_dl/extractor/escapist.py index 03ebb7c48..2396f163b 100644 --- a/youtube_dl/extractor/escapist.py +++ b/youtube_dl/extractor/escapist.py @@ -35,13 +35,13 @@ class EscapistIE(InfoExtractor): r' Date: Wed, 20 Nov 2013 07:23:23 +0100 Subject: [PATCH 079/425] [escapist] Fix title search --- youtube_dl/extractor/escapist.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/escapist.py b/youtube_dl/extractor/escapist.py index 2396f163b..b1242f6bc 100644 --- a/youtube_dl/extractor/escapist.py +++ b/youtube_dl/extractor/escapist.py @@ -35,11 +35,11 @@ class EscapistIE(InfoExtractor): r' Date: Wed, 20 Nov 2013 07:25:17 +0100 Subject: [PATCH 080/425] release 2013.11.20 --- README.md | 1 + youtube_dl/version.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 6632e5865..580b16004 100644 --- a/README.md +++ b/README.md @@ -123,6 +123,7 @@ which means you can modify it, redistribute it or use it however you like. --get-description simulate, quiet but print video description --get-filename simulate, quiet but print output filename --get-format simulate, quiet but print output format + -j, --dump-json simulate, quiet but print JSON information --newline output progress bar as new lines --no-progress do not print progress bar --console-title display progress in console titlebar diff --git a/youtube_dl/version.py b/youtube_dl/version.py index e9ff3f640..ad3fad818 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2013.11.19' +__version__ = '2013.11.20' From f99e0f1ed62e6210a52f1a4bddb860a5b09623ee Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Wed, 20 Nov 2013 07:37:07 +0100 Subject: [PATCH 081/425] Adapt age restriction tests to new .info.json filenames --- test/test_age_restriction.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_age_restriction.py b/test/test_age_restriction.py index d500c6edc..506572e9e 100644 --- a/test/test_age_restriction.py +++ b/test/test_age_restriction.py @@ -24,7 +24,7 @@ def _download_restricted(url, filename, age): } ydl = YoutubeDL(params) ydl.add_default_info_extractors() - json_filename = filename + '.info.json' + json_filename = os.path.splitext(filename)[0] + '.info.json' try_rm(json_filename) ydl.download([url]) res = os.path.exists(json_filename) From 9a98a466b34206f61543b3efdbcba40d58e51052 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Wed, 20 Nov 2013 07:37:14 +0100 Subject: [PATCH 082/425] [toutv] really skip test --- youtube_dl/extractor/toutv.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/toutv.py b/youtube_dl/extractor/toutv.py index 73ea67da9..732083617 100644 --- a/youtube_dl/extractor/toutv.py +++ b/youtube_dl/extractor/toutv.py @@ -28,7 +28,7 @@ class TouTvIE(InfoExtractor): u'params': { u'skip_download': True, # Requires rtmpdump }, - u'xskip': 'Only available in Canada' + u'skip': 'Only available in Canada' } def _real_extract(self, url): From c4864091a1f867c4ce74f459c31ac85d96c417ca Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Wed, 20 Nov 2013 07:43:21 +0100 Subject: [PATCH 083/425] [videopremium] Support new crazy redirect scheme --- youtube_dl/extractor/videopremium.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/videopremium.py b/youtube_dl/extractor/videopremium.py index 65f39b982..4800415bd 100644 --- a/youtube_dl/extractor/videopremium.py +++ b/youtube_dl/extractor/videopremium.py @@ -24,12 +24,16 @@ class VideoPremiumIE(InfoExtractor): webpage_url = 'http://videopremium.tv/' + video_id webpage = self._download_webpage(webpage_url, video_id) - self.report_extraction(video_id) + if re.match(r"^]*>window.location\s*=", webpage): + # Download again, we need a cookie + webpage = self._download_webpage( + webpage_url, video_id, + note=u'Downloading webpage again (with cookie)') - video_title = self._html_search_regex(r'\s*(.+?)\s*<', - webpage, u'video title') + video_title = self._html_search_regex( + r'\s*(.+?)\s*<', webpage, u'video title') - return [{ + return { 'id': video_id, 'url': "rtmp://e%d.md.iplay.md/play" % random.randint(1, 16), 'play_path': "mp4:%s.f4v" % video_id, @@ -37,4 +41,4 @@ class VideoPremiumIE(InfoExtractor): 'player_url': "http://videopremium.tv/uplayer/uppod.swf", 'ext': 'f4v', 'title': video_title, - }] + } \ No newline at end of file From 0ad97bbc056c83d49bc771ac202a9eb8354686de Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Wed, 20 Nov 2013 07:45:32 +0100 Subject: [PATCH 084/425] [spankwire] fix check for description --- youtube_dl/extractor/spankwire.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/spankwire.py b/youtube_dl/extractor/spankwire.py index 794550c81..995b46444 100644 --- a/youtube_dl/extractor/spankwire.py +++ b/youtube_dl/extractor/spankwire.py @@ -35,11 +35,12 @@ class SpankwireIE(InfoExtractor): webpage = self._download_webpage(req, video_id) video_title = self._html_search_regex(r'

([^<]+)', webpage, u'title') - video_uploader = self._html_search_regex(r'by:\s*]*>(.+?)', webpage, u'uploader', fatal=False) - thumbnail = self._html_search_regex(r'flashvars\.image_url = "([^"]+)', webpage, u'thumbnail', fatal=False) - description = self._html_search_regex(r'>\s*Description:\s*<[^>]*>([^<]+)', webpage, u'description', fatal=False) - if len(description) == 0: - description = None + video_uploader = self._html_search_regex( + r'by:\s*]*>(.+?)', webpage, u'uploader', fatal=False) + thumbnail = self._html_search_regex( + r'flashvars\.image_url = "([^"]+)', webpage, u'thumbnail', fatal=False) + description = self._html_search_regex( + r'>\s*Description:\s*<[^>]+>([^<]+)', webpage, u'description', fatal=False) video_urls = list(map(compat_urllib_parse.unquote , re.findall(r'flashvars\.quality_[0-9]{3}p = "([^"]+)', webpage))) if webpage.find('flashvars\.encrypted = "true"') != -1: From f2e87ef4fab5f6d000a7bb152b535a748866bca3 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Wed, 20 Nov 2013 07:46:44 +0100 Subject: [PATCH 085/425] [anitube] Skip test (on travis) --- youtube_dl/extractor/anitube.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/anitube.py b/youtube_dl/extractor/anitube.py index 679027f4e..691d5a844 100644 --- a/youtube_dl/extractor/anitube.py +++ b/youtube_dl/extractor/anitube.py @@ -17,6 +17,7 @@ class AnitubeIE(InfoExtractor): u'ext': u'mp4', u'title': u'Recorder to Randoseru 01', }, + u'skip': u'Blocked in the US', } def _real_extract(self, url): From ca872a4c0b5a926be9dccec6dc43ebb5b8a6abb7 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Wed, 20 Nov 2013 09:23:53 +0100 Subject: [PATCH 086/425] [spankwire] Fix description search --- youtube_dl/extractor/spankwire.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/spankwire.py b/youtube_dl/extractor/spankwire.py index 995b46444..9e2ad0d99 100644 --- a/youtube_dl/extractor/spankwire.py +++ b/youtube_dl/extractor/spankwire.py @@ -40,7 +40,7 @@ class SpankwireIE(InfoExtractor): thumbnail = self._html_search_regex( r'flashvars\.image_url = "([^"]+)', webpage, u'thumbnail', fatal=False) description = self._html_search_regex( - r'>\s*Description:\s*<[^>]+>([^<]+)', webpage, u'description', fatal=False) + r'([^<]+)<', webpage, u'description', fatal=False) video_urls = list(map(compat_urllib_parse.unquote , re.findall(r'flashvars\.quality_[0-9]{3}p = "([^"]+)', webpage))) if webpage.find('flashvars\.encrypted = "true"') != -1: From 0190eecc00c45b26eb5ef5444181cd996b139f18 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Wed, 20 Nov 2013 09:45:22 +0100 Subject: [PATCH 087/425] [nhl] Make NHLVideocenter IE_DESC fit with other descriptions --- youtube_dl/extractor/nhl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/nhl.py b/youtube_dl/extractor/nhl.py index 224f56ac8..458fe4063 100644 --- a/youtube_dl/extractor/nhl.py +++ b/youtube_dl/extractor/nhl.py @@ -72,7 +72,7 @@ class NHLIE(NHLBaseInfoExtractor): class NHLVideocenterIE(NHLBaseInfoExtractor): IE_NAME = u'nhl.com:videocenter' - IE_DESC = u'Download the first 12 videos from a videocenter category' + IE_DESC = u'NHL videocenter category' _VALID_URL = r'https?://video\.(?P[^.]*)\.nhl\.com/videocenter/(console\?.*?catid=(?P[^&]+))?' @classmethod From 9e4f50a8aee0d3f030be203686c8f0479b32b793 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Wed, 20 Nov 2013 09:59:03 +0100 Subject: [PATCH 088/425] [sztv] skip test, site is undergoing mid-term maintenance --- youtube_dl/extractor/sztvhu.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/sztvhu.py b/youtube_dl/extractor/sztvhu.py index 81fa35c4b..c9359fafb 100644 --- a/youtube_dl/extractor/sztvhu.py +++ b/youtube_dl/extractor/sztvhu.py @@ -15,7 +15,8 @@ class SztvHuIE(InfoExtractor): u'info_dict': { u"title": u"Cserkészek népszerűsítették a környezettudatos életmódot a Savaria téren", u"description": u'A zöld nap játékos ismeretterjesztő programjait a Magyar Cserkész Szövetség szervezte, akik az ország nyolc városában adják át tudásukat az érdeklődőknek. A PET...', - } + }, + u'skip': u'Service temporarily disabled as of 2013-11-20' } def _real_extract(self, url): From 64bb5187f53caea0e6067434c95da60284eef2ea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Thu, 21 Nov 2013 13:16:19 +0100 Subject: [PATCH 089/425] [soundcloud] Retrieve the file url using the client_id for the iPhone (fixes #1798) The desktop's client_id always give the rtmp url, but with the iPhone one it returns the http url if it's available. --- youtube_dl/extractor/soundcloud.py | 32 ++++++++++++++++++++---------- 1 file changed, 21 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index 687457e10..5544325b5 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -59,6 +59,7 @@ class SoundcloudIE(InfoExtractor): ] _CLIENT_ID = 'b45b1aa10f1ac2941910a7f0d10f8e28' + _IPHONE_CLIENT_ID = '376f225bf427445fc4bfb6b99b72e0bf' @classmethod def suitable(cls, url): @@ -83,7 +84,6 @@ class SoundcloudIE(InfoExtractor): thumbnail = thumbnail.replace('-large', '-t500x500') result = { 'id': track_id, - 'url': info['stream_url'] + '?client_id=' + self._CLIENT_ID, 'uploader': info['user']['username'], 'upload_date': unified_strdate(info['created_at']), 'title': info['title'], @@ -92,19 +92,29 @@ class SoundcloudIE(InfoExtractor): 'thumbnail': thumbnail, } if info.get('downloadable', False): + # We can build a direct link to the song result['url'] = 'https://api.soundcloud.com/tracks/{0}/download?client_id={1}'.format(track_id, self._CLIENT_ID) - if not info.get('streamable', False): - # We have to get the rtmp url + else: + # We have to retrieve the url stream_json = self._download_webpage( - 'http://api.soundcloud.com/i1/tracks/{0}/streams?client_id={1}'.format(track_id, self._CLIENT_ID), + 'http://api.soundcloud.com/i1/tracks/{0}/streams?client_id={1}'.format(track_id, self._IPHONE_CLIENT_ID), track_id, u'Downloading track url') - rtmp_url = json.loads(stream_json)['rtmp_mp3_128_url'] - # The url doesn't have an rtmp app, we have to extract the playpath - url, path = rtmp_url.split('mp3:', 1) - result.update({ - 'url': url, - 'play_path': 'mp3:' + path, - }) + # There should be only one entry in the dictionary + key, stream_url = list(json.loads(stream_json).items())[0] + if key.startswith(u'http'): + result['url'] = stream_url + elif key.startswith(u'rtmp'): + # The url doesn't have an rtmp app, we have to extract the playpath + url, path = stream_url.split('mp3:', 1) + result.update({ + 'url': url, + 'play_path': 'mp3:' + path, + }) + else: + # We fallback to the stream_url in the original info, this + # cannot be always used, sometimes it can give an HTTP 404 error + resut['url'] = info['stream_url'] + '?client_id=' + self._CLIENT_ID, + return result def _real_extract(self, url): From 2bb683c2012ad45b0e3008664be5994aa603c196 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Thu, 21 Nov 2013 13:59:33 +0100 Subject: [PATCH 090/425] release 2013.11.21 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index ad3fad818..17ce776a9 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2013.11.20' +__version__ = '2013.11.21' From a6a173c2fddf2fa38a69ca750431b7ca6932bcb0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Thu, 21 Nov 2013 14:09:28 +0100 Subject: [PATCH 091/425] utils.shell_quote: Convert the args to unicode strings The youtube test video failed with `UnicodeDecodeError: 'ascii' codec can't decode byte 0xc3 in position 34: ordinal not in range(128)`, the problem was with the filenames being encoded. --- test/test_utils.py | 6 ++++++ youtube_dl/utils.py | 11 ++++++++++- 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/test/test_utils.py b/test/test_utils.py index f3fbff042..e9e590e74 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -24,6 +24,8 @@ from youtube_dl.utils import ( xpath_with_ns, smuggle_url, unsmuggle_url, + shell_quote, + encodeFilename, ) if sys.version_info < (3, 0): @@ -170,6 +172,10 @@ class TestUtil(unittest.TestCase): self.assertEqual(res_url, url) self.assertEqual(res_data, None) + def test_shell_quote(self): + args = ['ffmpeg', '-i', encodeFilename(u'ñ€ß\'.mp4')] + self.assertEqual(shell_quote(args), u"""ffmpeg -i 'ñ€ß'"'"'.mp4'""") + if __name__ == '__main__': unittest.main() diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index b50c8166f..0720fe9eb 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -951,7 +951,16 @@ class locked_file(object): def shell_quote(args): - return ' '.join(map(pipes.quote, args)) + quoted_args = [] + encoding = sys.getfilesystemencoding() + if encoding is None: + encoding = 'utf-8' + for a in args: + if isinstance(a, bytes): + # We may get a filename encoded with 'encodeFilename' + a = a.decode(encoding) + quoted_args.append(pipes.quote(a)) + return u' '.join(quoted_args) def takewhile_inclusive(pred, seq): From b1c9c669365cea4fb94babd66076665685abc453 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Thu, 21 Nov 2013 23:26:28 +0100 Subject: [PATCH 092/425] Remove unnecessary slash in setup.py (Fixes #1778) --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index aa7cfca08..8e24fe679 100644 --- a/setup.py +++ b/setup.py @@ -48,7 +48,7 @@ else: 'data_files': [ # Installing system-wide would require sudo... ('etc/bash_completion.d', ['youtube-dl.bash-completion']), ('share/doc/youtube_dl', ['README.txt']), - ('share/man/man1/', ['youtube-dl.1']) + ('share/man/man1', ['youtube-dl.1']) ] } if setuptools_available: From 0980426559741bb9a8b2ea39b581073cf2738f5a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Fri, 22 Nov 2013 16:05:14 +0100 Subject: [PATCH 093/425] [bandcamp] add support for albums (reported in #1270) --- test/test_playlists.py | 9 +++++++++ youtube_dl/extractor/__init__.py | 2 +- youtube_dl/extractor/bandcamp.py | 24 ++++++++++++++++++++++++ youtube_dl/extractor/generic.py | 3 ++- 4 files changed, 36 insertions(+), 2 deletions(-) diff --git a/test/test_playlists.py b/test/test_playlists.py index 706b6bdca..d83b3bf51 100644 --- a/test/test_playlists.py +++ b/test/test_playlists.py @@ -22,6 +22,7 @@ from youtube_dl.extractor import ( LivestreamIE, NHLVideocenterIE, BambuserChannelIE, + BandcampAlbumIE ) @@ -103,5 +104,13 @@ class TestPlaylists(unittest.TestCase): self.assertEqual(result['title'], u'pixelversity') self.assertTrue(len(result['entries']) >= 66) + def test_bandcamp_album(self): + dl = FakeYDL() + ie = BandcampAlbumIE(dl) + result = ie.extract('http://mpallante.bandcamp.com/album/nightmare-night-ep') + self.assertIsPlaylist(result) + self.assertEqual(result['title'], u'Nightmare Night EP') + self.assertTrue(len(result['entries']) >= 4) + if __name__ == '__main__': unittest.main() diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index ffb74df9f..802beef21 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -11,7 +11,7 @@ from .arte import ( ) from .auengine import AUEngineIE from .bambuser import BambuserIE, BambuserChannelIE -from .bandcamp import BandcampIE +from .bandcamp import BandcampIE, BandcampAlbumIE from .bliptv import BlipTVIE, BlipTVUserIE from .bloomberg import BloombergIE from .breakcom import BreakIE diff --git a/youtube_dl/extractor/bandcamp.py b/youtube_dl/extractor/bandcamp.py index 129a20f44..81d5c60e9 100644 --- a/youtube_dl/extractor/bandcamp.py +++ b/youtube_dl/extractor/bandcamp.py @@ -3,11 +3,13 @@ import re from .common import InfoExtractor from ..utils import ( + compat_urlparse, ExtractorError, ) class BandcampIE(InfoExtractor): + IE_NAME = u'Bandcamp' _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P.*)' _TEST = { u'url': u'http://youtube-dl.bandcamp.com/track/youtube-dl-test-song', @@ -61,3 +63,25 @@ class BandcampIE(InfoExtractor): } return [track_info] + + +class BandcampAlbumIE(InfoExtractor): + IE_NAME = u'Bandcamp:album' + _VALID_URL = r'http://.*?\.bandcamp\.com/album/(?P<title>.*)' + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + title = mobj.group('title') + webpage = self._download_webpage(url, title) + tracks_paths = re.findall(r'<a href="(.*?)" itemprop="url">', webpage) + if not tracks_paths: + raise ExtractorError(u'The page doesn\'t contain any track') + entries = [ + self.url_result(compat_urlparse.urljoin(url, t_path), ie=BandcampIE.ie_key()) + for t_path in tracks_paths] + title = self._search_regex(r'album_title : "(.*?)"', webpage, u'title') + return { + '_type': 'playlist', + 'title': title, + 'entries': entries, + } diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index e1d6a2a01..0b5f2b2bb 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -199,7 +199,8 @@ class GenericIE(InfoExtractor): mobj = re.search(r'<meta property="og:url"[^>]*?content="(.*?bandcamp\.com.*?)"', webpage) if mobj is not None: burl = unescapeHTML(mobj.group(1)) - return self.url_result(burl, 'Bandcamp') + # Don't set the extractor because it can be a track url or an album + return self.url_result(burl) # Start with something easy: JW Player in SWFObject mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage) From ab009f59ef4a4e2a4842d37212b2e57466d9f22e Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Fri, 22 Nov 2013 17:17:55 +0100 Subject: [PATCH 094/425] [toutv] Fix a typo --- youtube_dl/extractor/soundcloud.py | 2 +- youtube_dl/extractor/toutv.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index 5544325b5..67b2dff9c 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -113,7 +113,7 @@ class SoundcloudIE(InfoExtractor): else: # We fallback to the stream_url in the original info, this # cannot be always used, sometimes it can give an HTTP 404 error - resut['url'] = info['stream_url'] + '?client_id=' + self._CLIENT_ID, + result['url'] = info['stream_url'] + '?client_id=' + self._CLIENT_ID, return result diff --git a/youtube_dl/extractor/toutv.py b/youtube_dl/extractor/toutv.py index 732083617..2f728d3dc 100644 --- a/youtube_dl/extractor/toutv.py +++ b/youtube_dl/extractor/toutv.py @@ -39,7 +39,6 @@ class TouTvIE(InfoExtractor): mediaId = self._search_regex( r'"idMedia":\s*"([^"]+)"', webpage, u'media ID') - # TODO test from de streams_url = u'http://release.theplatform.com/content.select?pid=' + mediaId streams_webpage = self._download_webpage( streams_url, video_id, note=u'Downloading stream list') From 02e4ebbbad5653b9bbbcf615bdcae6b2c7ea1e30 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Fri, 22 Nov 2013 17:19:22 +0100 Subject: [PATCH 095/425] [streamcloud] Add IE (Fixes #1801) --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/streamcloud.py | 65 +++++++++++++++++++++++++++++ 2 files changed, 66 insertions(+) create mode 100644 youtube_dl/extractor/streamcloud.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 802beef21..02f9e2546 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -128,6 +128,7 @@ from .spiegel import SpiegelIE from .stanfordoc import StanfordOpenClassroomIE from .statigram import StatigramIE from .steam import SteamIE +from .streamcloud import StreamcloudIE from .sztvhu import SztvHuIE from .teamcoco import TeamcocoIE from .techtalks import TechTalksIE diff --git a/youtube_dl/extractor/streamcloud.py b/youtube_dl/extractor/streamcloud.py new file mode 100644 index 000000000..d476693ec --- /dev/null +++ b/youtube_dl/extractor/streamcloud.py @@ -0,0 +1,65 @@ +# coding: utf-8 +import re +import time + +from .common import InfoExtractor +from ..utils import ( + compat_urllib_parse, + compat_urllib_request, +) + + +class StreamcloudIE(InfoExtractor): + IE_NAME = u'streamcloud.eu' + _VALID_URL = r'https?://streamcloud\.eu/(?P<id>[a-zA-Z0-9_-]+)/(?P<fname>[^#?]*)\.html' + + _TEST = { + u'url': u'http://streamcloud.eu/skp9j99s4bpz/youtube-dl_test_video_____________-BaW_jenozKc.mp4.html', + u'file': u'skp9j99s4bpz.mp4', + u'md5': u'6bea4c7fa5daaacc2a946b7146286686', + u'info_dict': { + u'title': u'youtube-dl test video \'/\\ ä ↭', + u'duration': 9, + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + + orig_webpage = self._download_webpage(url, video_id) + + fields = re.findall(r'''(?x)<input\s+ + type="(?:hidden|submit)"\s+ + name="([^"]+)"\s+ + (?:id="[^"]+"\s+)? + value="([^"]*)" + ''', orig_webpage) + post = compat_urllib_parse.urlencode(fields) + + self.to_screen('%s: Waiting for timeout' % video_id) + time.sleep(12) + headers = { + b'Content-Type': b'application/x-www-form-urlencoded', + } + req = compat_urllib_request.Request(url, post, headers) + + webpage = self._download_webpage( + req, video_id, note=u'Downloading video page ...') + title = self._html_search_regex( + r'<h1[^>]*>([^<]+)<', webpage, u'title') + video_url = self._search_regex( + r'file:\s*"([^"]+)"', webpage, u'video URL') + duration_str = self._search_regex( + r'duration:\s*"?([0-9]+)"?', webpage, u'duration', fatal=False) + duration = None if duration_str is None else int(duration_str) + thumbnail = self._search_regex( + r'image:\s*"([^"]+)"', webpage, u'thumbnail URL', fatal=False) + + return { + 'id': video_id, + 'title': title, + 'url': video_url, + 'duration': duration, + 'thumbnail': thumbnail, + } From cffa6aa10757099bfb46c5c4eff5c5d9a3ff97f4 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Fri, 22 Nov 2013 17:44:55 +0100 Subject: [PATCH 096/425] [bandcamp] Support trackinfo-style songs (Fixes #1270) --- youtube_dl/extractor/bandcamp.py | 46 ++++++++++++++++++++++++++++++-- 1 file changed, 44 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/bandcamp.py b/youtube_dl/extractor/bandcamp.py index 81d5c60e9..8b1d56da3 100644 --- a/youtube_dl/extractor/bandcamp.py +++ b/youtube_dl/extractor/bandcamp.py @@ -3,6 +3,7 @@ import re from .common import InfoExtractor from ..utils import ( + compat_str, compat_urlparse, ExtractorError, ) @@ -11,7 +12,7 @@ from ..utils import ( class BandcampIE(InfoExtractor): IE_NAME = u'Bandcamp' _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)' - _TEST = { + _TESTS = [{ u'url': u'http://youtube-dl.bandcamp.com/track/youtube-dl-test-song', u'file': u'1812978515.mp3', u'md5': u'cdeb30cdae1921719a3cbcab696ef53c', @@ -19,7 +20,28 @@ class BandcampIE(InfoExtractor): u"title": u"youtube-dl test song \"'/\\\u00e4\u21ad" }, u'skip': u'There is a limit of 200 free downloads / month for the test song' - } + }, { + u'url': u'http://blazo.bandcamp.com/album/jazz-format-mixtape-vol-1', + u'playlist': [ + { + u'file': u'1353101989.mp3', + u'md5': u'39bc1eded3476e927c724321ddf116cf', + u'info_dict': { + u'title': u'Intro', + } + }, + { + u'file': u'38097443.mp3', + u'md5': u'1a2c32e2691474643e912cc6cd4bffaa', + u'info_dict': { + u'title': u'Kero One - Keep It Alive (Blazo remix)', + } + }, + ], + u'params': { + u'playlistend': 2 + } + }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) @@ -28,6 +50,26 @@ class BandcampIE(InfoExtractor): # We get the link to the free download page m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage) if m_download is None: + m_trackinfo = re.search(r'trackinfo: (.+),\s*?\n', webpage) + if m_trackinfo: + json_code = m_trackinfo.group(1) + data = json.loads(json_code) + + entries = [] + for d in data: + formats = [{ + 'format_id': 'format_id', + 'url': format_url, + 'ext': format_id.partition('-')[0] + } for format_id, format_url in sorted(d['file'].items())] + entries.append({ + 'id': compat_str(d['id']), + 'title': d['title'], + 'formats': formats, + }) + + return self.playlist_result(entries, title, title) + else: raise ExtractorError(u'No free songs found') download_link = m_download.group(1) From bfe7439a2076452a70946ac146be463fa7277d67 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Fri, 22 Nov 2013 17:46:26 +0100 Subject: [PATCH 097/425] release 2013.11.22 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 17ce776a9..c1f581cd6 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2013.11.21' +__version__ = '2013.11.22' From 241650c7ff8afa7236598d0f95d20c0898abc02c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Fri, 22 Nov 2013 18:20:31 +0100 Subject: [PATCH 098/425] [vimeo] Fix the extraction of vimeo pro and player.vimeo.com videos --- youtube_dl/extractor/vimeo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index d465bf20b..7d82c2cfa 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -151,7 +151,7 @@ class VimeoIE(InfoExtractor): config = json.loads(config_json) except RegexNotFoundError: # For pro videos or player.vimeo.com urls - config = self._search_regex([r' = {config:({.+?}),assets:', r'c=({.+?);'], + config = self._search_regex([r' = {config:({.+?}),assets:', r'(?:c|b)=({.+?});'], webpage, u'info section', flags=re.DOTALL) config = json.loads(config) except Exception as e: From f143a42fe65ce4932dfab7be1f41c52ffe8a203e Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Fri, 22 Nov 2013 19:08:25 +0100 Subject: [PATCH 099/425] [bandcamp] Skip album test --- youtube_dl/extractor/bandcamp.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/bandcamp.py b/youtube_dl/extractor/bandcamp.py index 8b1d56da3..359d4174b 100644 --- a/youtube_dl/extractor/bandcamp.py +++ b/youtube_dl/extractor/bandcamp.py @@ -40,7 +40,8 @@ class BandcampIE(InfoExtractor): ], u'params': { u'playlistend': 2 - } + }, + u'skip': u'Bancamp imposes download limits. See test_playlists:test_bandcamp_album for the playlist test' }] def _real_extract(self, url): From 0b63aed8dfd36a5a7f5ae6518b9c385d9a43e735 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Fri, 22 Nov 2013 19:15:36 +0100 Subject: [PATCH 100/425] [update] do not assign to unused variables --- youtube_dl/update.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/youtube_dl/update.py b/youtube_dl/update.py index f41b4785a..e5f441707 100644 --- a/youtube_dl/update.py +++ b/youtube_dl/update.py @@ -109,7 +109,7 @@ def update_self(to_screen, verbose): urlh = compat_urllib_request.urlopen(version['exe'][0]) newcontent = urlh.read() urlh.close() - except (IOError, OSError) as err: + except (IOError, OSError): if verbose: to_screen(compat_str(traceback.format_exc())) to_screen(u'ERROR: unable to download latest version') return @@ -122,7 +122,7 @@ def update_self(to_screen, verbose): try: with open(exe + '.new', 'wb') as outf: outf.write(newcontent) - except (IOError, OSError) as err: + except (IOError, OSError): if verbose: to_screen(compat_str(traceback.format_exc())) to_screen(u'ERROR: unable to write the new version') return @@ -141,7 +141,7 @@ start /b "" cmd /c del "%%~f0"&exit /b" subprocess.Popen([bat]) # Continues to run in the background return # Do not show premature success messages - except (IOError, OSError) as err: + except (IOError, OSError): if verbose: to_screen(compat_str(traceback.format_exc())) to_screen(u'ERROR: unable to overwrite current version') return @@ -152,7 +152,7 @@ start /b "" cmd /c del "%%~f0"&exit /b" urlh = compat_urllib_request.urlopen(version['bin'][0]) newcontent = urlh.read() urlh.close() - except (IOError, OSError) as err: + except (IOError, OSError): if verbose: to_screen(compat_str(traceback.format_exc())) to_screen(u'ERROR: unable to download latest version') return @@ -165,7 +165,7 @@ start /b "" cmd /c del "%%~f0"&exit /b" try: with open(filename, 'wb') as outf: outf.write(newcontent) - except (IOError, OSError) as err: + except (IOError, OSError): if verbose: to_screen(compat_str(traceback.format_exc())) to_screen(u'ERROR: unable to overwrite current version') return From dca087205692c934163ec9aca5962056f890cd19 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Fri, 22 Nov 2013 19:57:52 +0100 Subject: [PATCH 101/425] Move the opener to the YoutubeDL object. This is the first step towards being able to just import youtube_dl and start using it. Apart from removing global state, this would fix problems like #1805. --- youtube_dl/YoutubeDL.py | 86 ++++++++++++++++++++++++++++- youtube_dl/__init__.py | 98 +++------------------------------- youtube_dl/extractor/common.py | 4 +- youtube_dl/utils.py | 4 +- 4 files changed, 96 insertions(+), 96 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index a2e3df1f9..72ccfa2ae 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -7,8 +7,10 @@ import errno import io import json import os +import platform import re import shutil +import subprocess import socket import sys import time @@ -18,6 +20,7 @@ if os.name == 'nt': import ctypes from .utils import ( + compat_cookiejar, compat_http_client, compat_print, compat_str, @@ -31,8 +34,10 @@ from .utils import ( encodeFilename, ExtractorError, locked_file, + make_HTTPS_handler, MaxDownloadsReached, PostProcessingError, + platform_name, preferredencoding, SameFileError, sanitize_filename, @@ -41,9 +46,11 @@ from .utils import ( UnavailableVideoError, write_json_file, write_string, + YoutubeDLHandler, ) from .extractor import get_info_extractor, gen_extractors from .FileDownloader import FileDownloader +from .version import __version__ class YoutubeDL(object): @@ -120,6 +127,8 @@ class YoutubeDL(object): downloadarchive: File name of a file where all downloads are recorded. Videos already present in the file are not downloaded again. + cookiefile: File name where cookies should be read from and dumped to. + nocheckcertificate Do not verify SSL certificates The following parameters are not used by YoutubeDL itself, they are used by the FileDownloader: @@ -160,6 +169,8 @@ class YoutubeDL(object): if '%(stitle)s' in self.params['outtmpl']: self.report_warning(u'%(stitle)s is deprecated. Use the %(title)s and the --restrict-filenames flag(which also secures %(uploader)s et al) instead.') + self._setup_opener() + def add_info_extractor(self, ie): """Add an InfoExtractor object to the end of the list.""" self._ies.append(ie) @@ -235,6 +246,9 @@ class YoutubeDL(object): def __exit__(self, *args): self.restore_console_title() + + if self.params.get('cookiefile') is not None: + self.cookiejar.save() def fixed_template(self): """Checks if the output template is fixed.""" @@ -774,7 +788,7 @@ class YoutubeDL(object): for url in url_list: try: #It also downloads the videos - videos = self.extract_info(url) + self.extract_info(url) except UnavailableVideoError: self.report_error(u'unable to download video') except MaxDownloadsReached: @@ -885,3 +899,73 @@ class YoutubeDL(object): '_resolution': u'resolution', 'format_note': u'note'}) self.to_screen(u'[info] Available formats for %s:\n%s\n%s' % (info_dict['id'], header_line, u"\n".join(formats_s))) + + def urlopen(self, req): + """ Start an HTTP download """ + return self._opener.open(req) + + def print_debug_header(self): + if not self.params.get('verbose'): + return + write_string(u'[debug] youtube-dl version ' + __version__ + u'\n') + try: + sp = subprocess.Popen( + ['git', 'rev-parse', '--short', 'HEAD'], + stdout=subprocess.PIPE, stderr=subprocess.PIPE, + cwd=os.path.dirname(os.path.abspath(__file__))) + out, err = sp.communicate() + out = out.decode().strip() + if re.match('[0-9a-f]+', out): + write_string(u'[debug] Git HEAD: ' + out + u'\n') + except: + try: + sys.exc_clear() + except: + pass + write_string(u'[debug] Python version %s - %s' % + (platform.python_version(), platform_name()) + u'\n') + + proxy_map = {} + for handler in self._opener.handlers: + if hasattr(handler, 'proxies'): + proxy_map.update(handler.proxies) + write_string(u'[debug] Proxy map: ' + compat_str(proxy_map) + u'\n') + + def _setup_opener(self, timeout=300): + opts_cookiefile = self.params.get('cookiefile') + opts_proxy = self.params.get('proxy') + + if opts_cookiefile is None: + self.cookiejar = compat_cookiejar.CookieJar() + else: + self.cookiejar = compat_cookiejar.MozillaCookieJar( + opts_cookiefile) + if os.access(opts_cookiefile, os.R_OK): + self.cookiejar.load() + + cookie_processor = compat_urllib_request.HTTPCookieProcessor( + self.cookiejar) + if opts_proxy is not None: + if opts_proxy == '': + proxies = {} + else: + proxies = {'http': opts_proxy, 'https': opts_proxy} + else: + proxies = compat_urllib_request.getproxies() + # Set HTTPS proxy to HTTP one if given (https://github.com/rg3/youtube-dl/issues/805) + if 'http' in proxies and 'https' not in proxies: + proxies['https'] = proxies['http'] + proxy_handler = compat_urllib_request.ProxyHandler(proxies) + https_handler = make_HTTPS_handler( + self.params.get('nocheckcertificate', False)) + opener = compat_urllib_request.build_opener( + https_handler, proxy_handler, cookie_processor, YoutubeDLHandler()) + # Delete the default user-agent header, which would otherwise apply in + # cases where our custom HTTP handler doesn't come into play + # (See https://github.com/rg3/youtube-dl/issues/1309 for details) + opener.addheaders = [] + self._opener = opener + + # TODO remove this global modification + compat_urllib_request.install_opener(opener) + socket.setdefaulttimeout(timeout) diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 64ebf4d48..27886593b 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -40,45 +40,35 @@ __authors__ = ( __license__ = 'Public Domain' import codecs -import collections import getpass import optparse import os import random import re import shlex -import socket import subprocess import sys -import traceback -import platform from .utils import ( - compat_cookiejar, compat_print, - compat_str, - compat_urllib_request, DateRange, decodeOption, determine_ext, DownloadError, get_cachedir, - make_HTTPS_handler, MaxDownloadsReached, - platform_name, preferredencoding, SameFileError, std_headers, write_string, - YoutubeDLHandler, ) from .update import update_self -from .version import __version__ from .FileDownloader import ( FileDownloader, ) from .extractor import gen_extractors +from .version import __version__ from .YoutubeDL import YoutubeDL from .PostProcessor import ( FFmpegMetadataPP, @@ -451,19 +441,6 @@ def _real_main(argv=None): parser, opts, args = parseOpts(argv) - # Open appropriate CookieJar - if opts.cookiefile is None: - jar = compat_cookiejar.CookieJar() - else: - try: - jar = compat_cookiejar.MozillaCookieJar(opts.cookiefile) - if os.access(opts.cookiefile, os.R_OK): - jar.load() - except (IOError, OSError) as err: - if opts.verbose: - traceback.print_exc() - write_string(u'ERROR: unable to open cookie file\n') - sys.exit(101) # Set user agent if opts.user_agent is not None: std_headers['User-Agent'] = opts.user_agent @@ -495,8 +472,6 @@ def _real_main(argv=None): all_urls = batchurls + args all_urls = [url.strip() for url in all_urls] - opener = _setup_opener(jar=jar, opts=opts) - extractors = gen_extractors() if opts.list_extractors: @@ -551,7 +526,7 @@ def _real_main(argv=None): if opts.retries is not None: try: opts.retries = int(opts.retries) - except (TypeError, ValueError) as err: + except (TypeError, ValueError): parser.error(u'invalid retry count specified') if opts.buffersize is not None: numeric_buffersize = FileDownloader.parse_bytes(opts.buffersize) @@ -562,13 +537,13 @@ def _real_main(argv=None): opts.playliststart = int(opts.playliststart) if opts.playliststart <= 0: raise ValueError(u'Playlist start must be positive') - except (TypeError, ValueError) as err: + except (TypeError, ValueError): parser.error(u'invalid playlist start number specified') try: opts.playlistend = int(opts.playlistend) if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart): raise ValueError(u'Playlist end must be greater than playlist start') - except (TypeError, ValueError) as err: + except (TypeError, ValueError): parser.error(u'invalid playlist end number specified') if opts.extractaudio: if opts.audioformat not in ['best', 'aac', 'mp3', 'm4a', 'opus', 'vorbis', 'wav']: @@ -671,34 +646,12 @@ def _real_main(argv=None): 'youtube_print_sig_code': opts.youtube_print_sig_code, 'age_limit': opts.age_limit, 'download_archive': opts.download_archive, + 'cookiefile': opts.cookiefile, + 'nocheckcertificate': opts.no_check_certificate, } with YoutubeDL(ydl_opts) as ydl: - if opts.verbose: - write_string(u'[debug] youtube-dl version ' + __version__ + u'\n') - try: - sp = subprocess.Popen( - ['git', 'rev-parse', '--short', 'HEAD'], - stdout=subprocess.PIPE, stderr=subprocess.PIPE, - cwd=os.path.dirname(os.path.abspath(__file__))) - out, err = sp.communicate() - out = out.decode().strip() - if re.match('[0-9a-f]+', out): - write_string(u'[debug] Git HEAD: ' + out + u'\n') - except: - try: - sys.exc_clear() - except: - pass - write_string(u'[debug] Python version %s - %s' % - (platform.python_version(), platform_name()) + u'\n') - - proxy_map = {} - for handler in opener.handlers: - if hasattr(handler, 'proxies'): - proxy_map.update(handler.proxies) - write_string(u'[debug] Proxy map: ' + compat_str(proxy_map) + u'\n') - + ydl.print_debug_header() ydl.add_default_info_extractors() # PostProcessors @@ -729,46 +682,9 @@ def _real_main(argv=None): ydl.to_screen(u'--max-download limit reached, aborting.') retcode = 101 - # Dump cookie jar if requested - if opts.cookiefile is not None: - try: - jar.save() - except (IOError, OSError): - sys.exit(u'ERROR: unable to save cookie jar') - sys.exit(retcode) -def _setup_opener(jar=None, opts=None, timeout=300): - if opts is None: - FakeOptions = collections.namedtuple( - 'FakeOptions', ['proxy', 'no_check_certificate']) - opts = FakeOptions(proxy=None, no_check_certificate=False) - - cookie_processor = compat_urllib_request.HTTPCookieProcessor(jar) - if opts.proxy is not None: - if opts.proxy == '': - proxies = {} - else: - proxies = {'http': opts.proxy, 'https': opts.proxy} - else: - proxies = compat_urllib_request.getproxies() - # Set HTTPS proxy to HTTP one if given (https://github.com/rg3/youtube-dl/issues/805) - if 'http' in proxies and 'https' not in proxies: - proxies['https'] = proxies['http'] - proxy_handler = compat_urllib_request.ProxyHandler(proxies) - https_handler = make_HTTPS_handler(opts) - opener = compat_urllib_request.build_opener( - https_handler, proxy_handler, cookie_processor, YoutubeDLHandler()) - # Delete the default user-agent header, which would otherwise apply in - # cases where our custom HTTP handler doesn't come into play - # (See https://github.com/rg3/youtube-dl/issues/1309 for details) - opener.addheaders = [] - compat_urllib_request.install_opener(opener) - socket.setdefaulttimeout(timeout) - return opener - - def main(argv=None): try: _real_main(argv) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index eb3435c77..423e54cea 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -8,7 +8,6 @@ import netrc from ..utils import ( compat_http_client, compat_urllib_error, - compat_urllib_request, compat_str, clean_html, @@ -19,6 +18,7 @@ from ..utils import ( unescapeHTML, ) + class InfoExtractor(object): """Information Extractor class. @@ -156,7 +156,7 @@ class InfoExtractor(object): elif note is not False: self.to_screen(u'%s: %s' % (video_id, note)) try: - return compat_urllib_request.urlopen(url_or_request) + return self._downloader.urlopen(url_or_request) except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: if errnote is None: errnote = u'Unable to download webpage' diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 0720fe9eb..0d2b7bd10 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -535,7 +535,7 @@ def formatSeconds(secs): else: return '%d' % secs -def make_HTTPS_handler(opts): +def make_HTTPS_handler(opts_no_check_certificate): if sys.version_info < (3,2): # Python's 2.x handler is very simplistic return compat_urllib_request.HTTPSHandler() @@ -545,7 +545,7 @@ def make_HTTPS_handler(opts): context.set_default_verify_paths() context.verify_mode = (ssl.CERT_NONE - if opts.no_check_certificate + if opts_no_check_certificate else ssl.CERT_REQUIRED) return compat_urllib_request.HTTPSHandler(context=context) From 00ea0f11eb76e7a67648790524a50f7254b9578f Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Fri, 22 Nov 2013 20:00:35 +0100 Subject: [PATCH 102/425] Print full title in --get-title output (#1806) --- youtube_dl/YoutubeDL.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index a2e3df1f9..2700051cf 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -639,7 +639,7 @@ class YoutubeDL(object): # Forced printings if self.params.get('forcetitle', False): - compat_print(info_dict['title']) + compat_print(info_dict['fulltitle']) if self.params.get('forceid', False): compat_print(info_dict['id']) if self.params.get('forceurl', False): From 50123be4211e2c16aa5d2fc9ebadbaf72a9becce Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Fri, 22 Nov 2013 20:23:55 +0100 Subject: [PATCH 103/425] release 2013.11.22.1 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index c1f581cd6..770b046a5 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2013.11.22' +__version__ = '2013.11.22.1' From d35dc6d3b57781e5f1c0a5df308e3c08f66371a7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Fri, 22 Nov 2013 21:19:31 +0100 Subject: [PATCH 104/425] [bandcamp] move the album test to the album extractor and return a single track instead of a playlist --- youtube_dl/extractor/bandcamp.py | 52 ++++++++++++++++---------------- 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/youtube_dl/extractor/bandcamp.py b/youtube_dl/extractor/bandcamp.py index 359d4174b..1aa9dbefd 100644 --- a/youtube_dl/extractor/bandcamp.py +++ b/youtube_dl/extractor/bandcamp.py @@ -20,28 +20,6 @@ class BandcampIE(InfoExtractor): u"title": u"youtube-dl test song \"'/\\\u00e4\u21ad" }, u'skip': u'There is a limit of 200 free downloads / month for the test song' - }, { - u'url': u'http://blazo.bandcamp.com/album/jazz-format-mixtape-vol-1', - u'playlist': [ - { - u'file': u'1353101989.mp3', - u'md5': u'39bc1eded3476e927c724321ddf116cf', - u'info_dict': { - u'title': u'Intro', - } - }, - { - u'file': u'38097443.mp3', - u'md5': u'1a2c32e2691474643e912cc6cd4bffaa', - u'info_dict': { - u'title': u'Kero One - Keep It Alive (Blazo remix)', - } - }, - ], - u'params': { - u'playlistend': 2 - }, - u'skip': u'Bancamp imposes download limits. See test_playlists:test_bandcamp_album for the playlist test' }] def _real_extract(self, url): @@ -63,13 +41,11 @@ class BandcampIE(InfoExtractor): 'url': format_url, 'ext': format_id.partition('-')[0] } for format_id, format_url in sorted(d['file'].items())] - entries.append({ + return { 'id': compat_str(d['id']), 'title': d['title'], 'formats': formats, - }) - - return self.playlist_result(entries, title, title) + } else: raise ExtractorError(u'No free songs found') @@ -112,6 +88,30 @@ class BandcampAlbumIE(InfoExtractor): IE_NAME = u'Bandcamp:album' _VALID_URL = r'http://.*?\.bandcamp\.com/album/(?P<title>.*)' + _TEST = { + u'url': u'http://blazo.bandcamp.com/album/jazz-format-mixtape-vol-1', + u'playlist': [ + { + u'file': u'1353101989.mp3', + u'md5': u'39bc1eded3476e927c724321ddf116cf', + u'info_dict': { + u'title': u'Intro', + } + }, + { + u'file': u'38097443.mp3', + u'md5': u'1a2c32e2691474643e912cc6cd4bffaa', + u'info_dict': { + u'title': u'Kero One - Keep It Alive (Blazo remix)', + } + }, + ], + u'params': { + u'playlistend': 2 + }, + u'skip': u'Bancamp imposes download limits. See test_playlists:test_bandcamp_album for the playlist test' + } + def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) title = mobj.group('title') From 9f79463803f40a15a6350dc693af75ec215147f2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Fri, 22 Nov 2013 21:25:12 +0100 Subject: [PATCH 105/425] [howcast] update test's checksum --- youtube_dl/extractor/howcast.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/howcast.py b/youtube_dl/extractor/howcast.py index 46954337f..bafc5826f 100644 --- a/youtube_dl/extractor/howcast.py +++ b/youtube_dl/extractor/howcast.py @@ -8,7 +8,7 @@ class HowcastIE(InfoExtractor): _TEST = { u'url': u'http://www.howcast.com/videos/390161-How-to-Tie-a-Square-Knot-Properly', u'file': u'390161.mp4', - u'md5': u'1d7ba54e2c9d7dc6935ef39e00529138', + u'md5': u'8b743df908c42f60cf6496586c7f12c3', u'info_dict': { u"description": u"The square knot, also known as the reef knot, is one of the oldest, most basic knots to tie, and can be used in many different ways. Here's the proper way to tie a square knot.", u"title": u"How to Tie a Square Knot Properly" From d3b30148edb6795aadc96b3a464c492b239a2242 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Fri, 22 Nov 2013 21:26:31 +0100 Subject: [PATCH 106/425] [bambuser:channel] Update test --- test/test_playlists.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_playlists.py b/test/test_playlists.py index d83b3bf51..7c67239a4 100644 --- a/test/test_playlists.py +++ b/test/test_playlists.py @@ -102,7 +102,7 @@ class TestPlaylists(unittest.TestCase): result = ie.extract('http://bambuser.com/channel/pixelversity') self.assertIsPlaylist(result) self.assertEqual(result['title'], u'pixelversity') - self.assertTrue(len(result['entries']) >= 66) + self.assertTrue(len(result['entries']) >= 60) def test_bandcamp_album(self): dl = FakeYDL() From 7012b23c947fc1ed146e314a30d3c70a5fde70e7 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Fri, 22 Nov 2013 22:46:46 +0100 Subject: [PATCH 107/425] Match --download-archive during playlist processing (Fixes #1745) --- test/test_youtube_lists.py | 6 ++--- youtube_dl/YoutubeDL.py | 43 +++++++++++++++++++++++---------- youtube_dl/extractor/common.py | 4 ++- youtube_dl/extractor/youtube.py | 26 ++++++++++++-------- 4 files changed, 52 insertions(+), 27 deletions(-) diff --git a/test/test_youtube_lists.py b/test/test_youtube_lists.py index 50ad52695..938517a2d 100644 --- a/test/test_youtube_lists.py +++ b/test/test_youtube_lists.py @@ -84,16 +84,16 @@ class TestYoutubeLists(unittest.TestCase): dl = FakeYDL() ie = YoutubeChannelIE(dl) #test paginated channel - result = ie.extract('https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w')[0] + result = ie.extract('https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w') self.assertTrue(len(result['entries']) > 90) #test autogenerated channel - result = ie.extract('https://www.youtube.com/channel/HCtnHdj3df7iM/videos')[0] + result = ie.extract('https://www.youtube.com/channel/HCtnHdj3df7iM/videos') self.assertTrue(len(result['entries']) >= 18) def test_youtube_user(self): dl = FakeYDL() ie = YoutubeUserIE(dl) - result = ie.extract('https://www.youtube.com/user/TheLinuxFoundation')[0] + result = ie.extract('https://www.youtube.com/user/TheLinuxFoundation') self.assertTrue(len(result['entries']) >= 320) def test_youtube_safe_search(self): diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 2700051cf..beb7d0cd1 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -355,15 +355,17 @@ class YoutubeDL(object): def _match_entry(self, info_dict): """ Returns None iff the file should be downloaded """ - title = info_dict['title'] - matchtitle = self.params.get('matchtitle', False) - if matchtitle: - if not re.search(matchtitle, title, re.IGNORECASE): - return u'[download] "' + title + '" title did not match pattern "' + matchtitle + '"' - rejecttitle = self.params.get('rejecttitle', False) - if rejecttitle: - if re.search(rejecttitle, title, re.IGNORECASE): - return u'"' + title + '" title matched reject pattern "' + rejecttitle + '"' + if 'title' in info_dict: + # This can happen when we're just evaluating the playlist + title = info_dict['title'] + matchtitle = self.params.get('matchtitle', False) + if matchtitle: + if not re.search(matchtitle, title, re.IGNORECASE): + return u'[download] "' + title + '" title did not match pattern "' + matchtitle + '"' + rejecttitle = self.params.get('rejecttitle', False) + if rejecttitle: + if re.search(rejecttitle, title, re.IGNORECASE): + return u'"' + title + '" title matched reject pattern "' + rejecttitle + '"' date = info_dict.get('upload_date', None) if date is not None: dateRange = self.params.get('daterange', DateRange()) @@ -374,8 +376,8 @@ class YoutubeDL(object): if age_limit < info_dict.get('age_limit', 0): return u'Skipping "' + title + '" because it is age restricted' if self.in_download_archive(info_dict): - return (u'%(title)s has already been recorded in archive' - % info_dict) + return (u'%s has already been recorded in archive' + % info_dict.get('title', info_dict.get('id', u'video'))) return None @staticmethod @@ -454,7 +456,7 @@ class YoutubeDL(object): ie_key=ie_result.get('ie_key'), extra_info=extra_info) elif result_type == 'playlist': - self.add_extra_info(ie_result, extra_info) + # We process each entry in the playlist playlist = ie_result.get('title', None) or ie_result.get('id', None) self.to_screen(u'[download] Downloading playlist: %s' % playlist) @@ -484,6 +486,12 @@ class YoutubeDL(object): 'webpage_url': ie_result['webpage_url'], 'extractor_key': ie_result['extractor_key'], } + + reason = self._match_entry(entry) + if reason is not None: + self.to_screen(u'[download] ' + reason) + continue + entry_result = self.process_ie_result(entry, download=download, extra_info=extra) @@ -810,7 +818,16 @@ class YoutubeDL(object): fn = self.params.get('download_archive') if fn is None: return False - vid_id = info_dict['extractor'] + u' ' + info_dict['id'] + extractor = info_dict.get('extractor_id') + if extractor is None: + if 'id' in info_dict: + extractor = info_dict.get('ie_key') # key in a playlist + if extractor is None: + return False # Incomplete video information + # Future-proof against any change in case + # and backwards compatibility with prior versions + extractor = extractor.lower() + vid_id = extractor + u' ' + info_dict['id'] try: with locked_file(fn, 'r', encoding='utf-8') as archive_file: for line in archive_file: diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index eb3435c77..3cebeaf29 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -229,12 +229,14 @@ class InfoExtractor(object): self.to_screen(u'Logging in') #Methods for following #608 - def url_result(self, url, ie=None): + def url_result(self, url, ie=None, video_id=None): """Returns a url that points to a page that should be processed""" #TODO: ie should be the class used for getting the info video_info = {'_type': 'url', 'url': url, 'ie_key': ie} + if video_id is not None: + video_info['id'] = video_id return video_info def playlist_result(self, entries, playlist_id=None, playlist_title=None): """Returns a playlist""" diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 9b09793eb..126688652 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1552,7 +1552,7 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): video_id = query_dict['v'][0] if self._downloader.params.get('noplaylist'): self.to_screen(u'Downloading just video %s because of --no-playlist' % video_id) - return self.url_result('https://www.youtube.com/watch?v=' + video_id, 'Youtube') + return self.url_result(video_id, 'Youtube', video_id=video_id) else: self.to_screen(u'Downloading playlist PL%s - add --no-playlist to just download video %s' % (playlist_id, video_id)) @@ -1571,7 +1571,8 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): playlist_title = self._og_search_title(page) - url_results = [self.url_result(vid, 'Youtube') for vid in ids] + url_results = [self.url_result(video_id, 'Youtube', video_id=video_id) + for video_id in ids] return self.playlist_result(url_results, playlist_id, playlist_title) @@ -1626,9 +1627,9 @@ class YoutubeChannelIE(InfoExtractor): self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids))) - urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids] - url_entries = [self.url_result(eurl, 'Youtube') for eurl in urls] - return [self.playlist_result(url_entries, channel_id)] + url_entries = [self.url_result(video_id, 'Youtube', video_id=video_id) + for video_id in video_ids] + return self.playlist_result(url_entries, channel_id) class YoutubeUserIE(InfoExtractor): @@ -1692,9 +1693,11 @@ class YoutubeUserIE(InfoExtractor): if len(ids_in_page) < self._GDATA_PAGE_SIZE: break - urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids] - url_results = [self.url_result(rurl, 'Youtube') for rurl in urls] - return [self.playlist_result(url_results, playlist_title = username)] + url_results = [ + self.url_result(video_id, 'Youtube', video_id=video_id) + for video_id in video_ids] + return self.playlist_result(url_results, playlist_title=username) + class YoutubeSearchIE(SearchInfoExtractor): IE_DESC = u'YouTube.com searches' @@ -1735,7 +1738,8 @@ class YoutubeSearchIE(SearchInfoExtractor): if len(video_ids) > n: video_ids = video_ids[:n] - videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids] + videos = [self.url_result(video_id, 'Youtube', video_id=video_id) + for video_id in video_ids] return self.playlist_result(videos, query) class YoutubeSearchDateIE(YoutubeSearchIE): @@ -1795,7 +1799,9 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor): feed_html = info['feed_html'] m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html) ids = orderedSet(m.group(1) for m in m_ids) - feed_entries.extend(self.url_result(id, 'Youtube') for id in ids) + feed_entries.extend( + self.url_result(video_id, 'Youtube', video_id=video_id) + for video_id in ids) if info['paging'] is None: break return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE) From d7386f6276b7b01ff4254136524d29c8f243721e Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Fri, 22 Nov 2013 23:05:56 +0100 Subject: [PATCH 108/425] [update] Check if version from repository is newer before updating Closes #1704 --- youtube_dl/update.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/youtube_dl/update.py b/youtube_dl/update.py index e5f441707..be7800e8b 100644 --- a/youtube_dl/update.py +++ b/youtube_dl/update.py @@ -41,6 +41,7 @@ def rsa_verify(message, signature, key): if signature != sha256(message).digest(): return False return True + def update_self(to_screen, verbose): """Update the program file with the latest version from the repository""" @@ -82,6 +83,13 @@ def update_self(to_screen, verbose): return version_id = versions_info['latest'] + + def version_tuple(version_str): + return tuple(map(int, version_str.split('.'))) + if version_tuple(__version__) >= version_tuple(version_str): + to_screen(u'youtube-dl is up to date (%s)' % __version__) + return + to_screen(u'Updating to version ' + version_id + '...') version = versions_info['versions'][version_id] From a87b0615aa311083923e607c3d1a5cdceab818f7 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Fri, 22 Nov 2013 23:08:15 +0100 Subject: [PATCH 109/425] release 2013.11.22.2 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 770b046a5..f6d18f945 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2013.11.22.1' +__version__ = '2013.11.22.2' From 43afe2858870d140b2a133fda2a0cbbd642a3bfc Mon Sep 17 00:00:00 2001 From: Itay Brandes <Brandes.Itay@gmail.com> Date: Sat, 23 Nov 2013 10:22:18 +0200 Subject: [PATCH 110/425] Log to an external logger (fixes #1810) Sadly applications using youtube-dl's python sources can't directly access it's log stream. It's pretty much limited to stdout and stderr only. It should log to logging.Logger instance passed to YoutubeDL's params dictionary. --- youtube_dl/YoutubeDL.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index beb7d0cd1..6729d53ad 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -97,6 +97,7 @@ class YoutubeDL(object): playlistend: Playlist item to end at. matchtitle: Download only matching titles. rejecttitle: Reject downloads for matching titles. + logger: Log messages to a logging.Logger instance. logtostderr: Log messages to stderr instead of stdout. writedescription: Write the video description to a .description file writeinfojson: Write the video description to a .info.json file @@ -192,7 +193,9 @@ class YoutubeDL(object): def to_screen(self, message, skip_eol=False): """Print message to stdout if not in quiet mode.""" - if not self.params.get('quiet', False): + if self.params.get('logger', False): + self.params['logger'].debug(message) + elif not self.params.get('quiet', False): terminator = [u'\n', u''][skip_eol] output = message + terminator write_string(output, self._screen_file) @@ -200,10 +203,13 @@ class YoutubeDL(object): def to_stderr(self, message): """Print message to stderr.""" assert type(message) == type(u'') - output = message + u'\n' - if 'b' in getattr(self._screen_file, 'mode', '') or sys.version_info[0] < 3: # Python 2 lies about the mode of sys.stdout/sys.stderr - output = output.encode(preferredencoding()) - sys.stderr.write(output) + if self.params.get('logger', False): + self.params['logger'].error(message) + else: + output = message + u'\n' + if 'b' in getattr(self._screen_file, 'mode', '') or sys.version_info[0] < 3: # Python 2 lies about the mode of sys.stdout/sys.stderr + output = output.encode(preferredencoding()) + sys.stderr.write(output) def to_console_title(self, message): if not self.params.get('consoletitle', False): From 52ad14aeb0178a187a861e7ce2259b7046702281 Mon Sep 17 00:00:00 2001 From: Takuya Tsuchida <takuya0301@gmail.com> Date: Sat, 23 Nov 2013 18:19:44 +0900 Subject: [PATCH 111/425] Add support for niconico --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/niconico.py | 190 +++++++++++++++++++++++++++++++ 2 files changed, 191 insertions(+) create mode 100644 youtube_dl/extractor/niconico.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 02f9e2546..f443f11f6 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -98,6 +98,7 @@ from .nba import NBAIE from .nbc import NBCNewsIE from .newgrounds import NewgroundsIE from .nhl import NHLIE, NHLVideocenterIE +from .niconico import NiconicoIE from .nowvideo import NowVideoIE from .ooyala import OoyalaIE from .orf import ORFIE diff --git a/youtube_dl/extractor/niconico.py b/youtube_dl/extractor/niconico.py new file mode 100644 index 000000000..8638a8ee8 --- /dev/null +++ b/youtube_dl/extractor/niconico.py @@ -0,0 +1,190 @@ +# encoding: utf-8 + +import re +import socket +import xml.etree.ElementTree + +from .common import InfoExtractor +from ..utils import ( + compat_http_client, + compat_urllib_error, + compat_urllib_parse, + compat_urllib_request, + compat_urlparse, + compat_str, + + ExtractorError, + unified_strdate, +) + +class NiconicoIE(InfoExtractor): + IE_NAME = u'niconico' + IE_DESC = u'ニコニコ動画' + + _TEST = { + u'url': u'http://www.nicovideo.jp/watch/sm22312215', + u'file': u'sm22312215.mp4', + u'md5': u'd1a75c0823e2f629128c43e1212760f9', + u'info_dict': { + u'title': u'Big Buck Bunny', + u'uploader': u'takuya0301', + u'uploader_id': u'2698420', + u'upload_date': u'20131123', + u'description': u'(c) copyright 2008, Blender Foundation / www.bigbuckbunny.org', + }, + u'params': { + u'username': u'ydl.niconico@gmail.com', + u'password': u'youtube-dl', + }, + } + + _VALID_URL = r'^(?:https?://)?(?:www\.)?nicovideo\.jp/watch/([a-z][a-z][0-9]+)(?:.*)$' + _LOGIN_URL = 'https://secure.nicovideo.jp/secure/login' + _NETRC_MACHINE = 'niconico' + # If True it will raise an error if no login info is provided + _LOGIN_REQUIRED = True + + def _real_initialize(self): + self._login() + + def _login(self): + (username, password) = self._get_login_info() + # No authentication to be performed + if username is None: + if self._LOGIN_REQUIRED: + raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True) + return False + + # Log in + login_form_strs = { + u'mail': username, + u'password': password, + } + # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode + # chokes on unicode + login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items()) + login_data = compat_urllib_parse.urlencode(login_form).encode('ascii') + request = compat_urllib_request.Request(self._LOGIN_URL, login_data) + try: + self.report_login() + login_results = compat_urllib_request.urlopen(request).read().decode('utf-8') + if re.search(r'(?i)<h1 class="mb8p4">Log in error</h1>', login_results) is not None: + self._downloader.report_warning(u'unable to log in: bad username or password') + return False + except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: + self._downloader.report_warning(u'unable to log in: %s' % compat_str(err)) + return False + return True + + def _real_extract(self, url): + video_id = self._extract_id(url) + + # Get video webpage + self.report_video_webpage_download(video_id) + url = 'http://www.nicovideo.jp/watch/' + video_id + request = compat_urllib_request.Request(url) + try: + video_webpage = compat_urllib_request.urlopen(request).read() + except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: + raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err)) + + # Get video info + self.report_video_info_webpage_download(video_id) + url = 'http://ext.nicovideo.jp/api/getthumbinfo/' + video_id + request = compat_urllib_request.Request(url) + try: + video_info_webpage = compat_urllib_request.urlopen(request).read() + except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: + raise ExtractorError(u'Unable to download video info webpage: %s' % compat_str(err)) + + # Get flv info + self.report_flv_info_webpage_download(video_id) + url = 'http://flapi.nicovideo.jp/api/getflv?v=' + video_id + request = compat_urllib_request.Request(url) + try: + flv_info_webpage = compat_urllib_request.urlopen(request).read() + except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: + raise ExtractorError(u'Unable to download flv info webpage: %s' % compat_str(err)) + + # Start extracting information + self.report_information_extraction(video_id) + video_info = xml.etree.ElementTree.fromstring(video_info_webpage) + + # url + video_real_url = compat_urlparse.parse_qs(flv_info_webpage.decode('utf-8'))['url'][0] + + # title + video_title = video_info.find('.//title').text + + # ext + video_extension = video_info.find('.//movie_type').text + + # format + video_format = video_extension.upper() + + # thumbnail + video_thumbnail = video_info.find('.//thumbnail_url').text + + # description + video_description = video_info.find('.//description').text + + # uploader_id + video_uploader_id = video_info.find('.//user_id').text + + # uploader + url = 'http://seiga.nicovideo.jp/api/user/info?id=' + video_uploader_id + request = compat_urllib_request.Request(url) + try: + user_info_webpage = compat_urllib_request.urlopen(request).read() + except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: + self._downloader.report_warning(u'Unable to download user info webpage: %s' % compat_str(err)) + + user_info = xml.etree.ElementTree.fromstring(user_info_webpage) + video_uploader = user_info.find('.//nickname').text + + # uploder_date + video_upload_date = unified_strdate(video_info.find('.//first_retrieve').text.split('+')[0]) + + # view_count + video_view_count = video_info.find('.//view_counter').text + + # webpage_url + video_webpage_url = video_info.find('.//watch_url').text + + return { + 'id': video_id, + 'url': video_real_url, + 'title': video_title, + 'ext': video_extension, + 'format': video_format, + 'thumbnail': video_thumbnail, + 'description': video_description, + 'uploader': video_uploader, + 'upload_date': video_upload_date, + 'uploader_id': video_uploader_id, + 'view_count': video_view_count, + 'webpage_url': video_webpage_url, + } + + def _extract_id(self, url): + mobj = re.match(self._VALID_URL, url) + if mobj is None: + raise ExtractorError(u'Invalid URL: %s' % url) + video_id = mobj.group(1) + return video_id + + def report_video_webpage_download(self, video_id): + """Report attempt to download video webpage.""" + self.to_screen(u'%s: Downloading video webpage' % video_id) + + def report_video_info_webpage_download(self, video_id): + """Report attempt to download video info webpage.""" + self.to_screen(u'%s: Downloading video info webpage' % video_id) + + def report_flv_info_webpage_download(self, video_id): + """Report attempt to download flv info webpage.""" + self.to_screen(u'%s: Downloading flv info webpage' % video_id) + + def report_information_extraction(self, video_id): + """Report attempt to extract video information.""" + self.to_screen(u'%s: Extracting video information' % video_id) From e5c146d58682dbdda2b46fc71a16ddc5b1fcc9fc Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Sat, 23 Nov 2013 15:57:42 +0100 Subject: [PATCH 112/425] [streamcloud] skip test on travis --- youtube_dl/extractor/streamcloud.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/streamcloud.py b/youtube_dl/extractor/streamcloud.py index d476693ec..9faf3a5e3 100644 --- a/youtube_dl/extractor/streamcloud.py +++ b/youtube_dl/extractor/streamcloud.py @@ -21,6 +21,7 @@ class StreamcloudIE(InfoExtractor): u'title': u'youtube-dl test video \'/\\ ä ↭', u'duration': 9, }, + u'skip': u'Only available from the EU' } def _real_extract(self, url): From 36de0a0e1a49f6324616f9b508920ff7d06136c2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Sat, 23 Nov 2013 23:26:06 +0100 Subject: [PATCH 113/425] [brightcove] Set the 'videoPlayer' value to the 'videoId' if it's missing in the parameters (fixes #1815) --- youtube_dl/extractor/brightcove.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index d8c35465a..74a7d13e3 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -75,14 +75,17 @@ class BrightcoveIE(InfoExtractor): params = {'flashID': object_doc.attrib['id'], 'playerID': find_xpath_attr(object_doc, './param', 'name', 'playerID').attrib['value'], } - playerKey = find_xpath_attr(object_doc, './param', 'name', 'playerKey') + def find_param(name): + return find_xpath_attr(object_doc, './param', 'name', name) + playerKey = find_param('playerKey') # Not all pages define this value if playerKey is not None: params['playerKey'] = playerKey.attrib['value'] - videoPlayer = find_xpath_attr(object_doc, './param', 'name', '@videoPlayer') + # The three fields hold the id of the video + videoPlayer = find_param('@videoPlayer') or find_param('videoId') or find_param('videoID') if videoPlayer is not None: params['@videoPlayer'] = videoPlayer.attrib['value'] - linkBase = find_xpath_attr(object_doc, './param', 'name', 'linkBaseURL') + linkBase = find_param('linkBaseURL') if linkBase is not None: params['linkBaseURL'] = linkBase.attrib['value'] data = compat_urllib_parse.urlencode(params) From 8bf9319e9c0c02f5516b00509476abff89eb3d41 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Sun, 24 Nov 2013 06:08:11 +0100 Subject: [PATCH 114/425] Simplify logger code(#1811) --- youtube_dl/YoutubeDL.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 6729d53ad..d7e2417ac 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -97,7 +97,7 @@ class YoutubeDL(object): playlistend: Playlist item to end at. matchtitle: Download only matching titles. rejecttitle: Reject downloads for matching titles. - logger: Log messages to a logging.Logger instance. + logger: Log messages to a logging.Logger instance. logtostderr: Log messages to stderr instead of stdout. writedescription: Write the video description to a .description file writeinfojson: Write the video description to a .info.json file @@ -193,7 +193,7 @@ class YoutubeDL(object): def to_screen(self, message, skip_eol=False): """Print message to stdout if not in quiet mode.""" - if self.params.get('logger', False): + if self.params.get('logger'): self.params['logger'].debug(message) elif not self.params.get('quiet', False): terminator = [u'\n', u''][skip_eol] @@ -203,7 +203,7 @@ class YoutubeDL(object): def to_stderr(self, message): """Print message to stderr.""" assert type(message) == type(u'') - if self.params.get('logger', False): + if self.params.get('logger'): self.params['logger'].error(message) else: output = message + u'\n' From 13ebea791fb4293acf939730ad5a9c07e553005f Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Sun, 24 Nov 2013 06:37:14 +0100 Subject: [PATCH 115/425] [niconico] Simplify and make work with old Python versions The website requires SSLv3, otherwise it just times out during SSL negotiation. --- youtube_dl/extractor/niconico.py | 121 ++++++++----------------------- youtube_dl/utils.py | 29 ++++++-- 2 files changed, 55 insertions(+), 95 deletions(-) diff --git a/youtube_dl/extractor/niconico.py b/youtube_dl/extractor/niconico.py index 8638a8ee8..22898b5a1 100644 --- a/youtube_dl/extractor/niconico.py +++ b/youtube_dl/extractor/niconico.py @@ -17,6 +17,7 @@ from ..utils import ( unified_strdate, ) + class NiconicoIE(InfoExtractor): IE_NAME = u'niconico' IE_DESC = u'ニコニコ動画' @@ -38,8 +39,7 @@ class NiconicoIE(InfoExtractor): }, } - _VALID_URL = r'^(?:https?://)?(?:www\.)?nicovideo\.jp/watch/([a-z][a-z][0-9]+)(?:.*)$' - _LOGIN_URL = 'https://secure.nicovideo.jp/secure/login' + _VALID_URL = r'^https?://(?:www\.|secure\.)?nicovideo\.jp/watch/([a-z][a-z][0-9]+)(?:.*)$' _NETRC_MACHINE = 'niconico' # If True it will raise an error if no login info is provided _LOGIN_REQUIRED = True @@ -57,99 +57,63 @@ class NiconicoIE(InfoExtractor): # Log in login_form_strs = { - u'mail': username, - u'password': password, + u'mail': username, + u'password': password, } # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode # chokes on unicode login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items()) - login_data = compat_urllib_parse.urlencode(login_form).encode('ascii') - request = compat_urllib_request.Request(self._LOGIN_URL, login_data) - try: - self.report_login() - login_results = compat_urllib_request.urlopen(request).read().decode('utf-8') - if re.search(r'(?i)<h1 class="mb8p4">Log in error</h1>', login_results) is not None: - self._downloader.report_warning(u'unable to log in: bad username or password') - return False - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.report_warning(u'unable to log in: %s' % compat_str(err)) + login_data = compat_urllib_parse.urlencode(login_form).encode('utf-8') + request = compat_urllib_request.Request( + u'https://secure.nicovideo.jp/secure/login', login_data) + login_results = self._download_webpage( + request, u'', note=u'Logging in', errnote=u'Unable to log in') + if re.search(r'(?i)<h1 class="mb8p4">Log in error</h1>', login_results) is not None: + self._downloader.report_warning(u'unable to log in: bad username or password') return False return True def _real_extract(self, url): - video_id = self._extract_id(url) + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group(1) # Get video webpage - self.report_video_webpage_download(video_id) - url = 'http://www.nicovideo.jp/watch/' + video_id - request = compat_urllib_request.Request(url) - try: - video_webpage = compat_urllib_request.urlopen(request).read() - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err)) + video_webpage = self._download_webpage( + 'http://www.nicovideo.jp/watch/' + video_id, video_id) - # Get video info - self.report_video_info_webpage_download(video_id) - url = 'http://ext.nicovideo.jp/api/getthumbinfo/' + video_id - request = compat_urllib_request.Request(url) - try: - video_info_webpage = compat_urllib_request.urlopen(request).read() - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - raise ExtractorError(u'Unable to download video info webpage: %s' % compat_str(err)) + video_info_webpage = self._download_webpage( + 'http://ext.nicovideo.jp/api/getthumbinfo/' + video_id, video_id, + note=u'Downloading video info page') # Get flv info - self.report_flv_info_webpage_download(video_id) - url = 'http://flapi.nicovideo.jp/api/getflv?v=' + video_id - request = compat_urllib_request.Request(url) - try: - flv_info_webpage = compat_urllib_request.urlopen(request).read() - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - raise ExtractorError(u'Unable to download flv info webpage: %s' % compat_str(err)) + flv_info_webpage = self._download_webpage( + u'http://flapi.nicovideo.jp/api/getflv?v=' + video_id, + video_id, u'Downloading flv info') + video_real_url = compat_urlparse.parse_qs(flv_info_webpage)['url'][0] # Start extracting information - self.report_information_extraction(video_id) video_info = xml.etree.ElementTree.fromstring(video_info_webpage) - - # url - video_real_url = compat_urlparse.parse_qs(flv_info_webpage.decode('utf-8'))['url'][0] - - # title video_title = video_info.find('.//title').text - - # ext video_extension = video_info.find('.//movie_type').text - - # format video_format = video_extension.upper() - - # thumbnail video_thumbnail = video_info.find('.//thumbnail_url').text - - # description video_description = video_info.find('.//description').text - - # uploader_id video_uploader_id = video_info.find('.//user_id').text + video_upload_date = unified_strdate(video_info.find('.//first_retrieve').text.split('+')[0]) + video_view_count = video_info.find('.//view_counter').text + video_webpage_url = video_info.find('.//watch_url').text # uploader + video_uploader = video_uploader_id url = 'http://seiga.nicovideo.jp/api/user/info?id=' + video_uploader_id - request = compat_urllib_request.Request(url) try: - user_info_webpage = compat_urllib_request.urlopen(request).read() + user_info_webpage = self._download_webpage( + url, video_id, note=u'Downloading user information') except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: self._downloader.report_warning(u'Unable to download user info webpage: %s' % compat_str(err)) - - user_info = xml.etree.ElementTree.fromstring(user_info_webpage) - video_uploader = user_info.find('.//nickname').text - - # uploder_date - video_upload_date = unified_strdate(video_info.find('.//first_retrieve').text.split('+')[0]) - - # view_count - video_view_count = video_info.find('.//view_counter').text - - # webpage_url - video_webpage_url = video_info.find('.//watch_url').text + else: + user_info = xml.etree.ElementTree.fromstring(user_info_webpage) + video_uploader = user_info.find('.//nickname').text return { 'id': video_id, @@ -165,26 +129,3 @@ class NiconicoIE(InfoExtractor): 'view_count': video_view_count, 'webpage_url': video_webpage_url, } - - def _extract_id(self, url): - mobj = re.match(self._VALID_URL, url) - if mobj is None: - raise ExtractorError(u'Invalid URL: %s' % url) - video_id = mobj.group(1) - return video_id - - def report_video_webpage_download(self, video_id): - """Report attempt to download video webpage.""" - self.to_screen(u'%s: Downloading video webpage' % video_id) - - def report_video_info_webpage_download(self, video_id): - """Report attempt to download video info webpage.""" - self.to_screen(u'%s: Downloading video info webpage' % video_id) - - def report_flv_info_webpage_download(self, video_id): - """Report attempt to download flv info webpage.""" - self.to_screen(u'%s: Downloading flv info webpage' % video_id) - - def report_information_extraction(self, video_id): - """Report attempt to extract video information.""" - self.to_screen(u'%s: Extracting video information' % video_id) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 0720fe9eb..34b3d19e0 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -12,6 +12,7 @@ import os import pipes import platform import re +import ssl import socket import sys import traceback @@ -535,13 +536,31 @@ def formatSeconds(secs): else: return '%d' % secs + def make_HTTPS_handler(opts): - if sys.version_info < (3,2): - # Python's 2.x handler is very simplistic - return compat_urllib_request.HTTPSHandler() + if sys.version_info < (3, 2): + import httplib + + class HTTPSConnectionV3(httplib.HTTPSConnection): + def __init__(self, *args, **kwargs): + httplib.HTTPSConnection.__init__(self, *args, **kwargs) + + def connect(self): + sock = socket.create_connection((self.host, self.port), self.timeout) + if self._tunnel_host: + self.sock = sock + self._tunnel() + try: + self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv3) + except ssl.SSLError as e: + self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23) + + class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler): + def https_open(self, req): + return self.do_open(HTTPSConnectionV3, req) + return HTTPSHandlerV3() else: - import ssl - context = ssl.SSLContext(ssl.PROTOCOL_SSLv23) + context = ssl.SSLContext(ssl.PROTOCOL_SSLv3) context.set_default_verify_paths() context.verify_mode = (ssl.CERT_NONE From 38b2db6a666e094896927217aa293750a732e81d Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Sun, 24 Nov 2013 06:39:49 +0100 Subject: [PATCH 116/425] Credit @takuya0301 for niconico --- youtube_dl/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 64ebf4d48..19904dbfd 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -35,6 +35,7 @@ __authors__ = ( 'Jelle van der Waa', 'Marcin Cieślak', 'Anton Larionov', + 'Takuya Tsuchida', ) __license__ = 'Public Domain' From 2e767313e49b43400b3baae247e0f4c9e9e24992 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Sun, 24 Nov 2013 06:52:21 +0100 Subject: [PATCH 117/425] [update] fix error --- youtube_dl/update.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/update.py b/youtube_dl/update.py index be7800e8b..cd9670166 100644 --- a/youtube_dl/update.py +++ b/youtube_dl/update.py @@ -86,7 +86,7 @@ def update_self(to_screen, verbose): def version_tuple(version_str): return tuple(map(int, version_str.split('.'))) - if version_tuple(__version__) >= version_tuple(version_str): + if version_tuple(__version__) >= version_tuple(version_id): to_screen(u'youtube-dl is up to date (%s)' % __version__) return From 23e6d50d73188eab26944e41f164a5a1ab7f547a Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Sun, 24 Nov 2013 06:52:53 +0100 Subject: [PATCH 118/425] [bandcamp] Remove unused variable --- youtube_dl/extractor/bandcamp.py | 1 - 1 file changed, 1 deletion(-) diff --git a/youtube_dl/extractor/bandcamp.py b/youtube_dl/extractor/bandcamp.py index 1aa9dbefd..3a32c14c5 100644 --- a/youtube_dl/extractor/bandcamp.py +++ b/youtube_dl/extractor/bandcamp.py @@ -34,7 +34,6 @@ class BandcampIE(InfoExtractor): json_code = m_trackinfo.group(1) data = json.loads(json_code) - entries = [] for d in data: formats = [{ 'format_id': 'format_id', From bd49928f7a0254eeb8d5f918c5649ce4eb78ef36 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Sun, 24 Nov 2013 06:53:50 +0100 Subject: [PATCH 119/425] [niconico] Clarify download --- youtube_dl/extractor/niconico.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/niconico.py b/youtube_dl/extractor/niconico.py index 22898b5a1..729607ea3 100644 --- a/youtube_dl/extractor/niconico.py +++ b/youtube_dl/extractor/niconico.py @@ -77,9 +77,9 @@ class NiconicoIE(InfoExtractor): mobj = re.match(self._VALID_URL, url) video_id = mobj.group(1) - # Get video webpage - video_webpage = self._download_webpage( - 'http://www.nicovideo.jp/watch/' + video_id, video_id) + # Get video webpage. We are not actually interested in it, but need + # the cookies in order to be able to download the info webpage + self._download_webpage('http://www.nicovideo.jp/watch/' + video_id, video_id) video_info_webpage = self._download_webpage( 'http://ext.nicovideo.jp/api/getthumbinfo/' + video_id, video_id, From 66ec0192406bbf1bffcb6c4e72fe1529f1e21195 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Sun, 24 Nov 2013 06:54:26 +0100 Subject: [PATCH 120/425] [youtube] do not use variable name twice --- youtube_dl/extractor/youtube.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 126688652..07a457f4d 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1571,8 +1571,8 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): playlist_title = self._og_search_title(page) - url_results = [self.url_result(video_id, 'Youtube', video_id=video_id) - for video_id in ids] + url_results = [self.url_result(vid_id, 'Youtube', video_id=vid_id) + for vid_id in ids] return self.playlist_result(url_results, playlist_id, playlist_title) From 382ed50e0ecfb2fa692049030c858b99159c791b Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Sun, 24 Nov 2013 07:30:05 +0100 Subject: [PATCH 121/425] [viki] Add extractor (fixes #1813) --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/viki.py | 91 ++++++++++++++++++++++++++++++++ 2 files changed, 92 insertions(+) create mode 100644 youtube_dl/extractor/viki.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index f443f11f6..867734fa2 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -157,6 +157,7 @@ from .videofyme import VideofyMeIE from .videopremium import VideoPremiumIE from .vimeo import VimeoIE, VimeoChannelIE from .vine import VineIE +from .viki import VikiIE from .vk import VKIE from .wat import WatIE from .websurg import WeBSurgIE diff --git a/youtube_dl/extractor/viki.py b/youtube_dl/extractor/viki.py new file mode 100644 index 000000000..78d03c079 --- /dev/null +++ b/youtube_dl/extractor/viki.py @@ -0,0 +1,91 @@ +import re + +from ..utils import ( + unified_strdate, +) +from .subtitles import SubtitlesInfoExtractor + + +class VikiIE(SubtitlesInfoExtractor): + IE_NAME = u'viki' + + _VALID_URL = r'^https?://(?:www\.)?viki\.com/videos/(?P<id>[0-9]+v)' + _TEST = { + u'url': u'http://www.viki.com/videos/1023585v-heirs-episode-14', + u'file': u'1023585v.mp4', + u'md5': u'a21454021c2646f5433514177e2caa5f', + u'info_dict': { + u'title': u'Heirs Episode 14', + u'uploader': u'SBS', + u'description': u'md5:c4b17b9626dd4b143dcc4d855ba3474e', + u'upload_date': u'20131121', + u'age_limit': 13, + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group(1) + + webpage = self._download_webpage(url, video_id) + title = self._og_search_title(webpage) + description = self._og_search_description(webpage) + thumbnail = self._og_search_thumbnail(webpage) + + uploader = self._html_search_regex( + r'<strong>Broadcast Network: </strong>\s*([^<]*)<', webpage, + u'uploader') + if uploader is not None: + uploader = uploader.strip() + + rating_str = self._html_search_regex( + r'<strong>Rating: </strong>\s*([^<]*)<', webpage, + u'rating information', default='').strip() + RATINGS = { + 'G': 0, + 'PG': 10, + 'PG-13': 13, + 'R': 16, + 'NC': 18, + } + age_limit = RATINGS.get(rating_str) + + info_url = 'http://www.viki.com/player5_fragment/%s?action=show&controller=videos' % video_id + info_webpage = self._download_webpage(info_url, video_id) + video_url = self._html_search_regex( + r'<source[^>]+src="([^"]+)"', info_webpage, u'video URL') + + upload_date_str = self._html_search_regex( + r'"created_at":"([^"]+)"', info_webpage, u'upload date') + upload_date = ( + unified_strdate(upload_date_str) + if upload_date_str is not None + else None + ) + + # subtitles + video_subtitles = self.extract_subtitles(video_id, info_webpage) + if self._downloader.params.get('listsubtitles', False): + self._list_available_subtitles(video_id, info_webpage) + return + + return { + 'id': video_id, + 'title': title, + 'url': video_url, + 'description': description, + 'thumbnail': thumbnail, + 'age_limit': age_limit, + 'uploader': uploader, + 'subtitles': video_subtitles, + 'upload_date': upload_date, + } + + def _get_available_subtitles(self, video_id, info_webpage): + res = {} + for sturl in re.findall(r'<track src="([^"]+)"/>'): + m = re.search(r'/(?P<lang>[a-z]+)\.vtt', sturl) + if not m: + continue + res[m.group('lang')] = sturl + return res From eaaafc59c2f8ffaee4df06092a57f65eec1b6eaa Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Sun, 24 Nov 2013 07:30:34 +0100 Subject: [PATCH 122/425] release 2013.11.24 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index f6d18f945..68ef46a30 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2013.11.22.2' +__version__ = '2013.11.24' From 0c7c19d6bc55a624532f2426d080aea51962cfe0 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Sun, 24 Nov 2013 07:51:44 +0100 Subject: [PATCH 123/425] [clipfish] Add extractor (Fixes #1760) --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/clipfish.py | 53 ++++++++++++++++++++++++++++++++ 2 files changed, 54 insertions(+) create mode 100644 youtube_dl/extractor/clipfish.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 867734fa2..4c280fa5e 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -20,6 +20,7 @@ from .c56 import C56IE from .canalplus import CanalplusIE from .canalc2 import Canalc2IE from .cinemassacre import CinemassacreIE +from .clipfish import ClipfishIE from .cnn import CNNIE from .collegehumor import CollegeHumorIE from .comedycentral import ComedyCentralIE diff --git a/youtube_dl/extractor/clipfish.py b/youtube_dl/extractor/clipfish.py new file mode 100644 index 000000000..95449da3c --- /dev/null +++ b/youtube_dl/extractor/clipfish.py @@ -0,0 +1,53 @@ +import re +import time +import xml.etree.ElementTree + +from .common import InfoExtractor + + +class ClipfishIE(InfoExtractor): + IE_NAME = u'clipfish' + + _VALID_URL = r'^https?://(?:www\.)?clipfish\.de/.*?/video/(?P<id>[0-9]+)/' + _TEST = { + u'url': u'http://www.clipfish.de/special/supertalent/video/4028320/supertalent-2013-ivana-opacak-singt-nobodys-perfect/', + u'file': u'4028320.f4v', + u'md5': u'5e38bda8c329fbfb42be0386a3f5a382', + u'info_dict': { + u'title': u'Supertalent 2013: Ivana Opacak singt Nobody\'s Perfect', + u'duration': 399, + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group(1) + + info_url = ('http://www.clipfish.de/devxml/videoinfo/%s?ts=%d' % + (video_id, int(time.time()))) + info_xml = self._download_webpage( + info_url, video_id, note=u'Downloading info page') + doc = xml.etree.ElementTree.fromstring(info_xml) + title = doc.find('title').text + video_url = doc.find('filename').text + thumbnail = doc.find('imageurl').text + duration_str = doc.find('duration').text + m = re.match( + r'^(?P<hours>[0-9]+):(?P<minutes>[0-9]{2}):(?P<seconds>[0-9]{2}):(?P<ms>[0-9]*)$', + duration_str) + if m: + duration = ( + (int(m.group('hours')) * 60 * 60) + + (int(m.group('minutes')) * 60) + + (int(m.group('seconds'))) + ) + else: + duration = None + + return { + 'id': video_id, + 'title': title, + 'url': video_url, + 'thumbnail': thumbnail, + 'duration': duration, + } From 138df537ffaeda182789440c4086f009a739dde3 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Sun, 24 Nov 2013 07:51:56 +0100 Subject: [PATCH 124/425] release 2013.11.24.1 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 68ef46a30..de92411bb 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2013.11.24' +__version__ = '2013.11.24.1' From d214fdb8fe796e92485e28038ee72d28caa3ad10 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Sun, 24 Nov 2013 11:02:34 +0100 Subject: [PATCH 125/425] [brightcove] Don't use 'or' with the xml nodes, use the 'value' attribute instead --- youtube_dl/extractor/brightcove.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index 74a7d13e3..66fe0ac9a 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -76,18 +76,21 @@ class BrightcoveIE(InfoExtractor): 'playerID': find_xpath_attr(object_doc, './param', 'name', 'playerID').attrib['value'], } def find_param(name): - return find_xpath_attr(object_doc, './param', 'name', name) + node = find_xpath_attr(object_doc, './param', 'name', name) + if node is not None: + return node.attrib['value'] + return None playerKey = find_param('playerKey') # Not all pages define this value if playerKey is not None: - params['playerKey'] = playerKey.attrib['value'] + params['playerKey'] = playerKey # The three fields hold the id of the video videoPlayer = find_param('@videoPlayer') or find_param('videoId') or find_param('videoID') if videoPlayer is not None: - params['@videoPlayer'] = videoPlayer.attrib['value'] + params['@videoPlayer'] = videoPlayer linkBase = find_param('linkBaseURL') if linkBase is not None: - params['linkBaseURL'] = linkBase.attrib['value'] + params['linkBaseURL'] = linkBase data = compat_urllib_parse.urlencode(params) return cls._FEDERATED_URL_TEMPLATE % data From dc65dcbb6d709ef6e38f336fe77c14767d6c8f9d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Sun, 24 Nov 2013 11:28:44 +0100 Subject: [PATCH 126/425] [mixcloud] The description field may be missing (fixes #1819) --- youtube_dl/extractor/mixcloud.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/mixcloud.py b/youtube_dl/extractor/mixcloud.py index a200dcd74..e2baf44d7 100644 --- a/youtube_dl/extractor/mixcloud.py +++ b/youtube_dl/extractor/mixcloud.py @@ -60,7 +60,7 @@ class MixcloudIE(InfoExtractor): 'title': info['name'], 'url': final_song_url, 'ext': 'mp3', - 'description': info['description'], + 'description': info.get('description'), 'thumbnail': info['pictures'].get('extra_large'), 'uploader': info['user']['name'], 'uploader_id': info['user']['username'], From f459d17018812dc896324f8208cdfe2ada04ea50 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Sun, 24 Nov 2013 14:33:50 +0100 Subject: [PATCH 127/425] [youtube] Add an extractor for downloading the watch history (closes #1821) --- test/test_all_urls.py | 1 + youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/youtube.py | 14 ++++++++++++++ 3 files changed, 16 insertions(+) diff --git a/test/test_all_urls.py b/test/test_all_urls.py index 56e5f80e1..42813da1a 100644 --- a/test/test_all_urls.py +++ b/test/test_all_urls.py @@ -100,6 +100,7 @@ class TestAllURLsMatching(unittest.TestCase): def test_keywords(self): self.assertMatch(':ytsubs', ['youtube:subscriptions']) self.assertMatch(':ytsubscriptions', ['youtube:subscriptions']) + self.assertMatch(':ythistory', ['youtube:history']) self.assertMatch(':thedailyshow', ['ComedyCentral']) self.assertMatch(':tds', ['ComedyCentral']) self.assertMatch(':colbertreport', ['ComedyCentral']) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 4c280fa5e..1fbd10bc5 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -186,6 +186,7 @@ from .youtube import ( YoutubeTruncatedURLIE, YoutubeWatchLaterIE, YoutubeFavouritesIE, + YoutubeHistoryIE, ) from .zdf import ZDFIE diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 07a457f4d..64d4c2445 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1826,6 +1826,20 @@ class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor): _PAGING_STEP = 100 _PERSONAL_FEED = True +class YoutubeHistoryIE(YoutubeFeedsInfoExtractor): + IE_DESC = u'Youtube watch history, "ythistory" keyword (requires authentication)' + _VALID_URL = u'https?://www\.youtube\.com/feed/history|:ythistory' + _FEED_NAME = 'history' + _PERSONAL_FEED = True + _PLAYLIST_TITLE = u'Youtube Watch History' + + def _real_extract(self, url): + webpage = self._download_webpage('https://www.youtube.com/feed/history', u'History') + data_paging = self._search_regex(r'data-paging="(\d+)"', webpage, u'data-paging') + # The step is actually a ridiculously big number (like 1374343569725646) + self._PAGING_STEP = int(data_paging) + return super(YoutubeHistoryIE, self)._real_extract(url) + class YoutubeFavouritesIE(YoutubeBaseInfoExtractor): IE_NAME = u'youtube:favorites' IE_DESC = u'YouTube.com favourite videos, "ytfav" keyword (requires authentication)' From 267ed0c5d3547c68f1d34203c2ae4b0d826a29d9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Sun, 24 Nov 2013 14:59:19 +0100 Subject: [PATCH 128/425] [collegehumor] Encode the xml before calling xml.etree.ElementTree.fromstring (fixes #1822) Uses a new helper method in InfoExtractor: _download_xml --- youtube_dl/extractor/collegehumor.py | 7 ++----- youtube_dl/extractor/common.py | 6 ++++++ 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/collegehumor.py b/youtube_dl/extractor/collegehumor.py index 0c29acfb1..b27c1dfc5 100644 --- a/youtube_dl/extractor/collegehumor.py +++ b/youtube_dl/extractor/collegehumor.py @@ -1,5 +1,4 @@ import re -import xml.etree.ElementTree from .common import InfoExtractor from ..utils import ( @@ -46,11 +45,10 @@ class CollegeHumorIE(InfoExtractor): self.report_extraction(video_id) xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id - metaXml = self._download_webpage(xmlUrl, video_id, + mdoc = self._download_xml(xmlUrl, video_id, u'Downloading info XML', u'Unable to download video info XML') - mdoc = xml.etree.ElementTree.fromstring(metaXml) try: videoNode = mdoc.findall('./video')[0] youtubeIdNode = videoNode.find('./youtubeID') @@ -65,11 +63,10 @@ class CollegeHumorIE(InfoExtractor): if next_url.endswith(u'manifest.f4m'): manifest_url = next_url + '?hdcore=2.10.3' - manifestXml = self._download_webpage(manifest_url, video_id, + adoc = self._download_xml(manifest_url, video_id, u'Downloading XML manifest', u'Unable to download video info XML') - adoc = xml.etree.ElementTree.fromstring(manifestXml) try: video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text except IndexError: diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 3cebeaf29..482a231ec 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -4,6 +4,7 @@ import re import socket import sys import netrc +import xml.etree.ElementTree from ..utils import ( compat_http_client, @@ -208,6 +209,11 @@ class InfoExtractor(object): """ Returns the data of the page as a string """ return self._download_webpage_handle(url_or_request, video_id, note, errnote)[0] + def _download_xml(self, url_or_request, video_id, note=u'Downloading XML', errnote=u'Unable to downloand XML'): + """Return the xml as an xml.etree.ElementTree.Element""" + xml_string = self._download_webpage(url_or_request, video_id, note, errnote) + return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8')) + def to_screen(self, msg): """Print msg to screen, prefixing it with '[ie_name]'""" self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg)) From a1ee09e815cb413d67cee17ad686224b26182dfb Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Sun, 24 Nov 2013 15:03:25 +0100 Subject: [PATCH 129/425] Document proxy --- youtube_dl/YoutubeDL.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 72ccfa2ae..0a845a344 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -128,7 +128,8 @@ class YoutubeDL(object): Videos already present in the file are not downloaded again. cookiefile: File name where cookies should be read from and dumped to. - nocheckcertificate Do not verify SSL certificates + nocheckcertificate:Do not verify SSL certificates + proxy: URL of the proxy server to use The following parameters are not used by YoutubeDL itself, they are used by the FileDownloader: From b7553b25543175c27c885b0c6ab77d91b270a520 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Sun, 24 Nov 2013 15:20:16 +0100 Subject: [PATCH 130/425] [vik] Clarify output --- youtube_dl/extractor/viki.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/viki.py b/youtube_dl/extractor/viki.py index 78d03c079..8088dcf0b 100644 --- a/youtube_dl/extractor/viki.py +++ b/youtube_dl/extractor/viki.py @@ -51,7 +51,8 @@ class VikiIE(SubtitlesInfoExtractor): age_limit = RATINGS.get(rating_str) info_url = 'http://www.viki.com/player5_fragment/%s?action=show&controller=videos' % video_id - info_webpage = self._download_webpage(info_url, video_id) + info_webpage = self._download_webpage( + info_url, video_id, note=u'Downloading info page') video_url = self._html_search_regex( r'<source[^>]+src="([^"]+)"', info_webpage, u'video URL') From 6d88bc37a32d5d624c09d68cd19e64e6095fa5de Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Sun, 24 Nov 2013 15:28:33 +0100 Subject: [PATCH 131/425] [viki] Skip travis test Also provide a better error message for geoblocked videos. --- youtube_dl/extractor/viki.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/viki.py b/youtube_dl/extractor/viki.py index 8088dcf0b..7b3a58de8 100644 --- a/youtube_dl/extractor/viki.py +++ b/youtube_dl/extractor/viki.py @@ -1,6 +1,7 @@ import re from ..utils import ( + ExtractorError, unified_strdate, ) from .subtitles import SubtitlesInfoExtractor @@ -20,7 +21,8 @@ class VikiIE(SubtitlesInfoExtractor): u'description': u'md5:c4b17b9626dd4b143dcc4d855ba3474e', u'upload_date': u'20131121', u'age_limit': 13, - } + }, + u'skip': u'Blocked in the US', } def _real_extract(self, url): @@ -53,6 +55,10 @@ class VikiIE(SubtitlesInfoExtractor): info_url = 'http://www.viki.com/player5_fragment/%s?action=show&controller=videos' % video_id info_webpage = self._download_webpage( info_url, video_id, note=u'Downloading info page') + if re.match(r'\s*<div\s+class="video-error', info_webpage): + raise ExtractorError( + u'Video %s is blocked from your location.' % video_id, + expected=True) video_url = self._html_search_regex( r'<source[^>]+src="([^"]+)"', info_webpage, u'video URL') From 66cfab4226296c1596fbf37c27758bbdb6846d53 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Sun, 24 Nov 2013 21:18:35 +0100 Subject: [PATCH 132/425] [comedycentral] Add support for comedycentral.com videos (closes #1824) It's a subclass of MTVIE The extractor for colbertnation.com and thedailyshow.com is called now ComedyCentralShowsIE --- youtube_dl/extractor/__init__.py | 2 +- youtube_dl/extractor/comedycentral.py | 33 ++++++++++++++++++++++++++- 2 files changed, 33 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 1fbd10bc5..0b4d086b7 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -23,7 +23,7 @@ from .cinemassacre import CinemassacreIE from .clipfish import ClipfishIE from .cnn import CNNIE from .collegehumor import CollegeHumorIE -from .comedycentral import ComedyCentralIE +from .comedycentral import ComedyCentralIE, ComedyCentralShowsIE from .condenast import CondeNastIE from .criterion import CriterionIE from .cspan import CSpanIE diff --git a/youtube_dl/extractor/comedycentral.py b/youtube_dl/extractor/comedycentral.py index 69b2beece..725849d2e 100644 --- a/youtube_dl/extractor/comedycentral.py +++ b/youtube_dl/extractor/comedycentral.py @@ -2,6 +2,7 @@ import re import xml.etree.ElementTree from .common import InfoExtractor +from .mtv import MTVIE, _media_xml_tag from ..utils import ( compat_str, compat_urllib_parse, @@ -11,7 +12,37 @@ from ..utils import ( ) -class ComedyCentralIE(InfoExtractor): +class ComedyCentralIE(MTVIE): + _VALID_URL = r'http://www.comedycentral.com/(video-clips|episodes|cc-studios)/(?P<title>.*)' + _FEED_URL = u'http://comedycentral.com/feeds/mrss/' + + _TEST = { + u'url': u'http://www.comedycentral.com/video-clips/kllhuv/stand-up-greg-fitzsimmons--uncensored---too-good-of-a-mother', + u'md5': u'4167875aae411f903b751a21f357f1ee', + u'info_dict': { + u'id': u'cef0cbb3-e776-4bc9-b62e-8016deccb354', + u'ext': u'mp4', + u'title': u'Uncensored - Greg Fitzsimmons - Too Good of a Mother', + u'description': u'After a certain point, breastfeeding becomes c**kblocking.', + }, + } + # Overwrite MTVIE properties we don't want + _TESTS = [] + + def _get_thumbnail_url(self, uri, itemdoc): + search_path = '%s/%s' % (_media_xml_tag('group'), _media_xml_tag('thumbnail')) + return itemdoc.find(search_path).attrib['url'] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + title = mobj.group('title') + webpage = self._download_webpage(url, title) + mgid = self._search_regex(r'data-mgid="(?P<mgid>mgid:.*?)"', + webpage, u'mgid') + return self._get_videos_info(mgid) + + +class ComedyCentralShowsIE(InfoExtractor): IE_DESC = u'The Daily Show / Colbert Report' # urls can be abbreviations like :thedailyshow or :colbert # urls for episodes like: From 16e055849ebfa5a942aef4411728b36bf53ebaa3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Sun, 24 Nov 2013 22:13:20 +0100 Subject: [PATCH 133/425] Update the keywords tests for the rename of the old ComedyCentralIE --- test/test_all_urls.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/test/test_all_urls.py b/test/test_all_urls.py index 42813da1a..1f1adb6b4 100644 --- a/test/test_all_urls.py +++ b/test/test_all_urls.py @@ -101,10 +101,10 @@ class TestAllURLsMatching(unittest.TestCase): self.assertMatch(':ytsubs', ['youtube:subscriptions']) self.assertMatch(':ytsubscriptions', ['youtube:subscriptions']) self.assertMatch(':ythistory', ['youtube:history']) - self.assertMatch(':thedailyshow', ['ComedyCentral']) - self.assertMatch(':tds', ['ComedyCentral']) - self.assertMatch(':colbertreport', ['ComedyCentral']) - self.assertMatch(':cr', ['ComedyCentral']) + self.assertMatch(':thedailyshow', ['ComedyCentralShows']) + self.assertMatch(':tds', ['ComedyCentralShows']) + self.assertMatch(':colbertreport', ['ComedyCentralShows']) + self.assertMatch(':cr', ['ComedyCentralShows']) if __name__ == '__main__': From 1fb2bcbbf748e07d05f98110cc27d440506a9b77 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 25 Nov 2013 02:02:34 +0100 Subject: [PATCH 134/425] [viki] Make uploader field optional (#1813) --- youtube_dl/extractor/viki.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/viki.py b/youtube_dl/extractor/viki.py index 7b3a58de8..cd986a749 100644 --- a/youtube_dl/extractor/viki.py +++ b/youtube_dl/extractor/viki.py @@ -34,11 +34,12 @@ class VikiIE(SubtitlesInfoExtractor): description = self._og_search_description(webpage) thumbnail = self._og_search_thumbnail(webpage) - uploader = self._html_search_regex( - r'<strong>Broadcast Network: </strong>\s*([^<]*)<', webpage, - u'uploader') - if uploader is not None: - uploader = uploader.strip() + uploader_m = re.search( + r'<strong>Broadcast Network: </strong>\s*([^<]*)<', webpage) + if uploader_m is None: + uploader = None + else: + uploader = uploader.group(1).strip() rating_str = self._html_search_regex( r'<strong>Rating: </strong>\s*([^<]*)<', webpage, From 02dbf93f0e98a56ed04b4a9e6a6d62efd6d801f9 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 25 Nov 2013 03:12:26 +0100 Subject: [PATCH 135/425] [zdf/common] Use API in ZDF extractor. This also comes with a lot of extra format fields Fixes #1518 --- youtube_dl/FileDownloader.py | 20 +----- youtube_dl/YoutubeDL.py | 24 ++++--- youtube_dl/extractor/common.py | 2 + youtube_dl/extractor/zdf.py | 115 ++++++++++++++++++++------------- youtube_dl/utils.py | 21 ++++++ 5 files changed, 112 insertions(+), 70 deletions(-) diff --git a/youtube_dl/FileDownloader.py b/youtube_dl/FileDownloader.py index e5a542ed5..2b4fb0b31 100644 --- a/youtube_dl/FileDownloader.py +++ b/youtube_dl/FileDownloader.py @@ -1,4 +1,3 @@ -import math import os import re import subprocess @@ -11,6 +10,7 @@ from .utils import ( ContentTooShortError, determine_ext, encodeFilename, + format_bytes, sanitize_open, timeconvert, ) @@ -53,20 +53,6 @@ class FileDownloader(object): self._progress_hooks = [] self.params = params - @staticmethod - def format_bytes(bytes): - if bytes is None: - return 'N/A' - if type(bytes) is str: - bytes = float(bytes) - if bytes == 0.0: - exponent = 0 - else: - exponent = int(math.log(bytes, 1024.0)) - suffix = ['B','KiB','MiB','GiB','TiB','PiB','EiB','ZiB','YiB'][exponent] - converted = float(bytes) / float(1024 ** exponent) - return '%.2f%s' % (converted, suffix) - @staticmethod def format_seconds(seconds): (mins, secs) = divmod(seconds, 60) @@ -117,7 +103,7 @@ class FileDownloader(object): def format_speed(speed): if speed is None: return '%10s' % '---b/s' - return '%10s' % ('%s/s' % FileDownloader.format_bytes(speed)) + return '%10s' % ('%s/s' % format_bytes(speed)) @staticmethod def best_block_size(elapsed_time, bytes): @@ -525,7 +511,7 @@ class FileDownloader(object): self.to_screen(u'\r[download] File is larger than max-filesize (%s bytes > %s bytes). Aborting.' % (data_len, max_data_len)) return False - data_len_str = self.format_bytes(data_len) + data_len_str = format_bytes(data_len) byte_counter = 0 + resume_len block_size = self.params.get('buffersize', 1024) start = time.time() diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index d7e2417ac..0578fe6c1 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -30,6 +30,7 @@ from .utils import ( DownloadError, encodeFilename, ExtractorError, + format_bytes, locked_file, MaxDownloadsReached, PostProcessingError, @@ -867,9 +868,11 @@ class YoutubeDL(object): def list_formats(self, info_dict): def format_note(fdict): - if fdict.get('format_note') is not None: - return fdict['format_note'] res = u'' + if fdict.get('format_note') is not None: + res += fdict['format_note'] + u' ' + if fdict.get('quality_name') is not None: + res += u'%s ' % fdict['quality_name'] if fdict.get('vcodec') is not None: res += u'%-5s' % fdict['vcodec'] elif fdict.get('vbr') is not None: @@ -886,25 +889,30 @@ class YoutubeDL(object): res += 'audio' if fdict.get('abr') is not None: res += u'@%3dk' % fdict['abr'] + if fdict.get('filesize') is not None: + if res: + res += u', ' + res += format_bytes(fdict['filesize']) return res - def line(format): - return (u'%-20s%-10s%-12s%s' % ( + def line(format, idlen=20): + return ((u'%-' + compat_str(idlen + 1) + u's%-10s%-12s%s') % ( format['format_id'], format['ext'], self.format_resolution(format), format_note(format), - ) - ) + )) formats = info_dict.get('formats', [info_dict]) - formats_s = list(map(line, formats)) + idlen = max(len(u'format code'), + max(len(f['format_id']) for f in formats)) + formats_s = [line(f, idlen) for f in formats] if len(formats) > 1: formats_s[0] += (' ' if format_note(formats[0]) else '') + '(worst)' formats_s[-1] += (' ' if format_note(formats[-1]) else '') + '(best)' header_line = line({ 'format_id': u'format code', 'ext': u'extension', - '_resolution': u'resolution', 'format_note': u'note'}) + '_resolution': u'resolution', 'format_note': u'note'}, idlen=idlen) self.to_screen(u'[info] Available formats for %s:\n%s\n%s' % (info_dict['id'], header_line, u"\n".join(formats_s))) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 482a231ec..3c4781121 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -76,6 +76,8 @@ class InfoExtractor(object): * acodec Name of the audio codec in use * vbr Average video bitrate in KBit/s * vcodec Name of the video codec in use + * quality_name Human-readable name of the video quality. + * filesize The number of bytes, if known in advance webpage_url: The url to the video webpage, if given to youtube-dl it should allow to get the same result again. (It will be set by YoutubeDL if it's missing) diff --git a/youtube_dl/extractor/zdf.py b/youtube_dl/extractor/zdf.py index c6a9d06f2..a8d899883 100644 --- a/youtube_dl/extractor/zdf.py +++ b/youtube_dl/extractor/zdf.py @@ -1,75 +1,100 @@ +import operator import re from .common import InfoExtractor from ..utils import ( - determine_ext, - ExtractorError, + parse_xml_doc, + unified_strdate, ) class ZDFIE(InfoExtractor): _VALID_URL = r'^http://www\.zdf\.de\/ZDFmediathek(?P<hash>#)?\/(.*beitrag\/video\/)(?P<video_id>[^/\?]+)(?:\?.*)?' - _MEDIA_STREAM = r'<a href="(?P<video_url>.+(?P<media_type>.streaming).+/zdf/(?P<quality>[^\/]+)/[^"]*)".+class="play".+>' def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - if mobj is None: - raise ExtractorError(u'Invalid URL: %s' % url) video_id = mobj.group('video_id') - if mobj.group('hash'): - url = url.replace(u'#', u'', 1) + xml_url = u'http://www.zdf.de/ZDFmediathek/xmlservice/web/beitragsDetails?ak=web&id=%s' % video_id + info_xml = self._download_webpage( + xml_url, video_id, note=u'Downloading video info') + doc = parse_xml_doc(info_xml) - html = self._download_webpage(url, video_id) - streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)] - if streams is None: - raise ExtractorError(u'No media url found.') + title = doc.find('.//information/title').text + description = doc.find('.//information/detail').text + uploader_node = doc.find('.//details/originChannelTitle') + uploader = None if uploader_node is None else uploader_node.text + duration_str = doc.find('.//details/length').text + duration_m = re.match(r'''(?x)^ + (?P<hours>[0-9]{2}) + :(?P<minutes>[0-9]{2}) + :(?P<seconds>[0-9]{2}) + (?:\.(?P<ms>[0-9]+)?) + ''', duration_str) + duration = ( + ( + (int(duration_m.group('hours')) * 60 * 60) + + (int(duration_m.group('minutes')) * 60) + + int(duration_m.group('seconds')) + ) + if duration_m + else None + ) + upload_date = unified_strdate(doc.find('.//details/airtime').text) - # s['media_type'] == 'wstreaming' -> use 'Windows Media Player' and mms url - # s['media_type'] == 'hstreaming' -> use 'Quicktime' and rtsp url - # choose first/default media type and highest quality for now - def stream_pref(s): - TYPE_ORDER = ['ostreaming', 'hstreaming', 'wstreaming'] + def xml_to_format(fnode): + video_url = fnode.find('url').text + is_available = u'http://www.metafilegenerator' not in video_url + + format_id = fnode.attrib['basetype'] + format_m = re.match(r'''(?x) + (?P<vcodec>[^_]+)_(?P<acodec>[^_]+)_(?P<container>[^_]+)_ + (?P<proto>[^_]+)_(?P<index>[^_]+)_(?P<indexproto>[^_]+) + ''', format_id) + + PROTO_ORDER = ['http', 'rtmp', 'rtsp'] try: - type_pref = TYPE_ORDER.index(s['media_type']) + proto_pref = -PROTO_ORDER.index(format_m.group('proto')) except ValueError: - type_pref = 999 + proto_pref = 999 - QUALITY_ORDER = ['veryhigh', '300'] + quality = fnode.find('./quality').text + QUALITY_ORDER = ['veryhigh', '300', 'high', 'med', 'low'] try: - quality_pref = QUALITY_ORDER.index(s['quality']) + quality_pref = -QUALITY_ORDER.index(quality) except ValueError: quality_pref = 999 - return (type_pref, quality_pref) + abr = int(fnode.find('./audioBitrate').text) // 1000 + vbr = int(fnode.find('./videoBitrate').text) // 1000 + pref = (is_available, proto_pref, quality_pref, vbr, abr) - sorted_streams = sorted(streams, key=stream_pref) - if not sorted_streams: - raise ExtractorError(u'No stream found.') - stream = sorted_streams[0] + return { + 'format_id': format_id, + 'url': video_url, + 'ext': format_m.group('container'), + 'acodec': format_m.group('acodec'), + 'vcodec': format_m.group('vcodec'), + 'abr': abr, + 'vbr': vbr, + 'width': int(fnode.find('./width').text), + 'height': int(fnode.find('./height').text), + 'quality_name': quality, + 'filesize': int(fnode.find('./filesize').text), + 'format_note': None if is_available else u'(unavailable)', + '_pref': pref, + } - media_link = self._download_webpage( - stream['video_url'], - video_id, - u'Get stream URL') - - #MMS_STREAM = r'href="(?P<video_url>mms://[^"]*)"' - RTSP_STREAM = r'(?P<video_url>rtsp://[^"]*.mp4)' - - mobj = re.search(self._MEDIA_STREAM, media_link) - if mobj is None: - mobj = re.search(RTSP_STREAM, media_link) - if mobj is None: - raise ExtractorError(u'Cannot extract mms:// or rtsp:// URL') - video_url = mobj.group('video_url') - - title = self._html_search_regex( - r'<h1(?: class="beitragHeadline")?>(.*?)</h1>', - html, u'title') + format_nodes = doc.findall('.//formitaeten/formitaet') + formats = sorted(map(xml_to_format, format_nodes), + key=operator.itemgetter('_pref')) return { 'id': video_id, - 'url': video_url, 'title': title, - 'ext': determine_ext(video_url) + 'formats': formats, + 'description': description, + 'uploader': uploader, + 'duration': duration, + 'upload_date': upload_date, } diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 34b3d19e0..ad0a06287 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -8,6 +8,7 @@ import gzip import io import json import locale +import math import os import pipes import platform @@ -16,6 +17,7 @@ import ssl import socket import sys import traceback +import xml.etree.ElementTree import zlib try: @@ -1006,3 +1008,22 @@ def unsmuggle_url(smug_url): jsond = compat_parse_qs(sdata)[u'__youtubedl_smuggle'][0] data = json.loads(jsond) return url, data + + +def parse_xml_doc(s): + assert isinstance(s, type(u'')) + return xml.etree.ElementTree.fromstring(s.encode('utf-8')) + + +def format_bytes(bytes): + if bytes is None: + return u'N/A' + if type(bytes) is str: + bytes = float(bytes) + if bytes == 0.0: + exponent = 0 + else: + exponent = int(math.log(bytes, 1024.0)) + suffix = [u'B', u'KiB', u'MiB', u'GiB', u'TiB', u'PiB', u'EiB', u'ZiB', u'YiB'][exponent] + converted = float(bytes) / float(1024 ** exponent) + return u'%.2f%s' % (converted, suffix) From c059bdd432911cff8c7426380a876c9679855ab5 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 25 Nov 2013 03:28:55 +0100 Subject: [PATCH 136/425] Remove quality_name field and improve zdf extractor --- youtube_dl/YoutubeDL.py | 2 -- youtube_dl/extractor/common.py | 1 - youtube_dl/extractor/zdf.py | 23 +++++++++++++++++------ 3 files changed, 17 insertions(+), 9 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 0578fe6c1..a896d9e63 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -871,8 +871,6 @@ class YoutubeDL(object): res = u'' if fdict.get('format_note') is not None: res += fdict['format_note'] + u' ' - if fdict.get('quality_name') is not None: - res += u'%s ' % fdict['quality_name'] if fdict.get('vcodec') is not None: res += u'%-5s' % fdict['vcodec'] elif fdict.get('vbr') is not None: diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 3c4781121..3d8ac8ba2 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -76,7 +76,6 @@ class InfoExtractor(object): * acodec Name of the audio codec in use * vbr Average video bitrate in KBit/s * vcodec Name of the video codec in use - * quality_name Human-readable name of the video quality. * filesize The number of bytes, if known in advance webpage_url: The url to the video webpage, if given to youtube-dl it should allow to get the same result again. (It will be set diff --git a/youtube_dl/extractor/zdf.py b/youtube_dl/extractor/zdf.py index a8d899883..07f830e80 100644 --- a/youtube_dl/extractor/zdf.py +++ b/youtube_dl/extractor/zdf.py @@ -52,6 +52,9 @@ class ZDFIE(InfoExtractor): (?P<proto>[^_]+)_(?P<index>[^_]+)_(?P<indexproto>[^_]+) ''', format_id) + ext = format_m.group('container') + is_supported = ext != 'f4f' + PROTO_ORDER = ['http', 'rtmp', 'rtsp'] try: proto_pref = -PROTO_ORDER.index(format_m.group('proto')) @@ -67,26 +70,34 @@ class ZDFIE(InfoExtractor): abr = int(fnode.find('./audioBitrate').text) // 1000 vbr = int(fnode.find('./videoBitrate').text) // 1000 - pref = (is_available, proto_pref, quality_pref, vbr, abr) + pref = (is_available, is_supported, + proto_pref, quality_pref, vbr, abr) + + format_note = u'' + if not is_supported: + format_note += u'(unsupported)' + if not format_note: + format_note = None return { - 'format_id': format_id, + 'format_id': format_id + u'-' + quality, 'url': video_url, - 'ext': format_m.group('container'), + 'ext': ext, 'acodec': format_m.group('acodec'), 'vcodec': format_m.group('vcodec'), 'abr': abr, 'vbr': vbr, 'width': int(fnode.find('./width').text), 'height': int(fnode.find('./height').text), - 'quality_name': quality, 'filesize': int(fnode.find('./filesize').text), - 'format_note': None if is_available else u'(unavailable)', + 'format_note': format_note, '_pref': pref, + '_available': is_available, } format_nodes = doc.findall('.//formitaeten/formitaet') - formats = sorted(map(xml_to_format, format_nodes), + formats = sorted(filter(lambda f: f['_available'], + map(xml_to_format, format_nodes)), key=operator.itemgetter('_pref')) return { From 113577e155b10d6775f38e00b897f8e1d743a17e Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 25 Nov 2013 03:35:52 +0100 Subject: [PATCH 137/425] [generic] Improve detection Allow download of http://goo.gl/7X5tOk Fixes #1818 --- youtube_dl/extractor/generic.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 0b5f2b2bb..37671430a 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -209,7 +209,7 @@ class GenericIE(InfoExtractor): mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage) if mobj is None: # Broaden the search a little bit: JWPlayer JS loader - mobj = re.search(r'[^A-Za-z0-9]?file["\']?:\s*["\'](http[^\'"&]*)', webpage) + mobj = re.search(r'[^A-Za-z0-9]?file["\']?:\s*["\'](http[^\'"]*)', webpage) if mobj is None: # Try to find twitter cards info mobj = re.search(r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage) @@ -236,18 +236,16 @@ class GenericIE(InfoExtractor): video_id = compat_urllib_parse.unquote(os.path.basename(video_url)) # here's a fun little line of code for you: - video_extension = os.path.splitext(video_id)[1][1:] video_id = os.path.splitext(video_id)[0] # video uploader is domain name video_uploader = self._search_regex(r'(?:https?://)?([^/]*)/.*', url, u'video uploader') - return [{ + return { 'id': video_id, 'url': video_url, 'uploader': video_uploader, 'upload_date': None, 'title': video_title, - 'ext': video_extension, - }] + } From ac05067d3dbc68cd50e8e07d51700b5a8a698a29 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 25 Nov 2013 03:37:49 +0100 Subject: [PATCH 138/425] release 2013.11.25 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index de92411bb..91b36e55c 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2013.11.24.1' +__version__ = '2013.11.25' From d0efb9ec9a85662fa43f026339821513ac2f039c Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 25 Nov 2013 03:47:32 +0100 Subject: [PATCH 139/425] [tests] Remove global_setup function --- test/helper.py | 4 ---- test/test_age_restriction.py | 3 +-- test/test_download.py | 2 -- test/test_playlists.py | 3 +-- test/test_subtitles.py | 3 +-- test/test_write_annotations.py | 3 +-- test/test_write_info_json.py | 3 +-- test/test_youtube_lists.py | 3 +-- test/test_youtube_signature.py | 3 --- 9 files changed, 6 insertions(+), 21 deletions(-) diff --git a/test/helper.py b/test/helper.py index d7bf7a828..b1f421ac5 100644 --- a/test/helper.py +++ b/test/helper.py @@ -12,10 +12,6 @@ from youtube_dl import YoutubeDL from youtube_dl.utils import preferredencoding -def global_setup(): - youtube_dl._setup_opener(timeout=10) - - def get_params(override=None): PARAMETERS_FILE = os.path.join(os.path.dirname(os.path.abspath(__file__)), "parameters.json") diff --git a/test/test_age_restriction.py b/test/test_age_restriction.py index 506572e9e..c9cdb96cb 100644 --- a/test/test_age_restriction.py +++ b/test/test_age_restriction.py @@ -6,8 +6,7 @@ import sys import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from test.helper import global_setup, try_rm -global_setup() +from test.helper import try_rm from youtube_dl import YoutubeDL diff --git a/test/test_download.py b/test/test_download.py index fe7f7b8cb..dd5818dba 100644 --- a/test/test_download.py +++ b/test/test_download.py @@ -9,12 +9,10 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from test.helper import ( get_params, get_testcases, - global_setup, try_rm, md5, report_warning ) -global_setup() import hashlib diff --git a/test/test_playlists.py b/test/test_playlists.py index 7c67239a4..167801ae2 100644 --- a/test/test_playlists.py +++ b/test/test_playlists.py @@ -8,8 +8,7 @@ import sys import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from test.helper import FakeYDL, global_setup -global_setup() +from test.helper import FakeYDL from youtube_dl.extractor import ( diff --git a/test/test_subtitles.py b/test/test_subtitles.py index 06a304879..94a1f771d 100644 --- a/test/test_subtitles.py +++ b/test/test_subtitles.py @@ -6,8 +6,7 @@ import sys import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from test.helper import FakeYDL, global_setup, md5 -global_setup() +from test.helper import FakeYDL, md5 from youtube_dl.extractor import ( diff --git a/test/test_write_annotations.py b/test/test_write_annotations.py index 35defb895..eac53b285 100644 --- a/test/test_write_annotations.py +++ b/test/test_write_annotations.py @@ -7,8 +7,7 @@ import sys import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from test.helper import get_params, global_setup, try_rm -global_setup() +from test.helper import get_params, try_rm import io diff --git a/test/test_write_info_json.py b/test/test_write_info_json.py index 30c4859fd..d7177611b 100644 --- a/test/test_write_info_json.py +++ b/test/test_write_info_json.py @@ -7,8 +7,7 @@ import sys import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from test.helper import get_params, global_setup -global_setup() +from test.helper import get_params import io diff --git a/test/test_youtube_lists.py b/test/test_youtube_lists.py index 938517a2d..8fd073f31 100644 --- a/test/test_youtube_lists.py +++ b/test/test_youtube_lists.py @@ -6,8 +6,7 @@ import sys import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from test.helper import FakeYDL, global_setup -global_setup() +from test.helper import FakeYDL from youtube_dl.extractor import ( diff --git a/test/test_youtube_signature.py b/test/test_youtube_signature.py index 5e1ff5eb0..056700614 100644 --- a/test/test_youtube_signature.py +++ b/test/test_youtube_signature.py @@ -6,9 +6,6 @@ import sys import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from test.helper import global_setup -global_setup() - import io import re From 07e40358799158e51453e2d2c493d265a495b9e0 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 25 Nov 2013 05:57:55 +0100 Subject: [PATCH 140/425] [viki] Fix uploader extraction --- youtube_dl/extractor/viki.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/viki.py b/youtube_dl/extractor/viki.py index cd986a749..20e8bbf7e 100644 --- a/youtube_dl/extractor/viki.py +++ b/youtube_dl/extractor/viki.py @@ -39,7 +39,7 @@ class VikiIE(SubtitlesInfoExtractor): if uploader_m is None: uploader = None else: - uploader = uploader.group(1).strip() + uploader = uploader_m.group(1).strip() rating_str = self._html_search_regex( r'<strong>Rating: </strong>\s*([^<]*)<', webpage, From 94ccb6fa2e3ec014bb995d05bfe634cf986d6198 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 25 Nov 2013 05:58:04 +0100 Subject: [PATCH 141/425] [viki] Fix subtitles extraction --- youtube_dl/extractor/viki.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/viki.py b/youtube_dl/extractor/viki.py index 20e8bbf7e..ac199d410 100644 --- a/youtube_dl/extractor/viki.py +++ b/youtube_dl/extractor/viki.py @@ -91,7 +91,7 @@ class VikiIE(SubtitlesInfoExtractor): def _get_available_subtitles(self, video_id, info_webpage): res = {} - for sturl in re.findall(r'<track src="([^"]+)"/>'): + for sturl in re.findall(r'<track src="([^"]+)"/>', info_webpage): m = re.search(r'/(?P<lang>[a-z]+)\.vtt', sturl) if not m: continue From de79c46c8fa86dd3cb2383fd46cdd19a48e2f81f Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 25 Nov 2013 06:06:18 +0100 Subject: [PATCH 142/425] [viki] Fix subtitle extraction --- youtube_dl/extractor/viki.py | 4 +++- youtube_dl/utils.py | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/viki.py b/youtube_dl/extractor/viki.py index ac199d410..2206a06d5 100644 --- a/youtube_dl/extractor/viki.py +++ b/youtube_dl/extractor/viki.py @@ -2,6 +2,7 @@ import re from ..utils import ( ExtractorError, + unescapeHTML, unified_strdate, ) from .subtitles import SubtitlesInfoExtractor @@ -91,7 +92,8 @@ class VikiIE(SubtitlesInfoExtractor): def _get_available_subtitles(self, video_id, info_webpage): res = {} - for sturl in re.findall(r'<track src="([^"]+)"/>', info_webpage): + for sturl_html in re.findall(r'<track src="([^"]+)"/>', info_webpage): + sturl = unescapeHTML(sturl_html) m = re.search(r'/(?P<lang>[a-z]+)\.vtt', sturl) if not m: continue diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index caec00e37..946e90e93 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -553,7 +553,7 @@ def make_HTTPS_handler(opts_no_check_certificate): self._tunnel() try: self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv3) - except ssl.SSLError as e: + except ssl.SSLError: self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23) class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler): From daa0dd2973212fc1b2837c9572b1502f91f6acbc Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 25 Nov 2013 06:06:39 +0100 Subject: [PATCH 143/425] release 2013.11.25.1 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 91b36e55c..2af23040f 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2013.11.25' +__version__ = '2013.11.25.1' From d0d2b49ab728e70b8b325298e7825760fa7b3775 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 25 Nov 2013 06:17:41 +0100 Subject: [PATCH 144/425] [FileDownloader] use moved format_bytes method --- youtube_dl/FileDownloader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/FileDownloader.py b/youtube_dl/FileDownloader.py index c6276d194..27684d0f6 100644 --- a/youtube_dl/FileDownloader.py +++ b/youtube_dl/FileDownloader.py @@ -289,7 +289,7 @@ class FileDownloader(object): data_len = None if percent > 0: data_len = int(downloaded_data_len * 100 / percent) - data_len_str = u'~'+self.format_bytes(data_len) + data_len_str = u'~' + format_bytes(data_len) self.report_progress(percent, data_len_str, speed, eta) cursor_in_new_line = False self._hook_progress({ From 5db07df634713fe73e15e98de62f70ffe3a47871 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 25 Nov 2013 15:46:54 +0100 Subject: [PATCH 145/425] Fix --download-archive (Fixes #1826) --- youtube_dl/YoutubeDL.py | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 30ba94666..a1ef3a94a 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -836,20 +836,26 @@ class YoutubeDL(object): except (IOError, OSError): self.report_warning(u'Unable to remove downloaded video file') - def in_download_archive(self, info_dict): - fn = self.params.get('download_archive') - if fn is None: - return False - extractor = info_dict.get('extractor_id') + def _make_archive_id(self, info_dict): + # Future-proof against any change in case + # and backwards compatibility with prior versions + extractor = info_dict.get('extractor') if extractor is None: if 'id' in info_dict: extractor = info_dict.get('ie_key') # key in a playlist if extractor is None: + return None # Incomplete video information + return extractor.lower() + u' ' + info_dict['id'] + + def in_download_archive(self, info_dict): + fn = self.params.get('download_archive') + if fn is None: + return False + + vid_id = self._make_archive_id(info_dict) + if vid_id is None: return False # Incomplete video information - # Future-proof against any change in case - # and backwards compatibility with prior versions - extractor = extractor.lower() - vid_id = extractor + u' ' + info_dict['id'] + try: with locked_file(fn, 'r', encoding='utf-8') as archive_file: for line in archive_file: @@ -864,7 +870,8 @@ class YoutubeDL(object): fn = self.params.get('download_archive') if fn is None: return - vid_id = info_dict['extractor'] + u' ' + info_dict['id'] + vid_id = self._make_archive_id(info_dict) + assert vid_id with locked_file(fn, 'a', encoding='utf-8') as archive_file: archive_file.write(vid_id + u'\n') From bb2bebdbe1ef06adc3c1cb2d078e061f44cf7d29 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 25 Nov 2013 15:47:14 +0100 Subject: [PATCH 146/425] release 2013.11.25.2 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 2af23040f..aed0c4e75 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2013.11.25.1' +__version__ = '2013.11.25.2' From d46cc192d763f66655247ee122e397626481caca Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 25 Nov 2013 19:11:01 +0100 Subject: [PATCH 147/425] Reduce socket timeout --- youtube_dl/YoutubeDL.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index a1ef3a94a..0cef1daf3 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -968,7 +968,7 @@ class YoutubeDL(object): proxy_map.update(handler.proxies) write_string(u'[debug] Proxy map: ' + compat_str(proxy_map) + u'\n') - def _setup_opener(self, timeout=300): + def _setup_opener(self, timeout=20): opts_cookiefile = self.params.get('cookiefile') opts_proxy = self.params.get('proxy') From 2a15e7063bceed326bcbc0a01ba77324f0373a0c Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 25 Nov 2013 20:30:41 +0100 Subject: [PATCH 148/425] [soundcloud] Prefer HTTP over RTMP (#1798) --- youtube_dl/extractor/soundcloud.py | 66 ++++++++++++++++++++++-------- 1 file changed, 48 insertions(+), 18 deletions(-) diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index 67b2dff9c..ee8da227e 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -76,44 +76,74 @@ class SoundcloudIE(InfoExtractor): def _extract_info_dict(self, info, full_title=None, quiet=False): track_id = compat_str(info['id']) name = full_title or track_id - if quiet == False: + if quiet: self.report_extraction(name) thumbnail = info['artwork_url'] if thumbnail is not None: thumbnail = thumbnail.replace('-large', '-t500x500') + ext = info.get('original_format', u'mp3') result = { - 'id': track_id, + 'id': track_id, 'uploader': info['user']['username'], 'upload_date': unified_strdate(info['created_at']), - 'title': info['title'], - 'ext': info.get('original_format', u'mp3'), + 'title': info['title'], 'description': info['description'], 'thumbnail': thumbnail, } if info.get('downloadable', False): # We can build a direct link to the song - result['url'] = 'https://api.soundcloud.com/tracks/{0}/download?client_id={1}'.format(track_id, self._CLIENT_ID) + format_url = ( + u'https://api.soundcloud.com/tracks/{0}/download?client_id={1}'.format( + track_id, self._CLIENT_ID)) + result['formats'] = [{ + 'format_id': 'download', + 'ext': ext, + 'url': format_url, + }] else: # We have to retrieve the url stream_json = self._download_webpage( 'http://api.soundcloud.com/i1/tracks/{0}/streams?client_id={1}'.format(track_id, self._IPHONE_CLIENT_ID), track_id, u'Downloading track url') - # There should be only one entry in the dictionary - key, stream_url = list(json.loads(stream_json).items())[0] - if key.startswith(u'http'): - result['url'] = stream_url - elif key.startswith(u'rtmp'): - # The url doesn't have an rtmp app, we have to extract the playpath - url, path = stream_url.split('mp3:', 1) - result.update({ - 'url': url, - 'play_path': 'mp3:' + path, - }) - else: + + formats = [] + format_dict = json.loads(stream_json) + for key, stream_url in format_dict.items(): + if key.startswith(u'http'): + formats.append({ + 'format_id': key, + 'ext': ext, + 'url': stream_url, + }) + elif key.startswith(u'rtmp'): + # The url doesn't have an rtmp app, we have to extract the playpath + url, path = stream_url.split('mp3:', 1) + formats.append({ + 'format_id': key, + 'url': url, + 'play_path': 'mp3:' + path, + 'ext': ext, + }) + + if not formats: # We fallback to the stream_url in the original info, this # cannot be always used, sometimes it can give an HTTP 404 error - result['url'] = info['stream_url'] + '?client_id=' + self._CLIENT_ID, + formats.append({ + 'format_id': u'fallback', + 'url': info['stream_url'] + '?client_id=' + self._CLIENT_ID, + 'ext': ext, + }) + + def format_pref(f): + if f['format_id'].startswith('http'): + return 2 + if f['format_id'].startswith('rtmp'): + return 1 + return 0 + + formats.sort(key=format_pref) + result['formats'] = formats return result From 1a62c18f6521803ab41483f5da56fc72957d7655 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Mon, 25 Nov 2013 22:03:20 +0100 Subject: [PATCH 149/425] [bambuser] Skip the download in the test It doesn't respect the 'Range' header. --- youtube_dl/extractor/bambuser.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/youtube_dl/extractor/bambuser.py b/youtube_dl/extractor/bambuser.py index 967568c4a..b80508efe 100644 --- a/youtube_dl/extractor/bambuser.py +++ b/youtube_dl/extractor/bambuser.py @@ -25,6 +25,11 @@ class BambuserIE(InfoExtractor): u'uploader': u'pixelversity', u'uploader_id': u'344706', }, + u'params': { + # It doesn't respect the 'Range' header, it would download the whole video + # caused the travis builds to fail: https://travis-ci.org/rg3/youtube-dl/jobs/14493845#L59 + u'skip_download': True, + }, } def _real_extract(self, url): From a3927cf7eefd2318cdfb44cdb213b3810ea7627b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Mon, 25 Nov 2013 21:55:20 +0100 Subject: [PATCH 150/425] Allow to initialize a YoutubeDL object without parameters Having to pass the 'outtmpl' parameter feels really strange when you just want to extract the info of a video. --- youtube_dl/YoutubeDL.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index a1ef3a94a..46635bce1 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -146,7 +146,7 @@ class YoutubeDL(object): _num_downloads = None _screen_file = None - def __init__(self, params): + def __init__(self, params={}): """Create a FileDownloader object with the given options.""" self._ies = [] self._ies_instances = {} @@ -169,7 +169,7 @@ class YoutubeDL(object): self.params = params self.fd = FileDownloader(self, self.params) - if '%(stitle)s' in self.params['outtmpl']: + if '%(stitle)s' in self.params.get('outtmpl', ''): self.report_warning(u'%(stitle)s is deprecated. Use the %(title)s and the --restrict-filenames flag(which also secures %(uploader)s et al) instead.') self._setup_opener() From 0c75c3fa7a24c05a74891ec49e5a18de4f2792f1 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 25 Nov 2013 22:15:20 +0100 Subject: [PATCH 151/425] Do not warn about fixed output template if --max-downloads is 1 Fixes #1828 --- youtube_dl/YoutubeDL.py | 8 +++----- youtube_dl/__init__.py | 4 +++- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 0cef1daf3..50f750593 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -258,10 +258,6 @@ class YoutubeDL(object): if self.params.get('cookiefile') is not None: self.cookiejar.save() - def fixed_template(self): - """Checks if the output template is fixed.""" - return (re.search(u'(?u)%\\(.+?\\)s', self.params['outtmpl']) is None) - def trouble(self, message=None, tb=None): """Determine action to take when a download problem appears. @@ -798,7 +794,9 @@ class YoutubeDL(object): def download(self, url_list): """Download a given list of URLs.""" - if len(url_list) > 1 and self.fixed_template(): + if (len(url_list) > 1 and + '%' not in self.params['outtmpl'] + and self.params.get('max_downloads') != 1): raise SameFileError(self.params['outtmpl']) for url in url_list: diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 1f15c7eaa..102508cf9 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -206,7 +206,9 @@ def parseOpts(overrideArguments=None): dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1) selection.add_option('--match-title', dest='matchtitle', metavar='REGEX',help='download only matching titles (regex or caseless sub-string)') selection.add_option('--reject-title', dest='rejecttitle', metavar='REGEX',help='skip download for matching titles (regex or caseless sub-string)') - selection.add_option('--max-downloads', metavar='NUMBER', dest='max_downloads', help='Abort after downloading NUMBER files', default=None) + selection.add_option('--max-downloads', metavar='NUMBER', + dest='max_downloads', type=int, default=None, + help='Abort after downloading NUMBER files') selection.add_option('--min-filesize', metavar='SIZE', dest='min_filesize', help="Do not download any videos smaller than SIZE (e.g. 50k or 44.6m)", default=None) selection.add_option('--max-filesize', metavar='SIZE', dest='max_filesize', help="Do not download any videos larger than SIZE (e.g. 50k or 44.6m)", default=None) selection.add_option('--date', metavar='DATE', dest='date', help='download only videos uploaded in this date', default=None) From d9b011f201ef61c10ce63b6078cd1e21b6da4d4a Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 25 Nov 2013 22:31:27 +0100 Subject: [PATCH 152/425] Fix rtmpdump with non-ASCII filenames on Windows on 2.x Reported in #1798 --- youtube_dl/FileDownloader.py | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/youtube_dl/FileDownloader.py b/youtube_dl/FileDownloader.py index 27684d0f6..3ff9716b3 100644 --- a/youtube_dl/FileDownloader.py +++ b/youtube_dl/FileDownloader.py @@ -339,13 +339,29 @@ class FileDownloader(object): if live: basic_args += ['--live'] args = basic_args + [[], ['--resume', '--skip', '1']][self.params.get('continuedl', False)] + + if sys.platform == 'win32' and sys.version_info < (3, 0): + # Windows subprocess module does not actually support Unicode + # on Python 2.x + # See http://stackoverflow.com/a/9951851/35070 + subprocess_encoding = sys.getfilesystemencoding() + args = [a.encode(subprocess_encoding, 'ignore') for a in args] + else: + subprocess_encoding = None + if self.params.get('verbose', False): + if subprocess_encoding: + str_args = [ + a.decode(subprocess_encoding) if isinstance(a, bytes) else a + for a in args] + else: + str_args = args try: import pipes - shell_quote = lambda args: ' '.join(map(pipes.quote, args)) + shell_quote = lambda args: ' '.join(map(pipes.quote, str_args)) except ImportError: shell_quote = repr - self.to_screen(u'[debug] rtmpdump command line: ' + shell_quote(args)) + self.to_screen(u'[debug] rtmpdump command line: ' + shell_quote(str_args)) retval = run_rtmpdump(args) From fb04e40396509fd2bd41250eec3b07adf1aa1125 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 25 Nov 2013 22:34:56 +0100 Subject: [PATCH 153/425] [soundcloud] Support for listing of audio-only files --- youtube_dl/YoutubeDL.py | 5 ++++- youtube_dl/extractor/soundcloud.py | 4 ++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index d0aab1bbd..87635e173 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -875,6 +875,8 @@ class YoutubeDL(object): @staticmethod def format_resolution(format, default='unknown'): + if format.get('vcodec') == 'none': + return 'audio only' if format.get('_resolution') is not None: return format['_resolution'] if format.get('height') is not None: @@ -891,7 +893,8 @@ class YoutubeDL(object): res = u'' if fdict.get('format_note') is not None: res += fdict['format_note'] + u' ' - if fdict.get('vcodec') is not None: + if (fdict.get('vcodec') is not None and + fdict.get('vcodec') != 'none'): res += u'%-5s' % fdict['vcodec'] elif fdict.get('vbr') is not None: res += u'video' diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index ee8da227e..3a19ab172 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -100,6 +100,7 @@ class SoundcloudIE(InfoExtractor): 'format_id': 'download', 'ext': ext, 'url': format_url, + 'vcodec': 'none', }] else: # We have to retrieve the url @@ -115,6 +116,7 @@ class SoundcloudIE(InfoExtractor): 'format_id': key, 'ext': ext, 'url': stream_url, + 'vcodec': 'none', }) elif key.startswith(u'rtmp'): # The url doesn't have an rtmp app, we have to extract the playpath @@ -124,6 +126,7 @@ class SoundcloudIE(InfoExtractor): 'url': url, 'play_path': 'mp3:' + path, 'ext': ext, + 'vcodec': 'none', }) if not formats: @@ -133,6 +136,7 @@ class SoundcloudIE(InfoExtractor): 'format_id': u'fallback', 'url': info['stream_url'] + '?client_id=' + self._CLIENT_ID, 'ext': ext, + 'vcodec': 'none', }) def format_pref(f): From 781a7d054657d813527fa0f98f831679675f8ea7 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 25 Nov 2013 22:36:18 +0100 Subject: [PATCH 154/425] release 2013.11.25.3 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index aed0c4e75..fc0881201 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2013.11.25.2' +__version__ = '2013.11.25.3' From 529a2e2cc35df8c77418f9d02b0f5b4730b95b06 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Mon, 25 Nov 2013 22:52:09 +0100 Subject: [PATCH 155/425] Fix typo in the documentation of the 'download_archive' param --- youtube_dl/YoutubeDL.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 87635e173..e23042c48 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -126,7 +126,7 @@ class YoutubeDL(object): noplaylist: Download single video instead of a playlist if in doubt. age_limit: An integer representing the user's age in years. Unsuitable videos for the given age are skipped. - downloadarchive: File name of a file where all downloads are recorded. + download_archive: File name of a file where all downloads are recorded. Videos already present in the file are not downloaded again. cookiefile: File name where cookies should be read from and dumped to. From d31209a1449d0bd9315e063be4cf7f5d45726563 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Mon, 25 Nov 2013 22:57:15 +0100 Subject: [PATCH 156/425] Use the 'extractor_key' field for the download archive file It has the same value as the ie_key. --- youtube_dl/YoutubeDL.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index e23042c48..e86e8a090 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -837,7 +837,7 @@ class YoutubeDL(object): def _make_archive_id(self, info_dict): # Future-proof against any change in case # and backwards compatibility with prior versions - extractor = info_dict.get('extractor') + extractor = info_dict.get('extractor_key') if extractor is None: if 'id' in info_dict: extractor = info_dict.get('ie_key') # key in a playlist From c2e52508cca307113ff0c3aedcc0519d92c48f00 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Tue, 26 Nov 2013 08:03:11 +0100 Subject: [PATCH 157/425] Include the proxy in the parameters for YoutubeDL (fixes #1831) --- youtube_dl/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 102508cf9..0704515df 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -651,6 +651,7 @@ def _real_main(argv=None): 'download_archive': opts.download_archive, 'cookiefile': opts.cookiefile, 'nocheckcertificate': opts.no_check_certificate, + 'proxy': opts.proxy, } with YoutubeDL(ydl_opts) as ydl: From c5ed4e8f7efaa258c74dd3179a7c691208874e41 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Tue, 26 Nov 2013 10:41:35 +0100 Subject: [PATCH 158/425] release 2013.11.26 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index fc0881201..99a5e0505 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2013.11.25.3' +__version__ = '2013.11.26' From 4a98cdbf3b19b07c7a885d348e79ddf79318f133 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Tue, 26 Nov 2013 18:53:36 +0100 Subject: [PATCH 159/425] YoutubeDL: set the 'params' property before any message/warning/error is sent (fixes #1840) If it sets the 'restrictfilenames' param, it will first report a warning. It will try to get the logger from the 'params' property, which would be set at that moment to None, raising the error 'AttributeError: 'NoneType' object has no attribute 'get'' --- youtube_dl/YoutubeDL.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index e86e8a090..711b5d79e 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -155,6 +155,7 @@ class YoutubeDL(object): self._download_retcode = 0 self._num_downloads = 0 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)] + self.params = params if (sys.version_info >= (3,) and sys.platform != 'win32' and sys.getfilesystemencoding() in ['ascii', 'ANSI_X3.4-1968'] @@ -164,9 +165,8 @@ class YoutubeDL(object): u'Assuming --restrict-filenames since file system encoding ' u'cannot encode all charactes. ' u'Set the LC_ALL environment variable to fix this.') - params['restrictfilenames'] = True + self.params['restrictfilenames'] = True - self.params = params self.fd = FileDownloader(self, self.params) if '%(stitle)s' in self.params.get('outtmpl', ''): From 6e47b51eef26dbaa3634b73914e4ee7213ad38f7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Tue, 26 Nov 2013 19:09:14 +0100 Subject: [PATCH 160/425] [youtube:playlist] Remove the link with index 0 It's not the first video of the playlist, it appears in the 'Play all' button (see the test course for an example) --- youtube_dl/extractor/youtube.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 1bf9cb7d4..4c43d5739 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1528,7 +1528,7 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): )""" _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s&page=%s' _MORE_PAGES_INDICATOR = r'data-link-type="next"' - _VIDEO_RE = r'href="/watch\?v=([0-9A-Za-z_-]{11})&' + _VIDEO_RE = r'href="/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&[^"]*?index=(?P<index>\d+)' IE_NAME = u'youtube:playlist' @classmethod @@ -1562,8 +1562,10 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): for page_num in itertools.count(1): url = self._TEMPLATE_URL % (playlist_id, page_num) page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num) - # The ids are duplicated - new_ids = orderedSet(re.findall(self._VIDEO_RE, page)) + matches = re.finditer(self._VIDEO_RE, page) + # We remove the duplicates and the link with index 0 + # (it's not the first video of the playlist) + new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0') ids.extend(new_ids) if re.search(self._MORE_PAGES_INDICATOR, page) is None: From e26f8712289c727a43d74a4669aee4924b9f75f2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Tue, 26 Nov 2013 18:48:52 +0100 Subject: [PATCH 161/425] Use the new '_download_xml' helper in more extractors --- youtube_dl/extractor/anitube.py | 4 +--- youtube_dl/extractor/arte.py | 7 ++----- youtube_dl/extractor/canalplus.py | 4 +--- youtube_dl/extractor/clipfish.py | 4 +--- youtube_dl/extractor/cnn.py | 4 +--- youtube_dl/extractor/comedycentral.py | 7 ++----- youtube_dl/extractor/daum.py | 10 +++------- youtube_dl/extractor/dreisat.py | 4 +--- youtube_dl/extractor/ebaumsworld.py | 4 +--- youtube_dl/extractor/faz.py | 4 +--- youtube_dl/extractor/francetv.py | 4 +--- youtube_dl/extractor/internetvideoarchive.py | 7 ++----- youtube_dl/extractor/jeuxvideo.py | 8 ++------ youtube_dl/extractor/justintv.py | 4 +--- youtube_dl/extractor/livestream.py | 4 +--- youtube_dl/extractor/mtv.py | 3 +-- youtube_dl/extractor/myspass.py | 4 +--- youtube_dl/extractor/naver.py | 7 ++----- youtube_dl/extractor/nbc.py | 5 ++--- youtube_dl/extractor/nhl.py | 4 +--- youtube_dl/extractor/niconico.py | 10 +++------- youtube_dl/extractor/sina.py | 4 +--- youtube_dl/extractor/spiegel.py | 5 +---- youtube_dl/extractor/teamcoco.py | 4 +--- youtube_dl/extractor/toutv.py | 5 +---- youtube_dl/extractor/trilulilu.py | 5 +---- youtube_dl/extractor/videofyme.py | 4 +--- youtube_dl/extractor/youtube.py | 4 +--- 28 files changed, 38 insertions(+), 105 deletions(-) diff --git a/youtube_dl/extractor/anitube.py b/youtube_dl/extractor/anitube.py index 691d5a844..2b019daa9 100644 --- a/youtube_dl/extractor/anitube.py +++ b/youtube_dl/extractor/anitube.py @@ -1,5 +1,4 @@ import re -import xml.etree.ElementTree from .common import InfoExtractor @@ -28,9 +27,8 @@ class AnitubeIE(InfoExtractor): key = self._html_search_regex(r'http://www\.anitube\.se/embed/([A-Za-z0-9_-]*)', webpage, u'key') - webpage_config = self._download_webpage('http://www.anitube.se/nuevo/econfig.php?key=%s' % key, + config_xml = self._download_xml('http://www.anitube.se/nuevo/econfig.php?key=%s' % key, key) - config_xml = xml.etree.ElementTree.fromstring(webpage_config.encode('utf-8')) video_title = config_xml.find('title').text diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py index 44d0b5d70..8b62ee774 100644 --- a/youtube_dl/extractor/arte.py +++ b/youtube_dl/extractor/arte.py @@ -1,7 +1,6 @@ # encoding: utf-8 import re import json -import xml.etree.ElementTree from .common import InfoExtractor from ..utils import ( @@ -78,8 +77,7 @@ class ArteTvIE(InfoExtractor): """Extract from videos.arte.tv""" ref_xml_url = url.replace('/videos/', '/do_delegate/videos/') ref_xml_url = ref_xml_url.replace('.html', ',view,asPlayerXml.xml') - ref_xml = self._download_webpage(ref_xml_url, video_id, note=u'Downloading metadata') - ref_xml_doc = xml.etree.ElementTree.fromstring(ref_xml) + ref_xml_doc = self._download_xml(ref_xml_url, video_id, note=u'Downloading metadata') config_node = find_xpath_attr(ref_xml_doc, './/video', 'lang', lang) config_xml_url = config_node.attrib['ref'] config_xml = self._download_webpage(config_xml_url, video_id, note=u'Downloading configuration') @@ -109,9 +107,8 @@ class ArteTvIE(InfoExtractor): """Extract form http://liveweb.arte.tv/""" webpage = self._download_webpage(url, name) video_id = self._search_regex(r'eventId=(\d+?)("|&)', webpage, u'event id') - config_xml = self._download_webpage('http://download.liveweb.arte.tv/o21/liveweb/events/event-%s.xml' % video_id, + config_doc = self._download_xml('http://download.liveweb.arte.tv/o21/liveweb/events/event-%s.xml' % video_id, video_id, u'Downloading information') - config_doc = xml.etree.ElementTree.fromstring(config_xml.encode('utf-8')) event_doc = config_doc.find('event') url_node = event_doc.find('video').find('urlHd') if url_node is None: diff --git a/youtube_dl/extractor/canalplus.py b/youtube_dl/extractor/canalplus.py index bfa2a8b40..7cdcd8399 100644 --- a/youtube_dl/extractor/canalplus.py +++ b/youtube_dl/extractor/canalplus.py @@ -1,6 +1,5 @@ # encoding: utf-8 import re -import xml.etree.ElementTree from .common import InfoExtractor from ..utils import unified_strdate @@ -31,11 +30,10 @@ class CanalplusIE(InfoExtractor): webpage = self._download_webpage(url, mobj.group('path')) video_id = self._search_regex(r'videoId = "(\d+)";', webpage, u'video id') info_url = self._VIDEO_INFO_TEMPLATE % video_id - info_page = self._download_webpage(info_url,video_id, + doc = self._download_xml(info_url,video_id, u'Downloading video info') self.report_extraction(video_id) - doc = xml.etree.ElementTree.fromstring(info_page.encode('utf-8')) video_info = [video for video in doc if video.find('ID').text == video_id][0] infos = video_info.find('INFOS') media = video_info.find('MEDIA') diff --git a/youtube_dl/extractor/clipfish.py b/youtube_dl/extractor/clipfish.py index 95449da3c..5f0b5602f 100644 --- a/youtube_dl/extractor/clipfish.py +++ b/youtube_dl/extractor/clipfish.py @@ -1,6 +1,5 @@ import re import time -import xml.etree.ElementTree from .common import InfoExtractor @@ -25,9 +24,8 @@ class ClipfishIE(InfoExtractor): info_url = ('http://www.clipfish.de/devxml/videoinfo/%s?ts=%d' % (video_id, int(time.time()))) - info_xml = self._download_webpage( + doc = self._download_xml( info_url, video_id, note=u'Downloading info page') - doc = xml.etree.ElementTree.fromstring(info_xml) title = doc.find('title').text video_url = doc.find('filename').text thumbnail = doc.find('imageurl').text diff --git a/youtube_dl/extractor/cnn.py b/youtube_dl/extractor/cnn.py index 34adf6dda..a034bb2fb 100644 --- a/youtube_dl/extractor/cnn.py +++ b/youtube_dl/extractor/cnn.py @@ -1,5 +1,4 @@ import re -import xml.etree.ElementTree from .common import InfoExtractor from ..utils import determine_ext @@ -33,8 +32,7 @@ class CNNIE(InfoExtractor): path = mobj.group('path') page_title = mobj.group('title') info_url = u'http://cnn.com/video/data/3.0/%s/index.xml' % path - info_xml = self._download_webpage(info_url, page_title) - info = xml.etree.ElementTree.fromstring(info_xml.encode('utf-8')) + info = self._download_xml(info_url, page_title) formats = [] for f in info.findall('files/file'): diff --git a/youtube_dl/extractor/comedycentral.py b/youtube_dl/extractor/comedycentral.py index 725849d2e..23647f99e 100644 --- a/youtube_dl/extractor/comedycentral.py +++ b/youtube_dl/extractor/comedycentral.py @@ -1,5 +1,4 @@ import re -import xml.etree.ElementTree from .common import InfoExtractor from .mtv import MTVIE, _media_xml_tag @@ -158,13 +157,12 @@ class ComedyCentralShowsIE(InfoExtractor): uri = mMovieParams[0][1] indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri}) - indexXml = self._download_webpage(indexUrl, epTitle, + idoc = self._download_xml(indexUrl, epTitle, u'Downloading show index', u'unable to download episode index') results = [] - idoc = xml.etree.ElementTree.fromstring(indexXml) itemEls = idoc.findall('.//item') for partNum,itemEl in enumerate(itemEls): mediaId = itemEl.findall('./guid')[0].text @@ -175,10 +173,9 @@ class ComedyCentralShowsIE(InfoExtractor): configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' + compat_urllib_parse.urlencode({'uri': mediaId})) - configXml = self._download_webpage(configUrl, epTitle, + cdoc = self._download_xml(configUrl, epTitle, u'Downloading configuration for %s' % shortMediaId) - cdoc = xml.etree.ElementTree.fromstring(configXml) turls = [] for rendition in cdoc.findall('.//rendition'): finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text) diff --git a/youtube_dl/extractor/daum.py b/youtube_dl/extractor/daum.py index a804e83bd..3d1dcb793 100644 --- a/youtube_dl/extractor/daum.py +++ b/youtube_dl/extractor/daum.py @@ -1,6 +1,5 @@ # encoding: utf-8 import re -import xml.etree.ElementTree from .common import InfoExtractor from ..utils import ( @@ -32,14 +31,12 @@ class DaumIE(InfoExtractor): full_id = self._search_regex(r'<link rel="video_src" href=".+?vid=(.+?)"', webpage, u'full id') query = compat_urllib_parse.urlencode({'vid': full_id}) - info_xml = self._download_webpage( + info = self._download_xml( 'http://tvpot.daum.net/clip/ClipInfoXml.do?' + query, video_id, u'Downloading video info') - urls_xml = self._download_webpage( + urls = self._download_xml( 'http://videofarm.daum.net/controller/api/open/v1_2/MovieData.apixml?' + query, video_id, u'Downloading video formats info') - info = xml.etree.ElementTree.fromstring(info_xml.encode('utf-8')) - urls = xml.etree.ElementTree.fromstring(urls_xml.encode('utf-8')) self.to_screen(u'%s: Getting video urls' % video_id) formats = [] @@ -49,10 +46,9 @@ class DaumIE(InfoExtractor): 'vid': full_id, 'profile': profile, }) - url_xml = self._download_webpage( + url_doc = self._download_xml( 'http://videofarm.daum.net/controller/api/open/v1_2/MovieLocation.apixml?' + format_query, video_id, note=False) - url_doc = xml.etree.ElementTree.fromstring(url_xml.encode('utf-8')) format_url = url_doc.find('result/url').text formats.append({ 'url': format_url, diff --git a/youtube_dl/extractor/dreisat.py b/youtube_dl/extractor/dreisat.py index 765cb1f37..3cb382e12 100644 --- a/youtube_dl/extractor/dreisat.py +++ b/youtube_dl/extractor/dreisat.py @@ -1,7 +1,6 @@ # coding: utf-8 import re -import xml.etree.ElementTree from .common import InfoExtractor from ..utils import ( @@ -30,8 +29,7 @@ class DreiSatIE(InfoExtractor): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') details_url = 'http://www.3sat.de/mediathek/xmlservice/web/beitragsDetails?ak=web&id=%s' % video_id - details_xml = self._download_webpage(details_url, video_id, note=u'Downloading video details') - details_doc = xml.etree.ElementTree.fromstring(details_xml.encode('utf-8')) + details_doc = self._download_xml(details_url, video_id, note=u'Downloading video details') thumbnail_els = details_doc.findall('.//teaserimage') thumbnails = [{ diff --git a/youtube_dl/extractor/ebaumsworld.py b/youtube_dl/extractor/ebaumsworld.py index f02c6998b..877113d63 100644 --- a/youtube_dl/extractor/ebaumsworld.py +++ b/youtube_dl/extractor/ebaumsworld.py @@ -1,5 +1,4 @@ import re -import xml.etree.ElementTree from .common import InfoExtractor from ..utils import determine_ext @@ -21,9 +20,8 @@ class EbaumsWorldIE(InfoExtractor): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') - config_xml = self._download_webpage( + config = self._download_xml( 'http://www.ebaumsworld.com/video/player/%s' % video_id, video_id) - config = xml.etree.ElementTree.fromstring(config_xml.encode('utf-8')) video_url = config.find('file').text return { diff --git a/youtube_dl/extractor/faz.py b/youtube_dl/extractor/faz.py index 89ed08db4..c0169de04 100644 --- a/youtube_dl/extractor/faz.py +++ b/youtube_dl/extractor/faz.py @@ -1,6 +1,5 @@ # encoding: utf-8 import re -import xml.etree.ElementTree from .common import InfoExtractor from ..utils import ( @@ -28,9 +27,8 @@ class FazIE(InfoExtractor): webpage = self._download_webpage(url, video_id) config_xml_url = self._search_regex(r'writeFLV\(\'(.+?)\',', webpage, u'config xml url') - config_xml = self._download_webpage(config_xml_url, video_id, + config = self._download_xml(config_xml_url, video_id, u'Downloading config xml') - config = xml.etree.ElementTree.fromstring(config_xml.encode('utf-8')) encodings = config.find('ENCODINGS') formats = [] diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py index 086cafca0..6e1971043 100644 --- a/youtube_dl/extractor/francetv.py +++ b/youtube_dl/extractor/francetv.py @@ -1,6 +1,5 @@ # encoding: utf-8 import re -import xml.etree.ElementTree import json from .common import InfoExtractor @@ -11,11 +10,10 @@ from ..utils import ( class FranceTVBaseInfoExtractor(InfoExtractor): def _extract_video(self, video_id): - xml_desc = self._download_webpage( + info = self._download_xml( 'http://www.francetvinfo.fr/appftv/webservices/video/' 'getInfosOeuvre.php?id-diffusion=' + video_id, video_id, 'Downloading XML config') - info = xml.etree.ElementTree.fromstring(xml_desc.encode('utf-8')) manifest_url = info.find('videos/video/url').text video_url = manifest_url.replace('manifest.f4m', 'index_2_av.m3u8') diff --git a/youtube_dl/extractor/internetvideoarchive.py b/youtube_dl/extractor/internetvideoarchive.py index be8e05f53..16a6f73c8 100644 --- a/youtube_dl/extractor/internetvideoarchive.py +++ b/youtube_dl/extractor/internetvideoarchive.py @@ -1,5 +1,4 @@ import re -import xml.etree.ElementTree from .common import InfoExtractor from ..utils import ( @@ -43,9 +42,8 @@ class InternetVideoArchiveIE(InfoExtractor): video_id = query_dic['publishedid'][0] url = self._build_url(query) - flashconfiguration_xml = self._download_webpage(url, video_id, + flashconfiguration = self._download_xml(url, video_id, u'Downloading flash configuration') - flashconfiguration = xml.etree.ElementTree.fromstring(flashconfiguration_xml.encode('utf-8')) file_url = flashconfiguration.find('file').text file_url = file_url.replace('/playlist.aspx', '/mrssplaylist.aspx') # Replace some of the parameters in the query to get the best quality @@ -53,9 +51,8 @@ class InternetVideoArchiveIE(InfoExtractor): file_url = re.sub(r'(?<=\?)(.+)$', lambda m: self._clean_query(m.group()), file_url) - info_xml = self._download_webpage(file_url, video_id, + info = self._download_xml(file_url, video_id, u'Downloading video info') - info = xml.etree.ElementTree.fromstring(info_xml.encode('utf-8')) item = info.find('channel/item') def _bp(p): diff --git a/youtube_dl/extractor/jeuxvideo.py b/youtube_dl/extractor/jeuxvideo.py index 0020c47cf..caf9d8c85 100644 --- a/youtube_dl/extractor/jeuxvideo.py +++ b/youtube_dl/extractor/jeuxvideo.py @@ -2,7 +2,6 @@ import json import re -import xml.etree.ElementTree from .common import InfoExtractor @@ -32,12 +31,9 @@ class JeuxVideoIE(InfoExtractor): r'http://www\.jeuxvideo\.com/config/\w+/\d+/(.*?)/\d+_player\.xml', xml_link, u'video ID') - xml_config = self._download_webpage( + config = self._download_xml( xml_link, title, u'Downloading XML config') - config = xml.etree.ElementTree.fromstring(xml_config.encode('utf-8')) - info_json = self._search_regex( - r'(?sm)<format\.json>(.*?)</format\.json>', - xml_config, u'JSON information') + info_json = config.find('format.json').text info = json.loads(info_json)['versions'][0] video_url = 'http://video720.jeuxvideo.com/' + info['file'] diff --git a/youtube_dl/extractor/justintv.py b/youtube_dl/extractor/justintv.py index f60017992..e9bde0c18 100644 --- a/youtube_dl/extractor/justintv.py +++ b/youtube_dl/extractor/justintv.py @@ -1,7 +1,6 @@ import json import os import re -import xml.etree.ElementTree from .common import InfoExtractor from ..utils import ( @@ -94,10 +93,9 @@ class JustinTVIE(InfoExtractor): archive_id = m.group(1) api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id - chapter_info_xml = self._download_webpage(api, chapter_id, + doc = self._download_xml(api, chapter_id, note=u'Downloading chapter information', errnote=u'Chapter information download failed') - doc = xml.etree.ElementTree.fromstring(chapter_info_xml) for a in doc.findall('.//archive'): if archive_id == a.find('./id').text: break diff --git a/youtube_dl/extractor/livestream.py b/youtube_dl/extractor/livestream.py index 5f548437c..9bc35b115 100644 --- a/youtube_dl/extractor/livestream.py +++ b/youtube_dl/extractor/livestream.py @@ -1,6 +1,5 @@ import re import json -import xml.etree.ElementTree from .common import InfoExtractor from ..utils import ( @@ -80,8 +79,7 @@ class LivestreamOriginalIE(InfoExtractor): user = mobj.group('user') api_url = 'http://x{0}x.api.channel.livestream.com/2.0/clipdetails?extendedInfo=true&id={1}'.format(user, video_id) - api_response = self._download_webpage(api_url, video_id) - info = xml.etree.ElementTree.fromstring(api_response.encode('utf-8')) + info = self._download_xml(api_url, video_id) item = info.find('channel').find('item') ns = {'media': 'http://search.yahoo.com/mrss'} thumbnail_url = item.find(xpath_with_ns('media:thumbnail', ns)).attrib['url'] diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py index 04afd6c4c..42aee58be 100644 --- a/youtube_dl/extractor/mtv.py +++ b/youtube_dl/extractor/mtv.py @@ -109,9 +109,8 @@ class MTVIE(InfoExtractor): def _get_videos_info(self, uri): video_id = self._id_from_uri(uri) data = compat_urllib_parse.urlencode({'uri': uri}) - infoXml = self._download_webpage(self._FEED_URL +'?' + data, video_id, + idoc = self._download_xml(self._FEED_URL +'?' + data, video_id, u'Downloading info') - idoc = xml.etree.ElementTree.fromstring(infoXml.encode('utf-8')) return [self._get_video_info(item) for item in idoc.findall('.//item')] def _real_extract(self, url): diff --git a/youtube_dl/extractor/myspass.py b/youtube_dl/extractor/myspass.py index 107665d15..0067bf134 100644 --- a/youtube_dl/extractor/myspass.py +++ b/youtube_dl/extractor/myspass.py @@ -1,5 +1,4 @@ import os.path -import xml.etree.ElementTree from .common import InfoExtractor from ..utils import ( @@ -33,8 +32,7 @@ class MySpassIE(InfoExtractor): # get metadata metadata_url = META_DATA_URL_TEMPLATE % video_id - metadata_text = self._download_webpage(metadata_url, video_id) - metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8')) + metadata = self._download_xml(metadata_url, video_id) # extract values from metadata url_flv_el = metadata.find('url_flv') diff --git a/youtube_dl/extractor/naver.py b/youtube_dl/extractor/naver.py index 9df236d69..d290397c7 100644 --- a/youtube_dl/extractor/naver.py +++ b/youtube_dl/extractor/naver.py @@ -1,6 +1,5 @@ # encoding: utf-8 import re -import xml.etree.ElementTree from .common import InfoExtractor from ..utils import ( @@ -38,14 +37,12 @@ class NaverIE(InfoExtractor): 'protocol': 'p2p', 'inKey': key, }) - info_xml = self._download_webpage( + info = self._download_xml( 'http://serviceapi.rmcnmv.naver.com/flash/videoInfo.nhn?' + query, video_id, u'Downloading video info') - urls_xml = self._download_webpage( + urls = self._download_xml( 'http://serviceapi.rmcnmv.naver.com/flash/playableEncodingOption.nhn?' + query_urls, video_id, u'Downloading video formats info') - info = xml.etree.ElementTree.fromstring(info_xml.encode('utf-8')) - urls = xml.etree.ElementTree.fromstring(urls_xml.encode('utf-8')) formats = [] for format_el in urls.findall('EncodingOptions/EncodingOption'): diff --git a/youtube_dl/extractor/nbc.py b/youtube_dl/extractor/nbc.py index 3bc9dae6d..e8bbfff7b 100644 --- a/youtube_dl/extractor/nbc.py +++ b/youtube_dl/extractor/nbc.py @@ -1,5 +1,4 @@ import re -import xml.etree.ElementTree from .common import InfoExtractor from ..utils import find_xpath_attr, compat_str @@ -21,8 +20,8 @@ class NBCNewsIE(InfoExtractor): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') - info_xml = self._download_webpage('http://www.nbcnews.com/id/%s/displaymode/1219' % video_id, video_id) - info = xml.etree.ElementTree.fromstring(info_xml.encode('utf-8')).find('video') + all_info = self._download_xml('http://www.nbcnews.com/id/%s/displaymode/1219' % video_id, video_id) + info = all_info.find('video') return {'id': video_id, 'title': info.find('headline').text, diff --git a/youtube_dl/extractor/nhl.py b/youtube_dl/extractor/nhl.py index 458fe4063..2edd806a3 100644 --- a/youtube_dl/extractor/nhl.py +++ b/youtube_dl/extractor/nhl.py @@ -1,6 +1,5 @@ import re import json -import xml.etree.ElementTree from .common import InfoExtractor from ..utils import ( @@ -26,9 +25,8 @@ class NHLBaseInfoExtractor(InfoExtractor): 'path': initial_video_url.replace('.mp4', '_sd.mp4'), }) path_url = 'http://video.nhl.com/videocenter/servlets/encryptvideopath?' + data - path_response = self._download_webpage(path_url, video_id, + path_doc = self._download_xml(path_url, video_id, u'Downloading final video url') - path_doc = xml.etree.ElementTree.fromstring(path_response) video_url = path_doc.find('path').text join = compat_urlparse.urljoin diff --git a/youtube_dl/extractor/niconico.py b/youtube_dl/extractor/niconico.py index 729607ea3..46774317c 100644 --- a/youtube_dl/extractor/niconico.py +++ b/youtube_dl/extractor/niconico.py @@ -2,7 +2,6 @@ import re import socket -import xml.etree.ElementTree from .common import InfoExtractor from ..utils import ( @@ -81,7 +80,7 @@ class NiconicoIE(InfoExtractor): # the cookies in order to be able to download the info webpage self._download_webpage('http://www.nicovideo.jp/watch/' + video_id, video_id) - video_info_webpage = self._download_webpage( + video_info = self._download_xml( 'http://ext.nicovideo.jp/api/getthumbinfo/' + video_id, video_id, note=u'Downloading video info page') @@ -92,7 +91,6 @@ class NiconicoIE(InfoExtractor): video_real_url = compat_urlparse.parse_qs(flv_info_webpage)['url'][0] # Start extracting information - video_info = xml.etree.ElementTree.fromstring(video_info_webpage) video_title = video_info.find('.//title').text video_extension = video_info.find('.//movie_type').text video_format = video_extension.upper() @@ -107,13 +105,11 @@ class NiconicoIE(InfoExtractor): video_uploader = video_uploader_id url = 'http://seiga.nicovideo.jp/api/user/info?id=' + video_uploader_id try: - user_info_webpage = self._download_webpage( + user_info = self._download_xml( url, video_id, note=u'Downloading user information') + video_uploader = user_info.find('.//nickname').text except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: self._downloader.report_warning(u'Unable to download user info webpage: %s' % compat_str(err)) - else: - user_info = xml.etree.ElementTree.fromstring(user_info_webpage) - video_uploader = user_info.find('.//nickname').text return { 'id': video_id, diff --git a/youtube_dl/extractor/sina.py b/youtube_dl/extractor/sina.py index 14b1c656c..74a87fe56 100644 --- a/youtube_dl/extractor/sina.py +++ b/youtube_dl/extractor/sina.py @@ -1,7 +1,6 @@ # coding: utf-8 import re -import xml.etree.ElementTree from .common import InfoExtractor from ..utils import ( @@ -35,12 +34,11 @@ class SinaIE(InfoExtractor): def _extract_video(self, video_id): data = compat_urllib_parse.urlencode({'vid': video_id}) - url_page = self._download_webpage('http://v.iask.com/v_play.php?%s' % data, + url_doc = self._download_xml('http://v.iask.com/v_play.php?%s' % data, video_id, u'Downloading video url') image_page = self._download_webpage( 'http://interface.video.sina.com.cn/interface/common/getVideoImage.php?%s' % data, video_id, u'Downloading thumbnail info') - url_doc = xml.etree.ElementTree.fromstring(url_page.encode('utf-8')) return {'id': video_id, 'url': url_doc.find('./durl/url').text, diff --git a/youtube_dl/extractor/spiegel.py b/youtube_dl/extractor/spiegel.py index 19ce585cf..695520524 100644 --- a/youtube_dl/extractor/spiegel.py +++ b/youtube_dl/extractor/spiegel.py @@ -1,5 +1,4 @@ import re -import xml.etree.ElementTree from .common import InfoExtractor @@ -33,12 +32,10 @@ class SpiegelIE(InfoExtractor): r'<div class="module-title">(.*?)</div>', webpage, u'title') xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml' - xml_code = self._download_webpage( + idoc = self._download_xml( xml_url, video_id, note=u'Downloading XML', errnote=u'Failed to download XML') - idoc = xml.etree.ElementTree.fromstring(xml_code) - formats = [ { 'format_id': n.tag.rpartition('type')[2], diff --git a/youtube_dl/extractor/teamcoco.py b/youtube_dl/extractor/teamcoco.py index 165d9f88b..2bf26d056 100644 --- a/youtube_dl/extractor/teamcoco.py +++ b/youtube_dl/extractor/teamcoco.py @@ -1,5 +1,4 @@ import re -import xml.etree.ElementTree from .common import InfoExtractor from ..utils import ( @@ -32,8 +31,7 @@ class TeamcocoIE(InfoExtractor): self.report_extraction(video_id) data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id - data_xml = self._download_webpage(data_url, video_id, 'Downloading data webpage') - data = xml.etree.ElementTree.fromstring(data_xml.encode('utf-8')) + data = self._download_xml(data_url, video_id, 'Downloading data webpage') qualities = ['500k', '480p', '1000k', '720p', '1080p'] diff --git a/youtube_dl/extractor/toutv.py b/youtube_dl/extractor/toutv.py index 2f728d3dc..1e9598ef6 100644 --- a/youtube_dl/extractor/toutv.py +++ b/youtube_dl/extractor/toutv.py @@ -1,6 +1,5 @@ # coding: utf-8 import re -import xml.etree.ElementTree from .common import InfoExtractor from ..utils import ( @@ -40,11 +39,9 @@ class TouTvIE(InfoExtractor): r'"idMedia":\s*"([^"]+)"', webpage, u'media ID') streams_url = u'http://release.theplatform.com/content.select?pid=' + mediaId - streams_webpage = self._download_webpage( + streams_doc = self._download_xml( streams_url, video_id, note=u'Downloading stream list') - streams_doc = xml.etree.ElementTree.fromstring( - streams_webpage.encode('utf-8')) video_url = next(n.text for n in streams_doc.findall('.//choice/url') if u'//ad.doubleclick' not in n.text) diff --git a/youtube_dl/extractor/trilulilu.py b/youtube_dl/extractor/trilulilu.py index 0bf028f61..1c49e580d 100644 --- a/youtube_dl/extractor/trilulilu.py +++ b/youtube_dl/extractor/trilulilu.py @@ -1,6 +1,5 @@ import json import re -import xml.etree.ElementTree from .common import InfoExtractor @@ -36,12 +35,10 @@ class TriluliluIE(InfoExtractor): format_url = (u'http://fs%(server)s.trilulilu.ro/%(hash)s/' u'video-formats2' % log) - format_str = self._download_webpage( + format_doc = self._download_xml( format_url, video_id, note=u'Downloading formats', errnote=u'Error while downloading formats') - - format_doc = xml.etree.ElementTree.fromstring(format_str) video_url_template = ( u'http://fs%(server)s.trilulilu.ro/stream.php?type=video' diff --git a/youtube_dl/extractor/videofyme.py b/youtube_dl/extractor/videofyme.py index 94f64ffa5..912802d9a 100644 --- a/youtube_dl/extractor/videofyme.py +++ b/youtube_dl/extractor/videofyme.py @@ -1,5 +1,4 @@ import re -import xml.etree.ElementTree from .common import InfoExtractor from ..utils import ( @@ -27,9 +26,8 @@ class VideofyMeIE(InfoExtractor): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') - config_xml = self._download_webpage('http://sunshine.videofy.me/?videoId=%s' % video_id, + config = self._download_xml('http://sunshine.videofy.me/?videoId=%s' % video_id, video_id) - config = xml.etree.ElementTree.fromstring(config_xml.encode('utf-8')) video = config.find('video') sources = video.find('sources') url_node = next(node for node in [find_xpath_attr(sources, 'source', 'id', 'HQ %s' % key) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 4c43d5739..a76a9071a 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -11,7 +11,6 @@ import socket import string import struct import traceback -import xml.etree.ElementTree import zlib from .common import InfoExtractor, SearchInfoExtractor @@ -1144,8 +1143,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): 'asrs': 1, }) list_url = caption_url + '&' + list_params - list_page = self._download_webpage(list_url, video_id) - caption_list = xml.etree.ElementTree.fromstring(list_page.encode('utf-8')) + caption_list = self._download_xml(list_url, video_id) original_lang_node = caption_list.find('track') if original_lang_node is None or original_lang_node.attrib.get('kind') != 'asr' : self._downloader.report_warning(u'Video doesn\'t have automatic captions') From 652cdaa269725dfbf9effdc18a8fd0b369100399 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Tue, 26 Nov 2013 21:35:03 +0100 Subject: [PATCH 162/425] [youtube:playlist] Add support for YouTube mixes (fixes #1839) --- test/test_youtube_lists.py | 9 +++++++++ youtube_dl/extractor/youtube.py | 24 ++++++++++++++++++++++-- 2 files changed, 31 insertions(+), 2 deletions(-) diff --git a/test/test_youtube_lists.py b/test/test_youtube_lists.py index 8fd073f31..95f07d129 100644 --- a/test/test_youtube_lists.py +++ b/test/test_youtube_lists.py @@ -107,5 +107,14 @@ class TestYoutubeLists(unittest.TestCase): result = ie.extract('http://www.youtube.com/show/airdisasters') self.assertTrue(len(result) >= 3) + def test_youtube_mix(self): + dl = FakeYDL() + ie = YoutubePlaylistIE(dl) + result = ie.extract('http://www.youtube.com/watch?v=lLJf9qJHR3E&list=RDrjFaenf1T-Y') + entries = result['entries'] + self.assertTrue(len(entries) >= 20) + original_video = entries[0] + self.assertEqual(original_video['id'], 'rjFaenf1T-Y') + if __name__ == '__main__': unittest.main() diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index a76a9071a..9ef5fecce 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -28,6 +28,7 @@ from ..utils import ( clean_html, get_cachedir, get_element_by_id, + get_element_by_attribute, ExtractorError, unescapeHTML, unified_strdate, @@ -1537,6 +1538,22 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): def _real_initialize(self): self._login() + def _ids_to_results(self, ids): + return [self.url_result(vid_id, 'Youtube', video_id=vid_id) + for vid_id in ids] + + def _extract_mix(self, playlist_id): + # The mixes are generated from a a single video + # the id of the playlist is just 'RD' + video_id + url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[2:], playlist_id) + webpage = self._download_webpage(url, playlist_id, u'Downloading Youtube mix') + title = clean_html(get_element_by_attribute('class', 'title long-title', webpage)) + video_re = r'data-index="\d+".*?href="/watch\?v=([0-9A-Za-z_-]{11})&[^"]*?list=%s' % re.escape(playlist_id) + ids = orderedSet(re.findall(video_re, webpage)) + url_results = self._ids_to_results(ids) + + return self.playlist_result(url_results, playlist_id, title) + def _real_extract(self, url): # Extract playlist id mobj = re.match(self._VALID_URL, url, re.VERBOSE) @@ -1554,6 +1571,10 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): else: self.to_screen(u'Downloading playlist PL%s - add --no-playlist to just download video %s' % (playlist_id, video_id)) + if len(playlist_id) == 13: # 'RD' + 11 characters for the video id + # Mixes require a custom extraction process + return self._extract_mix(playlist_id) + # Extract the video ids from the playlist pages ids = [] @@ -1571,8 +1592,7 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): playlist_title = self._og_search_title(page) - url_results = [self.url_result(vid_id, 'Youtube', video_id=vid_id) - for vid_id in ids] + url_results = self._ids_to_results(ids) return self.playlist_result(url_results, playlist_id, playlist_title) From c1f9c59d11c0a96be7caa0b4c6e90d900e3161c6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Wed, 27 Nov 2013 00:41:30 +0100 Subject: [PATCH 163/425] [bash-completion] Complete filenames or directories if the previous option requires it --- devscripts/bash-completion.in | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/devscripts/bash-completion.in b/devscripts/bash-completion.in index ce893fcbe..cc469366d 100644 --- a/devscripts/bash-completion.in +++ b/devscripts/bash-completion.in @@ -1,10 +1,21 @@ __youtube_dl() { - local cur prev opts + local cur prev opts fileopts diropts COMPREPLY=() cur="${COMP_WORDS[COMP_CWORD]}" + prev="${COMP_WORDS[COMP_CWORD-1]}" opts="{{flags}}" keywords=":ytfavorites :ytrecommended :ytsubscriptions :ytwatchlater" + fileopts="-a|--batch-file|--download-archive|--cookies" + diropts="--cache-dir" + + if [[ ${prev} =~ ${fileopts} ]]; then + COMPREPLY=( $(compgen -f -- ${cur}) ) + return 0 + elif [[ ${prev} =~ ${diropts} ]]; then + COMPREPLY=( $(compgen -d -- ${cur}) ) + return 0 + fi if [[ ${cur} =~ : ]]; then COMPREPLY=( $(compgen -W "${keywords}" -- ${cur}) ) From 5f09bbff4df9e11d803e3af066c08167f7f7dcb6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Wed, 27 Nov 2013 00:42:59 +0100 Subject: [PATCH 164/425] [bash-completion] Complete the ':ythistory' keyword --- devscripts/bash-completion.in | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/devscripts/bash-completion.in b/devscripts/bash-completion.in index cc469366d..3af87a378 100644 --- a/devscripts/bash-completion.in +++ b/devscripts/bash-completion.in @@ -1,11 +1,11 @@ __youtube_dl() { - local cur prev opts fileopts diropts + local cur prev opts fileopts diropts keywords COMPREPLY=() cur="${COMP_WORDS[COMP_CWORD]}" prev="${COMP_WORDS[COMP_CWORD-1]}" opts="{{flags}}" - keywords=":ytfavorites :ytrecommended :ytsubscriptions :ytwatchlater" + keywords=":ytfavorites :ytrecommended :ytsubscriptions :ytwatchlater :ythistory" fileopts="-a|--batch-file|--download-archive|--cookies" diropts="--cache-dir" From 4b19e3895492a472c5b63d9da5777bc29d44e25c Mon Sep 17 00:00:00 2001 From: Filippo Valsorda <filosottile.wiki@gmail.com> Date: Wed, 27 Nov 2013 02:54:51 +0100 Subject: [PATCH 165/425] [videopremium] support new .me domain --- youtube_dl/extractor/videopremium.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/videopremium.py b/youtube_dl/extractor/videopremium.py index 4800415bd..acae81448 100644 --- a/youtube_dl/extractor/videopremium.py +++ b/youtube_dl/extractor/videopremium.py @@ -5,7 +5,7 @@ from .common import InfoExtractor class VideoPremiumIE(InfoExtractor): - _VALID_URL = r'(?:https?://)?(?:www\.)?videopremium\.tv/(?P<id>\w+)(?:/.*)?' + _VALID_URL = r'(?:https?://)?(?:www\.)?videopremium\.(?:tv|me)/(?P<id>\w+)(?:/.*)?' _TEST = { u'url': u'http://videopremium.tv/4w7oadjsf156', u'file': u'4w7oadjsf156.f4v', @@ -41,4 +41,4 @@ class VideoPremiumIE(InfoExtractor): 'player_url': "http://videopremium.tv/uplayer/uppod.swf", 'ext': 'f4v', 'title': video_title, - } \ No newline at end of file + } From dcca796ce431da0d8b6927609c08938f22ba44cf Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Wed, 27 Nov 2013 18:33:51 +0100 Subject: [PATCH 166/425] [clipfish] Effect a better error message (#1842) --- youtube_dl/extractor/clipfish.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/youtube_dl/extractor/clipfish.py b/youtube_dl/extractor/clipfish.py index 5f0b5602f..05afce338 100644 --- a/youtube_dl/extractor/clipfish.py +++ b/youtube_dl/extractor/clipfish.py @@ -1,5 +1,6 @@ import re import time +import xml.etree.ElementTree from .common import InfoExtractor @@ -28,6 +29,10 @@ class ClipfishIE(InfoExtractor): info_url, video_id, note=u'Downloading info page') title = doc.find('title').text video_url = doc.find('filename').text + if video_url is None: + xml_bytes = xml.etree.ElementTree.tostring(doc) + raise ExtractorError(u'Cannot find video URL in document %r' % + xml_bytes) thumbnail = doc.find('imageurl').text duration_str = doc.find('duration').text m = re.match( From 76d1700b283ee482288eec12a6903a345742eead Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Wed, 27 Nov 2013 20:01:51 +0100 Subject: [PATCH 167/425] [youtube:playlist] Fix the extraction of the title for some mixes (#1844) Like https://www.youtube.com/watch?v=g8jDB5xOiuE&list=RDIh2gxLqR7HM --- youtube_dl/extractor/youtube.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 9ef5fecce..fb61f47e8 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1547,7 +1547,9 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): # the id of the playlist is just 'RD' + video_id url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[2:], playlist_id) webpage = self._download_webpage(url, playlist_id, u'Downloading Youtube mix') - title = clean_html(get_element_by_attribute('class', 'title long-title', webpage)) + title_span = (get_element_by_attribute('class', 'title long-title', webpage) or + get_element_by_attribute('class', 'title ', webpage)) + title = clean_html(title_span) video_re = r'data-index="\d+".*?href="/watch\?v=([0-9A-Za-z_-]{11})&[^"]*?list=%s' % re.escape(playlist_id) ids = orderedSet(re.findall(video_re, webpage)) url_results = self._ids_to_results(ids) From 35907e23ec4d7e754ff239693500e05886b80ee7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Wed, 27 Nov 2013 21:24:55 +0100 Subject: [PATCH 168/425] [yahoo] Fix video extraction and use the new format system exclusively --- youtube_dl/extractor/yahoo.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/yahoo.py b/youtube_dl/extractor/yahoo.py index 34e6afb20..617e3bb06 100644 --- a/youtube_dl/extractor/yahoo.py +++ b/youtube_dl/extractor/yahoo.py @@ -46,7 +46,7 @@ class YahooIE(InfoExtractor): video_id = mobj.group('id') webpage = self._download_webpage(url, video_id) - items_json = self._search_regex(r'YVIDEO_INIT_ITEMS = ({.*?});$', + items_json = self._search_regex(r'mediaItems: ({.*?})$', webpage, u'items', flags=re.MULTILINE) items = json.loads(items_json) info = items['mediaItems']['query']['results']['mediaObj'][0] @@ -91,17 +91,13 @@ class YahooIE(InfoExtractor): formats.append(format_info) formats = sorted(formats, key=lambda f:(f['height'], f['width'])) - info = { + return { 'id': video_id, 'title': meta['title'], 'formats': formats, 'description': clean_html(meta['description']), 'thumbnail': meta['thumbnail'], } - # TODO: Remove when #980 has been merged - info.update(formats[-1]) - - return info class YahooSearchIE(SearchInfoExtractor): From 0e44d8381a439c84dd23477d32f7da4bb0a06293 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Thu, 28 Nov 2013 00:33:27 +0100 Subject: [PATCH 169/425] [youtube:feeds] Use the 'paging' value from the downloaded json information (fixes #1845) --- youtube_dl/extractor/youtube.py | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index fb61f47e8..765b4a9bf 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1791,7 +1791,6 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor): Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties. """ _LOGIN_REQUIRED = True - _PAGING_STEP = 30 # use action_load_personal_feed instead of action_load_system_feed _PERSONAL_FEED = False @@ -1811,9 +1810,8 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor): def _real_extract(self, url): feed_entries = [] - # The step argument is available only in 2.7 or higher - for i in itertools.count(0): - paging = i*self._PAGING_STEP + paging = 0 + for i in itertools.count(1): info = self._download_webpage(self._FEED_TEMPLATE % paging, u'%s feed' % self._FEED_NAME, u'Downloading page %s' % i) @@ -1826,6 +1824,7 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor): for video_id in ids) if info['paging'] is None: break + paging = info['paging'] return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE) class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor): @@ -1845,7 +1844,6 @@ class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor): _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater' _FEED_NAME = 'watch_later' _PLAYLIST_TITLE = u'Youtube Watch Later' - _PAGING_STEP = 100 _PERSONAL_FEED = True class YoutubeHistoryIE(YoutubeFeedsInfoExtractor): @@ -1855,13 +1853,6 @@ class YoutubeHistoryIE(YoutubeFeedsInfoExtractor): _PERSONAL_FEED = True _PLAYLIST_TITLE = u'Youtube Watch History' - def _real_extract(self, url): - webpage = self._download_webpage('https://www.youtube.com/feed/history', u'History') - data_paging = self._search_regex(r'data-paging="(\d+)"', webpage, u'data-paging') - # The step is actually a ridiculously big number (like 1374343569725646) - self._PAGING_STEP = int(data_paging) - return super(YoutubeHistoryIE, self)._real_extract(url) - class YoutubeFavouritesIE(YoutubeBaseInfoExtractor): IE_NAME = u'youtube:favorites' IE_DESC = u'YouTube.com favourite videos, "ytfav" keyword (requires authentication)' From a2e6db365c11d8c9eaaaeb8de53d59add648f978 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Thu, 28 Nov 2013 05:47:20 +0100 Subject: [PATCH 170/425] [zdf] add a pseudo-testcase and fix URL matching --- youtube_dl/extractor/zdf.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/zdf.py b/youtube_dl/extractor/zdf.py index 07f830e80..3c01cc041 100644 --- a/youtube_dl/extractor/zdf.py +++ b/youtube_dl/extractor/zdf.py @@ -1,3 +1,5 @@ +# coding: utf-8 + import operator import re @@ -9,7 +11,19 @@ from ..utils import ( class ZDFIE(InfoExtractor): - _VALID_URL = r'^http://www\.zdf\.de\/ZDFmediathek(?P<hash>#)?\/(.*beitrag\/video\/)(?P<video_id>[^/\?]+)(?:\?.*)?' + _VALID_URL = r'^https?://www\.zdf\.de/ZDFmediathek(?P<hash>#)?/(.*beitrag/(?:video/)?)(?P<video_id>[0-9]+)(?:/[^/?]+)?(?:\?.*)?' + + _TEST = { + u"url": u"http://www.zdf.de/ZDFmediathek/beitrag/video/2037704/ZDFspezial---Ende-des-Machtpokers--?bc=sts;stt", + u"file": u"2037704.webm", + u"info_dict": { + u"upload_date": u"20131127", + u"description": u"Union und SPD haben sich auf einen Koalitionsvertrag geeinigt. Aber was bedeutet das für die Bürger? Sehen Sie hierzu das ZDFspezial \"Ende des Machtpokers - Große Koalition für Deutschland\".", + u"uploader": u"spezial", + u"title": u"ZDFspezial - Ende des Machtpokers" + }, + u"skip": u"Videos on ZDF.de are depublicised in short order", + } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) From 2a275ab007d6d336b44a6a0cd4fac6783ba63cb8 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Thu, 28 Nov 2013 05:47:50 +0100 Subject: [PATCH 171/425] [zdf] Use _download_xml --- youtube_dl/extractor/common.py | 3 ++- youtube_dl/extractor/zdf.py | 8 ++++---- youtube_dl/utils.py | 5 ----- 3 files changed, 6 insertions(+), 10 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 5656445a3..4f1b50880 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -210,7 +210,8 @@ class InfoExtractor(object): """ Returns the data of the page as a string """ return self._download_webpage_handle(url_or_request, video_id, note, errnote)[0] - def _download_xml(self, url_or_request, video_id, note=u'Downloading XML', errnote=u'Unable to downloand XML'): + def _download_xml(self, url_or_request, video_id, + note=u'Downloading XML', errnote=u'Unable to download XML'): """Return the xml as an xml.etree.ElementTree.Element""" xml_string = self._download_webpage(url_or_request, video_id, note, errnote) return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8')) diff --git a/youtube_dl/extractor/zdf.py b/youtube_dl/extractor/zdf.py index 3c01cc041..689f19735 100644 --- a/youtube_dl/extractor/zdf.py +++ b/youtube_dl/extractor/zdf.py @@ -5,7 +5,6 @@ import re from .common import InfoExtractor from ..utils import ( - parse_xml_doc, unified_strdate, ) @@ -30,9 +29,10 @@ class ZDFIE(InfoExtractor): video_id = mobj.group('video_id') xml_url = u'http://www.zdf.de/ZDFmediathek/xmlservice/web/beitragsDetails?ak=web&id=%s' % video_id - info_xml = self._download_webpage( - xml_url, video_id, note=u'Downloading video info') - doc = parse_xml_doc(info_xml) + doc = self._download_xml( + xml_url, video_id, + note=u'Downloading video info', + errnote=u'Failed to download video info') title = doc.find('.//information/title').text description = doc.find('.//information/detail').text diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 946e90e93..c486ef8ec 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1009,11 +1009,6 @@ def unsmuggle_url(smug_url): return url, data -def parse_xml_doc(s): - assert isinstance(s, type(u'')) - return xml.etree.ElementTree.fromstring(s.encode('utf-8')) - - def format_bytes(bytes): if bytes is None: return u'N/A' From ea07dbb8b108d7c77b6b822fba98817063a8457a Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Thu, 28 Nov 2013 05:48:32 +0100 Subject: [PATCH 172/425] release 2013.11.28 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 99a5e0505..03cb283bd 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2013.11.26' +__version__ = '2013.11.28' From f8f60d27931421f969c7ec0a2a45caa743549994 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Thu, 28 Nov 2013 05:54:46 +0100 Subject: [PATCH 173/425] [clipfish] Fix imports (#1842) --- youtube_dl/extractor/clipfish.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/clipfish.py b/youtube_dl/extractor/clipfish.py index 05afce338..ba5623572 100644 --- a/youtube_dl/extractor/clipfish.py +++ b/youtube_dl/extractor/clipfish.py @@ -3,6 +3,7 @@ import time import xml.etree.ElementTree from .common import InfoExtractor +from ..utils import ExtractorError class ClipfishIE(InfoExtractor): From fc9e1cc69706ef079fca0ee32529503ecedae578 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Thu, 28 Nov 2013 06:10:37 +0100 Subject: [PATCH 174/425] [clipfish] Use FIFA trailer as testcase (#1842) --- youtube_dl/extractor/clipfish.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/clipfish.py b/youtube_dl/extractor/clipfish.py index ba5623572..0d18e9a7a 100644 --- a/youtube_dl/extractor/clipfish.py +++ b/youtube_dl/extractor/clipfish.py @@ -11,12 +11,12 @@ class ClipfishIE(InfoExtractor): _VALID_URL = r'^https?://(?:www\.)?clipfish\.de/.*?/video/(?P<id>[0-9]+)/' _TEST = { - u'url': u'http://www.clipfish.de/special/supertalent/video/4028320/supertalent-2013-ivana-opacak-singt-nobodys-perfect/', - u'file': u'4028320.f4v', - u'md5': u'5e38bda8c329fbfb42be0386a3f5a382', + u'url': u'http://www.clipfish.de/special/game-trailer/video/3966754/fifa-14-e3-2013-trailer/', + u'file': u'3966754.mp4', + u'md5': u'2521cd644e862936cf2e698206e47385', u'info_dict': { - u'title': u'Supertalent 2013: Ivana Opacak singt Nobody\'s Perfect', - u'duration': 399, + u'title': u'FIFA 14 - E3 2013 Trailer', + u'duration': 82, } } From 4e0084d92e589d385f28ac98bfb847240d61dc93 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Thu, 28 Nov 2013 06:14:17 +0100 Subject: [PATCH 175/425] [youtube/subtitles] Change MD5 of vtt subtitle in test --- test/test_subtitles.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_subtitles.py b/test/test_subtitles.py index 94a1f771d..23a653124 100644 --- a/test/test_subtitles.py +++ b/test/test_subtitles.py @@ -72,7 +72,7 @@ class TestYoutubeSubtitles(BaseTestSubtitles): self.DL.params['writesubtitles'] = True self.DL.params['subtitlesformat'] = 'vtt' subtitles = self.getSubtitles() - self.assertEqual(md5(subtitles['en']), '356cdc577fde0c6783b9b822e7206ff7') + self.assertEqual(md5(subtitles['en']), '3cb210999d3e021bd6c7f0ea751eab06') def test_youtube_list_subtitles(self): self.DL.expect_warning(u'Video doesn\'t have automatic captions') From 2be54167d085c5b4c956c66ad0367fdcfb68b891 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Thu, 28 Nov 2013 06:17:56 +0100 Subject: [PATCH 176/425] release 2013.11.28.1 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 03cb283bd..9cae97ee2 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2013.11.28' +__version__ = '2013.11.28.1' From d8d6148628b972b6998a8c2a5465f031a44f4004 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Thu, 28 Nov 2013 13:32:49 +0100 Subject: [PATCH 177/425] Add an extractor for Internet Movie Database trailers (closes #1832) --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/imdb.py | 59 ++++++++++++++++++++++++++++++++ 2 files changed, 60 insertions(+) create mode 100644 youtube_dl/extractor/imdb.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 0b4d086b7..30e4a9105 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -71,6 +71,7 @@ from .hotnewhiphop import HotNewHipHopIE from .howcast import HowcastIE from .hypem import HypemIE from .ign import IGNIE, OneUPIE +from .imdb import ImdbIE from .ina import InaIE from .infoq import InfoQIE from .instagram import InstagramIE diff --git a/youtube_dl/extractor/imdb.py b/youtube_dl/extractor/imdb.py new file mode 100644 index 000000000..07e4f7d29 --- /dev/null +++ b/youtube_dl/extractor/imdb.py @@ -0,0 +1,59 @@ +import re +import json + +from .common import InfoExtractor +from ..utils import ( + compat_urlparse, + get_element_by_attribute, +) + + +class ImdbIE(InfoExtractor): + IE_NAME = u'imdb' + IE_DESC = u'Internet Movie Database trailers' + _VALID_URL = r'http://www\.imdb\.com/video/imdb/vi(?P<id>\d+)' + + _TEST = { + u'url': u'http://www.imdb.com/video/imdb/vi2524815897', + u'md5': u'9f34fa777ade3a6e57a054fdbcb3a068', + u'info_dict': { + u'id': u'2524815897', + u'ext': u'mp4', + u'title': u'Ice Age: Continental Drift Trailer (No. 2) - IMDb', + u'description': u'md5:9061c2219254e5d14e03c25c98e96a81', + u'duration': 151, + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + webpage = self._download_webpage(url,video_id) + descr = get_element_by_attribute('itemprop', 'description', webpage) + available_formats = re.findall( + r'case \'(?P<f_id>.*?)\' :$\s+url = \'(?P<path>.*?)\'', webpage, + flags=re.MULTILINE) + formats = [] + for f_id, f_path in available_formats: + format_page = self._download_webpage( + compat_urlparse.urljoin(url, f_path), + u'Downloading info for %s format' % f_id) + json_data = get_element_by_attribute('class', 'imdb-player-data', + format_page) + info = json.loads(json_data) + format_info = info['videoPlayerObject']['video'] + formats.append({ + 'format_id': f_id, + 'url': format_info['url'], + 'height': format_info['height'], + 'width': format_info['width'], + }) + + return { + 'id': video_id, + 'title': self._og_search_title(webpage), + 'formats': formats, + 'description': descr, + 'thumbnail': format_info['slate'], + 'duration': int(info['titleObject']['title']['duration_seconds']), + } From b03d0d064c0e198aa281faacb2b5a74af7628b74 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Thu, 28 Nov 2013 13:49:00 +0100 Subject: [PATCH 178/425] [imdb] Fix extraction in python 2.6 Using a regular expression because the html cannot be parsed. --- youtube_dl/extractor/imdb.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/imdb.py b/youtube_dl/extractor/imdb.py index 07e4f7d29..520edc7d0 100644 --- a/youtube_dl/extractor/imdb.py +++ b/youtube_dl/extractor/imdb.py @@ -38,8 +38,9 @@ class ImdbIE(InfoExtractor): format_page = self._download_webpage( compat_urlparse.urljoin(url, f_path), u'Downloading info for %s format' % f_id) - json_data = get_element_by_attribute('class', 'imdb-player-data', - format_page) + json_data = self._search_regex( + r'<script[^>]+class="imdb-player-data"[^>]*?>(.*?)</script>', + format_page, u'json data', flags=re.DOTALL) info = json.loads(json_data) format_info = info['videoPlayerObject']['video'] formats.append({ From 3862402ff3a991e7fb58470ac38fba82ba9b18d9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Thu, 28 Nov 2013 14:38:10 +0100 Subject: [PATCH 179/425] Add an extractor for Clipsyndicate (closes #1744) --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/clipsyndicate.py | 52 +++++++++++++++++++++++++++ 2 files changed, 53 insertions(+) create mode 100644 youtube_dl/extractor/clipsyndicate.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 30e4a9105..1e4f36aa3 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -21,6 +21,7 @@ from .canalplus import CanalplusIE from .canalc2 import Canalc2IE from .cinemassacre import CinemassacreIE from .clipfish import ClipfishIE +from .clipsyndicate import ClipsyndicateIE from .cnn import CNNIE from .collegehumor import CollegeHumorIE from .comedycentral import ComedyCentralIE, ComedyCentralShowsIE diff --git a/youtube_dl/extractor/clipsyndicate.py b/youtube_dl/extractor/clipsyndicate.py new file mode 100644 index 000000000..d4fc86973 --- /dev/null +++ b/youtube_dl/extractor/clipsyndicate.py @@ -0,0 +1,52 @@ +import re +import xml.etree.ElementTree + +from .common import InfoExtractor +from ..utils import ( + find_xpath_attr, +) + + +class ClipsyndicateIE(InfoExtractor): + _VALID_URL = r'http://www\.clipsyndicate\.com/video/play(list/\d+)?/(?P<id>\d+)' + + _TEST = { + u'url': u'http://www.clipsyndicate.com/video/play/4629301/brick_briscoe', + u'md5': u'4d7d549451bad625e0ff3d7bd56d776c', + u'info_dict': { + u'id': u'4629301', + u'ext': u'mp4', + u'title': u'Brick Briscoe', + u'duration': 612, + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + js_player = self._download_webpage( + 'http://eplayer.clipsyndicate.com/embed/player.js?va_id=%s' % video_id, + video_id, u'Downlaoding player') + # it includes a required token + flvars = self._search_regex(r'flvars: "(.*?)"', js_player, u'flvars') + + playlist_page = self._download_webpage( + 'http://eplayer.clipsyndicate.com/osmf/playlist?%s' % flvars, + video_id, u'Downloading video info') + # Fix broken xml + playlist_page = re.sub('&', '&', playlist_page) + pdoc = xml.etree.ElementTree.fromstring(playlist_page.encode('utf-8')) + + track_doc = pdoc.find('trackList/track') + def find_param(name): + node = find_xpath_attr(track_doc, './/param', 'name', name) + if node is not None: + return node.attrib['value'] + + return { + 'id': video_id, + 'title': find_param('title'), + 'url': track_doc.find('location').text, + 'thumbnail': find_param('thumbnail'), + 'duration': int(find_param('duration')), + } From 677c18092d8fd5ca6e08b25985c8533b6a0738d5 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Fri, 29 Nov 2013 03:33:25 +0100 Subject: [PATCH 180/425] [podomatic] Add extractor --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/podomatic.py | 49 +++++++++++++++++++++++++++++++ 2 files changed, 50 insertions(+) create mode 100644 youtube_dl/extractor/podomatic.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 1e4f36aa3..fd890e251 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -107,6 +107,7 @@ from .ooyala import OoyalaIE from .orf import ORFIE from .pbs import PBSIE from .photobucket import PhotobucketIE +from .podomatic import PodomaticIE from .pornhub import PornHubIE from .pornotube import PornotubeIE from .rbmaradio import RBMARadioIE diff --git a/youtube_dl/extractor/podomatic.py b/youtube_dl/extractor/podomatic.py new file mode 100644 index 000000000..58200971b --- /dev/null +++ b/youtube_dl/extractor/podomatic.py @@ -0,0 +1,49 @@ +import json +import re + +from .common import InfoExtractor + + +class PodomaticIE(InfoExtractor): + IE_NAME = 'podomatic' + _VALID_URL = r'^(?P<proto>https?)://(?P<channel>[^.]+)\.podomatic\.com/entry/(?P<id>[^?]+)' + + _TEST = { + u"url": u"http://scienceteachingtips.podomatic.com/entry/2009-01-02T16_03_35-08_00", + u"file": u"2009-01-02T16_03_35-08_00.mp3", + u"md5": u"84bb855fcf3429e6bf72460e1eed782d", + u"info_dict": { + u"uploader": u"Science Teaching Tips", + u"uploader_id": u"scienceteachingtips", + u"title": u"64. When the Moon Hits Your Eye", + u"duration": 446, + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + channel = mobj.group('channel') + + json_url = (('%s://%s.podomatic.com/entry/embed_params/%s' + + '?permalink=true&rtmp=0') % + (mobj.group('proto'), channel, video_id)) + data_json = self._download_webpage( + json_url, video_id, note=u'Downloading video info') + data = json.loads(data_json) + + video_url = data['downloadLink'] + uploader = data['podcast'] + title = data['title'] + thumbnail = data['imageLocation'] + duration = int(data['length'] / 1000.0) + + return { + 'id': video_id, + 'url': video_url, + 'title': title, + 'uploader': uploader, + 'uploader_id': channel, + 'thumbnail': thumbnail, + 'duration': duration, + } From 17769d5a6c24eb8f5d609aa99f84debc3fe4adec Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Fri, 29 Nov 2013 03:34:26 +0100 Subject: [PATCH 181/425] release 2013.11.29 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 9cae97ee2..a73d7fb5c 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2013.11.28.1' +__version__ = '2013.11.29' From acf37ca151d67ee28034775662318d9a0a1eb6f4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Fri, 29 Nov 2013 07:56:14 +0100 Subject: [PATCH 182/425] [imdb] Fix the resolution values (fixes #1847) We were using the size of the player, it was the same for all the formats --- youtube_dl/extractor/imdb.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/youtube_dl/extractor/imdb.py b/youtube_dl/extractor/imdb.py index 520edc7d0..d8e9712a7 100644 --- a/youtube_dl/extractor/imdb.py +++ b/youtube_dl/extractor/imdb.py @@ -46,8 +46,7 @@ class ImdbIE(InfoExtractor): formats.append({ 'format_id': f_id, 'url': format_info['url'], - 'height': format_info['height'], - 'width': format_info['width'], + 'height': int(info['titleObject']['encoding']['selected'][:-1]), }) return { From e1f900d6a4c449b2a7c7ed74dbe8eca74cbccf13 Mon Sep 17 00:00:00 2001 From: Nicolas Kaiser <nikai@nikai.net> Date: Fri, 29 Nov 2013 09:44:05 +0100 Subject: [PATCH 183/425] fix typo in README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 580b16004..af4d969d6 100644 --- a/README.md +++ b/README.md @@ -134,7 +134,7 @@ which means you can modify it, redistribute it or use it however you like. directory ## Video Format Options: - -f, --format FORMAT video format code, specifiy the order of + -f, --format FORMAT video format code, specify the order of preference using slashes: "-f 22/17/18". "-f mp4" and "-f flv" are also supported --all-formats download all available video formats From 9986238ba9a486ca76334c50562760a312ab20fa Mon Sep 17 00:00:00 2001 From: Nicolas Kaiser <nikai@nikai.net> Date: Fri, 29 Nov 2013 09:48:38 +0100 Subject: [PATCH 184/425] fix typo in help --- youtube_dl/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 0704515df..8f8422cc7 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -235,7 +235,7 @@ def parseOpts(overrideArguments=None): video_format.add_option('-f', '--format', action='store', dest='format', metavar='FORMAT', default='best', - help='video format code, specifiy the order of preference using slashes: "-f 22/17/18". "-f mp4" and "-f flv" are also supported') + help='video format code, specify the order of preference using slashes: "-f 22/17/18". "-f mp4" and "-f flv" are also supported') video_format.add_option('--all-formats', action='store_const', dest='format', help='download all available video formats', const='all') video_format.add_option('--prefer-free-formats', From a3fb4675fb67b061e2a71cec78a5dbd8695b8ef0 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Fri, 29 Nov 2013 15:25:09 +0100 Subject: [PATCH 185/425] Do not mutate default arguments In this case, it looks rather harmless (since the conditions for --restrict-filenames should not change while a process is running), but just to be sure. This also simplifies the interface for callers, who can just pass in the idiomatic None for "I don't care, whatever is the default". --- youtube_dl/YoutubeDL.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 711b5d79e..b822930cb 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -146,7 +146,7 @@ class YoutubeDL(object): _num_downloads = None _screen_file = None - def __init__(self, params={}): + def __init__(self, params=None): """Create a FileDownloader object with the given options.""" self._ies = [] self._ies_instances = {} @@ -155,7 +155,7 @@ class YoutubeDL(object): self._download_retcode = 0 self._num_downloads = 0 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)] - self.params = params + self.params = {} if params is None else params if (sys.version_info >= (3,) and sys.platform != 'win32' and sys.getfilesystemencoding() in ['ascii', 'ANSI_X3.4-1968'] From befd88b786dc41ff075693fd17bafbc7fa4c100e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Fri, 29 Nov 2013 15:25:43 +0100 Subject: [PATCH 186/425] [yahoo] Add an extractor for yahoo news (closes #1849) --- youtube_dl/extractor/__init__.py | 6 +++++- youtube_dl/extractor/yahoo.py | 34 +++++++++++++++++++++++++++++++- 2 files changed, 38 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index fd890e251..664639b53 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -172,7 +172,11 @@ from .xhamster import XHamsterIE from .xnxx import XNXXIE from .xvideos import XVideosIE from .xtube import XTubeIE -from .yahoo import YahooIE, YahooSearchIE +from .yahoo import ( + YahooIE, + YahooNewsIE, + YahooSearchIE, +) from .youjizz import YouJizzIE from .youku import YoukuIE from .youporn import YouPornIE diff --git a/youtube_dl/extractor/yahoo.py b/youtube_dl/extractor/yahoo.py index 617e3bb06..2d87e81b2 100644 --- a/youtube_dl/extractor/yahoo.py +++ b/youtube_dl/extractor/yahoo.py @@ -53,8 +53,11 @@ class YahooIE(InfoExtractor): # The 'meta' field is not always in the video webpage, we request it # from another page long_id = info['id'] + return self._get_info(info['id'], video_id) + + def _get_info(self, long_id, video_id): query = ('SELECT * FROM yahoo.media.video.streams WHERE id="%s"' - ' AND plrs="86Gj0vCaSzV_Iuf6hNylf2"' % long_id) + ' AND plrs="86Gj0vCaSzV_Iuf6hNylf2" AND region="US"' % long_id) data = compat_urllib_parse.urlencode({ 'q': query, 'env': 'prod', @@ -100,6 +103,35 @@ class YahooIE(InfoExtractor): } +class YahooNewsIE(YahooIE): + IE_NAME = 'yahoo:news' + _VALID_URL = r'http://news\.yahoo\.com/video/.*?-(?P<id>\d*?)\.html' + + _TEST = { + u'url': u'http://news.yahoo.com/video/china-moses-crazy-blues-104538833.html', + u'info_dict': { + u'id': u'104538833', + u'ext': u'flv', + u'title': u'China Moses Is Crazy About the Blues', + u'description': u'md5:9900ab8cd5808175c7b3fe55b979bed0', + }, + u'params': { + # Requires rtmpdump + u'skip_download': True, + }, + } + + # Overwrite YahooIE properties we don't want + _TESTS = [] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + webpage = self._download_webpage(url, video_id) + long_id = self._search_regex(r'contentId: \'(.+?)\',', webpage, u'long id') + return self._get_info(long_id, video_id) + + class YahooSearchIE(SearchInfoExtractor): IE_DESC = u'Yahoo screen search' _MAX_RESULTS = 1000 From 323ec6ae566af9744edce97a23e623d99eea8a1f Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Fri, 29 Nov 2013 15:57:43 +0100 Subject: [PATCH 187/425] Clarify --download-archive help --- youtube_dl/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 0704515df..c63d62986 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -220,7 +220,7 @@ def parseOpts(overrideArguments=None): default=None, type=int) selection.add_option('--download-archive', metavar='FILE', dest='download_archive', - help='Download only videos not present in the archive file. Record all downloaded videos in it.') + help='Download only videos not present in the archive file. Record the IDs of all downloaded videos in it.') authentication.add_option('-u', '--username', From c5171c454b4392f7276b7f9e94c25d7f1ad60375 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Fri, 29 Nov 2013 22:06:17 +0100 Subject: [PATCH 188/425] [yahoo] Force use of the http protocol for downloading the videos. --- youtube_dl/extractor/yahoo.py | 24 ++++++++---------------- 1 file changed, 8 insertions(+), 16 deletions(-) diff --git a/youtube_dl/extractor/yahoo.py b/youtube_dl/extractor/yahoo.py index 2d87e81b2..e457c4707 100644 --- a/youtube_dl/extractor/yahoo.py +++ b/youtube_dl/extractor/yahoo.py @@ -17,27 +17,21 @@ class YahooIE(InfoExtractor): _TESTS = [ { u'url': u'http://screen.yahoo.com/julian-smith-travis-legg-watch-214727115.html', - u'file': u'214727115.flv', + u'file': u'214727115.mp4', + u'md5': u'4962b075c08be8690a922ee026d05e69', u'info_dict': { u'title': u'Julian Smith & Travis Legg Watch Julian Smith', u'description': u'Julian and Travis watch Julian Smith', }, - u'params': { - # Requires rtmpdump - u'skip_download': True, - }, }, { u'url': u'http://screen.yahoo.com/wired/codefellas-s1-ep12-cougar-lies-103000935.html', - u'file': u'103000935.flv', + u'file': u'103000935.mp4', + u'md5': u'd6e6fc6e1313c608f316ddad7b82b306', u'info_dict': { u'title': u'Codefellas - The Cougar Lies with Spanish Moss', u'description': u'Agent Topple\'s mustache does its dirty work, and Nicole brokers a deal for peace. But why is the NSA collecting millions of Instagram brunch photos? And if your waffles have nothing to hide, what are they so worried about?', }, - u'params': { - # Requires rtmpdump - u'skip_download': True, - }, }, ] @@ -57,7 +51,8 @@ class YahooIE(InfoExtractor): def _get_info(self, long_id, video_id): query = ('SELECT * FROM yahoo.media.video.streams WHERE id="%s"' - ' AND plrs="86Gj0vCaSzV_Iuf6hNylf2" AND region="US"' % long_id) + ' AND plrs="86Gj0vCaSzV_Iuf6hNylf2" AND region="US"' + ' AND protocol="http"' % long_id) data = compat_urllib_parse.urlencode({ 'q': query, 'env': 'prod', @@ -109,16 +104,13 @@ class YahooNewsIE(YahooIE): _TEST = { u'url': u'http://news.yahoo.com/video/china-moses-crazy-blues-104538833.html', + u'md5': u'67010fdf3a08d290e060a4dd96baa07b', u'info_dict': { u'id': u'104538833', - u'ext': u'flv', + u'ext': u'mp4', u'title': u'China Moses Is Crazy About the Blues', u'description': u'md5:9900ab8cd5808175c7b3fe55b979bed0', }, - u'params': { - # Requires rtmpdump - u'skip_download': True, - }, } # Overwrite YahooIE properties we don't want From 06dcbb71d8e19947eb6e71390a6a0640abe3dad0 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Sat, 30 Nov 2013 00:42:43 +0100 Subject: [PATCH 189/425] Clarify help of --write-pages (#1853) --- youtube_dl/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index c63d62986..42ab572f2 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -317,7 +317,7 @@ def parseOpts(overrideArguments=None): help='print downloaded pages to debug problems(very verbose)') verbosity.add_option('--write-pages', action='store_true', dest='write_pages', default=False, - help='Write downloaded pages to files in the current directory') + help='Write downloaded intermediary pages to files in the current directory to debug problems') verbosity.add_option('--youtube-print-sig-code', action='store_true', dest='youtube_print_sig_code', default=False, help=optparse.SUPPRESS_HELP) From 0a688bc0b28c970e9af965b3fa0c7927507eeb97 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Sat, 30 Nov 2013 14:56:51 +0100 Subject: [PATCH 190/425] [youtube] Add support for downloading top lists (fixes #1868) It needs to know the channel and the title of the list, because the ids change every time you browse the channels and are attached to a 'VISITOR_INFO1_LIVE' cookie. --- test/test_youtube_lists.py | 8 ++++++++ youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/youtube.py | 35 ++++++++++++++++++++++++++++++++ 3 files changed, 44 insertions(+) diff --git a/test/test_youtube_lists.py b/test/test_youtube_lists.py index 95f07d129..33db09f43 100644 --- a/test/test_youtube_lists.py +++ b/test/test_youtube_lists.py @@ -15,6 +15,7 @@ from youtube_dl.extractor import ( YoutubeIE, YoutubeChannelIE, YoutubeShowIE, + YoutubeTopListIE, ) @@ -116,5 +117,12 @@ class TestYoutubeLists(unittest.TestCase): original_video = entries[0] self.assertEqual(original_video['id'], 'rjFaenf1T-Y') + def test_youtube_toplist(self): + dl = FakeYDL() + ie = YoutubeTopListIE(dl) + result = ie.extract('yttoplist:music:Top Tracks') + entries = result['entries'] + self.assertTrue(len(entries) >= 9) + if __name__ == '__main__': unittest.main() diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 664639b53..0abf86e44 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -194,6 +194,7 @@ from .youtube import ( YoutubeWatchLaterIE, YoutubeFavouritesIE, YoutubeHistoryIE, + YoutubeTopListIE, ) from .zdf import ZDFIE diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 765b4a9bf..a1a4d896d 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1576,6 +1576,9 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): if len(playlist_id) == 13: # 'RD' + 11 characters for the video id # Mixes require a custom extraction process return self._extract_mix(playlist_id) + if playlist_id.startswith('TL'): + raise ExtractorError(u'For downloading YouTube.com top lists, use ' + u'the "yttoplist" keyword, for example "youtube-dl \'yttoplist:music:Top Tracks\'"', expected=True) # Extract the video ids from the playlist pages ids = [] @@ -1598,6 +1601,38 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): return self.playlist_result(url_results, playlist_id, playlist_title) +class YoutubeTopListIE(YoutubePlaylistIE): + IE_NAME = u'youtube:toplist' + IE_DESC = (u'YouTube.com top lists, "yttoplist:{channel}:{list title}"' + u' (Example: "yttoplist:music:Top Tracks")') + _VALID_URL = r'yttoplist:(?P<chann>.*?):(?P<title>.*?)$' + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + channel = mobj.group('chann') + title = mobj.group('title') + query = compat_urllib_parse.urlencode({'title': title}) + playlist_re = 'href="([^"]+?%s[^"]+?)"' % re.escape(query) + channel_page = self._download_webpage('https://www.youtube.com/%s' % channel, title) + link = self._html_search_regex(playlist_re, channel_page, u'list') + url = compat_urlparse.urljoin('https://www.youtube.com/', link) + + video_re = r'data-index="\d+".*?data-video-id="([0-9A-Za-z_-]{11})"' + ids = [] + # sometimes the webpage doesn't contain the videos + # retry until we get them + for i in itertools.count(0): + msg = u'Downloading Youtube mix' + if i > 0: + msg += ', retry #%d' % i + webpage = self._download_webpage(url, title, msg) + ids = orderedSet(re.findall(video_re, webpage)) + if ids: + break + url_results = self._ids_to_results(ids) + return self.playlist_result(url_results, playlist_title=title) + + class YoutubeChannelIE(InfoExtractor): IE_DESC = u'YouTube.com channels' _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)" From 5e09d6abbd09de92869cbb8ed204d18f9cd04931 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Sun, 1 Dec 2013 01:16:20 +0100 Subject: [PATCH 191/425] [clipfish] Skip test on travis --- youtube_dl/extractor/clipfish.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/clipfish.py b/youtube_dl/extractor/clipfish.py index 0d18e9a7a..43efb08bf 100644 --- a/youtube_dl/extractor/clipfish.py +++ b/youtube_dl/extractor/clipfish.py @@ -17,7 +17,8 @@ class ClipfishIE(InfoExtractor): u'info_dict': { u'title': u'FIFA 14 - E3 2013 Trailer', u'duration': 82, - } + }, + u'skip': 'Blocked in the US' } def _real_extract(self, url): From 355e4fd07e7f9c0632d9d78415675f8b5cc3c2ce Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Sun, 1 Dec 2013 01:21:33 +0100 Subject: [PATCH 192/425] [generic] Find embedded dailymotion videos (Fixes #1848) --- youtube_dl/extractor/generic.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 37671430a..10ae06263 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -195,6 +195,15 @@ class GenericIE(InfoExtractor): return self.playlist_result( urlrs, playlist_id=video_id, playlist_title=video_title) + # Look for embedded Dailymotion player + matches = re.findall( + r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion.com/embed/video/.+?)\1', webpage) + if matches: + urlrs = [self.url_result(unescapeHTML(tuppl[1]), 'Dailymotion') + for tuppl in matches] + return self.playlist_result( + urlrs, playlist_id=video_id, playlist_title=video_title) + # Look for Bandcamp pages with custom domain mobj = re.search(r'<meta property="og:url"[^>]*?content="(.*?bandcamp\.com.*?)"', webpage) if mobj is not None: From e344693b65a42436eb40efe85095c01f767a502d Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Sun, 1 Dec 2013 11:42:02 +0100 Subject: [PATCH 193/425] Make socket timeout configurable, and bump default to 10 minutes (#1862) --- test/parameters.json | 3 ++- youtube_dl/YoutubeDL.py | 4 +++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/test/parameters.json b/test/parameters.json index f042880ed..487a46d56 100644 --- a/test/parameters.json +++ b/test/parameters.json @@ -39,5 +39,6 @@ "writeinfojson": true, "writesubtitles": false, "allsubtitles": false, - "listssubtitles": false + "listssubtitles": false, + "socket_timeout": 20 } diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index b822930cb..b7393fd79 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -132,6 +132,7 @@ class YoutubeDL(object): cookiefile: File name where cookies should be read from and dumped to. nocheckcertificate:Do not verify SSL certificates proxy: URL of the proxy server to use + socket_timeout: Time to wait for unresponsive hosts, in seconds The following parameters are not used by YoutubeDL itself, they are used by the FileDownloader: @@ -969,7 +970,8 @@ class YoutubeDL(object): proxy_map.update(handler.proxies) write_string(u'[debug] Proxy map: ' + compat_str(proxy_map) + u'\n') - def _setup_opener(self, timeout=20): + def _setup_opener(self): + timeout = float(self.params.get('socket_timeout', 600)) opts_cookiefile = self.params.get('cookiefile') opts_proxy = self.params.get('proxy') From 55a10eab48776197245d3d87b86195f182d8d82a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Sun, 1 Dec 2013 22:36:18 +0100 Subject: [PATCH 194/425] [vimeo] Add an extractor for users (closes #1871) --- test/test_all_urls.py | 4 ++++ test/test_playlists.py | 9 +++++++ youtube_dl/extractor/__init__.py | 6 ++++- youtube_dl/extractor/vimeo.py | 41 ++++++++++++++++++++++++-------- 4 files changed, 49 insertions(+), 11 deletions(-) diff --git a/test/test_all_urls.py b/test/test_all_urls.py index 1f1adb6b4..6b9764c67 100644 --- a/test/test_all_urls.py +++ b/test/test_all_urls.py @@ -106,6 +106,10 @@ class TestAllURLsMatching(unittest.TestCase): self.assertMatch(':colbertreport', ['ComedyCentralShows']) self.assertMatch(':cr', ['ComedyCentralShows']) + def test_vimeo_matching(self): + self.assertMatch('http://vimeo.com/channels/tributes', ['vimeo:channel']) + self.assertMatch('http://vimeo.com/user7108434', ['vimeo:user']) + if __name__ == '__main__': unittest.main() diff --git a/test/test_playlists.py b/test/test_playlists.py index 167801ae2..13a6f4b2f 100644 --- a/test/test_playlists.py +++ b/test/test_playlists.py @@ -15,6 +15,7 @@ from youtube_dl.extractor import ( DailymotionPlaylistIE, DailymotionUserIE, VimeoChannelIE, + VimeoUserIE, UstreamChannelIE, SoundcloudSetIE, SoundcloudUserIE, @@ -54,6 +55,14 @@ class TestPlaylists(unittest.TestCase): self.assertEqual(result['title'], u'Vimeo Tributes') self.assertTrue(len(result['entries']) > 24) + def test_vimeo_user(self): + dl = FakeYDL() + ie = VimeoUserIE(dl) + result = ie.extract('http://vimeo.com/nkistudio/videos') + self.assertIsPlaylist(result) + self.assertEqual(result['title'], u'Nki') + self.assertTrue(len(result['entries']) > 65) + def test_ustream_channel(self): dl = FakeYDL() ie = UstreamChannelIE(dl) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 664639b53..cc93e619c 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -159,7 +159,11 @@ from .viddler import ViddlerIE from .videodetective import VideoDetectiveIE from .videofyme import VideofyMeIE from .videopremium import VideoPremiumIE -from .vimeo import VimeoIE, VimeoChannelIE +from .vimeo import ( + VimeoIE, + VimeoChannelIE, + VimeoUserIE, +) from .vine import VineIE from .viki import VikiIE from .vk import VKIE diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 7d82c2cfa..f27763ae2 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -249,25 +249,46 @@ class VimeoChannelIE(InfoExtractor): IE_NAME = u'vimeo:channel' _VALID_URL = r'(?:https?://)?vimeo.\com/channels/(?P<id>[^/]+)' _MORE_PAGES_INDICATOR = r'<a.+?rel="next"' + _TITLE_RE = r'<link rel="alternate"[^>]+?title="(.*?)"' - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - channel_id = mobj.group('id') + def _extract_videos(self, list_id, base_url): video_ids = [] - for pagenum in itertools.count(1): - webpage = self._download_webpage('http://vimeo.com/channels/%s/videos/page:%d' % (channel_id, pagenum), - channel_id, u'Downloading page %s' % pagenum) + webpage = self._download_webpage( + '%s/videos/page:%d/' % (base_url, pagenum),list_id, + u'Downloading page %s' % pagenum) video_ids.extend(re.findall(r'id="clip_(\d+?)"', webpage)) if re.search(self._MORE_PAGES_INDICATOR, webpage, re.DOTALL) is None: break entries = [self.url_result('http://vimeo.com/%s' % video_id, 'Vimeo') for video_id in video_ids] - channel_title = self._html_search_regex(r'<a href="/channels/%s">(.*?)</a>' % channel_id, - webpage, u'channel title') + list_title = self._html_search_regex(self._TITLE_RE, webpage, + u'list title') return {'_type': 'playlist', - 'id': channel_id, - 'title': channel_title, + 'id': list_id, + 'title': list_title, 'entries': entries, } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + channel_id = mobj.group('id') + return self._extract_videos(channel_id, 'http://vimeo.com/channels/%s' % channel_id) + + +class VimeoUserIE(VimeoChannelIE): + IE_NAME = u'vimeo:user' + _VALID_URL = r'(?:https?://)?vimeo.\com/(?P<name>[^/]+)' + _TITLE_RE = r'<a[^>]+?class="user">([^<>]+?)</a>' + + @classmethod + def suitable(cls, url): + if VimeoChannelIE.suitable(url) or VimeoIE.suitable(url): + return False + return super(VimeoUserIE, cls).suitable(url) + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + name = mobj.group('name') + return self._extract_videos(name, 'http://vimeo.com/%s' % name) From 6ad14cab599b05a658756fef47d3837281429da7 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 2 Dec 2013 13:37:05 +0100 Subject: [PATCH 195/425] Add --socket-timeout option --- youtube_dl/YoutubeDL.py | 4 +++- youtube_dl/__init__.py | 4 ++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index b7393fd79..b68b110a4 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -971,7 +971,9 @@ class YoutubeDL(object): write_string(u'[debug] Proxy map: ' + compat_str(proxy_map) + u'\n') def _setup_opener(self): - timeout = float(self.params.get('socket_timeout', 600)) + timeout_val = self.params.get('socket_timeout') + timeout = 600 if timeout_val is None else float(timeout_val) + opts_cookiefile = self.params.get('cookiefile') opts_proxy = self.params.get('proxy') diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 92e583744..799eca566 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -198,6 +198,9 @@ def parseOpts(overrideArguments=None): general.add_option( '--no-cache-dir', action='store_const', const=None, dest='cachedir', help='Disable filesystem caching') + general.add_option( + '--socket-timeout', dest='socket_timeout', + type=float, default=None, help=optparse.SUPPRESS_HELP) selection.add_option('--playlist-start', @@ -652,6 +655,7 @@ def _real_main(argv=None): 'cookiefile': opts.cookiefile, 'nocheckcertificate': opts.no_check_certificate, 'proxy': opts.proxy, + 'socket_timeout': opts.socket_timeout, } with YoutubeDL(ydl_opts) as ydl: From 0037e02921e7f70409ce113fb060765a6f24a27e Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 2 Dec 2013 13:37:26 +0100 Subject: [PATCH 196/425] release 2013.12.02 --- README.md | 7 ++++--- youtube_dl/version.py | 2 +- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index af4d969d6..031e436b6 100644 --- a/README.md +++ b/README.md @@ -56,7 +56,8 @@ which means you can modify it, redistribute it or use it however you like. --no-playlist download only the currently playing video --age-limit YEARS download only videos suitable for the given age --download-archive FILE Download only videos not present in the archive - file. Record all downloaded videos in it. + file. Record the IDs of all downloaded videos in + it. ## Download Options: -r, --rate-limit LIMIT maximum download rate in bytes per second (e.g. @@ -130,8 +131,8 @@ which means you can modify it, redistribute it or use it however you like. -v, --verbose print various debugging information --dump-intermediate-pages print downloaded pages to debug problems(very verbose) - --write-pages Write downloaded pages to files in the current - directory + --write-pages Write downloaded intermediary pages to files in + the current directory to debug problems ## Video Format Options: -f, --format FORMAT video format code, specify the order of diff --git a/youtube_dl/version.py b/youtube_dl/version.py index a73d7fb5c..d8f341ab9 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2013.11.29' +__version__ = '2013.12.02' From 5270d8cb1389a9b26fa698137bf4861d4bab6a25 Mon Sep 17 00:00:00 2001 From: dst <dstftw@gmail.com> Date: Mon, 2 Dec 2013 20:10:19 +0700 Subject: [PATCH 197/425] Added extractors for smotri.com --- test/test_playlists.py | 22 ++- youtube_dl/extractor/__init__.py | 5 + youtube_dl/extractor/smotri.py | 239 +++++++++++++++++++++++++++++++ 3 files changed, 265 insertions(+), 1 deletion(-) create mode 100644 youtube_dl/extractor/smotri.py diff --git a/test/test_playlists.py b/test/test_playlists.py index 13a6f4b2f..00c950109 100644 --- a/test/test_playlists.py +++ b/test/test_playlists.py @@ -22,7 +22,9 @@ from youtube_dl.extractor import ( LivestreamIE, NHLVideocenterIE, BambuserChannelIE, - BandcampAlbumIE + BandcampAlbumIE, + SmotriCommunityIE, + SmotriUserIE ) @@ -119,6 +121,24 @@ class TestPlaylists(unittest.TestCase): self.assertIsPlaylist(result) self.assertEqual(result['title'], u'Nightmare Night EP') self.assertTrue(len(result['entries']) >= 4) + + def test_smotri_community(self): + dl = FakeYDL() + ie = SmotriCommunityIE(dl) + result = ie.extract('http://smotri.com/community/video/kommuna') + self.assertIsPlaylist(result) + self.assertEqual(result['id'], u'kommuna') + self.assertEqual(result['title'], u'КПРФ') + self.assertTrue(len(result['entries']) >= 4) + + def test_smotri_user(self): + dl = FakeYDL() + ie = SmotriUserIE(dl) + result = ie.extract('http://smotri.com/user/inspector') + self.assertIsPlaylist(result) + self.assertEqual(result['id'], u'inspector') + self.assertEqual(result['title'], u'Inspector') + self.assertTrue(len(result['entries']) >= 9) if __name__ == '__main__': unittest.main() diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index cc93e619c..bd996483b 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -121,6 +121,11 @@ from .rutube import RutubeIE from .sina import SinaIE from .slashdot import SlashdotIE from .slideshare import SlideshareIE +from .smotri import ( + SmotriIE, + SmotriCommunityIE, + SmotriUserIE, +) from .sohu import SohuIE from .soundcloud import SoundcloudIE, SoundcloudSetIE, SoundcloudUserIE from .southparkstudios import ( diff --git a/youtube_dl/extractor/smotri.py b/youtube_dl/extractor/smotri.py new file mode 100644 index 000000000..ea42d5320 --- /dev/null +++ b/youtube_dl/extractor/smotri.py @@ -0,0 +1,239 @@ +# encoding: utf-8 + +import re +import json +import hashlib + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + ExtractorError +) + + +class SmotriIE(InfoExtractor): + IE_DESC = u'Smotri.com' + IE_NAME = u'smotri' + _VALID_URL = r'^(?:http://)?(?:www\.)?(?P<url>smotri\.com/video/view/\?id=(?P<videoid>v(?P<realvideoid>[0-9]+)[a-z0-9]{4}))' + + _TESTS = [ + # real video id 2610366 + { + u'url': u'http://smotri.com/video/view/?id=v261036632ab', + u'file': u'v261036632ab.mp4', + u'md5': u'46a72e83a6ad8862b64fa6953fa93f8a', + u'info_dict': { + u'title': u'катастрофа с камер видеонаблюдения', + u'uploader': u'rbc2008', + u'uploader_id': u'rbc08', + u'upload_date': u'20131118', + u'thumbnail': u'http://frame6.loadup.ru/8b/a9/2610366.3.3.jpg' + }, + }, + # real video id 57591 + { + u'url': u'http://smotri.com/video/view/?id=v57591cb20', + u'file': u'v57591cb20.flv', + u'md5': u'9eae59f6dda7087bf39a140e2fff5757', + u'info_dict': { + u'title': u'test', + u'uploader': u'Support Photofile@photofile', + u'uploader_id': u'support-photofile', + u'upload_date': u'20070704', + u'thumbnail': u'http://frame4.loadup.ru/03/ed/57591.2.3.jpg' + }, + }, + # video-password + { + u'url': u'http://smotri.com/video/view/?id=v1390466a13c', + u'file': u'v1390466a13c.mp4', + u'md5': u'fe4dd9357558d5ee3c8fc0ef0d39de66', + u'info_dict': { + u'title': u'TOCCA_A_NOI_-_LE_COSE_NON_VANNO_CAMBIAMOLE_ORA-1', + u'uploader': u'timoxa40', + u'uploader_id': u'timoxa40', + u'upload_date': u'20100404', + u'thumbnail': u'http://frame7.loadup.ru/af/3f/1390466.3.3.jpg' + }, + u'params': { + u'videopassword': u'qwerty', + }, + }, + # age limit + video-password + { + u'url': u'http://smotri.com/video/view/?id=v15408898bcf', + u'file': u'v15408898bcf.flv', + u'md5': u'c66a5d61379ac6fde06f07eebe436316', + u'info_dict': { + u'title': u'этот ролик не покажут по ТВ', + u'uploader': u'zzxxx', + u'uploader_id': u'ueggb', + u'upload_date': u'20101001', + u'thumbnail': u'http://frame3.loadup.ru/75/75/1540889.1.3.jpg', + u'age_limit': 18 + }, + u'params': { + u'videopassword': u'333' + } + } + ] + + _SUCCESS = 0 + _PASSWORD_NOT_VERIFIED = 1 + _PASSWORD_DETECTED = 2 + _VIDEO_NOT_FOUND = 3 + + def _search_meta(self, name, html, display_name=None): + if display_name is None: + display_name = name + return self._html_search_regex( + r'<meta itemprop="%s" content="([^"]+)" />' % re.escape(name), + html, display_name, fatal=False) + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('videoid') + real_video_id = mobj.group('realvideoid') + + # Download video JSON data + video_json_url = 'http://smotri.com/vt.php?id=%s' % real_video_id + video_json_page = self._download_webpage(video_json_url, video_id, u'Downloading video JSON') + video_json = json.loads(video_json_page) + + status = video_json['status'] + if status == self._VIDEO_NOT_FOUND: + raise ExtractorError(u'Video %s does not exist' % video_id, expected=True) + elif status == self._PASSWORD_DETECTED: # The video is protected by a password, retry with + # video-password set + video_password = self._downloader.params.get('videopassword', None) + if not video_password: + raise ExtractorError(u'This video is protected by a password, use the --video-password option', expected=True) + video_json_url += '&md5pass=%s' % hashlib.md5(video_password).hexdigest() + video_json_page = self._download_webpage(video_json_url, video_id, u'Downloading video JSON (video-password set)') + video_json = json.loads(video_json_page) + status = video_json['status'] + if status == self._PASSWORD_NOT_VERIFIED: + raise ExtractorError(u'Video password is invalid', expected=True) + + if status != self._SUCCESS: + raise ExtractorError(u'Unexpected status value %s' % status) + + # Extract the URL of the video + video_url = video_json['file_data'] + video_ext = determine_ext(video_url) + + # Video JSON does not provide enough meta data + # We will extract some from the video web page instead + video_page_url = 'http://' + mobj.group('url') + video_page = self._download_webpage(video_page_url, video_id, u'Downloading video page') + + # Adult content + if re.search(u'EroConfirmText">', video_page) is not None: + self.report_age_confirmation() + confirm_string = self._html_search_regex( + ur'<a href="/video/view/\?id=%s&confirm=([^"]+)" title="[^"]+">' % video_id, + video_page, u'confirm string') + confirm_url = video_page_url + '&confirm=%s' % confirm_string + video_page = self._download_webpage(confirm_url, video_id, u'Downloading video page (age confirmed)') + adult_content = True + else: + adult_content = False + + # Extract the rest of meta data + video_title = self._search_meta(u'name', video_page, u'title') + if not video_title: + video_title = video_url.rsplit('/', 1)[-1] + + video_description = self._search_meta(u'description', video_page) + video_thumbnail = self._search_meta(u'thumbnail', video_page) + + upload_date_str = self._search_meta(u'uploadDate', video_page, u'upload date') + upload_date_m = re.search(r'(?P<year>\d{4})\.(?P<month>\d{2})\.(?P<day>\d{2})T', upload_date_str) + video_upload_date = ( + ( + upload_date_m.group('year') + + upload_date_m.group('month') + + upload_date_m.group('day') + ) + if upload_date_m else None + ) + + duration_str = self._search_meta(u'duration', video_page) + duration_m = re.search(r'T(?P<hours>[0-9]{2})H(?P<minutes>[0-9]{2})M(?P<seconds>[0-9]{2})S', duration_str) + video_duration = ( + ( + (int(duration_m.group('hours')) * 60 * 60) + + (int(duration_m.group('minutes')) * 60) + + int(duration_m.group('seconds')) + ) + if duration_m else None + ) + + video_uploader = self._html_search_regex( + ur'<div class="DescrUser"><div>Автор.*?onmouseover="popup_user_info[^"]+">(.*?)</a>', + video_page, u'uploader', fatal=False, flags=re.MULTILINE|re.DOTALL) + + video_uploader_id = self._html_search_regex( + ur'<div class="DescrUser"><div>Автор.*?onmouseover="popup_user_info\(.*?\'([^\']+)\'\);">', + video_page, u'uploader id', fatal=False, flags=re.MULTILINE|re.DOTALL) + + video_view_count = self._html_search_regex( + ur'Общее количество просмотров.*?<span class="Number">(\d+)</span>', + video_page, u'view count', fatal=False, flags=re.MULTILINE|re.DOTALL) + + return { + 'id': video_id, + 'url': video_url, + 'title': video_title, + 'ext': video_ext, + 'thumbnail': video_thumbnail, + 'description': video_description, + 'uploader': video_uploader, + 'upload_date': video_upload_date, + 'uploader_id': video_uploader_id, + 'video_duration': video_duration, + 'view_count': video_view_count, + 'age_limit': 18 if adult_content else 0, + 'video_page_url': video_page_url + } + +class SmotriCommunityIE(InfoExtractor): + IE_DESC = u'Smotri.com community videos' + IE_NAME = u'smotri:community' + _VALID_URL = r'^(?:http://)?(?:www\.)?smotri\.com/community/video/(?P<communityid>[0-9A-Za-z_\'-]+)' + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + community_id = mobj.group('communityid') + + url = 'http://smotri.com/export/rss/video/by/community/-/%s/video.xml' % community_id + rss = self._download_xml(url, community_id, u'Downloading community RSS') + + entries = [self.url_result(video_url.text, 'Smotri') + for video_url in rss.findall('./channel/item/link')] + + community_title = self._html_search_regex( + ur'^Видео сообщества "([^"]+)"$', rss.find('./channel/description').text, u'community title') + + return self.playlist_result(entries, community_id, community_title) + +class SmotriUserIE(InfoExtractor): + IE_DESC = u'Smotri.com user videos' + IE_NAME = u'smotri:user' + _VALID_URL = r'^(?:http://)?(?:www\.)?smotri\.com/user/(?P<userid>[0-9A-Za-z_\'-]+)' + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url); + user_id = mobj.group('userid') + + url = 'http://smotri.com/export/rss/user/video/-/%s/video.xml' % user_id + rss = self._download_xml(url, user_id, u'Downloading user RSS') + + entries = [self.url_result(video_url.text, 'Smotri') + for video_url in rss.findall('./channel/item/link')] + + user_nickname = self._html_search_regex( + ur'^Видео режиссера (.*)$', rss.find('./channel/description').text, u'user nickname') + + return self.playlist_result(entries, user_id, user_nickname) + \ No newline at end of file From aaebed13a8447961e23cca9c75b097732c246476 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 2 Dec 2013 17:08:17 +0100 Subject: [PATCH 198/425] [smotri] Simplify --- youtube_dl/extractor/common.py | 3 +- youtube_dl/extractor/smotri.py | 91 +++++++++++++++++++--------------- 2 files changed, 54 insertions(+), 40 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 4f1b50880..1b049082d 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -364,7 +364,8 @@ class InfoExtractor(object): if display_name is None: display_name = name return self._html_search_regex( - r'''(?ix)<meta(?=[^>]+(?:name|property)=["\']%s["\']) + r'''(?ix)<meta + (?=[^>]+(?:itemprop|name|property)=["\']%s["\']) [^>]+content=["\']([^"\']+)["\']''' % re.escape(name), html, display_name, fatal=False) diff --git a/youtube_dl/extractor/smotri.py b/youtube_dl/extractor/smotri.py index ea42d5320..f035a3214 100644 --- a/youtube_dl/extractor/smotri.py +++ b/youtube_dl/extractor/smotri.py @@ -14,46 +14,49 @@ from ..utils import ( class SmotriIE(InfoExtractor): IE_DESC = u'Smotri.com' IE_NAME = u'smotri' - _VALID_URL = r'^(?:http://)?(?:www\.)?(?P<url>smotri\.com/video/view/\?id=(?P<videoid>v(?P<realvideoid>[0-9]+)[a-z0-9]{4}))' - + _VALID_URL = r'^https?://(?:www\.)?(?P<url>smotri\.com/video/view/\?id=(?P<videoid>v(?P<realvideoid>[0-9]+)[a-z0-9]{4}))' + _TESTS = [ # real video id 2610366 { u'url': u'http://smotri.com/video/view/?id=v261036632ab', u'file': u'v261036632ab.mp4', - u'md5': u'46a72e83a6ad8862b64fa6953fa93f8a', + u'md5': u'2a7b08249e6f5636557579c368040eb9', u'info_dict': { u'title': u'катастрофа с камер видеонаблюдения', u'uploader': u'rbc2008', u'uploader_id': u'rbc08', u'upload_date': u'20131118', - u'thumbnail': u'http://frame6.loadup.ru/8b/a9/2610366.3.3.jpg' + u'description': u'катастрофа с камер видеонаблюдения, видео катастрофа с камер видеонаблюдения', + u'thumbnail': u'http://frame6.loadup.ru/8b/a9/2610366.3.3.jpg', }, }, # real video id 57591 { u'url': u'http://smotri.com/video/view/?id=v57591cb20', u'file': u'v57591cb20.flv', - u'md5': u'9eae59f6dda7087bf39a140e2fff5757', + u'md5': u'830266dfc21f077eac5afd1883091bcd', u'info_dict': { u'title': u'test', u'uploader': u'Support Photofile@photofile', u'uploader_id': u'support-photofile', u'upload_date': u'20070704', - u'thumbnail': u'http://frame4.loadup.ru/03/ed/57591.2.3.jpg' - }, + u'description': u'test, видео test', + u'thumbnail': u'http://frame4.loadup.ru/03/ed/57591.2.3.jpg', + }, }, # video-password { u'url': u'http://smotri.com/video/view/?id=v1390466a13c', u'file': u'v1390466a13c.mp4', - u'md5': u'fe4dd9357558d5ee3c8fc0ef0d39de66', + u'md5': u'f6331cef33cad65a0815ee482a54440b', u'info_dict': { u'title': u'TOCCA_A_NOI_-_LE_COSE_NON_VANNO_CAMBIAMOLE_ORA-1', u'uploader': u'timoxa40', u'uploader_id': u'timoxa40', u'upload_date': u'20100404', - u'thumbnail': u'http://frame7.loadup.ru/af/3f/1390466.3.3.jpg' + u'thumbnail': u'http://frame7.loadup.ru/af/3f/1390466.3.3.jpg', + u'description': u'TOCCA_A_NOI_-_LE_COSE_NON_VANNO_CAMBIAMOLE_ORA-1, видео TOCCA_A_NOI_-_LE_COSE_NON_VANNO_CAMBIAMOLE_ORA-1', }, u'params': { u'videopassword': u'qwerty', @@ -63,15 +66,16 @@ class SmotriIE(InfoExtractor): { u'url': u'http://smotri.com/video/view/?id=v15408898bcf', u'file': u'v15408898bcf.flv', - u'md5': u'c66a5d61379ac6fde06f07eebe436316', + u'md5': u'91e909c9f0521adf5ee86fbe073aad70', u'info_dict': { u'title': u'этот ролик не покажут по ТВ', u'uploader': u'zzxxx', u'uploader_id': u'ueggb', u'upload_date': u'20101001', u'thumbnail': u'http://frame3.loadup.ru/75/75/1540889.1.3.jpg', - u'age_limit': 18 - }, + u'age_limit': 18, + u'description': u'этот ролик не покажут по ТВ, видео этот ролик не покажут по ТВ', + }, u'params': { u'videopassword': u'333' } @@ -82,14 +86,15 @@ class SmotriIE(InfoExtractor): _PASSWORD_NOT_VERIFIED = 1 _PASSWORD_DETECTED = 2 _VIDEO_NOT_FOUND = 3 - + def _search_meta(self, name, html, display_name=None): if display_name is None: display_name = name return self._html_search_regex( r'<meta itemprop="%s" content="([^"]+)" />' % re.escape(name), html, display_name, fatal=False) - + return self._html_search_meta(name, html, display_name) + def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('videoid') @@ -103,12 +108,12 @@ class SmotriIE(InfoExtractor): status = video_json['status'] if status == self._VIDEO_NOT_FOUND: raise ExtractorError(u'Video %s does not exist' % video_id, expected=True) - elif status == self._PASSWORD_DETECTED: # The video is protected by a password, retry with + elif status == self._PASSWORD_DETECTED: # The video is protected by a password, retry with # video-password set video_password = self._downloader.params.get('videopassword', None) if not video_password: raise ExtractorError(u'This video is protected by a password, use the --video-password option', expected=True) - video_json_url += '&md5pass=%s' % hashlib.md5(video_password).hexdigest() + video_json_url += '&md5pass=%s' % hashlib.md5(video_password.encode('utf-8')).hexdigest() video_json_page = self._download_webpage(video_json_url, video_id, u'Downloading video JSON (video-password set)') video_json = json.loads(video_json_page) status = video_json['status'] @@ -120,7 +125,6 @@ class SmotriIE(InfoExtractor): # Extract the URL of the video video_url = video_json['file_data'] - video_ext = determine_ext(video_url) # Video JSON does not provide enough meta data # We will extract some from the video web page instead @@ -131,7 +135,7 @@ class SmotriIE(InfoExtractor): if re.search(u'EroConfirmText">', video_page) is not None: self.report_age_confirmation() confirm_string = self._html_search_regex( - ur'<a href="/video/view/\?id=%s&confirm=([^"]+)" title="[^"]+">' % video_id, + r'<a href="/video/view/\?id=%s&confirm=([^"]+)" title="[^"]+">' % video_id, video_page, u'confirm string') confirm_url = video_page_url + '&confirm=%s' % confirm_string video_page = self._download_webpage(confirm_url, video_id, u'Downloading video page (age confirmed)') @@ -143,11 +147,17 @@ class SmotriIE(InfoExtractor): video_title = self._search_meta(u'name', video_page, u'title') if not video_title: video_title = video_url.rsplit('/', 1)[-1] - + video_description = self._search_meta(u'description', video_page) + END_TEXT = u' на сайте Smotri.com' + if video_description.endswith(END_TEXT): + video_description = video_description[:-len(END_TEXT)] + START_TEXT = u'Смотреть онлайн ролик ' + if video_description.startswith(START_TEXT): + video_description = video_description[len(START_TEXT):] video_thumbnail = self._search_meta(u'thumbnail', video_page) - - upload_date_str = self._search_meta(u'uploadDate', video_page, u'upload date') + + upload_date_str = self._search_meta(u'uploadDate', video_page, u'upload date') upload_date_m = re.search(r'(?P<year>\d{4})\.(?P<month>\d{2})\.(?P<day>\d{2})T', upload_date_str) video_upload_date = ( ( @@ -170,22 +180,21 @@ class SmotriIE(InfoExtractor): ) video_uploader = self._html_search_regex( - ur'<div class="DescrUser"><div>Автор.*?onmouseover="popup_user_info[^"]+">(.*?)</a>', + u'<div class="DescrUser"><div>Автор.*?onmouseover="popup_user_info[^"]+">(.*?)</a>', video_page, u'uploader', fatal=False, flags=re.MULTILINE|re.DOTALL) video_uploader_id = self._html_search_regex( - ur'<div class="DescrUser"><div>Автор.*?onmouseover="popup_user_info\(.*?\'([^\']+)\'\);">', + u'<div class="DescrUser"><div>Автор.*?onmouseover="popup_user_info\\(.*?\'([^\']+)\'\\);">', video_page, u'uploader id', fatal=False, flags=re.MULTILINE|re.DOTALL) video_view_count = self._html_search_regex( - ur'Общее количество просмотров.*?<span class="Number">(\d+)</span>', + u'Общее количество просмотров.*?<span class="Number">(\\d+)</span>', video_page, u'view count', fatal=False, flags=re.MULTILINE|re.DOTALL) return { 'id': video_id, 'url': video_url, 'title': video_title, - 'ext': video_ext, 'thumbnail': video_thumbnail, 'description': video_description, 'uploader': video_uploader, @@ -197,43 +206,47 @@ class SmotriIE(InfoExtractor): 'video_page_url': video_page_url } + class SmotriCommunityIE(InfoExtractor): IE_DESC = u'Smotri.com community videos' IE_NAME = u'smotri:community' - _VALID_URL = r'^(?:http://)?(?:www\.)?smotri\.com/community/video/(?P<communityid>[0-9A-Za-z_\'-]+)' + _VALID_URL = r'^https?://(?:www\.)?smotri\.com/community/video/(?P<communityid>[0-9A-Za-z_\'-]+)' def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) community_id = mobj.group('communityid') - + url = 'http://smotri.com/export/rss/video/by/community/-/%s/video.xml' % community_id rss = self._download_xml(url, community_id, u'Downloading community RSS') - + entries = [self.url_result(video_url.text, 'Smotri') for video_url in rss.findall('./channel/item/link')] - + + description_text = rss.find('./channel/description').text community_title = self._html_search_regex( - ur'^Видео сообщества "([^"]+)"$', rss.find('./channel/description').text, u'community title') + u'^Видео сообщества "([^"]+)"$', description_text, u'community title') return self.playlist_result(entries, community_id, community_title) - + + class SmotriUserIE(InfoExtractor): IE_DESC = u'Smotri.com user videos' IE_NAME = u'smotri:user' - _VALID_URL = r'^(?:http://)?(?:www\.)?smotri\.com/user/(?P<userid>[0-9A-Za-z_\'-]+)' - + _VALID_URL = r'^https?://(?:www\.)?smotri\.com/user/(?P<userid>[0-9A-Za-z_\'-]+)' + def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url); + mobj = re.match(self._VALID_URL, url) user_id = mobj.group('userid') - + url = 'http://smotri.com/export/rss/user/video/-/%s/video.xml' % user_id rss = self._download_xml(url, user_id, u'Downloading user RSS') - + entries = [self.url_result(video_url.text, 'Smotri') for video_url in rss.findall('./channel/item/link')] - + + description_text = rss.find('./channel/description').text user_nickname = self._html_search_regex( - ur'^Видео режиссера (.*)$', rss.find('./channel/description').text, u'user nickname') + u'^Видео режиссера (.*)$', description_text, + u'user nickname') return self.playlist_result(entries, user_id, user_nickname) - \ No newline at end of file From 87968574293ef87b98f51cf0d7c0958b9f496a7a Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 2 Dec 2013 17:43:22 +0100 Subject: [PATCH 199/425] Credit @dstftw for smotri IE --- youtube_dl/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 799eca566..2eeef2ae9 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -36,6 +36,7 @@ __authors__ = ( 'Marcin Cieślak', 'Anton Larionov', 'Takuya Tsuchida', + 'Sergey M.', ) __license__ = 'Public Domain' From 36a826a50dc5e53af8355f1233cc4f3ceba2e61b Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Tue, 3 Dec 2013 11:54:52 +0100 Subject: [PATCH 200/425] Clarify --download-archive help (#1757) --- youtube_dl/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 2eeef2ae9..48137ebe5 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -224,7 +224,7 @@ def parseOpts(overrideArguments=None): default=None, type=int) selection.add_option('--download-archive', metavar='FILE', dest='download_archive', - help='Download only videos not present in the archive file. Record the IDs of all downloaded videos in it.') + help='Download only videos not listed in the archive file. Record the IDs of all downloaded videos in it.') authentication.add_option('-u', '--username', From 1b753cb3344837fb69e9bfde89d03161d33ba3ff Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Tue, 3 Dec 2013 13:04:02 +0100 Subject: [PATCH 201/425] Add Windows configuration file locations (#1881) --- README.md | 2 +- youtube_dl/__init__.py | 29 ++++++++++++++++++++++------- 2 files changed, 23 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 031e436b6..0ff6ff8b9 100644 --- a/README.md +++ b/README.md @@ -183,7 +183,7 @@ which means you can modify it, redistribute it or use it however you like. # CONFIGURATION -You can configure youtube-dl by placing default arguments (such as `--extract-audio --no-mtime` to always extract the audio and not copy the mtime) into `/etc/youtube-dl.conf` and/or `~/.config/youtube-dl.conf`. +You can configure youtube-dl by placing default arguments (such as `--extract-audio --no-mtime` to always extract the audio and not copy the mtime) into `/etc/youtube-dl.conf` and/or `~/.config/youtube-dl.conf`. On Windows, the configuration file locations are `%APPDATA%\youtube-dl\config` and `C:\Users\<Yourname>\youtube-dl.conf`. # OUTPUT TEMPLATE diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 48137ebe5..32490b24e 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -81,15 +81,13 @@ from .PostProcessor import ( def parseOpts(overrideArguments=None): - def _readOptions(filename_bytes): + def _readOptions(filename_bytes, def=[]): try: optionf = open(filename_bytes) except IOError: - return [] # silently skip if file is not present + return def # silently skip if file is not present try: - res = [] - for l in optionf: - res += shlex.split(l, comments=True) + res = [shlex.split(l, comments=True) for l in optionf] finally: optionf.close() return res @@ -419,6 +417,8 @@ def parseOpts(overrideArguments=None): if opts.verbose: write_string(u'[debug] Override config: ' + repr(overrideArguments) + '\n') else: + systemConf = _readOptions('/etc/youtube-dl.conf') + xdg_config_home = os.environ.get('XDG_CONFIG_HOME') if xdg_config_home: userConfFile = os.path.join(xdg_config_home, 'youtube-dl', 'config') @@ -428,8 +428,23 @@ def parseOpts(overrideArguments=None): userConfFile = os.path.join(os.path.expanduser('~'), '.config', 'youtube-dl', 'config') if not os.path.isfile(userConfFile): userConfFile = os.path.join(os.path.expanduser('~'), '.config', 'youtube-dl.conf') - systemConf = _readOptions('/etc/youtube-dl.conf') - userConf = _readOptions(userConfFile) + userConf = _readOptions(userConfFile, None) + + if userConf is None: + appdata_dir = os.environ.get('appdata') + if appdata_dir: + userConf = _readOptions( + os.path.join(appdata_dir, 'youtube-dl', 'config'), + def=None) + + if userConf is None: + userConfFile = _readOptions( + os.path.join(os.path.expanduser('~'), 'youtube-dl.conf'), + def=None) + + if userConf is None: + userConf = [] + commandLineConf = sys.argv[1:] argv = systemConf + userConf + commandLineConf opts, args = parser.parse_args(argv) From fb27c2295e0e9d6f2f6ac45ed5906987b4710d0a Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Tue, 3 Dec 2013 13:09:48 +0100 Subject: [PATCH 202/425] Correct configuration file locations --- README.md | 2 +- youtube_dl/__init__.py | 18 +++++++++++++----- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 0ff6ff8b9..85af7cf7e 100644 --- a/README.md +++ b/README.md @@ -183,7 +183,7 @@ which means you can modify it, redistribute it or use it however you like. # CONFIGURATION -You can configure youtube-dl by placing default arguments (such as `--extract-audio --no-mtime` to always extract the audio and not copy the mtime) into `/etc/youtube-dl.conf` and/or `~/.config/youtube-dl.conf`. On Windows, the configuration file locations are `%APPDATA%\youtube-dl\config` and `C:\Users\<Yourname>\youtube-dl.conf`. +You can configure youtube-dl by placing default arguments (such as `--extract-audio --no-mtime` to always extract the audio and not copy the mtime) into `/etc/youtube-dl.conf` and/or `~/.config/youtube-dl.conf`. On Windows, the configuration file locations are `%APPDATA%\youtube-dl\config.txt` and `C:\Users\<Yourname>\youtube-dl.conf`. # OUTPUT TEMPLATE diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 32490b24e..9c8a694f0 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -81,11 +81,11 @@ from .PostProcessor import ( def parseOpts(overrideArguments=None): - def _readOptions(filename_bytes, def=[]): + def _readOptions(filename_bytes, default=[]): try: optionf = open(filename_bytes) except IOError: - return def # silently skip if file is not present + return default # silently skip if file is not present try: res = [shlex.split(l, comments=True) for l in optionf] finally: @@ -435,12 +435,20 @@ def parseOpts(overrideArguments=None): if appdata_dir: userConf = _readOptions( os.path.join(appdata_dir, 'youtube-dl', 'config'), - def=None) + default=None) + if userConf is None: + userConf = _readOptions( + os.path.join(appdata_dir, 'youtube-dl', 'config.txt'), + default=None) if userConf is None: - userConfFile = _readOptions( + userConf = _readOptions( os.path.join(os.path.expanduser('~'), 'youtube-dl.conf'), - def=None) + default=None) + if userConf is None: + userConf = _readOptions( + os.path.join(os.path.expanduser('~'), 'youtube-dl.conf.txt'), + default=None) if userConf is None: userConf = [] From a0eaa341e1ce6254179c1a00a11704da1887e124 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Tue, 3 Dec 2013 13:11:20 +0100 Subject: [PATCH 203/425] [configuration] Undo code breakage --- youtube_dl/__init__.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 9c8a694f0..fff295e8c 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -87,7 +87,9 @@ def parseOpts(overrideArguments=None): except IOError: return default # silently skip if file is not present try: - res = [shlex.split(l, comments=True) for l in optionf] + res = [] + for l in optionf: + res += shlex.split(l, comments=True) finally: optionf.close() return res From 731e3dde299844fc3b0f369d5a161fa4df0eb718 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Tue, 3 Dec 2013 13:13:09 +0100 Subject: [PATCH 204/425] release 2013.12.03 --- README.md | 2 +- youtube_dl/version.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 85af7cf7e..00975ab5e 100644 --- a/README.md +++ b/README.md @@ -55,7 +55,7 @@ which means you can modify it, redistribute it or use it however you like. --dateafter DATE download only videos uploaded after this date --no-playlist download only the currently playing video --age-limit YEARS download only videos suitable for the given age - --download-archive FILE Download only videos not present in the archive + --download-archive FILE Download only videos not listed in the archive file. Record the IDs of all downloaded videos in it. diff --git a/youtube_dl/version.py b/youtube_dl/version.py index d8f341ab9..f9a339c02 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2013.12.02' +__version__ = '2013.12.03' From cf6758d2040816033ec47afe9c1d497e4c2abd4d Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Tue, 3 Dec 2013 13:33:07 +0100 Subject: [PATCH 205/425] Document disabling proxy (#1882) --- youtube_dl/__init__.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index fff295e8c..d2446b670 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -191,7 +191,9 @@ def parseOpts(overrideArguments=None): general.add_option('--extractor-descriptions', action='store_true', dest='list_extractor_descriptions', help='Output descriptions of all supported extractors', default=False) - general.add_option('--proxy', dest='proxy', default=None, help='Use the specified HTTP/HTTPS proxy', metavar='URL') + general.add_option( + '--proxy', dest='proxy', default=None, metavar='URL', + help='Use the specified HTTP/HTTPS proxy. Pass in an empty string (--proxy "") for direct connection') general.add_option('--no-check-certificate', action='store_true', dest='no_check_certificate', default=False, help='Suppress HTTPS certificate validation.') general.add_option( '--cache-dir', dest='cachedir', default=get_cachedir(), metavar='DIR', From cb7fb54600a96bcced33020b925f2cfc9428bd4c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Tue, 3 Dec 2013 13:55:25 +0100 Subject: [PATCH 206/425] Change the ie_name of YoutubeSearchDateIE It produced a duplicate entry when listing the extractors with '--list-extractors' and generates noise in the commit log when generating the supported sites webpage (like in 09f355f73bf1657ecacfd05eda21d2c4bf1cc4a8) --- youtube_dl/extractor/youtube.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 765b4a9bf..66f5af000 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1765,6 +1765,7 @@ class YoutubeSearchIE(SearchInfoExtractor): return self.playlist_result(videos, query) class YoutubeSearchDateIE(YoutubeSearchIE): + IE_NAME = YoutubeSearchIE.IE_NAME + ':date' _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published' _SEARCH_KEY = 'ytsearchdate' IE_DESC = u'YouTube.com searches, newest videos first' From e9d8e302aafdb6fcf72c44d582c1f6d4447cd5fc Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Tue, 3 Dec 2013 14:06:08 +0100 Subject: [PATCH 207/425] [xhamster] Change test checksum --- youtube_dl/extractor/xhamster.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/xhamster.py b/youtube_dl/extractor/xhamster.py index 7444d3393..279f75e7a 100644 --- a/youtube_dl/extractor/xhamster.py +++ b/youtube_dl/extractor/xhamster.py @@ -26,7 +26,7 @@ class XHamsterIE(InfoExtractor): { u'url': u'http://xhamster.com/movies/2221348/britney_spears_sexy_booty.html?hd', u'file': u'2221348.flv', - u'md5': u'e767b9475de189320f691f49c679c4c7', + u'md5': u'970a94178ca4118c5aa3aaea21211b81', u'info_dict': { u"upload_date": u"20130914", u"uploader_id": u"jojo747400", From 938384c587c33696bcdb9c28b982e2b744695b3d Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Tue, 3 Dec 2013 14:08:16 +0100 Subject: [PATCH 208/425] [redtube] Fix search for title --- youtube_dl/extractor/redtube.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/redtube.py b/youtube_dl/extractor/redtube.py index 3bbda128e..c2254ae8a 100644 --- a/youtube_dl/extractor/redtube.py +++ b/youtube_dl/extractor/redtube.py @@ -30,7 +30,7 @@ class RedTubeIE(InfoExtractor): r'<source src="(.+?)" type="video/mp4">', webpage, u'video URL') video_title = self._html_search_regex( - r'<h1 class="videoTitle slidePanelMovable">(.+?)</h1>', + r'<h1 class="videoTitle[^"]*">(.+?)</h1>', webpage, u'title') # No self-labeling, but they describe themselves as From ce93879a9b3b1661db3e65ec43649c5b6a08778c Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Tue, 3 Dec 2013 14:16:58 +0100 Subject: [PATCH 209/425] [daum] Fix real video ID extraction --- youtube_dl/extractor/daum.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/daum.py b/youtube_dl/extractor/daum.py index 3d1dcb793..fe7cfb064 100644 --- a/youtube_dl/extractor/daum.py +++ b/youtube_dl/extractor/daum.py @@ -28,7 +28,8 @@ class DaumIE(InfoExtractor): video_id = mobj.group(1) canonical_url = 'http://tvpot.daum.net/v/%s' % video_id webpage = self._download_webpage(canonical_url, video_id) - full_id = self._search_regex(r'<link rel="video_src" href=".+?vid=(.+?)"', + full_id = self._search_regex( + r'<iframe src="http://videofarm.daum.net/controller/video/viewer/Video.html\?.*?vid=(.+?)[&"]', webpage, u'full id') query = compat_urllib_parse.urlencode({'vid': full_id}) info = self._download_xml( From fb7abb31af93a2a1d84ba17beb0f389dd09eafdb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Tue, 3 Dec 2013 14:21:06 +0100 Subject: [PATCH 210/425] Remove the compatibility code used before the new format system was implemented --- youtube_dl/extractor/appletrailers.py | 9 ++------- youtube_dl/extractor/archiveorg.py | 11 ++--------- youtube_dl/extractor/comedycentral.py | 9 ++------- youtube_dl/extractor/daum.py | 5 +---- youtube_dl/extractor/dreisat.py | 7 +------ youtube_dl/extractor/faz.py | 5 +---- youtube_dl/extractor/gamespot.py | 5 +---- youtube_dl/extractor/metacritic.py | 5 +---- youtube_dl/extractor/mtv.py | 7 +------ youtube_dl/extractor/naver.py | 5 +---- youtube_dl/extractor/trilulilu.py | 6 +----- youtube_dl/extractor/viddler.py | 8 +------- 12 files changed, 15 insertions(+), 67 deletions(-) diff --git a/youtube_dl/extractor/appletrailers.py b/youtube_dl/extractor/appletrailers.py index 6d6237f8a..4befff394 100644 --- a/youtube_dl/extractor/appletrailers.py +++ b/youtube_dl/extractor/appletrailers.py @@ -113,7 +113,7 @@ class AppleTrailersIE(InfoExtractor): }) formats = sorted(formats, key=lambda f: (f['height'], f['width'])) - info = { + playlist.append({ '_type': 'video', 'id': video_id, 'title': title, @@ -124,12 +124,7 @@ class AppleTrailersIE(InfoExtractor): 'upload_date': upload_date, 'uploader_id': uploader_id, 'user_agent': 'QuickTime compatible (youtube-dl)', - } - # TODO: Remove when #980 has been merged - info['url'] = formats[-1]['url'] - info['ext'] = formats[-1]['ext'] - - playlist.append(info) + }) return { '_type': 'playlist', diff --git a/youtube_dl/extractor/archiveorg.py b/youtube_dl/extractor/archiveorg.py index 61ce4469a..3ae0aebb1 100644 --- a/youtube_dl/extractor/archiveorg.py +++ b/youtube_dl/extractor/archiveorg.py @@ -49,7 +49,7 @@ class ArchiveOrgIE(InfoExtractor): for f in formats: f['ext'] = determine_ext(f['url']) - info = { + return { '_type': 'video', 'id': video_id, 'title': title, @@ -57,12 +57,5 @@ class ArchiveOrgIE(InfoExtractor): 'description': description, 'uploader': uploader, 'upload_date': upload_date, + 'thumbnail': data.get('misc', {}).get('image'), } - thumbnail = data.get('misc', {}).get('image') - if thumbnail: - info['thumbnail'] = thumbnail - - # TODO: Remove when #980 has been merged - info.update(formats[-1]) - - return info diff --git a/youtube_dl/extractor/comedycentral.py b/youtube_dl/extractor/comedycentral.py index 23647f99e..41ef9ad47 100644 --- a/youtube_dl/extractor/comedycentral.py +++ b/youtube_dl/extractor/comedycentral.py @@ -197,7 +197,7 @@ class ComedyCentralShowsIE(InfoExtractor): }) effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1) - info = { + results.append({ 'id': shortMediaId, 'formats': formats, 'uploader': showId, @@ -205,11 +205,6 @@ class ComedyCentralShowsIE(InfoExtractor): 'title': effTitle, 'thumbnail': None, 'description': compat_str(officialTitle), - } - - # TODO: Remove when #980 has been merged - info.update(info['formats'][-1]) - - results.append(info) + }) return results diff --git a/youtube_dl/extractor/daum.py b/youtube_dl/extractor/daum.py index fe7cfb064..d418ce4a8 100644 --- a/youtube_dl/extractor/daum.py +++ b/youtube_dl/extractor/daum.py @@ -57,7 +57,7 @@ class DaumIE(InfoExtractor): 'format_id': profile, }) - info = { + return { 'id': video_id, 'title': info.find('TITLE').text, 'formats': formats, @@ -66,6 +66,3 @@ class DaumIE(InfoExtractor): 'duration': int(info.find('DURATION').text), 'upload_date': info.find('REGDTTM').text[:8], } - # TODO: Remove when #980 has been merged - info.update(formats[-1]) - return info diff --git a/youtube_dl/extractor/dreisat.py b/youtube_dl/extractor/dreisat.py index 3cb382e12..24ce79425 100644 --- a/youtube_dl/extractor/dreisat.py +++ b/youtube_dl/extractor/dreisat.py @@ -65,7 +65,7 @@ class DreiSatIE(InfoExtractor): return (qidx, prefer_http, format['video_bitrate']) formats.sort(key=_sortkey) - info = { + return { '_type': 'video', 'id': video_id, 'title': video_title, @@ -76,8 +76,3 @@ class DreiSatIE(InfoExtractor): 'uploader': video_uploader, 'upload_date': upload_date, } - - # TODO: Remove when #980 has been merged - info.update(formats[-1]) - - return info diff --git a/youtube_dl/extractor/faz.py b/youtube_dl/extractor/faz.py index c0169de04..d0dfde694 100644 --- a/youtube_dl/extractor/faz.py +++ b/youtube_dl/extractor/faz.py @@ -44,13 +44,10 @@ class FazIE(InfoExtractor): }) descr = self._html_search_regex(r'<p class="Content Copy">(.*?)</p>', webpage, u'description') - info = { + return { 'id': video_id, 'title': self._og_search_title(webpage), 'formats': formats, 'description': descr, 'thumbnail': config.find('STILL/STILL_BIG').text, } - # TODO: Remove when #980 has been merged - info.update(formats[-1]) - return info diff --git a/youtube_dl/extractor/gamespot.py b/youtube_dl/extractor/gamespot.py index 9645b00c3..26b7d2ae5 100644 --- a/youtube_dl/extractor/gamespot.py +++ b/youtube_dl/extractor/gamespot.py @@ -47,13 +47,10 @@ class GameSpotIE(InfoExtractor): 'format_id': q, }) - info = { + return { 'id': data_video['guid'], 'title': compat_urllib_parse.unquote(data_video['title']), 'formats': formats, 'description': get_meta_content('description', webpage), 'thumbnail': self._og_search_thumbnail(webpage), } - # TODO: Remove when #980 has been merged - info.update(formats[-1]) - return info diff --git a/youtube_dl/extractor/metacritic.py b/youtube_dl/extractor/metacritic.py index 449138b56..6b95b4998 100644 --- a/youtube_dl/extractor/metacritic.py +++ b/youtube_dl/extractor/metacritic.py @@ -43,13 +43,10 @@ class MetacriticIE(InfoExtractor): description = self._html_search_regex(r'<b>Description:</b>(.*?)</p>', webpage, u'description', flags=re.DOTALL) - info = { + return { 'id': video_id, 'title': clip.find('title').text, 'formats': formats, 'description': description, 'duration': int(clip.find('duration').text), } - # TODO: Remove when #980 has been merged - info.update(formats[-1]) - return info diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py index 42aee58be..972336782 100644 --- a/youtube_dl/extractor/mtv.py +++ b/youtube_dl/extractor/mtv.py @@ -93,7 +93,7 @@ class MTVIE(InfoExtractor): else: description = None - info = { + return { 'title': itemdoc.find('title').text, 'formats': self._extract_video_formats(mediagen_page), 'id': video_id, @@ -101,11 +101,6 @@ class MTVIE(InfoExtractor): 'description': description, } - # TODO: Remove when #980 has been merged - info.update(info['formats'][-1]) - - return info - def _get_videos_info(self, uri): video_id = self._id_from_uri(uri) data = compat_urllib_parse.urlencode({'uri': uri}) diff --git a/youtube_dl/extractor/naver.py b/youtube_dl/extractor/naver.py index d290397c7..c012ec0cf 100644 --- a/youtube_dl/extractor/naver.py +++ b/youtube_dl/extractor/naver.py @@ -56,7 +56,7 @@ class NaverIE(InfoExtractor): 'height': int(format_el.find('height').text), }) - info = { + return { 'id': video_id, 'title': info.find('Subject').text, 'formats': formats, @@ -65,6 +65,3 @@ class NaverIE(InfoExtractor): 'upload_date': info.find('WriteDate').text.replace('.', ''), 'view_count': int(info.find('PlayCount').text), } - # TODO: Remove when #980 has been merged - info.update(formats[-1]) - return info diff --git a/youtube_dl/extractor/trilulilu.py b/youtube_dl/extractor/trilulilu.py index 1c49e580d..d64aaa41f 100644 --- a/youtube_dl/extractor/trilulilu.py +++ b/youtube_dl/extractor/trilulilu.py @@ -55,7 +55,7 @@ class TriluliluIE(InfoExtractor): for fnode in format_doc.findall('./formats/format') ] - info = { + return { '_type': 'video', 'id': video_id, 'formats': formats, @@ -64,7 +64,3 @@ class TriluliluIE(InfoExtractor): 'thumbnail': thumbnail, } - # TODO: Remove when #980 has been merged - info.update(formats[-1]) - - return info diff --git a/youtube_dl/extractor/viddler.py b/youtube_dl/extractor/viddler.py index 826804af3..75335dfb8 100644 --- a/youtube_dl/extractor/viddler.py +++ b/youtube_dl/extractor/viddler.py @@ -47,7 +47,7 @@ class ViddlerIE(InfoExtractor): r"thumbnail\s*:\s*'([^']*)'", webpage, u'thumbnail', fatal=False) - info = { + return { '_type': 'video', 'id': video_id, 'title': title, @@ -56,9 +56,3 @@ class ViddlerIE(InfoExtractor): 'duration': duration, 'formats': formats, } - - # TODO: Remove when #980 has been merged - info['formats'][-1]['ext'] = determine_ext(info['formats'][-1]['url']) - info.update(info['formats'][-1]) - - return info From 84db81815af6787d91188ca065cc9ced4d83a4ca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Tue, 3 Dec 2013 14:58:24 +0100 Subject: [PATCH 211/425] Move common code for extractors based in MTV services to a new base class Removes the duplication of the thumbnail extraction code (only MTVIE needs to override it) --- youtube_dl/extractor/comedycentral.py | 10 +--- youtube_dl/extractor/gametrailers.py | 16 ++---- youtube_dl/extractor/mtv.py | 69 ++++++++++++++---------- youtube_dl/extractor/southparkstudios.py | 13 +---- 4 files changed, 48 insertions(+), 60 deletions(-) diff --git a/youtube_dl/extractor/comedycentral.py b/youtube_dl/extractor/comedycentral.py index 41ef9ad47..53579aa27 100644 --- a/youtube_dl/extractor/comedycentral.py +++ b/youtube_dl/extractor/comedycentral.py @@ -1,7 +1,7 @@ import re from .common import InfoExtractor -from .mtv import MTVIE, _media_xml_tag +from .mtv import MTVServicesInfoExtractor from ..utils import ( compat_str, compat_urllib_parse, @@ -11,7 +11,7 @@ from ..utils import ( ) -class ComedyCentralIE(MTVIE): +class ComedyCentralIE(MTVServicesInfoExtractor): _VALID_URL = r'http://www.comedycentral.com/(video-clips|episodes|cc-studios)/(?P<title>.*)' _FEED_URL = u'http://comedycentral.com/feeds/mrss/' @@ -25,12 +25,6 @@ class ComedyCentralIE(MTVIE): u'description': u'After a certain point, breastfeeding becomes c**kblocking.', }, } - # Overwrite MTVIE properties we don't want - _TESTS = [] - - def _get_thumbnail_url(self, uri, itemdoc): - search_path = '%s/%s' % (_media_xml_tag('group'), _media_xml_tag('thumbnail')) - return itemdoc.find(search_path).attrib['url'] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) diff --git a/youtube_dl/extractor/gametrailers.py b/youtube_dl/extractor/gametrailers.py index 3cc02d97e..3a8bef250 100644 --- a/youtube_dl/extractor/gametrailers.py +++ b/youtube_dl/extractor/gametrailers.py @@ -1,13 +1,11 @@ import re -from .mtv import MTVIE, _media_xml_tag +from .mtv import MTVServicesInfoExtractor -class GametrailersIE(MTVIE): - """ - Gametrailers use the same videos system as MTVIE, it just changes the feed - url, where the uri is and the method to get the thumbnails. - """ + +class GametrailersIE(MTVServicesInfoExtractor): _VALID_URL = r'http://www.gametrailers.com/(?P<type>videos|reviews|full-episodes)/(?P<id>.*?)/(?P<title>.*)' + _TEST = { u'url': u'http://www.gametrailers.com/videos/zbvr8i/mirror-s-edge-2-e3-2013--debut-trailer', u'file': u'70e9a5d7-cf25-4a10-9104-6f3e7342ae0d.mp4', @@ -17,15 +15,9 @@ class GametrailersIE(MTVIE): u'description': u'Faith is back! Check out the World Premiere trailer for Mirror\'s Edge 2 straight from the EA Press Conference at E3 2013!', }, } - # Overwrite MTVIE properties we don't want - _TESTS = [] _FEED_URL = 'http://www.gametrailers.com/feeds/mrss' - def _get_thumbnail_url(self, uri, itemdoc): - search_path = '%s/%s' % (_media_xml_tag('group'), _media_xml_tag('thumbnail')) - return itemdoc.find(search_path).attrib['url'] - def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py index 972336782..6b3feb560 100644 --- a/youtube_dl/extractor/mtv.py +++ b/youtube_dl/extractor/mtv.py @@ -10,35 +10,8 @@ from ..utils import ( def _media_xml_tag(tag): return '{http://search.yahoo.com/mrss/}%s' % tag -class MTVIE(InfoExtractor): - _VALID_URL = r'^https?://(?:www\.)?mtv\.com/videos/.+?/(?P<videoid>[0-9]+)/[^/]+$' - - _FEED_URL = 'http://www.mtv.com/player/embed/AS3/rss/' - - _TESTS = [ - { - u'url': u'http://www.mtv.com/videos/misc/853555/ours-vh1-storytellers.jhtml', - u'file': u'853555.mp4', - u'md5': u'850f3f143316b1e71fa56a4edfd6e0f8', - u'info_dict': { - u'title': u'Taylor Swift - "Ours (VH1 Storytellers)"', - u'description': u'Album: Taylor Swift performs "Ours" for VH1 Storytellers at Harvey Mudd College.', - }, - }, - { - u'add_ie': ['Vevo'], - u'url': u'http://www.mtv.com/videos/taylor-swift/916187/everything-has-changed-ft-ed-sheeran.jhtml', - u'file': u'USCJY1331283.mp4', - u'md5': u'73b4e7fcadd88929292fe52c3ced8caf', - u'info_dict': { - u'title': u'Everything Has Changed', - u'upload_date': u'20130606', - u'uploader': u'Taylor Swift', - }, - u'skip': u'VEVO is only available in some countries', - }, - ] +class MTVServicesInfoExtractor(InfoExtractor): @staticmethod def _id_from_uri(uri): return uri.split(':')[-1] @@ -53,7 +26,12 @@ class MTVIE(InfoExtractor): return base + m.group('finalid') def _get_thumbnail_url(self, uri, itemdoc): - return 'http://mtv.mtvnimages.com/uri/' + uri + search_path = '%s/%s' % (_media_xml_tag('group'), _media_xml_tag('thumbnail')) + thumb_node = itemdoc.find(search_path) + if thumb_node is None: + return None + else: + return thumb_node.attrib['url'] def _extract_video_formats(self, metadataXml): if '/error_country_block.swf' in metadataXml: @@ -108,6 +86,39 @@ class MTVIE(InfoExtractor): u'Downloading info') return [self._get_video_info(item) for item in idoc.findall('.//item')] + +class MTVIE(MTVServicesInfoExtractor): + _VALID_URL = r'^https?://(?:www\.)?mtv\.com/videos/.+?/(?P<videoid>[0-9]+)/[^/]+$' + + _FEED_URL = 'http://www.mtv.com/player/embed/AS3/rss/' + + _TESTS = [ + { + u'url': u'http://www.mtv.com/videos/misc/853555/ours-vh1-storytellers.jhtml', + u'file': u'853555.mp4', + u'md5': u'850f3f143316b1e71fa56a4edfd6e0f8', + u'info_dict': { + u'title': u'Taylor Swift - "Ours (VH1 Storytellers)"', + u'description': u'Album: Taylor Swift performs "Ours" for VH1 Storytellers at Harvey Mudd College.', + }, + }, + { + u'add_ie': ['Vevo'], + u'url': u'http://www.mtv.com/videos/taylor-swift/916187/everything-has-changed-ft-ed-sheeran.jhtml', + u'file': u'USCJY1331283.mp4', + u'md5': u'73b4e7fcadd88929292fe52c3ced8caf', + u'info_dict': { + u'title': u'Everything Has Changed', + u'upload_date': u'20130606', + u'uploader': u'Taylor Swift', + }, + u'skip': u'VEVO is only available in some countries', + }, + ] + + def _get_thumbnail_url(self, uri, itemdoc): + return 'http://mtv.mtvnimages.com/uri/' + uri + def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('videoid') diff --git a/youtube_dl/extractor/southparkstudios.py b/youtube_dl/extractor/southparkstudios.py index a711531e6..fd90cc5dd 100644 --- a/youtube_dl/extractor/southparkstudios.py +++ b/youtube_dl/extractor/southparkstudios.py @@ -1,15 +1,14 @@ import re -from .mtv import MTVIE, _media_xml_tag +from .mtv import MTVServicesInfoExtractor -class SouthParkStudiosIE(MTVIE): +class SouthParkStudiosIE(MTVServicesInfoExtractor): IE_NAME = u'southparkstudios.com' _VALID_URL = r'(https?://)?(www\.)?(?P<url>southparkstudios\.com/(clips|full-episodes)/(?P<id>.+?)(\?|#|$))' _FEED_URL = 'http://www.southparkstudios.com/feeds/video-player/mrss' - # Overwrite MTVIE properties we don't want _TESTS = [{ u'url': u'http://www.southparkstudios.com/clips/104437/bat-daded#tab=featured', u'file': u'a7bff6c2-ed00-11e0-aca6-0026b9414f30.mp4', @@ -19,14 +18,6 @@ class SouthParkStudiosIE(MTVIE): }, }] - def _get_thumbnail_url(self, uri, itemdoc): - search_path = '%s/%s' % (_media_xml_tag('group'), _media_xml_tag('thumbnail')) - thumb_node = itemdoc.find(search_path) - if thumb_node is None: - return None - else: - return thumb_node.attrib['url'] - def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) url = u'http://www.' + mobj.group(u'url') From 1dcc4c0cad886457c0fa5f874c38f95f0510ea4f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Fri, 22 Nov 2013 14:57:53 +0100 Subject: [PATCH 212/425] Add --load-info option (#972) It just calls the 'YoutubeDL.process_ie_result' with the dictionary from the json file --- youtube_dl/YoutubeDL.py | 6 ++++++ youtube_dl/__init__.py | 10 ++++++++-- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index b68b110a4..80c056dc8 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -812,6 +812,12 @@ class YoutubeDL(object): return self._download_retcode + def download_with_info_file(self, info_filename): + with open(info_filename, 'r') as f: + # TODO: Check for errors + info = json.load(f) + self.process_ie_result(info, download=True) + def post_process(self, filename, ie_info): """Run all the postprocessors on the given file.""" info = dict(ie_info) diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index d2446b670..b0d9a6763 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -360,6 +360,9 @@ def parseOpts(overrideArguments=None): help='Restrict filenames to only ASCII characters, and avoid "&" and spaces in filenames', default=False) filesystem.add_option('-a', '--batch-file', dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)') + filesystem.add_option('--load-info', + dest='load_info_filename', metavar='FILE', + help='json file containing the video information (created with the "--write-json" option') filesystem.add_option('-w', '--no-overwrites', action='store_true', dest='nooverwrites', help='do not overwrite files', default=False) filesystem.add_option('-c', '--continue', @@ -706,14 +709,17 @@ def _real_main(argv=None): update_self(ydl.to_screen, opts.verbose) # Maybe do nothing - if len(all_urls) < 1: + if (len(all_urls) < 1) and (opts.load_info_filename is None): if not opts.update_self: parser.error(u'you must provide at least one URL') else: sys.exit() try: - retcode = ydl.download(all_urls) + if opts.load_info_filename is not None: + retcode = ydl.download_with_info_file(opts.load_info_filename) + else: + retcode = ydl.download(all_urls) except MaxDownloadsReached: ydl.to_screen(u'--max-download limit reached, aborting.') retcode = 101 From d494389821de832874dc78abc2fe16365b5fe815 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Tue, 3 Dec 2013 20:16:52 +0100 Subject: [PATCH 213/425] Option '--load-info': if the download fails, try extracting the info with the 'webpage_url' field of the info dict The video url may have expired. --- youtube_dl/YoutubeDL.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 80c056dc8..77339dddf 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -816,7 +816,16 @@ class YoutubeDL(object): with open(info_filename, 'r') as f: # TODO: Check for errors info = json.load(f) - self.process_ie_result(info, download=True) + try: + self.process_ie_result(info, download=True) + except DownloadError: + webpage_url = info.get('webpage_url') + if webpage_url is not None: + self.report_warning(u'The info failed to download, trying with "%s"' % webpage_url) + return self.download([webpage_url]) + else: + raise + return self._download_retcode def post_process(self, filename, ie_info): """Run all the postprocessors on the given file.""" From 55f6597c67dd04729dbc1b83d81bfbd63d7e9c0a Mon Sep 17 00:00:00 2001 From: dst <dstftw@gmail.com> Date: Wed, 4 Dec 2013 08:41:09 +0700 Subject: [PATCH 214/425] [smotri] Add an extractor for live rtmp broadcasts --- youtube_dl/FileDownloader.py | 49 ++++++++++++-- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/smotri.py | 106 ++++++++++++++++++++++++++++++- 3 files changed, 148 insertions(+), 8 deletions(-) diff --git a/youtube_dl/FileDownloader.py b/youtube_dl/FileDownloader.py index 3ff9716b3..de1dc66bb 100644 --- a/youtube_dl/FileDownloader.py +++ b/youtube_dl/FileDownloader.py @@ -226,6 +226,22 @@ class FileDownloader(object): (clear_line, percent_str, data_len_str, speed_str, eta_str), skip_eol=True) self.to_console_title(u'youtube-dl - %s of %s at %s ETA %s' % (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip())) + + def report_progress_live_stream(self, downloaded_data_len, speed, elapsed): + if self.params.get('noprogress', False): + return + clear_line = (u'\x1b[K' if sys.stderr.isatty() and os.name != 'nt' else u'') + downloaded_str = format_bytes(downloaded_data_len) + speed_str = self.format_speed(speed) + elapsed_str = FileDownloader.format_seconds(elapsed) + if self.params.get('progress_with_newline', False): + self.to_screen(u'[download] %s at %s' % + (downloaded_str, speed_str)) + else: + self.to_screen(u'\r%s[download] %s at %s ET %s' % + (clear_line, downloaded_str, speed_str, elapsed_str), skip_eol=True) + self.to_console_title(u'youtube-dl - %s at %s ET %s' % + (downloaded_str.strip(), speed_str.strip(), elapsed_str.strip())) def report_resuming_byte(self, resume_len): """Report attempt to resume at given byte.""" @@ -255,7 +271,7 @@ class FileDownloader(object): self.to_screen(u'\r%s[download] 100%% of %s in %s' % (clear_line, data_len_str, self.format_seconds(tot_time))) - def _download_with_rtmpdump(self, filename, url, player_url, page_url, play_path, tc_url, live): + def _download_with_rtmpdump(self, filename, url, player_url, page_url, play_path, tc_url, live, conn): def run_rtmpdump(args): start = time.time() resume_percent = None @@ -301,11 +317,27 @@ class FileDownloader(object): 'eta': eta, 'speed': speed, }) - elif self.params.get('verbose', False): - if not cursor_in_new_line: - self.to_screen(u'') - cursor_in_new_line = True - self.to_screen(u'[rtmpdump] '+line) + else: + # no percent for live streams + mobj = re.search(r'([0-9]+\.[0-9]{3}) kB / [0-9]+\.[0-9]{2} sec', line) + if mobj: + downloaded_data_len = int(float(mobj.group(1))*1024) + time_now = time.time() + speed = self.calc_speed(start, time_now, downloaded_data_len) + self.report_progress_live_stream(downloaded_data_len, speed, time_now - start) + cursor_in_new_line = False + self._hook_progress({ + 'downloaded_bytes': downloaded_data_len, + 'tmpfilename': tmpfilename, + 'filename': filename, + 'status': 'downloading', + 'speed': speed, + }) + elif self.params.get('verbose', False): + if not cursor_in_new_line: + self.to_screen(u'') + cursor_in_new_line = True + self.to_screen(u'[rtmpdump] '+line) proc.wait() if not cursor_in_new_line: self.to_screen(u'') @@ -338,6 +370,8 @@ class FileDownloader(object): basic_args += ['--stop', '1'] if live: basic_args += ['--live'] + if conn: + basic_args += ['--conn', conn] args = basic_args + [[], ['--resume', '--skip', '1']][self.params.get('continuedl', False)] if sys.platform == 'win32' and sys.version_info < (3, 0): @@ -479,7 +513,8 @@ class FileDownloader(object): info_dict.get('page_url', None), info_dict.get('play_path', None), info_dict.get('tc_url', None), - info_dict.get('rtmp_live', False)) + info_dict.get('rtmp_live', False), + info_dict.get('rtmp_conn', None)) # Attempt to download using mplayer if url.startswith('mms') or url.startswith('rtsp'): diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index bd996483b..60e2d6ebd 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -125,6 +125,7 @@ from .smotri import ( SmotriIE, SmotriCommunityIE, SmotriUserIE, + SmotriBroadcastIE, ) from .sohu import SohuIE from .soundcloud import SoundcloudIE, SoundcloudSetIE, SoundcloudUserIE diff --git a/youtube_dl/extractor/smotri.py b/youtube_dl/extractor/smotri.py index f035a3214..f86ee8388 100644 --- a/youtube_dl/extractor/smotri.py +++ b/youtube_dl/extractor/smotri.py @@ -3,10 +3,12 @@ import re import json import hashlib +import uuid from .common import InfoExtractor from ..utils import ( - determine_ext, + compat_urllib_parse, + compat_urllib_request, ExtractorError ) @@ -250,3 +252,105 @@ class SmotriUserIE(InfoExtractor): u'user nickname') return self.playlist_result(entries, user_id, user_nickname) + + +class SmotriBroadcastIE(InfoExtractor): + IE_DESC = u'Smotri.com broadcasts' + IE_NAME = u'smotri:broadcast' + _VALID_URL = r'^https?://(?:www\.)?(?P<url>smotri\.com/live/(?P<broadcastid>[^/]+))/?.*' + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + broadcast_id = mobj.group('broadcastid') + + broadcast_url = 'http://' + mobj.group('url') + broadcast_page = self._download_webpage(broadcast_url, broadcast_id, u'Downloading broadcast page') + + if re.search(u'>Режиссер с логином <br/>"%s"<br/> <span>не существует<' % broadcast_id, broadcast_page) is not None: + raise ExtractorError(u'Broadcast %s does not exist' % broadcast_id, expected=True) + + # Adult content + if re.search(u'EroConfirmText">', broadcast_page) is not None: + + (username, password) = self._get_login_info() + if username is None: + raise ExtractorError(u'Erotic broadcasts allowed only for registered users, ' + u'use --username and --password options to provide account credentials.', expected=True) + + # Log in + login_form_strs = { + u'login-hint53': '1', + u'confirm_erotic': '1', + u'login': username, + u'password': password, + } + # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode + # chokes on unicode + login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items()) + login_data = compat_urllib_parse.urlencode(login_form).encode('utf-8') + login_url = broadcast_url + '/?no_redirect=1' + request = compat_urllib_request.Request(login_url, login_data) + request.add_header('Content-Type', 'application/x-www-form-urlencoded') + broadcast_page = self._download_webpage( + request, broadcast_id, note=u'Logging in and confirming age') + + if re.search(u'>Неверный логин или пароль<', broadcast_page) is not None: + raise ExtractorError(u'Unable to log in: bad username or password', expected=True) + + adult_content = True + else: + adult_content = False + + ticket = self._html_search_regex( + u'window.broadcast_control.addFlashVar\\(\'file\', \'([^\']+)\'\\);', + broadcast_page, u'broadcast ticket') + + url = 'http://smotri.com/broadcast/view/url/?ticket=%s' % ticket + + broadcast_password = self._downloader.params.get('videopassword', None) + if broadcast_password: + url += '&pass=%s' % hashlib.md5(broadcast_password.encode('utf-8')).hexdigest() + + broadcast_json_page = self._download_webpage(url, broadcast_id, u'Downloading broadcast JSON') + + try: + broadcast_json = json.loads(broadcast_json_page) + + protected_broadcast = broadcast_json['_pass_protected'] == 1 + if protected_broadcast and not broadcast_password: + raise ExtractorError(u'This broadcast is protected by a password, use the --video-password option', expected=True) + + broadcast_offline = broadcast_json['is_play'] == 0 + if broadcast_offline: + raise ExtractorError(u'Broadcast %s is offline' % broadcast_id, expected=True) + + rtmp_url = broadcast_json['_server'] + if not rtmp_url.startswith('rtmp://'): + raise ExtractorError(u'Unexpected broadcast rtmp URL') + + broadcast_playpath = broadcast_json['_streamName'] + broadcast_thumbnail = broadcast_json['_imgURL'] + broadcast_title = broadcast_json['title'] + broadcast_description = broadcast_json['description'] + broadcaster_nick = broadcast_json['nick'] + broadcaster_login = broadcast_json['login'] + rtmp_conn = 'S:%s' % uuid.uuid4().hex + except KeyError: + if protected_broadcast: + raise ExtractorError(u'Bad broadcast password', expected=True) + raise ExtractorError(u'Unexpected broadcast JSON') + + return { + 'id': broadcast_id, + 'url': rtmp_url, + 'title': broadcast_title, + 'thumbnail': broadcast_thumbnail, + 'description': broadcast_description, + 'uploader': broadcaster_nick, + 'uploader_id': broadcaster_login, + 'age_limit': 18 if adult_content else 0, + 'ext': 'flv', + 'play_path': broadcast_playpath, + 'rtmp_live': True, + 'rtmp_conn': rtmp_conn + } \ No newline at end of file From 8aff7b9bc47795288c65399d6fcac7a8c48004e9 Mon Sep 17 00:00:00 2001 From: dst <dstftw@gmail.com> Date: Wed, 4 Dec 2013 12:36:12 +0700 Subject: [PATCH 215/425] [smotri] Fix broadcast ticket regex --- youtube_dl/extractor/smotri.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/smotri.py b/youtube_dl/extractor/smotri.py index f86ee8388..ff539ea0c 100644 --- a/youtube_dl/extractor/smotri.py +++ b/youtube_dl/extractor/smotri.py @@ -302,7 +302,7 @@ class SmotriBroadcastIE(InfoExtractor): adult_content = False ticket = self._html_search_regex( - u'window.broadcast_control.addFlashVar\\(\'file\', \'([^\']+)\'\\);', + u'window\.broadcast_control\.addFlashVar\\(\'file\', \'([^\']+)\'\\);', broadcast_page, u'broadcast ticket') url = 'http://smotri.com/broadcast/view/url/?ticket=%s' % ticket From 27dcce19045670fc348ff1119c0d2283aaed3ae2 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Wed, 4 Dec 2013 14:16:52 +0100 Subject: [PATCH 216/425] [youtube] Resolve URLs in comments --- youtube_dl/extractor/youtube.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 66f5af000..7fff761bd 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -336,7 +336,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): u"uploader": u"Philipp Hagemeister", u"uploader_id": u"phihag", u"upload_date": u"20121002", - u"description": u"test chars: \"'/\\ä↭𝕐\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de ." + u"description": u"test chars: \"'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de ." } }, { @@ -1366,6 +1366,15 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): # description video_description = get_element_by_id("eow-description", video_webpage) if video_description: + video_description = re.sub(r'''(?x) + <a\s+ + (?:[a-zA-Z-]+="[^"]+"\s+)*? + title="([^"]+)"\s+ + (?:[a-zA-Z-]+="[^"]+"\s+)*? + class="yt-uix-redirect-link"\s*> + [^<]+ + </a> + ''', r'\1', video_description) video_description = clean_html(video_description) else: fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage) From 671c0f151d5a7bb5c32a59f483a8e330f1f9a15b Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Wed, 4 Dec 2013 14:19:07 +0100 Subject: [PATCH 217/425] release 2013.12.04 --- README.md | 3 ++- youtube_dl/version.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 00975ab5e..029c418d1 100644 --- a/README.md +++ b/README.md @@ -30,7 +30,8 @@ which means you can modify it, redistribute it or use it however you like. --list-extractors List all supported extractors and the URLs they would handle --extractor-descriptions Output descriptions of all supported extractors - --proxy URL Use the specified HTTP/HTTPS proxy + --proxy URL Use the specified HTTP/HTTPS proxy. Pass in an + empty string (--proxy "") for direct connection --no-check-certificate Suppress HTTPS certificate validation. --cache-dir DIR Location in the filesystem where youtube-dl can store downloaded information permanently. By diff --git a/youtube_dl/version.py b/youtube_dl/version.py index f9a339c02..68b30bfd4 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2013.12.03' +__version__ = '2013.12.04' From c0ade33e167d1668c4aa8a6684e7083e6c71dd6e Mon Sep 17 00:00:00 2001 From: dst <dstftw@gmail.com> Date: Wed, 4 Dec 2013 20:34:47 +0700 Subject: [PATCH 218/425] Correct some extractor _VALID_URL regexes --- youtube_dl/extractor/addanime.py | 2 +- youtube_dl/extractor/appletrailers.py | 2 +- youtube_dl/extractor/archiveorg.py | 2 +- youtube_dl/extractor/arte.py | 4 ++-- youtube_dl/extractor/auengine.py | 2 +- youtube_dl/extractor/bambuser.py | 2 +- youtube_dl/extractor/bloomberg.py | 2 +- youtube_dl/extractor/comedycentral.py | 2 +- youtube_dl/extractor/cspan.py | 2 +- youtube_dl/extractor/dreisat.py | 2 +- youtube_dl/extractor/eighttracks.py | 2 +- youtube_dl/extractor/exfm.py | 2 +- youtube_dl/extractor/faz.py | 2 +- youtube_dl/extractor/fktv.py | 4 ++-- youtube_dl/extractor/francetv.py | 2 +- youtube_dl/extractor/gamekings.py | 2 +- youtube_dl/extractor/gametrailers.py | 2 +- youtube_dl/extractor/ign.py | 2 +- youtube_dl/extractor/instagram.py | 2 +- youtube_dl/extractor/jukebox.py | 2 +- youtube_dl/extractor/liveleak.py | 2 +- youtube_dl/extractor/livestream.py | 2 +- youtube_dl/extractor/muzu.py | 2 +- youtube_dl/extractor/myspass.py | 2 +- youtube_dl/extractor/orf.py | 2 +- youtube_dl/extractor/pbs.py | 2 +- youtube_dl/extractor/rutube.py | 2 +- youtube_dl/extractor/slashdot.py | 2 +- youtube_dl/extractor/soundcloud.py | 4 ++-- youtube_dl/extractor/space.py | 2 +- youtube_dl/extractor/stanfordoc.py | 2 +- youtube_dl/extractor/tf1.py | 2 +- youtube_dl/extractor/unistra.py | 2 +- youtube_dl/extractor/veehd.py | 2 +- youtube_dl/extractor/vevo.py | 2 +- youtube_dl/extractor/vice.py | 2 +- youtube_dl/extractor/viddler.py | 2 +- youtube_dl/extractor/videofyme.py | 2 +- youtube_dl/extractor/wat.py | 2 +- youtube_dl/extractor/youjizz.py | 2 +- 40 files changed, 43 insertions(+), 43 deletions(-) diff --git a/youtube_dl/extractor/addanime.py b/youtube_dl/extractor/addanime.py index b99d4b966..a3a1b999d 100644 --- a/youtube_dl/extractor/addanime.py +++ b/youtube_dl/extractor/addanime.py @@ -13,7 +13,7 @@ from ..utils import ( class AddAnimeIE(InfoExtractor): - _VALID_URL = r'^http://(?:\w+\.)?add-anime\.net/watch_video.php\?(?:.*?)v=(?P<video_id>[\w_]+)(?:.*)' + _VALID_URL = r'^http://(?:\w+\.)?add-anime\.net/watch_video\.php\?(?:.*?)v=(?P<video_id>[\w_]+)(?:.*)' IE_NAME = u'AddAnime' _TEST = { u'url': u'http://www.add-anime.net/watch_video.php?v=24MR3YO5SAS9', diff --git a/youtube_dl/extractor/appletrailers.py b/youtube_dl/extractor/appletrailers.py index 6d6237f8a..5b522552a 100644 --- a/youtube_dl/extractor/appletrailers.py +++ b/youtube_dl/extractor/appletrailers.py @@ -10,7 +10,7 @@ from ..utils import ( class AppleTrailersIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?trailers.apple.com/trailers/(?P<company>[^/]+)/(?P<movie>[^/]+)' + _VALID_URL = r'https?://(?:www\.)?trailers\.apple\.com/trailers/(?P<company>[^/]+)/(?P<movie>[^/]+)' _TEST = { u"url": u"http://trailers.apple.com/trailers/wb/manofsteel/", u"playlist": [ diff --git a/youtube_dl/extractor/archiveorg.py b/youtube_dl/extractor/archiveorg.py index 61ce4469a..a8394bfb0 100644 --- a/youtube_dl/extractor/archiveorg.py +++ b/youtube_dl/extractor/archiveorg.py @@ -11,7 +11,7 @@ from ..utils import ( class ArchiveOrgIE(InfoExtractor): IE_NAME = 'archive.org' IE_DESC = 'archive.org videos' - _VALID_URL = r'(?:https?://)?(?:www\.)?archive.org/details/(?P<id>[^?/]+)(?:[?].*)?$' + _VALID_URL = r'(?:https?://)?(?:www\.)?archive\.org/details/(?P<id>[^?/]+)(?:[?].*)?$' _TEST = { u"url": u"http://archive.org/details/XD300-23_68HighlightsAResearchCntAugHumanIntellect", u'file': u'XD300-23_68HighlightsAResearchCntAugHumanIntellect.ogv', diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py index 8b62ee774..56a5d009f 100644 --- a/youtube_dl/extractor/arte.py +++ b/youtube_dl/extractor/arte.py @@ -17,8 +17,8 @@ from ..utils import ( # add tests. class ArteTvIE(InfoExtractor): - _VIDEOS_URL = r'(?:http://)?videos.arte.tv/(?P<lang>fr|de)/.*-(?P<id>.*?).html' - _LIVEWEB_URL = r'(?:http://)?liveweb.arte.tv/(?P<lang>fr|de)/(?P<subpage>.+?)/(?P<name>.+)' + _VIDEOS_URL = r'(?:http://)?videos\.arte\.tv/(?P<lang>fr|de)/.*-(?P<id>.*?)\.html' + _LIVEWEB_URL = r'(?:http://)?liveweb\.arte\.tv/(?P<lang>fr|de)/(?P<subpage>.+?)/(?P<name>.+)' _LIVE_URL = r'index-[0-9]+\.html$' IE_NAME = u'arte.tv' diff --git a/youtube_dl/extractor/auengine.py b/youtube_dl/extractor/auengine.py index 95c038003..bcccc0b7a 100644 --- a/youtube_dl/extractor/auengine.py +++ b/youtube_dl/extractor/auengine.py @@ -16,7 +16,7 @@ class AUEngineIE(InfoExtractor): u"title": u"[Commie]The Legend of the Legendary Heroes - 03 - Replication Eye (Alpha Stigma)[F9410F5A]" } } - _VALID_URL = r'(?:http://)?(?:www\.)?auengine\.com/embed.php\?.*?file=([^&]+).*?' + _VALID_URL = r'(?:http://)?(?:www\.)?auengine\.com/embed\.php\?.*?file=([^&]+).*?' def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) diff --git a/youtube_dl/extractor/bambuser.py b/youtube_dl/extractor/bambuser.py index b80508efe..d48c0c38d 100644 --- a/youtube_dl/extractor/bambuser.py +++ b/youtube_dl/extractor/bambuser.py @@ -54,7 +54,7 @@ class BambuserIE(InfoExtractor): class BambuserChannelIE(InfoExtractor): IE_NAME = u'bambuser:channel' - _VALID_URL = r'http://bambuser.com/channel/(?P<user>.*?)(?:/|#|\?|$)' + _VALID_URL = r'https?://bambuser\.com/channel/(?P<user>.*?)(?:/|#|\?|$)' # The maximum number we can get with each request _STEP = 50 diff --git a/youtube_dl/extractor/bloomberg.py b/youtube_dl/extractor/bloomberg.py index 3666a780b..755d9c9ef 100644 --- a/youtube_dl/extractor/bloomberg.py +++ b/youtube_dl/extractor/bloomberg.py @@ -4,7 +4,7 @@ from .common import InfoExtractor class BloombergIE(InfoExtractor): - _VALID_URL = r'https?://www\.bloomberg\.com/video/(?P<name>.+?).html' + _VALID_URL = r'https?://www\.bloomberg\.com/video/(?P<name>.+?)\.html' _TEST = { u'url': u'http://www.bloomberg.com/video/shah-s-presentation-on-foreign-exchange-strategies-qurhIVlJSB6hzkVi229d8g.html', diff --git a/youtube_dl/extractor/comedycentral.py b/youtube_dl/extractor/comedycentral.py index 23647f99e..caea446ea 100644 --- a/youtube_dl/extractor/comedycentral.py +++ b/youtube_dl/extractor/comedycentral.py @@ -12,7 +12,7 @@ from ..utils import ( class ComedyCentralIE(MTVIE): - _VALID_URL = r'http://www.comedycentral.com/(video-clips|episodes|cc-studios)/(?P<title>.*)' + _VALID_URL = r'https?://(?:www\.)?comedycentral\.com/(video-clips|episodes|cc-studios)/(?P<title>.*)' _FEED_URL = u'http://comedycentral.com/feeds/mrss/' _TEST = { diff --git a/youtube_dl/extractor/cspan.py b/youtube_dl/extractor/cspan.py index 7bf03c584..d5730684d 100644 --- a/youtube_dl/extractor/cspan.py +++ b/youtube_dl/extractor/cspan.py @@ -6,7 +6,7 @@ from ..utils import ( ) class CSpanIE(InfoExtractor): - _VALID_URL = r'http://www.c-spanvideo.org/program/(.*)' + _VALID_URL = r'http://www\.c-spanvideo\.org/program/(.*)' _TEST = { u'url': u'http://www.c-spanvideo.org/program/HolderonV', u'file': u'315139.flv', diff --git a/youtube_dl/extractor/dreisat.py b/youtube_dl/extractor/dreisat.py index 3cb382e12..008c99699 100644 --- a/youtube_dl/extractor/dreisat.py +++ b/youtube_dl/extractor/dreisat.py @@ -11,7 +11,7 @@ from ..utils import ( class DreiSatIE(InfoExtractor): IE_NAME = '3sat' - _VALID_URL = r'(?:http://)?(?:www\.)?3sat.de/mediathek/index.php\?(?:(?:mode|display)=[^&]+&)*obj=(?P<id>[0-9]+)$' + _VALID_URL = r'(?:http://)?(?:www\.)?3sat\.de/mediathek/index\.php\?(?:(?:mode|display)=[^&]+&)*obj=(?P<id>[0-9]+)$' _TEST = { u"url": u"http://www.3sat.de/mediathek/index.php?obj=36983", u'file': u'36983.webm', diff --git a/youtube_dl/extractor/eighttracks.py b/youtube_dl/extractor/eighttracks.py index f21ef8853..88f5526b8 100644 --- a/youtube_dl/extractor/eighttracks.py +++ b/youtube_dl/extractor/eighttracks.py @@ -10,7 +10,7 @@ from ..utils import ( class EightTracksIE(InfoExtractor): IE_NAME = '8tracks' - _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$' + _VALID_URL = r'https?://8tracks\.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$' _TEST = { u"name": u"EightTracks", u"url": u"http://8tracks.com/ytdl/youtube-dl-test-tracks-a", diff --git a/youtube_dl/extractor/exfm.py b/youtube_dl/extractor/exfm.py index a51d79b08..682901d16 100644 --- a/youtube_dl/extractor/exfm.py +++ b/youtube_dl/extractor/exfm.py @@ -8,7 +8,7 @@ class ExfmIE(InfoExtractor): IE_NAME = u'exfm' IE_DESC = u'ex.fm' _VALID_URL = r'(?:http://)?(?:www\.)?ex\.fm/song/([^/]+)' - _SOUNDCLOUD_URL = r'(?:http://)?(?:www\.)?api\.soundcloud.com/tracks/([^/]+)/stream' + _SOUNDCLOUD_URL = r'(?:http://)?(?:www\.)?api\.soundcloud\.com/tracks/([^/]+)/stream' _TESTS = [ { u'url': u'http://ex.fm/song/eh359', diff --git a/youtube_dl/extractor/faz.py b/youtube_dl/extractor/faz.py index c0169de04..615674baf 100644 --- a/youtube_dl/extractor/faz.py +++ b/youtube_dl/extractor/faz.py @@ -9,7 +9,7 @@ from ..utils import ( class FazIE(InfoExtractor): IE_NAME = u'faz.net' - _VALID_URL = r'https?://www\.faz\.net/multimedia/videos/.*?-(?P<id>\d+).html' + _VALID_URL = r'https?://www\.faz\.net/multimedia/videos/.*?-(?P<id>\d+)\.html' _TEST = { u'url': u'http://www.faz.net/multimedia/videos/stockholm-chemie-nobelpreis-fuer-drei-amerikanische-forscher-12610585.html', diff --git a/youtube_dl/extractor/fktv.py b/youtube_dl/extractor/fktv.py index dba1a8dc2..d7048c8c1 100644 --- a/youtube_dl/extractor/fktv.py +++ b/youtube_dl/extractor/fktv.py @@ -12,7 +12,7 @@ from ..utils import ( class FKTVIE(InfoExtractor): IE_NAME = u'fernsehkritik.tv' - _VALID_URL = r'(?:http://)?(?:www\.)?fernsehkritik.tv/folge-(?P<ep>[0-9]+)(?:/.*)?' + _VALID_URL = r'(?:http://)?(?:www\.)?fernsehkritik\.tv/folge-(?P<ep>[0-9]+)(?:/.*)?' _TEST = { u'url': u'http://fernsehkritik.tv/folge-1', @@ -52,7 +52,7 @@ class FKTVIE(InfoExtractor): class FKTVPosteckeIE(InfoExtractor): IE_NAME = u'fernsehkritik.tv:postecke' - _VALID_URL = r'(?:http://)?(?:www\.)?fernsehkritik.tv/inline-video/postecke.php\?(.*&)?ep=(?P<ep>[0-9]+)(&|$)' + _VALID_URL = r'(?:http://)?(?:www\.)?fernsehkritik\.tv/inline-video/postecke\.php\?(.*&)?ep=(?P<ep>[0-9]+)(&|$)' _TEST = { u'url': u'http://fernsehkritik.tv/inline-video/postecke.php?iframe=true&width=625&height=440&ep=120', u'file': u'0120.flv', diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py index 6e1971043..66aa3aa0d 100644 --- a/youtube_dl/extractor/francetv.py +++ b/youtube_dl/extractor/francetv.py @@ -45,7 +45,7 @@ class PluzzIE(FranceTVBaseInfoExtractor): class FranceTvInfoIE(FranceTVBaseInfoExtractor): IE_NAME = u'francetvinfo.fr' - _VALID_URL = r'https?://www\.francetvinfo\.fr/replay.*/(?P<title>.+).html' + _VALID_URL = r'https?://www\.francetvinfo\.fr/replay.*/(?P<title>.+)\.html' _TEST = { u'url': u'http://www.francetvinfo.fr/replay-jt/france-3/soir-3/jt-grand-soir-3-lundi-26-aout-2013_393427.html', diff --git a/youtube_dl/extractor/gamekings.py b/youtube_dl/extractor/gamekings.py index c91669b0e..a3a5251fe 100644 --- a/youtube_dl/extractor/gamekings.py +++ b/youtube_dl/extractor/gamekings.py @@ -4,7 +4,7 @@ from .common import InfoExtractor class GamekingsIE(InfoExtractor): - _VALID_URL = r'http?://www\.gamekings\.tv/videos/(?P<name>[0-9a-z\-]+)' + _VALID_URL = r'http://www\.gamekings\.tv/videos/(?P<name>[0-9a-z\-]+)' _TEST = { u"url": u"http://www.gamekings.tv/videos/phoenix-wright-ace-attorney-dual-destinies-review/", u'file': u'20130811.mp4', diff --git a/youtube_dl/extractor/gametrailers.py b/youtube_dl/extractor/gametrailers.py index 3cc02d97e..88f656031 100644 --- a/youtube_dl/extractor/gametrailers.py +++ b/youtube_dl/extractor/gametrailers.py @@ -7,7 +7,7 @@ class GametrailersIE(MTVIE): Gametrailers use the same videos system as MTVIE, it just changes the feed url, where the uri is and the method to get the thumbnails. """ - _VALID_URL = r'http://www.gametrailers.com/(?P<type>videos|reviews|full-episodes)/(?P<id>.*?)/(?P<title>.*)' + _VALID_URL = r'http://www\.gametrailers\.com/(?P<type>videos|reviews|full-episodes)/(?P<id>.*?)/(?P<title>.*)' _TEST = { u'url': u'http://www.gametrailers.com/videos/zbvr8i/mirror-s-edge-2-e3-2013--debut-trailer', u'file': u'70e9a5d7-cf25-4a10-9104-6f3e7342ae0d.mp4', diff --git a/youtube_dl/extractor/ign.py b/youtube_dl/extractor/ign.py index c52146f7d..57b79a336 100644 --- a/youtube_dl/extractor/ign.py +++ b/youtube_dl/extractor/ign.py @@ -103,7 +103,7 @@ class IGNIE(InfoExtractor): class OneUPIE(IGNIE): """Extractor for 1up.com, it uses the ign videos system.""" - _VALID_URL = r'https?://gamevideos.1up.com/(?P<type>video)/id/(?P<name_or_id>.+)' + _VALID_URL = r'https?://gamevideos\.1up\.com/(?P<type>video)/id/(?P<name_or_id>.+)' IE_NAME = '1up.com' _DESCRIPTION_RE = r'<div id="vid_summary">(.+?)</div>' diff --git a/youtube_dl/extractor/instagram.py b/youtube_dl/extractor/instagram.py index 213aac428..660573d02 100644 --- a/youtube_dl/extractor/instagram.py +++ b/youtube_dl/extractor/instagram.py @@ -3,7 +3,7 @@ import re from .common import InfoExtractor class InstagramIE(InfoExtractor): - _VALID_URL = r'(?:http://)?instagram.com/p/(.*?)/' + _VALID_URL = r'(?:http://)?instagram\.com/p/(.*?)/' _TEST = { u'url': u'http://instagram.com/p/aye83DjauH/?foo=bar#abc', u'file': u'aye83DjauH.mp4', diff --git a/youtube_dl/extractor/jukebox.py b/youtube_dl/extractor/jukebox.py index c7bb234fe..592c64e1d 100644 --- a/youtube_dl/extractor/jukebox.py +++ b/youtube_dl/extractor/jukebox.py @@ -8,7 +8,7 @@ from ..utils import ( ) class JukeboxIE(InfoExtractor): - _VALID_URL = r'^http://www\.jukebox?\..+?\/.+[,](?P<video_id>[a-z0-9\-]+).html' + _VALID_URL = r'^http://www\.jukebox?\..+?\/.+[,](?P<video_id>[a-z0-9\-]+)\.html' _IFRAME = r'<iframe .*src="(?P<iframe>[^"]*)".*>' _VIDEO_URL = r'"config":{"file":"(?P<video_url>http:[^"]+[.](?P<video_ext>[^.?]+)[?]mdtk=[0-9]+)"' _TITLE = r'<h1 class="inline">(?P<title>[^<]+)</h1>.*<span id="infos_article_artist">(?P<artist>[^<]+)</span>' diff --git a/youtube_dl/extractor/liveleak.py b/youtube_dl/extractor/liveleak.py index dd062a14e..5ae57a77c 100644 --- a/youtube_dl/extractor/liveleak.py +++ b/youtube_dl/extractor/liveleak.py @@ -8,7 +8,7 @@ from ..utils import ( class LiveLeakIE(InfoExtractor): - _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)' + _VALID_URL = r'^(?:http://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)' IE_NAME = u'liveleak' _TEST = { u'url': u'http://www.liveleak.com/view?i=757_1364311680', diff --git a/youtube_dl/extractor/livestream.py b/youtube_dl/extractor/livestream.py index 9bc35b115..1dcd1fb2d 100644 --- a/youtube_dl/extractor/livestream.py +++ b/youtube_dl/extractor/livestream.py @@ -11,7 +11,7 @@ from ..utils import ( class LivestreamIE(InfoExtractor): IE_NAME = u'livestream' - _VALID_URL = r'http://new.livestream.com/.*?/(?P<event_name>.*?)(/videos/(?P<id>\d+))?/?$' + _VALID_URL = r'http://new\.livestream\.com/.*?/(?P<event_name>.*?)(/videos/(?P<id>\d+))?/?$' _TEST = { u'url': u'http://new.livestream.com/CoheedandCambria/WebsterHall/videos/4719370', u'file': u'4719370.mp4', diff --git a/youtube_dl/extractor/muzu.py b/youtube_dl/extractor/muzu.py index 03e31ea1c..1772b7f9a 100644 --- a/youtube_dl/extractor/muzu.py +++ b/youtube_dl/extractor/muzu.py @@ -9,7 +9,7 @@ from ..utils import ( class MuzuTVIE(InfoExtractor): - _VALID_URL = r'https?://www.muzu.tv/(.+?)/(.+?)/(?P<id>\d+)' + _VALID_URL = r'https?://www\.muzu\.tv/(.+?)/(.+?)/(?P<id>\d+)' IE_NAME = u'muzu.tv' _TEST = { diff --git a/youtube_dl/extractor/myspass.py b/youtube_dl/extractor/myspass.py index 0067bf134..4becddee6 100644 --- a/youtube_dl/extractor/myspass.py +++ b/youtube_dl/extractor/myspass.py @@ -9,7 +9,7 @@ from ..utils import ( class MySpassIE(InfoExtractor): - _VALID_URL = r'http://www.myspass.de/.*' + _VALID_URL = r'http://www\.myspass\.de/.*' _TEST = { u'url': u'http://www.myspass.de/myspass/shows/tvshows/absolute-mehrheit/Absolute-Mehrheit-vom-17022013-Die-Highlights-Teil-2--/11741/', u'file': u'11741.mp4', diff --git a/youtube_dl/extractor/orf.py b/youtube_dl/extractor/orf.py index cfca2a063..b42eae89a 100644 --- a/youtube_dl/extractor/orf.py +++ b/youtube_dl/extractor/orf.py @@ -12,7 +12,7 @@ from ..utils import ( ) class ORFIE(InfoExtractor): - _VALID_URL = r'https?://tvthek.orf.at/(programs/.+?/episodes|topics/.+?)/(?P<id>\d+)' + _VALID_URL = r'https?://tvthek\.orf\.at/(programs/.+?/episodes|topics/.+?)/(?P<id>\d+)' def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py index 65462d867..25f019231 100644 --- a/youtube_dl/extractor/pbs.py +++ b/youtube_dl/extractor/pbs.py @@ -5,7 +5,7 @@ from .common import InfoExtractor class PBSIE(InfoExtractor): - _VALID_URL = r'https?://video.pbs.org/video/(?P<id>\d+)/?' + _VALID_URL = r'https?://video\.pbs\.org/video/(?P<id>\d+)/?' _TEST = { u'url': u'http://video.pbs.org/video/2365006249/', diff --git a/youtube_dl/extractor/rutube.py b/youtube_dl/extractor/rutube.py index a18034fe2..e3e9bc07f 100644 --- a/youtube_dl/extractor/rutube.py +++ b/youtube_dl/extractor/rutube.py @@ -11,7 +11,7 @@ from ..utils import ( class RutubeIE(InfoExtractor): - _VALID_URL = r'https?://rutube.ru/video/(?P<long_id>\w+)' + _VALID_URL = r'https?://rutube\.ru/video/(?P<long_id>\w+)' _TEST = { u'url': u'http://rutube.ru/video/3eac3b4561676c17df9132a9a1e62e3e/', diff --git a/youtube_dl/extractor/slashdot.py b/youtube_dl/extractor/slashdot.py index f5003c7f9..d68646d24 100644 --- a/youtube_dl/extractor/slashdot.py +++ b/youtube_dl/extractor/slashdot.py @@ -4,7 +4,7 @@ from .common import InfoExtractor class SlashdotIE(InfoExtractor): - _VALID_URL = r'https?://tv.slashdot.org/video/\?embed=(?P<id>.*?)(&|$)' + _VALID_URL = r'https?://tv\.slashdot\.org/video/\?embed=(?P<id>.*?)(&|$)' _TEST = { u'add_ie': ['Ooyala'], diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index 3a19ab172..cb6dedab7 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -25,7 +25,7 @@ class SoundcloudIE(InfoExtractor): _VALID_URL = r'''^(?:https?://)? (?:(?:(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)/?(?:[?].*)?$) |(?:api\.soundcloud\.com/tracks/(?P<track_id>\d+)) - |(?P<widget>w.soundcloud.com/player/?.*?url=.*) + |(?P<widget>w\.soundcloud\.com/player/?.*?url=.*) ) ''' IE_NAME = u'soundcloud' @@ -217,7 +217,7 @@ class SoundcloudSetIE(SoundcloudIE): class SoundcloudUserIE(SoundcloudIE): - _VALID_URL = r'https?://(www\.)?soundcloud.com/(?P<user>[^/]+)(/?(tracks/)?)?(\?.*)?$' + _VALID_URL = r'https?://(www\.)?soundcloud\.com/(?P<user>[^/]+)(/?(tracks/)?)?(\?.*)?$' IE_NAME = u'soundcloud:user' # it's in tests/test_playlists.py diff --git a/youtube_dl/extractor/space.py b/youtube_dl/extractor/space.py index 0d32a0688..11455e0fa 100644 --- a/youtube_dl/extractor/space.py +++ b/youtube_dl/extractor/space.py @@ -6,7 +6,7 @@ from ..utils import RegexNotFoundError, ExtractorError class SpaceIE(InfoExtractor): - _VALID_URL = r'https?://www\.space\.com/\d+-(?P<title>[^/\.\?]*?)-video.html' + _VALID_URL = r'https?://www\.space\.com/\d+-(?P<title>[^/\.\?]*?)-video\.html' _TEST = { u'add_ie': ['Brightcove'], u'url': u'http://www.space.com/23373-huge-martian-landforms-detail-revealed-by-european-probe-video.html', diff --git a/youtube_dl/extractor/stanfordoc.py b/youtube_dl/extractor/stanfordoc.py index b27838bf9..d54e01a12 100644 --- a/youtube_dl/extractor/stanfordoc.py +++ b/youtube_dl/extractor/stanfordoc.py @@ -18,7 +18,7 @@ from ..utils import ( class StanfordOpenClassroomIE(InfoExtractor): IE_NAME = u'stanfordoc' IE_DESC = u'Stanford Open ClassRoom' - _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$' + _VALID_URL = r'^(?:https?://)?openclassroom\.stanford\.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$' _TEST = { u'url': u'http://openclassroom.stanford.edu/MainFolder/VideoPage.php?course=PracticalUnix&video=intro-environment&speed=100', u'file': u'PracticalUnix_intro-environment.mp4', diff --git a/youtube_dl/extractor/tf1.py b/youtube_dl/extractor/tf1.py index 772134a12..2c5c88be8 100644 --- a/youtube_dl/extractor/tf1.py +++ b/youtube_dl/extractor/tf1.py @@ -7,7 +7,7 @@ from .common import InfoExtractor class TF1IE(InfoExtractor): """TF1 uses the wat.tv player.""" - _VALID_URL = r'http://videos.tf1.fr/.*-(.*?).html' + _VALID_URL = r'http://videos\.tf1\.fr/.*-(.*?)\.html' _TEST = { u'url': u'http://videos.tf1.fr/auto-moto/citroen-grand-c4-picasso-2013-presentation-officielle-8062060.html', u'file': u'10635995.mp4', diff --git a/youtube_dl/extractor/unistra.py b/youtube_dl/extractor/unistra.py index 516e18914..474610eec 100644 --- a/youtube_dl/extractor/unistra.py +++ b/youtube_dl/extractor/unistra.py @@ -3,7 +3,7 @@ import re from .common import InfoExtractor class UnistraIE(InfoExtractor): - _VALID_URL = r'http://utv.unistra.fr/(?:index|video).php\?id_video\=(\d+)' + _VALID_URL = r'http://utv\.unistra\.fr/(?:index|video)\.php\?id_video\=(\d+)' _TEST = { u'url': u'http://utv.unistra.fr/video.php?id_video=154', diff --git a/youtube_dl/extractor/veehd.py b/youtube_dl/extractor/veehd.py index 3a99a29c6..3cf8c853d 100644 --- a/youtube_dl/extractor/veehd.py +++ b/youtube_dl/extractor/veehd.py @@ -9,7 +9,7 @@ from ..utils import ( ) class VeeHDIE(InfoExtractor): - _VALID_URL = r'https?://veehd.com/video/(?P<id>\d+)' + _VALID_URL = r'https?://veehd\.com/video/(?P<id>\d+)' _TEST = { u'url': u'http://veehd.com/video/4686958', diff --git a/youtube_dl/extractor/vevo.py b/youtube_dl/extractor/vevo.py index 4378b1780..d8bfcd155 100644 --- a/youtube_dl/extractor/vevo.py +++ b/youtube_dl/extractor/vevo.py @@ -15,7 +15,7 @@ class VevoIE(InfoExtractor): Accepts urls from vevo.com or in the format 'vevo:{id}' (currently used by MTVIE) """ - _VALID_URL = r'((http://www.vevo.com/watch/.*?/.*?/)|(vevo:))(?P<id>.*?)(\?|$)' + _VALID_URL = r'((http://www\.vevo\.com/watch/.*?/.*?/)|(vevo:))(?P<id>.*?)(\?|$)' _TESTS = [{ u'url': u'http://www.vevo.com/watch/hurts/somebody-to-die-for/GB1101300280', u'file': u'GB1101300280.mp4', diff --git a/youtube_dl/extractor/vice.py b/youtube_dl/extractor/vice.py index 6b93afa50..87812d6af 100644 --- a/youtube_dl/extractor/vice.py +++ b/youtube_dl/extractor/vice.py @@ -6,7 +6,7 @@ from ..utils import ExtractorError class ViceIE(InfoExtractor): - _VALID_URL = r'http://www.vice.com/.*?/(?P<name>.+)' + _VALID_URL = r'http://www\.vice\.com/.*?/(?P<name>.+)' _TEST = { u'url': u'http://www.vice.com/Fringes/cowboy-capitalists-part-1', diff --git a/youtube_dl/extractor/viddler.py b/youtube_dl/extractor/viddler.py index 826804af3..36d1bde08 100644 --- a/youtube_dl/extractor/viddler.py +++ b/youtube_dl/extractor/viddler.py @@ -8,7 +8,7 @@ from ..utils import ( class ViddlerIE(InfoExtractor): - _VALID_URL = r'(?P<domain>https?://(?:www\.)?viddler.com)/(?:v|embed|player)/(?P<id>[a-z0-9]+)' + _VALID_URL = r'(?P<domain>https?://(?:www\.)?viddler\.com)/(?:v|embed|player)/(?P<id>[a-z0-9]+)' _TEST = { u"url": u"http://www.viddler.com/v/43903784", u'file': u'43903784.mp4', diff --git a/youtube_dl/extractor/videofyme.py b/youtube_dl/extractor/videofyme.py index 912802d9a..f75169041 100644 --- a/youtube_dl/extractor/videofyme.py +++ b/youtube_dl/extractor/videofyme.py @@ -7,7 +7,7 @@ from ..utils import ( ) class VideofyMeIE(InfoExtractor): - _VALID_URL = r'https?://(www.videofy.me/.+?|p.videofy.me/v)/(?P<id>\d+)(&|#|$)' + _VALID_URL = r'https?://(www\.videofy\.me/.+?|p\.videofy\.me/v)/(?P<id>\d+)(&|#|$)' IE_NAME = u'videofy.me' _TEST = { diff --git a/youtube_dl/extractor/wat.py b/youtube_dl/extractor/wat.py index 29c25f0e3..4fab6c6e8 100644 --- a/youtube_dl/extractor/wat.py +++ b/youtube_dl/extractor/wat.py @@ -11,7 +11,7 @@ from ..utils import ( class WatIE(InfoExtractor): - _VALID_URL=r'http://www.wat.tv/.*-(?P<shortID>.*?)_.*?.html' + _VALID_URL=r'http://www\.wat\.tv/.*-(?P<shortID>.*?)_.*?\.html' IE_NAME = 'wat.tv' _TEST = { u'url': u'http://www.wat.tv/video/world-war-philadelphia-vost-6bv55_2fjr7_.html', diff --git a/youtube_dl/extractor/youjizz.py b/youtube_dl/extractor/youjizz.py index 1fcc518ac..e971b5b4b 100644 --- a/youtube_dl/extractor/youjizz.py +++ b/youtube_dl/extractor/youjizz.py @@ -7,7 +7,7 @@ from ..utils import ( class YouJizzIE(InfoExtractor): - _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$' + _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+)\.html$' _TEST = { u'url': u'http://www.youjizz.com/videos/zeichentrick-1-2189178.html', u'file': u'2189178.flv', From 6a656a843a629ceef6979976a353d177c97b9527 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Wed, 4 Dec 2013 20:35:00 +0100 Subject: [PATCH 219/425] Update description value for the write_info_json test (required after 27dcce19045670fc348ff1119c0d2283aaed3ae2) --- test/test_write_info_json.py | 1 + 1 file changed, 1 insertion(+) diff --git a/test/test_write_info_json.py b/test/test_write_info_json.py index d7177611b..90426a559 100644 --- a/test/test_write_info_json.py +++ b/test/test_write_info_json.py @@ -33,6 +33,7 @@ TEST_ID = 'BaW_jenozKc' INFO_JSON_FILE = TEST_ID + '.info.json' DESCRIPTION_FILE = TEST_ID + '.mp4.description' EXPECTED_DESCRIPTION = u'''test chars: "'/\ä↭𝕐 +test URL: https://github.com/rg3/youtube-dl/issues/1892 This is a test video for youtube-dl. From bfb9f7bc4c5c6fd9b2d3d46be133988f70534d26 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Wed, 4 Dec 2013 20:36:26 +0100 Subject: [PATCH 220/425] [hotnewhiphop] Update test's title --- youtube_dl/extractor/hotnewhiphop.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/hotnewhiphop.py b/youtube_dl/extractor/hotnewhiphop.py index 3798118a7..0ee74fb38 100644 --- a/youtube_dl/extractor/hotnewhiphop.py +++ b/youtube_dl/extractor/hotnewhiphop.py @@ -11,7 +11,7 @@ class HotNewHipHopIE(InfoExtractor): u'file': u'1435540.mp3', u'md5': u'2c2cd2f76ef11a9b3b581e8b232f3d96', u'info_dict': { - u"title": u"Freddie Gibbs - Lay It Down" + u"title": u'Freddie Gibbs "Lay It Down"' } } From e9bf7479d209c2623753628201ca0daffa19f3cf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Wed, 4 Dec 2013 23:28:40 +0100 Subject: [PATCH 221/425] Add an extractor for theplatform.com --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/theplatform.py | 69 +++++++++++++++++++++++++++++ 2 files changed, 70 insertions(+) create mode 100644 youtube_dl/extractor/theplatform.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index bd996483b..900a6f02f 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -144,6 +144,7 @@ from .teamcoco import TeamcocoIE from .techtalks import TechTalksIE from .ted import TEDIE from .tf1 import TF1IE +from .theplatform import ThePlatformIE from .thisav import ThisAVIE from .toutv import TouTvIE from .traileraddict import TrailerAddictIE diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py new file mode 100644 index 000000000..d1d6a4c2c --- /dev/null +++ b/youtube_dl/extractor/theplatform.py @@ -0,0 +1,69 @@ +import re +import json + +from .common import InfoExtractor +from ..utils import ( + xpath_with_ns, + find_xpath_attr, +) + +_x = lambda p: xpath_with_ns(p, {'smil': 'http://www.w3.org/2005/SMIL21/Language'}) + + +class ThePlatformIE(InfoExtractor): + _VALID_URL = r'https?://link\.theplatform\.com/s/[^/]+/(?P<id>[^/\?]+)' + + _TEST = { + # from http://www.metacafe.com/watch/cb-e9I_cZgTgIPd/blackberrys_big_bold_z30/ + u'url': u'http://link.theplatform.com/s/dJ5BDC/e9I_cZgTgIPd/meta.smil?format=smil&Tracking=true&mbr=true', + u'info_dict': { + u'id': u'e9I_cZgTgIPd', + u'ext': u'flv', + u'title': u'Blackberry\'s big, bold Z30', + u'description': u'The Z30 is Blackberry\'s biggest, baddest mobile messaging device yet.', + u'duration': 247, + }, + u'params': { + # rtmp download + u'skip_download': True, + }, + } + + def _get_info(self, video_id): + smil_url = ('http://link.theplatform.com/s/dJ5BDC/{0}/meta.smil?' + 'format=smil&mbr=true'.format(video_id)) + meta = self._download_xml(smil_url, video_id) + info_url = 'http://link.theplatform.com/s/dJ5BDC/{0}?format=preview'.format(video_id) + info_json = self._download_webpage(info_url, video_id) + info = json.loads(info_json) + + head = meta.find(_x('smil:head')) + body = meta.find(_x('smil:body')) + base_url = head.find(_x('smil:meta')).attrib['base'] + switch = body.find(_x('smil:switch')) + formats = [] + for f in switch.findall(_x('smil:video')): + attr = f.attrib + formats.append({ + 'url': base_url, + 'play_path': 'mp4:' + attr['src'], + 'ext': 'flv', + 'width': int(attr['width']), + 'height': int(attr['height']), + 'vbr': int(attr['system-bitrate']), + }) + formats.sort(key=lambda f: (f['height'], f['width'], f['vbr'])) + + return { + 'id': video_id, + 'title': info['title'], + 'formats': formats, + 'description': info['description'], + 'thumbnail': info['defaultThumbnailUrl'], + 'duration': info['duration']//1000, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + return self._get_info(video_id) From b9a2c53833a3cebc32df908aad74f7c5a3537aa1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Wed, 4 Dec 2013 23:43:50 +0100 Subject: [PATCH 222/425] [metacafe] Add support for cbs videos (fixes #1838) They use theplatform.com --- youtube_dl/extractor/metacafe.py | 29 +++++++++++++++++++++++++---- youtube_dl/extractor/theplatform.py | 2 +- 2 files changed, 26 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/metacafe.py b/youtube_dl/extractor/metacafe.py index 91480ba87..e59bdd604 100644 --- a/youtube_dl/extractor/metacafe.py +++ b/youtube_dl/extractor/metacafe.py @@ -69,6 +69,21 @@ class MetacafeIE(InfoExtractor): u'age_limit': 18, }, }, + # cbs video + { + u'url': u'http://www.metacafe.com/watch/cb-0rOxMBabDXN6/samsung_galaxy_note_2_samsungs_next_generation_phablet/', + u'info_dict': { + u'id': u'0rOxMBabDXN6', + u'ext': u'flv', + u'title': u'Samsung Galaxy Note 2: Samsung\'s next-generation phablet', + u'description': u'md5:54d49fac53d26d5a0aaeccd061ada09d', + u'duration': 129, + }, + u'params': { + # rtmp download + u'skip_download': True, + }, + }, ] @@ -106,10 +121,16 @@ class MetacafeIE(InfoExtractor): video_id = mobj.group(1) - # Check if video comes from YouTube - mobj2 = re.match(r'^yt-(.*)$', video_id) - if mobj2 is not None: - return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1), 'Youtube')] + # the video may come from an external site + m_external = re.match('^(\w{2})-(.*)$', video_id) + if m_external is not None: + prefix, ext_id = m_external.groups() + # Check if video comes from YouTube + if prefix == 'yt': + return self.url_result('http://www.youtube.com/watch?v=%s' % ext_id, 'Youtube') + # CBS videos use theplatform.com + if prefix == 'cb': + return self.url_result('theplatform:%s' % ext_id, 'ThePlatform') # Retrieve video webpage to extract further information req = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id) diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py index d1d6a4c2c..920689511 100644 --- a/youtube_dl/extractor/theplatform.py +++ b/youtube_dl/extractor/theplatform.py @@ -11,7 +11,7 @@ _x = lambda p: xpath_with_ns(p, {'smil': 'http://www.w3.org/2005/SMIL21/Language class ThePlatformIE(InfoExtractor): - _VALID_URL = r'https?://link\.theplatform\.com/s/[^/]+/(?P<id>[^/\?]+)' + _VALID_URL = r'(?:https?://link\.theplatform\.com/s/[^/]+/|theplatform:)(?P<id>[^/\?]+)' _TEST = { # from http://www.metacafe.com/watch/cb-e9I_cZgTgIPd/blackberrys_big_bold_z30/ From 673d1273ff6f6d3267728fbe6f79c9c801598fd2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Thu, 5 Dec 2013 12:41:58 +0100 Subject: [PATCH 223/425] [vevo] Support '/watch/{id}' urls --- youtube_dl/extractor/vevo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/vevo.py b/youtube_dl/extractor/vevo.py index d8bfcd155..3eedcf7dd 100644 --- a/youtube_dl/extractor/vevo.py +++ b/youtube_dl/extractor/vevo.py @@ -15,7 +15,7 @@ class VevoIE(InfoExtractor): Accepts urls from vevo.com or in the format 'vevo:{id}' (currently used by MTVIE) """ - _VALID_URL = r'((http://www\.vevo\.com/watch/.*?/.*?/)|(vevo:))(?P<id>.*?)(\?|$)' + _VALID_URL = r'((http://www\.vevo\.com/watch/(?:[^/]+/[^/]+/)?)|(vevo:))(?P<id>.*?)(\?|$)' _TESTS = [{ u'url': u'http://www.vevo.com/watch/hurts/somebody-to-die-for/GB1101300280', u'file': u'GB1101300280.mp4', From 7fc3fa0545f8a07414e8c97be9862a3c2f79bb98 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Thu, 5 Dec 2013 14:29:08 +0100 Subject: [PATCH 224/425] [9gag] Add extractor --- youtube_dl/YoutubeDL.py | 34 +++++++++++++++++++++++--- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/ninegag.py | 41 ++++++++++++++++++++++++++++++++ 3 files changed, 73 insertions(+), 3 deletions(-) create mode 100644 youtube_dl/extractor/ninegag.py diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index b68b110a4..8ad7bd1da 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -405,7 +405,8 @@ class YoutubeDL(object): for key, value in extra_info.items(): info_dict.setdefault(key, value) - def extract_info(self, url, download=True, ie_key=None, extra_info={}): + def extract_info(self, url, download=True, ie_key=None, extra_info={}, + process=True): ''' Returns a list with a dictionary for each video we find. If 'download', also downloads the videos. @@ -441,7 +442,10 @@ class YoutubeDL(object): 'webpage_url': url, 'extractor_key': ie.ie_key(), }) - return self.process_ie_result(ie_result, download, extra_info) + if process: + return self.process_ie_result(ie_result, download, extra_info) + else: + return ie_result except ExtractorError as de: # An error we somewhat expected self.report_error(compat_str(de), de.format_traceback()) break @@ -474,8 +478,32 @@ class YoutubeDL(object): download, ie_key=ie_result.get('ie_key'), extra_info=extra_info) - elif result_type == 'playlist': + elif result_type == 'url_transparent': + # Use the information from the embedding page + info = self.extract_info( + ie_result['url'], ie_key=ie_result.get('ie_key'), + extra_info=extra_info, download=False, process=False) + def make_result(embedded_info): + new_result = ie_result.copy() + for f in ('_type', 'url', 'ext', 'player_url', 'formats', + 'entries', 'urlhandle', 'ie_key', 'duration', + 'subtitles', 'annotations', 'format'): + if f in new_result: + del new_result[f] + if f in embedded_info: + new_result[f] = embedded_info[f] + return new_result + new_result = make_result(info) + + assert new_result.get('_type') != 'url_transparent' + if new_result.get('_type') == 'compat_list': + new_result['entries'] = [ + make_result(e) for e in new_result['entries']] + + return self.process_ie_result( + new_result, download=download, extra_info=extra_info) + elif result_type == 'playlist': # We process each entry in the playlist playlist = ie_result.get('title', None) or ie_result.get('id', None) self.to_screen(u'[download] Downloading playlist: %s' % playlist) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index bd996483b..a77e98d49 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -102,6 +102,7 @@ from .nbc import NBCNewsIE from .newgrounds import NewgroundsIE from .nhl import NHLIE, NHLVideocenterIE from .niconico import NiconicoIE +from .ninegag import NineGagIE from .nowvideo import NowVideoIE from .ooyala import OoyalaIE from .orf import ORFIE diff --git a/youtube_dl/extractor/ninegag.py b/youtube_dl/extractor/ninegag.py new file mode 100644 index 000000000..cc00ffbcc --- /dev/null +++ b/youtube_dl/extractor/ninegag.py @@ -0,0 +1,41 @@ +import json +import re + +from .common import InfoExtractor + + +class NineGagIE(InfoExtractor): + IE_NAME = '9gag' + _VALID_URL = r'^https?://(?:www\.)?9gag\.tv/v/(?P<id>[0-9]+)' + + _TEST = { + u"url": u"http://9gag.tv/v/1912", + u"file": u"1912.mp4", + u"info_dict": { + u"description": u"This 3-minute video will make you smile and then make you feel untalented and insignificant. Anyway, you should share this awesomeness. (Thanks, Dino!)", + u"title": u"\"People Are Awesome 2013\" Is Absolutely Awesome" + }, + u'add_ie': [u'Youtube'] + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + + webpage = self._download_webpage(url, video_id) + data_json = self._html_search_regex(r'''(?x) + <div\s*id="tv-video"\s*data-video-source="youtube"\s* + data-video-meta="([^"]+)"''', webpage, u'video metadata') + + data = json.loads(data_json) + + return { + '_type': 'url_transparent', + 'url': data['youtubeVideoId'], + 'ie_key': 'Youtube', + 'id': video_id, + 'title': data['title'], + 'description': data['description'], + 'view_count': int(data['view_count']), + 'thumbnail': data['thumbnail_url'], + } From a1ef7e85d6834d5e8a9a2171b220a9e3b93dd2cf Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Thu, 5 Dec 2013 14:31:54 +0100 Subject: [PATCH 225/425] Remove unused imports --- youtube_dl/extractor/smotri.py | 1 - youtube_dl/extractor/theplatform.py | 1 - youtube_dl/extractor/viddler.py | 3 --- youtube_dl/extractor/yahoo.py | 2 +- youtube_dl/utils.py | 1 - 5 files changed, 1 insertion(+), 7 deletions(-) diff --git a/youtube_dl/extractor/smotri.py b/youtube_dl/extractor/smotri.py index f035a3214..5a28bc820 100644 --- a/youtube_dl/extractor/smotri.py +++ b/youtube_dl/extractor/smotri.py @@ -6,7 +6,6 @@ import hashlib from .common import InfoExtractor from ..utils import ( - determine_ext, ExtractorError ) diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py index 920689511..61452e47d 100644 --- a/youtube_dl/extractor/theplatform.py +++ b/youtube_dl/extractor/theplatform.py @@ -4,7 +4,6 @@ import json from .common import InfoExtractor from ..utils import ( xpath_with_ns, - find_xpath_attr, ) _x = lambda p: xpath_with_ns(p, {'smil': 'http://www.w3.org/2005/SMIL21/Language'}) diff --git a/youtube_dl/extractor/viddler.py b/youtube_dl/extractor/viddler.py index 138a35b2a..9328ef4a2 100644 --- a/youtube_dl/extractor/viddler.py +++ b/youtube_dl/extractor/viddler.py @@ -2,9 +2,6 @@ import json import re from .common import InfoExtractor -from ..utils import ( - determine_ext, -) class ViddlerIE(InfoExtractor): diff --git a/youtube_dl/extractor/yahoo.py b/youtube_dl/extractor/yahoo.py index e457c4707..5c9c361b9 100644 --- a/youtube_dl/extractor/yahoo.py +++ b/youtube_dl/extractor/yahoo.py @@ -47,7 +47,7 @@ class YahooIE(InfoExtractor): # The 'meta' field is not always in the video webpage, we request it # from another page long_id = info['id'] - return self._get_info(info['id'], video_id) + return self._get_info(long_id, video_id) def _get_info(self, long_id, video_id): query = ('SELECT * FROM yahoo.media.video.streams WHERE id="%s"' diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index c486ef8ec..77609f7ca 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -17,7 +17,6 @@ import ssl import socket import sys import traceback -import xml.etree.ElementTree import zlib try: From 19e3dfc9f8444a1341a6e71752a3235a0447a565 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Thu, 5 Dec 2013 18:29:07 +0100 Subject: [PATCH 226/425] [9gag] Like/dislike count (#1895) --- youtube_dl/extractor/common.py | 3 +++ youtube_dl/extractor/ninegag.py | 2 ++ 2 files changed, 5 insertions(+) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 1b049082d..92a0c5050 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -55,6 +55,9 @@ class InfoExtractor(object): subtitles: The subtitle file contents as a dictionary in the format {language: subtitles}. view_count: How many users have watched the video on the platform. + like_count: Number of positive ratings of the video + dislike_count: Number of negative ratings of the video + comment_count: Number of comments on the video urlhandle: [internal] The urlHandle to be used to download the file, like returned by urllib.request.urlopen age_limit: Age restriction for the video, as an integer (years) diff --git a/youtube_dl/extractor/ninegag.py b/youtube_dl/extractor/ninegag.py index cc00ffbcc..ea986c00e 100644 --- a/youtube_dl/extractor/ninegag.py +++ b/youtube_dl/extractor/ninegag.py @@ -37,5 +37,7 @@ class NineGagIE(InfoExtractor): 'title': data['title'], 'description': data['description'], 'view_count': int(data['view_count']), + 'like_count': int(data['statistic']['like']), + 'dislike_count': int(data['statistic']['dislike']), 'thumbnail': data['thumbnail_url'], } From 9e6060208430cef6af5e1f6ae24feb65c35fc03c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Thu, 5 Dec 2013 20:45:53 +0100 Subject: [PATCH 227/425] =?UTF-8?q?[francetv]=20Add=20support=20for=20more?= =?UTF-8?q?=20channels:=203,=204,=205=20and=20=C3=94=20(#1898)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rename the France2IE extractor to FranceTVIE --- youtube_dl/extractor/__init__.py | 2 +- youtube_dl/extractor/francetv.py | 103 +++++++++++++++++++++++++------ 2 files changed, 85 insertions(+), 20 deletions(-) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 2e1a8be14..a78dcad7f 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -56,7 +56,7 @@ from .flickr import FlickrIE from .francetv import ( PluzzIE, FranceTvInfoIE, - France2IE, + FranceTVIE, GenerationQuoiIE ) from .freesound import FreesoundIE diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py index 66aa3aa0d..290e650e0 100644 --- a/youtube_dl/extractor/francetv.py +++ b/youtube_dl/extractor/francetv.py @@ -21,7 +21,7 @@ class FranceTVBaseInfoExtractor(InfoExtractor): thumbnail_path = info.find('image').text return {'id': video_id, - 'ext': 'mp4', + 'ext': 'flv' if video_url.startswith('rtmp') else 'mp4', 'url': video_url, 'title': info.find('titre').text, 'thumbnail': compat_urlparse.urljoin('http://pluzz.francetv.fr', thumbnail_path), @@ -66,35 +66,100 @@ class FranceTvInfoIE(FranceTVBaseInfoExtractor): return self._extract_video(video_id) -class France2IE(FranceTVBaseInfoExtractor): - IE_NAME = u'france2.fr' - _VALID_URL = r'''(?x)https?://www\.france2\.fr/ +class FranceTVIE(FranceTVBaseInfoExtractor): + IE_NAME = u'francetv' + IE_DESC = u'France 2, 3, 4, 5 and Ô' + _VALID_URL = r'''(?x)https?://www\.france[2345o]\.fr/ (?: - emissions/.*?/videos/(?P<id>\d+) - | emission/(?P<key>[^/?]+) + emissions/.*?/(videos|emissions)/(?P<id>[^/?]+) + | (emission|jt)/(?P<key>[^/?]+) )''' - _TEST = { - u'url': u'http://www.france2.fr/emissions/13h15-le-samedi-le-dimanche/videos/75540104', - u'file': u'75540104.mp4', - u'info_dict': { - u'title': u'13h15, le samedi...', - u'description': u'md5:2e5b58ba7a2d3692b35c792be081a03d', + _TESTS = [ + # france2 + { + u'url': u'http://www.france2.fr/emissions/13h15-le-samedi-le-dimanche/videos/75540104', + u'file': u'75540104.mp4', + u'info_dict': { + u'title': u'13h15, le samedi...', + u'description': u'md5:2e5b58ba7a2d3692b35c792be081a03d', + }, + u'params': { + # m3u8 download + u'skip_download': True, + }, }, - u'params': { - u'skip_download': True, + # france3 + { + u'url': u'http://www.france3.fr/emissions/pieces-a-conviction/videos/rhozet_pac_ba_20131204_1933_03122013164521_F3', + u'info_dict': { + u'id': u'rhozet_pac_ba_20131204_1933_03122013164521_F3', + u'ext': u'flv', + u'title': u'Pièces à conviction du 04/12/2013', + u'description': u'md5:1cf14ea302ba5f10d992c9eb2bff30dd', + }, + u'params': { + # rtmp download + u'skip_download': True, + }, }, - } + # france4 + { + u'url': u'http://www.france4.fr/emissions/hero-corp/videos/rhozet_herocorp_bonus_1_20131106_1923_06112013172108_F4', + u'info_dict': { + u'id': u'rhozet_herocorp_bonus_1_20131106_1923_06112013172108_F4', + u'ext': u'flv', + u'title': u'Hero Corp Making of - Extrait 1', + u'description': u'md5:c87d54871b1790679aec1197e73d650a', + }, + u'params': { + # rtmp download + u'skip_download': True, + }, + }, + # france5 + { + u'url': u'http://www.france5.fr/emissions/c-a-dire/videos/92837968', + u'info_dict': { + u'id': u'92837968', + u'ext': u'mp4', + u'title': u'C à dire ?!', + u'description': u'md5:fb1db1cbad784dcce7c7a7bd177c8e2f', + }, + u'params': { + # m3u8 download + u'skip_download': True, + }, + }, + # franceo + { + u'url': u'http://www.franceo.fr/jt/info-afrique/04-12-2013', + u'info_dict': { + u'id': u'92327925', + u'ext': u'mp4', + u'title': u'Infô-Afrique', + u'description': u'md5:ebf346da789428841bee0fd2a935ea55', + }, + u'params': { + # m3u8 download + u'skip_download': True, + }, + u'skip': u'The id changes frequently', + }, + ] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) if mobj.group('key'): webpage = self._download_webpage(url, mobj.group('key')) - video_id = self._html_search_regex( - r'''(?x)<div\s+class="video-player">\s* + id_res = [ + (r'''(?x)<div\s+class="video-player">\s* <a\s+href="http://videos.francetv.fr/video/([0-9]+)"\s+ - class="francetv-video-player">''', - webpage, u'video ID') + class="francetv-video-player">'''), + (r'<a id="player_direct" href="http://info\.francetelevisions' + '\.fr/\?id-video=([^"/&]+)'), + ] + video_id = self._html_search_regex(id_res, webpage, u'video ID') else: video_id = mobj.group('id') return self._extract_video(video_id) From 3514813d5b021c5595b212ba3b1801175840c5c0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Thu, 5 Dec 2013 21:26:35 +0100 Subject: [PATCH 228/425] [francetv] Add support for urls in the format http://www.france3.fr/emissions/{program}/diffusions/{date} (fixes #1898) --- youtube_dl/extractor/francetv.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py index 290e650e0..ad85bc16d 100644 --- a/youtube_dl/extractor/francetv.py +++ b/youtube_dl/extractor/francetv.py @@ -72,7 +72,7 @@ class FranceTVIE(FranceTVBaseInfoExtractor): _VALID_URL = r'''(?x)https?://www\.france[2345o]\.fr/ (?: emissions/.*?/(videos|emissions)/(?P<id>[^/?]+) - | (emission|jt)/(?P<key>[^/?]+) + | (emissions?|jt)/(?P<key>[^/?]+) )''' _TESTS = [ @@ -91,12 +91,12 @@ class FranceTVIE(FranceTVBaseInfoExtractor): }, # france3 { - u'url': u'http://www.france3.fr/emissions/pieces-a-conviction/videos/rhozet_pac_ba_20131204_1933_03122013164521_F3', + u'url': u'http://www.france3.fr/emissions/pieces-a-conviction/diffusions/13-11-2013_145575', u'info_dict': { - u'id': u'rhozet_pac_ba_20131204_1933_03122013164521_F3', + u'id': u'000702326_CAPP_PicesconvictionExtrait313022013_120220131722_Au', u'ext': u'flv', - u'title': u'Pièces à conviction du 04/12/2013', - u'description': u'md5:1cf14ea302ba5f10d992c9eb2bff30dd', + u'title': u'Le scandale du prix des médicaments', + u'description': u'md5:1384089fbee2f04fc6c9de025ee2e9ce', }, u'params': { # rtmp download @@ -158,6 +158,7 @@ class FranceTVIE(FranceTVBaseInfoExtractor): class="francetv-video-player">'''), (r'<a id="player_direct" href="http://info\.francetelevisions' '\.fr/\?id-video=([^"/&]+)'), + (r'<a class="video" id="ftv_player_(.+?)"'), ] video_id = self._html_search_regex(id_res, webpage, u'video ID') else: From ef4fd848573b601502ba9142d5ce521294024356 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Fri, 6 Dec 2013 09:15:04 +0100 Subject: [PATCH 229/425] [wistia] Add extractor --- youtube_dl/YoutubeDL.py | 3 +- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/generic.py | 31 ++++++++++++------ youtube_dl/extractor/wistia.py | 55 ++++++++++++++++++++++++++++++++ 4 files changed, 80 insertions(+), 10 deletions(-) create mode 100644 youtube_dl/extractor/wistia.py diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 8ad7bd1da..07b36a98e 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -488,7 +488,8 @@ class YoutubeDL(object): new_result = ie_result.copy() for f in ('_type', 'url', 'ext', 'player_url', 'formats', 'entries', 'urlhandle', 'ie_key', 'duration', - 'subtitles', 'annotations', 'format'): + 'subtitles', 'annotations', 'format', + 'thumbnail', 'thumbnails'): if f in new_result: del new_result[f] if f in embedded_info: diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index a78dcad7f..a7d37d48b 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -178,6 +178,7 @@ from .wat import WatIE from .websurg import WeBSurgIE from .weibo import WeiboIE from .wimp import WimpIE +from .wistia import WistiaIE from .worldstarhiphop import WorldStarHipHopIE from .xhamster import XHamsterIE from .xnxx import XNXXIE diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 10ae06263..216e03218 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -169,8 +169,13 @@ class GenericIE(InfoExtractor): # Site Name | Video Title # Video Title - Tagline | Site Name # and so on and so forth; it's just not practical - video_title = self._html_search_regex(r'<title>(.*)', - webpage, u'video title', default=u'video', flags=re.DOTALL) + video_title = self._html_search_regex( + r'(?s)(.*?)', webpage, u'video title', + default=u'video') + + # video uploader is domain name + video_uploader = self._search_regex( + r'^(?:https?://)?([^/]*)/.*', url, u'video uploader') # Look for BrightCove: bc_url = BrightcoveIE._extract_brightcove_url(webpage) @@ -188,7 +193,7 @@ class GenericIE(InfoExtractor): # Look for embedded YouTube player matches = re.findall( - r']+?src=(["\'])(?P(?:https?:)?//(?:www\.)?youtube.com/embed/.+?)\1', webpage) + r']+?src=(["\'])(?P(?:https?:)?//(?:www\.)?youtube\.com/embed/.+?)\1', webpage) if matches: urlrs = [self.url_result(unescapeHTML(tuppl[1]), 'Youtube') for tuppl in matches] @@ -197,13 +202,26 @@ class GenericIE(InfoExtractor): # Look for embedded Dailymotion player matches = re.findall( - r']+?src=(["\'])(?P(?:https?:)?//(?:www\.)?dailymotion.com/embed/video/.+?)\1', webpage) + r']+?src=(["\'])(?P(?:https?:)?//(?:www\.)?dailymotion\.com/embed/video/.+?)\1', webpage) if matches: urlrs = [self.url_result(unescapeHTML(tuppl[1]), 'Dailymotion') for tuppl in matches] return self.playlist_result( urlrs, playlist_id=video_id, playlist_title=video_title) + # Look for embedded Wistia player + match = re.search( + r']+?src=(["\'])(?P(?:https?:)?//(?:fast\.)?wistia\.net/embed/iframe/.+?)\1', webpage) + if match: + return { + '_type': 'url_transparent', + 'url': unescapeHTML(match.group('url')), + 'ie_key': 'Wistia', + 'uploader': video_uploader, + 'title': video_title, + 'id': video_id, + } + # Look for Bandcamp pages with custom domain mobj = re.search(r']*?content="(.*?bandcamp\.com.*?)"', webpage) if mobj is not None: @@ -247,14 +265,9 @@ class GenericIE(InfoExtractor): # here's a fun little line of code for you: video_id = os.path.splitext(video_id)[0] - # video uploader is domain name - video_uploader = self._search_regex(r'(?:https?://)?([^/]*)/.*', - url, u'video uploader') - return { 'id': video_id, 'url': video_url, 'uploader': video_uploader, - 'upload_date': None, 'title': video_title, } diff --git a/youtube_dl/extractor/wistia.py b/youtube_dl/extractor/wistia.py new file mode 100644 index 000000000..e1748c261 --- /dev/null +++ b/youtube_dl/extractor/wistia.py @@ -0,0 +1,55 @@ +import json +import re + +from .common import InfoExtractor + + +class WistiaIE(InfoExtractor): + _VALID_URL = r'^https?://(?:fast\.)?wistia\.net/embed/iframe/(?P[a-z0-9]+)' + + _TEST = { + u"url": u"http://fast.wistia.net/embed/iframe/sh7fpupwlt", + u"file": u"sh7fpupwlt.mov", + u"md5": u"cafeb56ec0c53c18c97405eecb3133df", + u"info_dict": { + u"title": u"cfh_resourceful_zdkh_final_1" + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + + webpage = self._download_webpage(url, video_id) + data_json = self._html_search_regex( + r'Wistia.iframeInit\((.*?), {}\);', webpage, u'video data') + + data = json.loads(data_json) + + formats = [] + thumbnails = [] + for atype, a in data['assets'].items(): + if atype == 'still': + thumbnails.append({ + 'url': a['url'], + 'resolution': '%dx%d' % (a['width'], a['height']), + }) + continue + if atype == 'preview': + continue + formats.append({ + 'format_id': atype, + 'url': a['url'], + 'width': a['width'], + 'height': a['height'], + 'filesize': a['size'], + 'ext': a['ext'], + }) + formats.sort(key=lambda a: a['filesize']) + + return { + 'id': video_id, + 'title': data['name'], + 'formats': formats, + 'thumbnails': thumbnails, + } From 4e761794760ff5b281205838bf8a02ea496b89b2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Fri, 6 Dec 2013 13:03:08 +0100 Subject: [PATCH 230/425] [vimeo] Extract views count, likes count and comments count (#1895) --- youtube_dl/extractor/vimeo.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index f27763ae2..ac956e673 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -196,6 +196,16 @@ class VimeoIE(InfoExtractor): if mobj is not None: video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3) + try: + view_count = int(self._search_regex(r'UserPlays:(\d+)', webpage, u'view count')) + like_count = int(self._search_regex(r'UserLikes:(\d+)', webpage, u'like count')) + comment_count = int(self._search_regex(r'UserComments:(\d+)', webpage, u'comment count')) + except RegexNotFoundError: + # This info is only available in vimeo.com/{id} urls + view_count = None + like_count = None + comment_count = None + # Vimeo specific: extract request signature and timestamp sig = config['request']['signature'] timestamp = config['request']['timestamp'] @@ -242,6 +252,9 @@ class VimeoIE(InfoExtractor): 'description': video_description, 'formats': formats, 'webpage_url': url, + 'view_count': view_count, + 'like_count': like_count, + 'comment_count': comment_count, } From 336c3a69bd198130e2f65f14dfc83383fec7c5e2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Fri, 6 Dec 2013 13:22:04 +0100 Subject: [PATCH 231/425] [youtube] Extract like and dislike count (#1895) --- youtube_dl/extractor/youtube.py | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 7fff761bd..52c8e7d04 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -388,10 +388,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): super(YoutubeIE, self).__init__(*args, **kwargs) self._player_cache = {} - def report_video_webpage_download(self, video_id): - """Report attempt to download video webpage.""" - self.to_screen(u'%s: Downloading video webpage' % video_id) - def report_video_info_webpage_download(self, video_id): """Report attempt to download video info webpage.""" self.to_screen(u'%s: Downloading video info webpage' % video_id) @@ -1258,15 +1254,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): video_id = self._extract_id(url) # Get video webpage - self.report_video_webpage_download(video_id) url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id - request = compat_urllib_request.Request(url) - try: - video_webpage_bytes = compat_urllib_request.urlopen(request).read() - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err)) - - video_webpage = video_webpage_bytes.decode('utf-8', 'ignore') + video_webpage = self._download_webpage(url, video_id) # Attempt to extract SWF player URL mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage) @@ -1383,6 +1372,14 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): else: video_description = u'' + def _extract_count(klass): + count = self._search_regex(r'class="%s">([\d,]+)' % re.escape(klass), video_webpage, klass, fatal=False) + if count is not None: + return int(count.replace(',', '')) + return None + like_count = _extract_count(u'likes-count') + dislike_count = _extract_count(u'dislikes-count') + # subtitles video_subtitles = self.extract_subtitles(video_id, video_webpage) @@ -1515,6 +1512,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): 'annotations': video_annotations, 'webpage_url': 'https://www.youtube.com/watch?v=%s' % video_id, 'view_count': view_count, + 'like_count': like_count, + 'dislike_count': dislike_count, }) return results From f53c966a73df42a9a949912ef8ab99a64fb99466 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Fri, 6 Dec 2013 13:36:36 +0100 Subject: [PATCH 232/425] [dailymotion] Extract view count (#1895) --- test/test_utils.py | 5 +++++ youtube_dl/extractor/dailymotion.py | 5 +++++ youtube_dl/utils.py | 4 ++++ 3 files changed, 14 insertions(+) diff --git a/test/test_utils.py b/test/test_utils.py index e9e590e74..0fa66beec 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -26,6 +26,7 @@ from youtube_dl.utils import ( unsmuggle_url, shell_quote, encodeFilename, + str_to_int, ) if sys.version_info < (3, 0): @@ -176,6 +177,10 @@ class TestUtil(unittest.TestCase): args = ['ffmpeg', '-i', encodeFilename(u'ñ€ß\'.mp4')] self.assertEqual(shell_quote(args), u"""ffmpeg -i 'ñ€ß'"'"'.mp4'""") + def test_str_to_int(self): + self.assertEqual(str_to_int('123,456'), 123456) + self.assertEqual(str_to_int('123.456'), 123456) + if __name__ == '__main__': unittest.main() diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index 71f5e03ee..3756cf765 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -11,6 +11,7 @@ from ..utils import ( get_element_by_attribute, get_element_by_id, orderedSet, + str_to_int, ExtractorError, ) @@ -146,6 +147,9 @@ class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor): self._list_available_subtitles(video_id, webpage) return + view_count = str_to_int(self._search_regex( + r'video_views_value[^>]+>([\d\.]+)<', webpage, u'view count')) + return { 'id': video_id, 'formats': formats, @@ -155,6 +159,7 @@ class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor): 'subtitles': video_subtitles, 'thumbnail': info['thumbnail_url'], 'age_limit': age_limit, + 'view_count': view_count, } def _get_available_subtitles(self, video_id, webpage): diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 77609f7ca..7b5878830 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1020,3 +1020,7 @@ def format_bytes(bytes): suffix = [u'B', u'KiB', u'MiB', u'GiB', u'TiB', u'PiB', u'EiB', u'ZiB', u'YiB'][exponent] converted = float(bytes) / float(1024 ** exponent) return u'%.2f%s' % (converted, suffix) + +def str_to_int(int_str): + int_str = re.sub(r'[,\.]', u'', int_str) + return int(int_str) From 563e405411131628a6ea160c3fe2b2b4a883ac85 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Fri, 6 Dec 2013 13:41:07 +0100 Subject: [PATCH 233/425] [dailymotion] Fix view count regex In some languages they can be in the format '123,456' instead of '123.456' --- youtube_dl/extractor/dailymotion.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index 3756cf765..3bd0b862c 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -148,7 +148,7 @@ class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor): return view_count = str_to_int(self._search_regex( - r'video_views_value[^>]+>([\d\.]+)<', webpage, u'view count')) + r'video_views_value[^>]+>([\d\.,]+)<', webpage, u'view count')) return { 'id': video_id, From 7d4afc557f88a05f4f45618c07443aee5aa2099e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Fri, 6 Dec 2013 19:48:54 +0100 Subject: [PATCH 234/425] [youtube:playlist] Support mix ids longer than 13 (#1295) --- youtube_dl/extractor/youtube.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 52c8e7d04..91f8028ff 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1553,7 +1553,7 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): def _extract_mix(self, playlist_id): # The mixes are generated from a a single video # the id of the playlist is just 'RD' + video_id - url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[2:], playlist_id) + url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id) webpage = self._download_webpage(url, playlist_id, u'Downloading Youtube mix') title_span = (get_element_by_attribute('class', 'title long-title', webpage) or get_element_by_attribute('class', 'title ', webpage)) @@ -1581,7 +1581,7 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): else: self.to_screen(u'Downloading playlist PL%s - add --no-playlist to just download video %s' % (playlist_id, video_id)) - if len(playlist_id) == 13: # 'RD' + 11 characters for the video id + if playlist_id.startswith('RD'): # Mixes require a custom extraction process return self._extract_mix(playlist_id) From 715c8e7bdb219f30f83c7d76cbbbc77195366cb7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Fri, 6 Dec 2013 19:52:41 +0100 Subject: [PATCH 235/425] [youtube:playlist] Recognize mix ids for direct use (fixes #1295) --- youtube_dl/extractor/youtube.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 91f8028ff..01715024c 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1528,10 +1528,10 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): \? (?:.*?&)*? (?:p|a|list)= | p/ ) - ((?:PL|EC|UU|FL)?[0-9A-Za-z-_]{10,}) + ((?:PL|EC|UU|FL|RD)?[0-9A-Za-z-_]{10,}) .* | - ((?:PL|EC|UU|FL)[0-9A-Za-z-_]{10,}) + ((?:PL|EC|UU|FL|RD)[0-9A-Za-z-_]{10,}) )""" _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s&page=%s' _MORE_PAGES_INDICATOR = r'data-link-type="next"' From 0b6a9f639f6447c7e09c38b88b42964e8fa05349 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Fri, 6 Dec 2013 20:14:29 +0100 Subject: [PATCH 236/425] [vevo] Update test video's duration --- youtube_dl/extractor/vevo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/vevo.py b/youtube_dl/extractor/vevo.py index 3eedcf7dd..4823992ef 100644 --- a/youtube_dl/extractor/vevo.py +++ b/youtube_dl/extractor/vevo.py @@ -24,7 +24,7 @@ class VevoIE(InfoExtractor): u"upload_date": u"20130624", u"uploader": u"Hurts", u"title": u"Somebody to Die For", - u"duration": 230, + u"duration": 230.12, u"width": 1920, u"height": 1080, } From d349cd22401648e88d57b6dcdd0c8bbb12aaa0d9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Fri, 6 Dec 2013 20:26:55 +0100 Subject: [PATCH 237/425] [imdb] Fix extraction The paths to each format's page may have leading whitespace. The height and the duration can't be extracted. --- youtube_dl/extractor/imdb.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/youtube_dl/extractor/imdb.py b/youtube_dl/extractor/imdb.py index d8e9712a7..6fb373db2 100644 --- a/youtube_dl/extractor/imdb.py +++ b/youtube_dl/extractor/imdb.py @@ -21,7 +21,6 @@ class ImdbIE(InfoExtractor): u'ext': u'mp4', u'title': u'Ice Age: Continental Drift Trailer (No. 2) - IMDb', u'description': u'md5:9061c2219254e5d14e03c25c98e96a81', - u'duration': 151, } } @@ -35,6 +34,7 @@ class ImdbIE(InfoExtractor): flags=re.MULTILINE) formats = [] for f_id, f_path in available_formats: + f_path = f_path.strip() format_page = self._download_webpage( compat_urlparse.urljoin(url, f_path), u'Downloading info for %s format' % f_id) @@ -46,7 +46,6 @@ class ImdbIE(InfoExtractor): formats.append({ 'format_id': f_id, 'url': format_info['url'], - 'height': int(info['titleObject']['encoding']['selected'][:-1]), }) return { @@ -55,5 +54,4 @@ class ImdbIE(InfoExtractor): 'formats': formats, 'description': descr, 'thumbnail': format_info['slate'], - 'duration': int(info['titleObject']['title']['duration_seconds']), } From 5cc14c2fd74a721be0effd5bc06a76164a9c97a1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Fri, 6 Dec 2013 21:47:32 +0100 Subject: [PATCH 238/425] [vimeo] Add an extractor for albums (closes #1911) --- test/test_playlists.py | 9 +++++++++ youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/vimeo.py | 23 ++++++++++++++++++++--- 3 files changed, 30 insertions(+), 3 deletions(-) diff --git a/test/test_playlists.py b/test/test_playlists.py index 00c950109..6a5e0b780 100644 --- a/test/test_playlists.py +++ b/test/test_playlists.py @@ -16,6 +16,7 @@ from youtube_dl.extractor import ( DailymotionUserIE, VimeoChannelIE, VimeoUserIE, + VimeoAlbumIE, UstreamChannelIE, SoundcloudSetIE, SoundcloudUserIE, @@ -65,6 +66,14 @@ class TestPlaylists(unittest.TestCase): self.assertEqual(result['title'], u'Nki') self.assertTrue(len(result['entries']) > 65) + def test_vimeo_album(self): + dl = FakeYDL() + ie = VimeoAlbumIE(dl) + result = ie.extract('http://vimeo.com/album/2632481') + self.assertIsPlaylist(result) + self.assertEqual(result['title'], u'Staff Favorites: November 2013') + self.assertTrue(len(result['entries']) > 12) + def test_ustream_channel(self): dl = FakeYDL() ie = UstreamChannelIE(dl) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index a7d37d48b..ac0a11dfe 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -170,6 +170,7 @@ from .vimeo import ( VimeoIE, VimeoChannelIE, VimeoUserIE, + VimeoAlbumIE, ) from .vine import VineIE from .viki import VikiIE diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index ac956e673..293dad3c0 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -20,7 +20,7 @@ class VimeoIE(InfoExtractor): """Information extractor for vimeo.com.""" # _VALID_URL matches Vimeo URLs - _VALID_URL = r'(?Phttps?://)?(?:(?:www|(?Pplayer))\.)?vimeo(?Ppro)?\.com/(?:(?:(?:groups|album)/[^/]+)|(?:.*?)/)?(?Pplay_redirect_hls\?clip_id=)?(?:videos?/)?(?P[0-9]+)/?(?:[?].*)?(?:#.*)?$' + _VALID_URL = r'(?Phttps?://)?(?:(?:www|(?Pplayer))\.)?vimeo(?Ppro)?\.com/(?:(?:(?:groups)/[^/]+)|(?:.*?)/)?(?Pplay_redirect_hls\?clip_id=)?(?:videos?/)?(?P[0-9]+)/?(?:[?].*)?(?:#.*)?$' _NETRC_MACHINE = 'vimeo' IE_NAME = u'vimeo' _TESTS = [ @@ -264,11 +264,14 @@ class VimeoChannelIE(InfoExtractor): _MORE_PAGES_INDICATOR = r']+?title="(.*?)"' + def _page_url(self, base_url, pagenum): + return '%s/videos/page:%d/' % (base_url, pagenum) + def _extract_videos(self, list_id, base_url): video_ids = [] for pagenum in itertools.count(1): webpage = self._download_webpage( - '%s/videos/page:%d/' % (base_url, pagenum),list_id, + self._page_url(base_url, pagenum) ,list_id, u'Downloading page %s' % pagenum) video_ids.extend(re.findall(r'id="clip_(\d+?)"', webpage)) if re.search(self._MORE_PAGES_INDICATOR, webpage, re.DOTALL) is None: @@ -297,7 +300,7 @@ class VimeoUserIE(VimeoChannelIE): @classmethod def suitable(cls, url): - if VimeoChannelIE.suitable(url) or VimeoIE.suitable(url): + if VimeoChannelIE.suitable(url) or VimeoIE.suitable(url) or VimeoAlbumIE.suitable(url): return False return super(VimeoUserIE, cls).suitable(url) @@ -305,3 +308,17 @@ class VimeoUserIE(VimeoChannelIE): mobj = re.match(self._VALID_URL, url) name = mobj.group('name') return self._extract_videos(name, 'http://vimeo.com/%s' % name) + + +class VimeoAlbumIE(VimeoChannelIE): + IE_NAME = u'vimeo:album' + _VALID_URL = r'(?:https?://)?vimeo.\com/album/(?P\d+)' + _TITLE_RE = r'