From c8434e83163fc90007eb5b501ea0e827f8b5e127 Mon Sep 17 00:00:00 2001 From: rzhxeo Date: Mon, 4 Nov 2013 03:08:17 +0100 Subject: [PATCH 001/150] Add support for crunchyroll.com --- youtube_dl/aes.py | 144 ++++++++++++++++++++--- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/crunchyroll.py | 171 ++++++++++++++++++++++++++++ 3 files changed, 298 insertions(+), 18 deletions(-) create mode 100644 youtube_dl/extractor/crunchyroll.py diff --git a/youtube_dl/aes.py b/youtube_dl/aes.py index 9a0c93fa6..e9c5e2152 100644 --- a/youtube_dl/aes.py +++ b/youtube_dl/aes.py @@ -1,4 +1,4 @@ -__all__ = ['aes_encrypt', 'key_expansion', 'aes_ctr_decrypt', 'aes_decrypt_text'] +__all__ = ['aes_encrypt', 'key_expansion', 'aes_ctr_decrypt', 'aes_cbc_decrypt', 'aes_decrypt_text'] import base64 from math import ceil @@ -32,6 +32,31 @@ def aes_ctr_decrypt(data, key, counter): return decrypted_data +def aes_cbc_decrypt(data, key, iv): + """ + Decrypt with aes in CBC mode + + @param {int[]} data cipher + @param {int[]} key 16/24/32-Byte cipher key + @param {int[]} iv 16-Byte IV + @returns {int[]} decrypted data + """ + expanded_key = key_expansion(key) + block_count = int(ceil(float(len(data)) / BLOCK_SIZE_BYTES)) + + decrypted_data=[] + previous_cipher_block = iv + for i in range(block_count): + block = data[i*BLOCK_SIZE_BYTES : (i+1)*BLOCK_SIZE_BYTES] + block += [0]*(BLOCK_SIZE_BYTES - len(block)) + + decrypted_block = aes_decrypt(block, expanded_key) + decrypted_data += xor(decrypted_block, previous_cipher_block) + previous_cipher_block = block + decrypted_data = decrypted_data[:len(data)] + + return decrypted_data + def key_expansion(data): """ Generate key schedule @@ -75,7 +100,7 @@ def aes_encrypt(data, expanded_key): @returns {int[]} 16-Byte cipher """ rounds = len(expanded_key) // BLOCK_SIZE_BYTES - 1 - + data = xor(data, expanded_key[:BLOCK_SIZE_BYTES]) for i in range(1, rounds+1): data = sub_bytes(data) @@ -83,6 +108,26 @@ def aes_encrypt(data, expanded_key): if i != rounds: data = mix_columns(data) data = xor(data, expanded_key[i*BLOCK_SIZE_BYTES : (i+1)*BLOCK_SIZE_BYTES]) + + return data + +def aes_decrypt(data, expanded_key): + """ + Decrypt one block with aes + + @param {int[]} data 16-Byte cipher + @param {int[]} expanded_key 176/208/240-Byte expanded key + @returns {int[]} 16-Byte state + """ + rounds = len(expanded_key) // BLOCK_SIZE_BYTES - 1 + + for i in range(rounds, 0, -1): + data = xor(data, expanded_key[i*BLOCK_SIZE_BYTES : (i+1)*BLOCK_SIZE_BYTES]) + if i != rounds: + data = mix_columns_inv(data) + data = shift_rows_inv(data) + data = sub_bytes_inv(data) + data = xor(data, expanded_key[:BLOCK_SIZE_BYTES]) return data @@ -139,14 +184,69 @@ SBOX = (0x63, 0x7C, 0x77, 0x7B, 0xF2, 0x6B, 0x6F, 0xC5, 0x30, 0x01, 0x67, 0x2B, 0x70, 0x3E, 0xB5, 0x66, 0x48, 0x03, 0xF6, 0x0E, 0x61, 0x35, 0x57, 0xB9, 0x86, 0xC1, 0x1D, 0x9E, 0xE1, 0xF8, 0x98, 0x11, 0x69, 0xD9, 0x8E, 0x94, 0x9B, 0x1E, 0x87, 0xE9, 0xCE, 0x55, 0x28, 0xDF, 0x8C, 0xA1, 0x89, 0x0D, 0xBF, 0xE6, 0x42, 0x68, 0x41, 0x99, 0x2D, 0x0F, 0xB0, 0x54, 0xBB, 0x16) -MIX_COLUMN_MATRIX = ((2,3,1,1), - (1,2,3,1), - (1,1,2,3), - (3,1,1,2)) +SBOX_INV = (0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38, 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb, + 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87, 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb, + 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d, 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e, + 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2, 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25, + 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16, 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92, + 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda, 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84, + 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a, 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06, + 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02, 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b, + 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea, 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73, + 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85, 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e, + 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89, 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b, + 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20, 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4, + 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31, 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f, + 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d, 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef, + 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0, 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61, + 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26, 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d) +MIX_COLUMN_MATRIX = ((0x2,0x3,0x1,0x1), + (0x1,0x2,0x3,0x1), + (0x1,0x1,0x2,0x3), + (0x3,0x1,0x1,0x2)) +MIX_COLUMN_MATRIX_INV = ((0xE,0xB,0xD,0x9), + (0x9,0xE,0xB,0xD), + (0xD,0x9,0xE,0xB), + (0xB,0xD,0x9,0xE)) +RIJNDAEL_EXP_TABLE = (0x01, 0x03, 0x05, 0x0F, 0x11, 0x33, 0x55, 0xFF, 0x1A, 0x2E, 0x72, 0x96, 0xA1, 0xF8, 0x13, 0x35, + 0x5F, 0xE1, 0x38, 0x48, 0xD8, 0x73, 0x95, 0xA4, 0xF7, 0x02, 0x06, 0x0A, 0x1E, 0x22, 0x66, 0xAA, + 0xE5, 0x34, 0x5C, 0xE4, 0x37, 0x59, 0xEB, 0x26, 0x6A, 0xBE, 0xD9, 0x70, 0x90, 0xAB, 0xE6, 0x31, + 0x53, 0xF5, 0x04, 0x0C, 0x14, 0x3C, 0x44, 0xCC, 0x4F, 0xD1, 0x68, 0xB8, 0xD3, 0x6E, 0xB2, 0xCD, + 0x4C, 0xD4, 0x67, 0xA9, 0xE0, 0x3B, 0x4D, 0xD7, 0x62, 0xA6, 0xF1, 0x08, 0x18, 0x28, 0x78, 0x88, + 0x83, 0x9E, 0xB9, 0xD0, 0x6B, 0xBD, 0xDC, 0x7F, 0x81, 0x98, 0xB3, 0xCE, 0x49, 0xDB, 0x76, 0x9A, + 0xB5, 0xC4, 0x57, 0xF9, 0x10, 0x30, 0x50, 0xF0, 0x0B, 0x1D, 0x27, 0x69, 0xBB, 0xD6, 0x61, 0xA3, + 0xFE, 0x19, 0x2B, 0x7D, 0x87, 0x92, 0xAD, 0xEC, 0x2F, 0x71, 0x93, 0xAE, 0xE9, 0x20, 0x60, 0xA0, + 0xFB, 0x16, 0x3A, 0x4E, 0xD2, 0x6D, 0xB7, 0xC2, 0x5D, 0xE7, 0x32, 0x56, 0xFA, 0x15, 0x3F, 0x41, + 0xC3, 0x5E, 0xE2, 0x3D, 0x47, 0xC9, 0x40, 0xC0, 0x5B, 0xED, 0x2C, 0x74, 0x9C, 0xBF, 0xDA, 0x75, + 0x9F, 0xBA, 0xD5, 0x64, 0xAC, 0xEF, 0x2A, 0x7E, 0x82, 0x9D, 0xBC, 0xDF, 0x7A, 0x8E, 0x89, 0x80, + 0x9B, 0xB6, 0xC1, 0x58, 0xE8, 0x23, 0x65, 0xAF, 0xEA, 0x25, 0x6F, 0xB1, 0xC8, 0x43, 0xC5, 0x54, + 0xFC, 0x1F, 0x21, 0x63, 0xA5, 0xF4, 0x07, 0x09, 0x1B, 0x2D, 0x77, 0x99, 0xB0, 0xCB, 0x46, 0xCA, + 0x45, 0xCF, 0x4A, 0xDE, 0x79, 0x8B, 0x86, 0x91, 0xA8, 0xE3, 0x3E, 0x42, 0xC6, 0x51, 0xF3, 0x0E, + 0x12, 0x36, 0x5A, 0xEE, 0x29, 0x7B, 0x8D, 0x8C, 0x8F, 0x8A, 0x85, 0x94, 0xA7, 0xF2, 0x0D, 0x17, + 0x39, 0x4B, 0xDD, 0x7C, 0x84, 0x97, 0xA2, 0xFD, 0x1C, 0x24, 0x6C, 0xB4, 0xC7, 0x52, 0xF6, 0x01) +RIJNDAEL_LOG_TABLE = (0x00, 0x00, 0x19, 0x01, 0x32, 0x02, 0x1a, 0xc6, 0x4b, 0xc7, 0x1b, 0x68, 0x33, 0xee, 0xdf, 0x03, + 0x64, 0x04, 0xe0, 0x0e, 0x34, 0x8d, 0x81, 0xef, 0x4c, 0x71, 0x08, 0xc8, 0xf8, 0x69, 0x1c, 0xc1, + 0x7d, 0xc2, 0x1d, 0xb5, 0xf9, 0xb9, 0x27, 0x6a, 0x4d, 0xe4, 0xa6, 0x72, 0x9a, 0xc9, 0x09, 0x78, + 0x65, 0x2f, 0x8a, 0x05, 0x21, 0x0f, 0xe1, 0x24, 0x12, 0xf0, 0x82, 0x45, 0x35, 0x93, 0xda, 0x8e, + 0x96, 0x8f, 0xdb, 0xbd, 0x36, 0xd0, 0xce, 0x94, 0x13, 0x5c, 0xd2, 0xf1, 0x40, 0x46, 0x83, 0x38, + 0x66, 0xdd, 0xfd, 0x30, 0xbf, 0x06, 0x8b, 0x62, 0xb3, 0x25, 0xe2, 0x98, 0x22, 0x88, 0x91, 0x10, + 0x7e, 0x6e, 0x48, 0xc3, 0xa3, 0xb6, 0x1e, 0x42, 0x3a, 0x6b, 0x28, 0x54, 0xfa, 0x85, 0x3d, 0xba, + 0x2b, 0x79, 0x0a, 0x15, 0x9b, 0x9f, 0x5e, 0xca, 0x4e, 0xd4, 0xac, 0xe5, 0xf3, 0x73, 0xa7, 0x57, + 0xaf, 0x58, 0xa8, 0x50, 0xf4, 0xea, 0xd6, 0x74, 0x4f, 0xae, 0xe9, 0xd5, 0xe7, 0xe6, 0xad, 0xe8, + 0x2c, 0xd7, 0x75, 0x7a, 0xeb, 0x16, 0x0b, 0xf5, 0x59, 0xcb, 0x5f, 0xb0, 0x9c, 0xa9, 0x51, 0xa0, + 0x7f, 0x0c, 0xf6, 0x6f, 0x17, 0xc4, 0x49, 0xec, 0xd8, 0x43, 0x1f, 0x2d, 0xa4, 0x76, 0x7b, 0xb7, + 0xcc, 0xbb, 0x3e, 0x5a, 0xfb, 0x60, 0xb1, 0x86, 0x3b, 0x52, 0xa1, 0x6c, 0xaa, 0x55, 0x29, 0x9d, + 0x97, 0xb2, 0x87, 0x90, 0x61, 0xbe, 0xdc, 0xfc, 0xbc, 0x95, 0xcf, 0xcd, 0x37, 0x3f, 0x5b, 0xd1, + 0x53, 0x39, 0x84, 0x3c, 0x41, 0xa2, 0x6d, 0x47, 0x14, 0x2a, 0x9e, 0x5d, 0x56, 0xf2, 0xd3, 0xab, + 0x44, 0x11, 0x92, 0xd9, 0x23, 0x20, 0x2e, 0x89, 0xb4, 0x7c, 0xb8, 0x26, 0x77, 0x99, 0xe3, 0xa5, + 0x67, 0x4a, 0xed, 0xde, 0xc5, 0x31, 0xfe, 0x18, 0x0d, 0x63, 0x8c, 0x80, 0xc0, 0xf7, 0x70, 0x07) def sub_bytes(data): return [SBOX[x] for x in data] +def sub_bytes_inv(data): + return [SBOX_INV[x] for x in data] + def rotate(data): return data[1:] + [data[0]] @@ -160,30 +260,31 @@ def key_schedule_core(data, rcon_iteration): def xor(data1, data2): return [x^y for x, y in zip(data1, data2)] -def mix_column(data): +def rijndael_mul(a, b): + if(a==0 or b==0): + return 0 + return RIJNDAEL_EXP_TABLE[(RIJNDAEL_LOG_TABLE[a] + RIJNDAEL_LOG_TABLE[b]) % 0xFF] + +def mix_column(data, matrix): data_mixed = [] for row in range(4): mixed = 0 for column in range(4): - addend = data[column] - if MIX_COLUMN_MATRIX[row][column] in (2,3): - addend <<= 1 - if addend > 0xff: - addend &= 0xff - addend ^= 0x1b - if MIX_COLUMN_MATRIX[row][column] == 3: - addend ^= data[column] - mixed ^= addend & 0xff + # xor is (+) and (-) + mixed ^= rijndael_mul(data[column], matrix[row][column]) data_mixed.append(mixed) return data_mixed -def mix_columns(data): +def mix_columns(data, matrix=MIX_COLUMN_MATRIX): data_mixed = [] for i in range(4): column = data[i*4 : (i+1)*4] - data_mixed += mix_column(column) + data_mixed += mix_column(column, matrix) return data_mixed +def mix_columns_inv(data): + return mix_columns(data, MIX_COLUMN_MATRIX_INV) + def shift_rows(data): data_shifted = [] for column in range(4): @@ -191,6 +292,13 @@ def shift_rows(data): data_shifted.append( data[((column + row) & 0b11) * 4 + row] ) return data_shifted +def shift_rows_inv(data): + data_shifted = [] + for column in range(4): + for row in range(4): + data_shifted.append( data[((column - row) & 0b11) * 4 + row] ) + return data_shifted + def inc(data): data = data[:] # copy for i in range(len(data)-1,-1,-1): diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index bcf1cce7f..a61e17ea1 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -23,6 +23,7 @@ from .collegehumor import CollegeHumorIE from .comedycentral import ComedyCentralIE from .condenast import CondeNastIE from .criterion import CriterionIE +from .crunchyroll import CrunchyrollIE from .cspan import CSpanIE from .dailymotion import ( DailymotionIE, diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py new file mode 100644 index 000000000..4bd366079 --- /dev/null +++ b/youtube_dl/extractor/crunchyroll.py @@ -0,0 +1,171 @@ +# encoding: utf-8 +import re, base64, zlib +from hashlib import sha1 +from math import pow, sqrt, floor +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + compat_urllib_parse, + compat_urllib_request, + bytes_to_intlist, + intlist_to_bytes, + unified_strdate, + clean_html, +) +from ..aes import ( + aes_cbc_decrypt, + inc, +) + +class CrunchyrollIE(InfoExtractor): + _VALID_URL = r'(?:https?://)?(?:www\.)?(?Pcrunchyroll\.com/[^/]*/[^/?&]*?(?P[0-9]+))(?:[/?&]|$)' + _TESTS = [{ + u'url': u'http://www.crunchyroll.com/wanna-be-the-strongest-in-the-world/episode-1-an-idol-wrestler-is-born-645513', + u'file': u'645513.flv', + #u'md5': u'b1639fd6ddfaa43788c85f6d1dddd412', + u'info_dict': { + u'title': u'Wanna be the Strongest in the World – Episode 1 – An Idol-Wrestler is Born!', + u'description': u'md5:2d17137920c64f2f49981a7797d275ef', + u'thumbnail': u'http://img1.ak.crunchyroll.com/i/spire1-tmb/20c6b5e10f1a47b10516877d3c039cae1380951166_full.jpg', + u'uploader': u'Yomiuri Telecasting Corporation (YTV)', + u'upload_date': u'20131013', + }, + u'params': { + # rtmp + u'skip_download': True, + }, + }] + + _FORMAT_IDS = { + u'360': (u'60', u'106'), + u'480': (u'61', u'106'), + u'720': (u'62', u'106'), + u'1080': (u'80', u'108'), + } + + def _decrypt_subtitles(self, data, iv, id): + data = bytes_to_intlist(data) + iv = bytes_to_intlist(iv) + id = int(id) + + def obfuscate_key_aux(count, modulo, start): + output = list(start) + for _ in range(count): + output.append(output[-1] + output[-2]) + # cut off start values + output = output[2:] + output = list(map(lambda x: x % modulo + 33, output)) + return output + + def obfuscate_key(key): + num1 = int(floor(pow(2, 25) * sqrt(6.9))) + num2 = (num1 ^ key) << 5 + num3 = key ^ num1 + num4 = num3 ^ (num3 >> 3) ^ num2 + prefix = intlist_to_bytes(obfuscate_key_aux(20, 97, (1, 2))) + shaHash = bytes_to_intlist(sha1(prefix + str(num4).encode(u'ascii')).digest()) + # Extend 160 Bit hash to 256 Bit + return shaHash + [0] * 12 + + key = obfuscate_key(id) + class Counter: + __value = iv + def next_value(self): + temp = self.__value + self.__value = inc(self.__value) + return temp + decrypted_data = intlist_to_bytes(aes_cbc_decrypt(data, key, iv)) + return zlib.decompress(decrypted_data) + + def _convert_subtitles_to_srt(self, subtitles): + i=1 + output = u'' + for start, end, text in re.findall(r']*?start="([^"]+)" [^>]*?end="([^"]+)" [^>]*?text="([^"]+)"[^>]*?>', subtitles): + start = start.replace(u'.', u',') + end = end.replace(u'.', u',') + text = clean_html(text) + text = text.replace(u'\\N', u'\n') + if not text: + continue + output += u'%d\n%s --> %s\n%s\n\n' % (i, start, end, text) + i+=1 + return output + + def _real_extract(self,url): + mobj = re.match(self._VALID_URL, url) + + webpage_url = u'http://www.' + mobj.group('url') + video_id = mobj.group(u'video_id') + webpage = self._download_webpage(webpage_url, video_id) + note_m = self._html_search_regex(r'
(.+?)
', webpage, u'trailer-notice', default=u'') + if note_m: + raise ExtractorError(note_m) + + video_title = self._html_search_regex(r']*>(.+?)', webpage, u'video_title', flags=re.DOTALL) + video_title = re.sub(r' {5} *–? *', u' – ', video_title) + video_description = self._html_search_regex(r'"description":"([^"]+)', webpage, u'video_description', default=u'') + if not video_description: + video_description = None + video_upload_date = self._html_search_regex(r'
Availability for free users:(.+?)
', webpage, u'video_upload_date', fatal=False, flags=re.DOTALL) + if video_upload_date: + video_upload_date = unified_strdate(video_upload_date) + video_uploader = self._html_search_regex(r'
\s*Publisher:(.+?)
', webpage, u'video_uploader', fatal=False, flags=re.DOTALL) + + playerdata_url = compat_urllib_parse.unquote(self._html_search_regex(r'"config_url":"([^"]+)', webpage, u'playerdata_url')) + playerdata_req = compat_urllib_request.Request(playerdata_url) + playerdata_req.data = compat_urllib_parse.urlencode({u'current_page': webpage_url}) + playerdata_req.add_header(u'Content-Type', u'application/x-www-form-urlencoded') + playerdata = self._download_webpage(playerdata_req, video_id, note=u'Downloading media info') + + stream_id = self._search_regex(r'([^<]+)', playerdata, u'stream_id') + video_thumbnail = self._search_regex(r'([^<]+)', playerdata, u'thumbnail', fatal=False) + + formats = [] + for fmt in re.findall(r'\?p([0-9]{3,4})=1', webpage): + stream_quality, stream_format = self._FORMAT_IDS[fmt] + video_format = fmt+u'p' + streamdata_req = compat_urllib_request.Request(u'http://www.crunchyroll.com/xml/') + # urlencode doesn't work! + streamdata_req.data = u'req=RpcApiVideoEncode%5FGetStreamInfo&video%5Fencode%5Fquality='+stream_quality+u'&media%5Fid='+stream_id+u'&video%5Fformat='+stream_format + streamdata_req.add_header(u'Content-Type', u'application/x-www-form-urlencoded') + streamdata_req.add_header(u'Content-Length', str(len(streamdata_req.data))) + streamdata = self._download_webpage(streamdata_req, video_id, note=u'Downloading media info for '+video_format) + video_url = self._search_regex(r'([^<]+)', streamdata, u'video_url') + video_play_path = self._search_regex(r'([^<]+)', streamdata, u'video_play_path') + formats.append({ + u'url': video_url, + u'play_path': video_play_path, + u'ext': 'flv', + u'format': video_format, + u'format_id': video_format, + }) + + subtitles = {} + for sub_id, sub_name in re.findall(r'\?ssid=([0-9]+)" title="([^"]+)', webpage): + sub_page = self._download_webpage(u'http://www.crunchyroll.com/xml/?req=RpcApiSubtitle_GetXml&subtitle_script_id='+sub_id,\ + video_id, note=u'Downloading subtitles for '+sub_name) + id = self._search_regex(r'id=\'([0-9]+)', sub_page, u'subtitle_id', fatal=False) + iv = self._search_regex(r'([^<]+)', sub_page, u'subtitle_iv', fatal=False) + data = self._search_regex(r'([^<]+)', sub_page, u'subtitle_data', fatal=False) + if not id or not iv or not data: + continue + id = int(id) + iv = base64.b64decode(iv) + data = base64.b64decode(data) + + subtitle = self._decrypt_subtitles(data, iv, id).decode(u'utf-8') + lang_code = self._search_regex(r'lang_code=\'([^\']+)', subtitle, u'subtitle_lang_code', fatal=False) + if not lang_code: + continue + subtitles[lang_code] = self._convert_subtitles_to_srt(subtitle) + + return { + u'id': video_id, + u'title': video_title, + u'description': video_description, + u'thumbnail': video_thumbnail, + u'uploader': video_uploader, + u'upload_date': video_upload_date, + u'subtitles': subtitles, + u'formats': formats, + } From 0a688bc0b28c970e9af965b3fa0c7927507eeb97 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sat, 30 Nov 2013 14:56:51 +0100 Subject: [PATCH 002/150] [youtube] Add support for downloading top lists (fixes #1868) It needs to know the channel and the title of the list, because the ids change every time you browse the channels and are attached to a 'VISITOR_INFO1_LIVE' cookie. --- test/test_youtube_lists.py | 8 ++++++++ youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/youtube.py | 35 ++++++++++++++++++++++++++++++++ 3 files changed, 44 insertions(+) diff --git a/test/test_youtube_lists.py b/test/test_youtube_lists.py index 95f07d129..33db09f43 100644 --- a/test/test_youtube_lists.py +++ b/test/test_youtube_lists.py @@ -15,6 +15,7 @@ from youtube_dl.extractor import ( YoutubeIE, YoutubeChannelIE, YoutubeShowIE, + YoutubeTopListIE, ) @@ -116,5 +117,12 @@ class TestYoutubeLists(unittest.TestCase): original_video = entries[0] self.assertEqual(original_video['id'], 'rjFaenf1T-Y') + def test_youtube_toplist(self): + dl = FakeYDL() + ie = YoutubeTopListIE(dl) + result = ie.extract('yttoplist:music:Top Tracks') + entries = result['entries'] + self.assertTrue(len(entries) >= 9) + if __name__ == '__main__': unittest.main() diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 664639b53..0abf86e44 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -194,6 +194,7 @@ from .youtube import ( YoutubeWatchLaterIE, YoutubeFavouritesIE, YoutubeHistoryIE, + YoutubeTopListIE, ) from .zdf import ZDFIE diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 765b4a9bf..a1a4d896d 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1576,6 +1576,9 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): if len(playlist_id) == 13: # 'RD' + 11 characters for the video id # Mixes require a custom extraction process return self._extract_mix(playlist_id) + if playlist_id.startswith('TL'): + raise ExtractorError(u'For downloading YouTube.com top lists, use ' + u'the "yttoplist" keyword, for example "youtube-dl \'yttoplist:music:Top Tracks\'"', expected=True) # Extract the video ids from the playlist pages ids = [] @@ -1598,6 +1601,38 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): return self.playlist_result(url_results, playlist_id, playlist_title) +class YoutubeTopListIE(YoutubePlaylistIE): + IE_NAME = u'youtube:toplist' + IE_DESC = (u'YouTube.com top lists, "yttoplist:{channel}:{list title}"' + u' (Example: "yttoplist:music:Top Tracks")') + _VALID_URL = r'yttoplist:(?P.*?):(?P.*?)$' + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + channel = mobj.group('chann') + title = mobj.group('title') + query = compat_urllib_parse.urlencode({'title': title}) + playlist_re = 'href="([^"]+?%s[^"]+?)"' % re.escape(query) + channel_page = self._download_webpage('https://www.youtube.com/%s' % channel, title) + link = self._html_search_regex(playlist_re, channel_page, u'list') + url = compat_urlparse.urljoin('https://www.youtube.com/', link) + + video_re = r'data-index="\d+".*?data-video-id="([0-9A-Za-z_-]{11})"' + ids = [] + # sometimes the webpage doesn't contain the videos + # retry until we get them + for i in itertools.count(0): + msg = u'Downloading Youtube mix' + if i > 0: + msg += ', retry #%d' % i + webpage = self._download_webpage(url, title, msg) + ids = orderedSet(re.findall(video_re, webpage)) + if ids: + break + url_results = self._ids_to_results(ids) + return self.playlist_result(url_results, playlist_title=title) + + class YoutubeChannelIE(InfoExtractor): IE_DESC = u'YouTube.com channels' _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)" From 1dcc4c0cad886457c0fa5f874c38f95f0510ea4f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Fri, 22 Nov 2013 14:57:53 +0100 Subject: [PATCH 003/150] Add --load-info option (#972) It just calls the 'YoutubeDL.process_ie_result' with the dictionary from the json file --- youtube_dl/YoutubeDL.py | 6 ++++++ youtube_dl/__init__.py | 10 ++++++++-- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index b68b110a4..80c056dc8 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -812,6 +812,12 @@ class YoutubeDL(object): return self._download_retcode + def download_with_info_file(self, info_filename): + with open(info_filename, 'r') as f: + # TODO: Check for errors + info = json.load(f) + self.process_ie_result(info, download=True) + def post_process(self, filename, ie_info): """Run all the postprocessors on the given file.""" info = dict(ie_info) diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index d2446b670..b0d9a6763 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -360,6 +360,9 @@ def parseOpts(overrideArguments=None): help='Restrict filenames to only ASCII characters, and avoid "&" and spaces in filenames', default=False) filesystem.add_option('-a', '--batch-file', dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)') + filesystem.add_option('--load-info', + dest='load_info_filename', metavar='FILE', + help='json file containing the video information (created with the "--write-json" option') filesystem.add_option('-w', '--no-overwrites', action='store_true', dest='nooverwrites', help='do not overwrite files', default=False) filesystem.add_option('-c', '--continue', @@ -706,14 +709,17 @@ def _real_main(argv=None): update_self(ydl.to_screen, opts.verbose) # Maybe do nothing - if len(all_urls) < 1: + if (len(all_urls) < 1) and (opts.load_info_filename is None): if not opts.update_self: parser.error(u'you must provide at least one URL') else: sys.exit() try: - retcode = ydl.download(all_urls) + if opts.load_info_filename is not None: + retcode = ydl.download_with_info_file(opts.load_info_filename) + else: + retcode = ydl.download(all_urls) except MaxDownloadsReached: ydl.to_screen(u'--max-download limit reached, aborting.') retcode = 101 From d494389821de832874dc78abc2fe16365b5fe815 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Tue, 3 Dec 2013 20:16:52 +0100 Subject: [PATCH 004/150] Option '--load-info': if the download fails, try extracting the info with the 'webpage_url' field of the info dict The video url may have expired. --- youtube_dl/YoutubeDL.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 80c056dc8..77339dddf 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -816,7 +816,16 @@ class YoutubeDL(object): with open(info_filename, 'r') as f: # TODO: Check for errors info = json.load(f) - self.process_ie_result(info, download=True) + try: + self.process_ie_result(info, download=True) + except DownloadError: + webpage_url = info.get('webpage_url') + if webpage_url is not None: + self.report_warning(u'The info failed to download, trying with "%s"' % webpage_url) + return self.download([webpage_url]) + else: + raise + return self._download_retcode def post_process(self, filename, ie_info): """Run all the postprocessors on the given file.""" From 55f6597c67dd04729dbc1b83d81bfbd63d7e9c0a Mon Sep 17 00:00:00 2001 From: dst <dstftw@gmail.com> Date: Wed, 4 Dec 2013 08:41:09 +0700 Subject: [PATCH 005/150] [smotri] Add an extractor for live rtmp broadcasts --- youtube_dl/FileDownloader.py | 49 ++++++++++++-- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/smotri.py | 106 ++++++++++++++++++++++++++++++- 3 files changed, 148 insertions(+), 8 deletions(-) diff --git a/youtube_dl/FileDownloader.py b/youtube_dl/FileDownloader.py index 3ff9716b3..de1dc66bb 100644 --- a/youtube_dl/FileDownloader.py +++ b/youtube_dl/FileDownloader.py @@ -226,6 +226,22 @@ class FileDownloader(object): (clear_line, percent_str, data_len_str, speed_str, eta_str), skip_eol=True) self.to_console_title(u'youtube-dl - %s of %s at %s ETA %s' % (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip())) + + def report_progress_live_stream(self, downloaded_data_len, speed, elapsed): + if self.params.get('noprogress', False): + return + clear_line = (u'\x1b[K' if sys.stderr.isatty() and os.name != 'nt' else u'') + downloaded_str = format_bytes(downloaded_data_len) + speed_str = self.format_speed(speed) + elapsed_str = FileDownloader.format_seconds(elapsed) + if self.params.get('progress_with_newline', False): + self.to_screen(u'[download] %s at %s' % + (downloaded_str, speed_str)) + else: + self.to_screen(u'\r%s[download] %s at %s ET %s' % + (clear_line, downloaded_str, speed_str, elapsed_str), skip_eol=True) + self.to_console_title(u'youtube-dl - %s at %s ET %s' % + (downloaded_str.strip(), speed_str.strip(), elapsed_str.strip())) def report_resuming_byte(self, resume_len): """Report attempt to resume at given byte.""" @@ -255,7 +271,7 @@ class FileDownloader(object): self.to_screen(u'\r%s[download] 100%% of %s in %s' % (clear_line, data_len_str, self.format_seconds(tot_time))) - def _download_with_rtmpdump(self, filename, url, player_url, page_url, play_path, tc_url, live): + def _download_with_rtmpdump(self, filename, url, player_url, page_url, play_path, tc_url, live, conn): def run_rtmpdump(args): start = time.time() resume_percent = None @@ -301,11 +317,27 @@ class FileDownloader(object): 'eta': eta, 'speed': speed, }) - elif self.params.get('verbose', False): - if not cursor_in_new_line: - self.to_screen(u'') - cursor_in_new_line = True - self.to_screen(u'[rtmpdump] '+line) + else: + # no percent for live streams + mobj = re.search(r'([0-9]+\.[0-9]{3}) kB / [0-9]+\.[0-9]{2} sec', line) + if mobj: + downloaded_data_len = int(float(mobj.group(1))*1024) + time_now = time.time() + speed = self.calc_speed(start, time_now, downloaded_data_len) + self.report_progress_live_stream(downloaded_data_len, speed, time_now - start) + cursor_in_new_line = False + self._hook_progress({ + 'downloaded_bytes': downloaded_data_len, + 'tmpfilename': tmpfilename, + 'filename': filename, + 'status': 'downloading', + 'speed': speed, + }) + elif self.params.get('verbose', False): + if not cursor_in_new_line: + self.to_screen(u'') + cursor_in_new_line = True + self.to_screen(u'[rtmpdump] '+line) proc.wait() if not cursor_in_new_line: self.to_screen(u'') @@ -338,6 +370,8 @@ class FileDownloader(object): basic_args += ['--stop', '1'] if live: basic_args += ['--live'] + if conn: + basic_args += ['--conn', conn] args = basic_args + [[], ['--resume', '--skip', '1']][self.params.get('continuedl', False)] if sys.platform == 'win32' and sys.version_info < (3, 0): @@ -479,7 +513,8 @@ class FileDownloader(object): info_dict.get('page_url', None), info_dict.get('play_path', None), info_dict.get('tc_url', None), - info_dict.get('rtmp_live', False)) + info_dict.get('rtmp_live', False), + info_dict.get('rtmp_conn', None)) # Attempt to download using mplayer if url.startswith('mms') or url.startswith('rtsp'): diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index bd996483b..60e2d6ebd 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -125,6 +125,7 @@ from .smotri import ( SmotriIE, SmotriCommunityIE, SmotriUserIE, + SmotriBroadcastIE, ) from .sohu import SohuIE from .soundcloud import SoundcloudIE, SoundcloudSetIE, SoundcloudUserIE diff --git a/youtube_dl/extractor/smotri.py b/youtube_dl/extractor/smotri.py index f035a3214..f86ee8388 100644 --- a/youtube_dl/extractor/smotri.py +++ b/youtube_dl/extractor/smotri.py @@ -3,10 +3,12 @@ import re import json import hashlib +import uuid from .common import InfoExtractor from ..utils import ( - determine_ext, + compat_urllib_parse, + compat_urllib_request, ExtractorError ) @@ -250,3 +252,105 @@ class SmotriUserIE(InfoExtractor): u'user nickname') return self.playlist_result(entries, user_id, user_nickname) + + +class SmotriBroadcastIE(InfoExtractor): + IE_DESC = u'Smotri.com broadcasts' + IE_NAME = u'smotri:broadcast' + _VALID_URL = r'^https?://(?:www\.)?(?P<url>smotri\.com/live/(?P<broadcastid>[^/]+))/?.*' + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + broadcast_id = mobj.group('broadcastid') + + broadcast_url = 'http://' + mobj.group('url') + broadcast_page = self._download_webpage(broadcast_url, broadcast_id, u'Downloading broadcast page') + + if re.search(u'>Режиссер с логином <br/>"%s"<br/> <span>не существует<' % broadcast_id, broadcast_page) is not None: + raise ExtractorError(u'Broadcast %s does not exist' % broadcast_id, expected=True) + + # Adult content + if re.search(u'EroConfirmText">', broadcast_page) is not None: + + (username, password) = self._get_login_info() + if username is None: + raise ExtractorError(u'Erotic broadcasts allowed only for registered users, ' + u'use --username and --password options to provide account credentials.', expected=True) + + # Log in + login_form_strs = { + u'login-hint53': '1', + u'confirm_erotic': '1', + u'login': username, + u'password': password, + } + # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode + # chokes on unicode + login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items()) + login_data = compat_urllib_parse.urlencode(login_form).encode('utf-8') + login_url = broadcast_url + '/?no_redirect=1' + request = compat_urllib_request.Request(login_url, login_data) + request.add_header('Content-Type', 'application/x-www-form-urlencoded') + broadcast_page = self._download_webpage( + request, broadcast_id, note=u'Logging in and confirming age') + + if re.search(u'>Неверный логин или пароль<', broadcast_page) is not None: + raise ExtractorError(u'Unable to log in: bad username or password', expected=True) + + adult_content = True + else: + adult_content = False + + ticket = self._html_search_regex( + u'window.broadcast_control.addFlashVar\\(\'file\', \'([^\']+)\'\\);', + broadcast_page, u'broadcast ticket') + + url = 'http://smotri.com/broadcast/view/url/?ticket=%s' % ticket + + broadcast_password = self._downloader.params.get('videopassword', None) + if broadcast_password: + url += '&pass=%s' % hashlib.md5(broadcast_password.encode('utf-8')).hexdigest() + + broadcast_json_page = self._download_webpage(url, broadcast_id, u'Downloading broadcast JSON') + + try: + broadcast_json = json.loads(broadcast_json_page) + + protected_broadcast = broadcast_json['_pass_protected'] == 1 + if protected_broadcast and not broadcast_password: + raise ExtractorError(u'This broadcast is protected by a password, use the --video-password option', expected=True) + + broadcast_offline = broadcast_json['is_play'] == 0 + if broadcast_offline: + raise ExtractorError(u'Broadcast %s is offline' % broadcast_id, expected=True) + + rtmp_url = broadcast_json['_server'] + if not rtmp_url.startswith('rtmp://'): + raise ExtractorError(u'Unexpected broadcast rtmp URL') + + broadcast_playpath = broadcast_json['_streamName'] + broadcast_thumbnail = broadcast_json['_imgURL'] + broadcast_title = broadcast_json['title'] + broadcast_description = broadcast_json['description'] + broadcaster_nick = broadcast_json['nick'] + broadcaster_login = broadcast_json['login'] + rtmp_conn = 'S:%s' % uuid.uuid4().hex + except KeyError: + if protected_broadcast: + raise ExtractorError(u'Bad broadcast password', expected=True) + raise ExtractorError(u'Unexpected broadcast JSON') + + return { + 'id': broadcast_id, + 'url': rtmp_url, + 'title': broadcast_title, + 'thumbnail': broadcast_thumbnail, + 'description': broadcast_description, + 'uploader': broadcaster_nick, + 'uploader_id': broadcaster_login, + 'age_limit': 18 if adult_content else 0, + 'ext': 'flv', + 'play_path': broadcast_playpath, + 'rtmp_live': True, + 'rtmp_conn': rtmp_conn + } \ No newline at end of file From 8aff7b9bc47795288c65399d6fcac7a8c48004e9 Mon Sep 17 00:00:00 2001 From: dst <dstftw@gmail.com> Date: Wed, 4 Dec 2013 12:36:12 +0700 Subject: [PATCH 006/150] [smotri] Fix broadcast ticket regex --- youtube_dl/extractor/smotri.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/smotri.py b/youtube_dl/extractor/smotri.py index f86ee8388..ff539ea0c 100644 --- a/youtube_dl/extractor/smotri.py +++ b/youtube_dl/extractor/smotri.py @@ -302,7 +302,7 @@ class SmotriBroadcastIE(InfoExtractor): adult_content = False ticket = self._html_search_regex( - u'window.broadcast_control.addFlashVar\\(\'file\', \'([^\']+)\'\\);', + u'window\.broadcast_control\.addFlashVar\\(\'file\', \'([^\']+)\'\\);', broadcast_page, u'broadcast ticket') url = 'http://smotri.com/broadcast/view/url/?ticket=%s' % ticket From ca9e02dc00fcfbc86fdf5d645417430ecb135537 Mon Sep 17 00:00:00 2001 From: Adam Glenn <gekitsuu@gmail.com> Date: Fri, 6 Dec 2013 21:11:01 -0800 Subject: [PATCH 007/150] Adding pyvideo support --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/pyvideo.py | 26 ++++++++++++++++++++++++++ 2 files changed, 27 insertions(+) create mode 100644 youtube_dl/extractor/pyvideo.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index f6a23f663..d7cb6e463 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -111,6 +111,7 @@ from .photobucket import PhotobucketIE from .podomatic import PodomaticIE from .pornhub import PornHubIE from .pornotube import PornotubeIE +from .pyvideo import PyvideoIE from .rbmaradio import RBMARadioIE from .redtube import RedTubeIE from .ringtv import RingTVIE diff --git a/youtube_dl/extractor/pyvideo.py b/youtube_dl/extractor/pyvideo.py new file mode 100644 index 000000000..243dff806 --- /dev/null +++ b/youtube_dl/extractor/pyvideo.py @@ -0,0 +1,26 @@ +import re + +from .common import InfoExtractor +from ..utils import determine_ext + + +class PyvideoIE(InfoExtractor): + _VALID_URL = r'(?:http://)?(?:www\.)?break\.com/video/([^/]+)' + _VALID_URL = r'(?:http://)?(?:www\.)?pyvideo\.org/video/(\d+)/(.*)' + _TEST = { + u'url': u'http://pyvideo.org/video/1737/become-a-logging-expert-in-30-minutes', + u'file': u'Become a logging expert in 30 minutes-24_4WWkSmNo.mp4', + u'md5': u'bf08cae24e1601027f98ae1262c299ad', + u'info_dict': { + u"title": u"Become a logging expert in 30 minutes" + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group(2) + webpage = self._download_webpage(url, video_id) + m_youtube = re.search(r'(https?://www\.youtube\.com/watch\?v=.*)', webpage) + + if m_youtube is not None: + return self.url_result(m_youtube.group(1), 'Youtube') From f623530d6e1ac63e0e2056274cb9b59b8699ce0c Mon Sep 17 00:00:00 2001 From: Adam Glenn <gekitsuu@gmail.com> Date: Fri, 6 Dec 2013 21:12:10 -0800 Subject: [PATCH 008/150] removing bad VALID_URL --- youtube_dl/extractor/pyvideo.py | 1 - 1 file changed, 1 deletion(-) diff --git a/youtube_dl/extractor/pyvideo.py b/youtube_dl/extractor/pyvideo.py index 243dff806..5c47993e7 100644 --- a/youtube_dl/extractor/pyvideo.py +++ b/youtube_dl/extractor/pyvideo.py @@ -5,7 +5,6 @@ from ..utils import determine_ext class PyvideoIE(InfoExtractor): - _VALID_URL = r'(?:http://)?(?:www\.)?break\.com/video/([^/]+)' _VALID_URL = r'(?:http://)?(?:www\.)?pyvideo\.org/video/(\d+)/(.*)' _TEST = { u'url': u'http://pyvideo.org/video/1737/become-a-logging-expert-in-30-minutes', From 0d9ec5d9630afa298f8b9acdfae0d85fbaa3fe58 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Sat, 7 Dec 2013 10:59:18 +0100 Subject: [PATCH 009/150] [pyvideo] Cleanup and fix test --- youtube_dl/extractor/pyvideo.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/pyvideo.py b/youtube_dl/extractor/pyvideo.py index 5c47993e7..e28ca3fb6 100644 --- a/youtube_dl/extractor/pyvideo.py +++ b/youtube_dl/extractor/pyvideo.py @@ -1,18 +1,22 @@ import re from .common import InfoExtractor -from ..utils import determine_ext class PyvideoIE(InfoExtractor): _VALID_URL = r'(?:http://)?(?:www\.)?pyvideo\.org/video/(\d+)/(.*)' _TEST = { u'url': u'http://pyvideo.org/video/1737/become-a-logging-expert-in-30-minutes', - u'file': u'Become a logging expert in 30 minutes-24_4WWkSmNo.mp4', - u'md5': u'bf08cae24e1601027f98ae1262c299ad', + u'file': u'24_4WWkSmNo.mp4', + u'md5': u'de317418c8bc76b1fd8633e4f32acbc6', u'info_dict': { - u"title": u"Become a logging expert in 30 minutes" - } + u"title": u"Become a logging expert in 30 minutes", + u"description": u"md5:9665350d466c67fb5b1598de379021f7", + u"upload_date": u"20130320", + u"uploader": u"NextDayVideo", + u"uploader_id": u"NextDayVideo", + }, + u'add_ie': ['Youtube'], } def _real_extract(self, url): From c4d9e6731a80cc1ce173d444920d49b424c62887 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Sat, 7 Dec 2013 11:19:59 +0100 Subject: [PATCH 010/150] [pyvideo] add support for videos that don't come from Youtube --- youtube_dl/extractor/pyvideo.py | 30 ++++++++++++++++++++++++++---- 1 file changed, 26 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/pyvideo.py b/youtube_dl/extractor/pyvideo.py index e28ca3fb6..33054591b 100644 --- a/youtube_dl/extractor/pyvideo.py +++ b/youtube_dl/extractor/pyvideo.py @@ -1,11 +1,12 @@ import re +import os from .common import InfoExtractor class PyvideoIE(InfoExtractor): - _VALID_URL = r'(?:http://)?(?:www\.)?pyvideo\.org/video/(\d+)/(.*)' - _TEST = { + _VALID_URL = r'(?:http://)?(?:www\.)?pyvideo\.org/video/(?P<id>\d+)/(.*)' + _TESTS = [{ u'url': u'http://pyvideo.org/video/1737/become-a-logging-expert-in-30-minutes', u'file': u'24_4WWkSmNo.mp4', u'md5': u'de317418c8bc76b1fd8633e4f32acbc6', @@ -17,13 +18,34 @@ class PyvideoIE(InfoExtractor): u"uploader_id": u"NextDayVideo", }, u'add_ie': ['Youtube'], - } + }, + { + u'url': u'http://pyvideo.org/video/2542/gloriajw-spotifywitherikbernhardsson182m4v', + u'md5': u'5fe1c7e0a8aa5570330784c847ff6d12', + u'info_dict': { + u'id': u'2542', + u'ext': u'm4v', + u'title': u'Gloriajw-SpotifyWithErikBernhardsson182', + }, + }, + ] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - video_id = mobj.group(2) + video_id = mobj.group('id') webpage = self._download_webpage(url, video_id) m_youtube = re.search(r'(https?://www\.youtube\.com/watch\?v=.*)', webpage) if m_youtube is not None: return self.url_result(m_youtube.group(1), 'Youtube') + + title = self._html_search_regex(r'<div class="section">.*?<h3>([^>]+?)</h3>', + webpage, u'title', flags=re.DOTALL) + video_url = self._search_regex([r'<source src="(.*?)"', + r'<dt>Download</dt>.*?<a href="(.+?)"'], + webpage, u'video url', flags=re.DOTALL) + return { + 'id': video_id, + 'title': os.path.splitext(title)[0], + 'url': video_url, + } From 8ca6b8fba1ef0a2e221681a2deff3eea00f0fa35 Mon Sep 17 00:00:00 2001 From: rzhxeo <rzhxeot7z81b4700@mailcatch.com> Date: Sat, 7 Dec 2013 21:39:32 +0100 Subject: [PATCH 011/150] [XHamsterIE] Fix HD video detection --- youtube_dl/extractor/xhamster.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/xhamster.py b/youtube_dl/extractor/xhamster.py index 279f75e7a..0dd5e805c 100644 --- a/youtube_dl/extractor/xhamster.py +++ b/youtube_dl/extractor/xhamster.py @@ -46,7 +46,7 @@ class XHamsterIE(InfoExtractor): return mobj.group('server')+'/key='+mobj.group('file') def is_hd(webpage): - return webpage.find('<div class=\'icon iconHD\'>') != -1 + return webpage.find('<div class=\'icon iconHD\'') != -1 mobj = re.match(self._VALID_URL, url) From b860967ce46358f6f54d2374af7e880155339392 Mon Sep 17 00:00:00 2001 From: rzhxeo <rzhxeot7z81b4700@mailcatch.com> Date: Sat, 7 Dec 2013 22:17:13 +0100 Subject: [PATCH 012/150] [XHamsterIE] Fix md5 in second test --- youtube_dl/extractor/xhamster.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/xhamster.py b/youtube_dl/extractor/xhamster.py index 0dd5e805c..ef9997ee4 100644 --- a/youtube_dl/extractor/xhamster.py +++ b/youtube_dl/extractor/xhamster.py @@ -26,7 +26,7 @@ class XHamsterIE(InfoExtractor): { u'url': u'http://xhamster.com/movies/2221348/britney_spears_sexy_booty.html?hd', u'file': u'2221348.flv', - u'md5': u'970a94178ca4118c5aa3aaea21211b81', + u'md5': u'e767b9475de189320f691f49c679c4c7', u'info_dict': { u"upload_date": u"20130914", u"uploader_id": u"jojo747400", From a213880aaffb0c236cb0b2141f167bca307f2628 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Sun, 8 Dec 2013 05:49:35 +0100 Subject: [PATCH 013/150] Simplify status reporting (#1918) --- youtube_dl/FileDownloader.py | 39 ++++++++++++++++++++---------------- 1 file changed, 22 insertions(+), 17 deletions(-) diff --git a/youtube_dl/FileDownloader.py b/youtube_dl/FileDownloader.py index 3ff9716b3..ac6a6d8a0 100644 --- a/youtube_dl/FileDownloader.py +++ b/youtube_dl/FileDownloader.py @@ -204,11 +204,19 @@ class FileDownloader(object): """Report destination filename.""" self.to_screen(u'[download] Destination: ' + filename) + def _report_progress_status(self, msg, is_last_line=False): + clear_line = (u'\x1b[K' if sys.stderr.isatty() and os.name != 'nt' else u'') + if self.params.get('progress_with_newline', False): + self.to_screen(u'[download] ' + msg) + else: + self.to_screen(u'\r%s[download] %s' % (clear_line, msg), + skip_eol=not is_last_line) + self.to_console_title(u'youtube-dl ' + msg) + def report_progress(self, percent, data_len_str, speed, eta): """Report download progress.""" if self.params.get('noprogress', False): return - clear_line = (u'\x1b[K' if sys.stderr.isatty() and os.name != 'nt' else u'') if eta is not None: eta_str = self.format_eta(eta) else: @@ -218,14 +226,20 @@ class FileDownloader(object): else: percent_str = 'Unknown %' speed_str = self.format_speed(speed) - if self.params.get('progress_with_newline', False): - self.to_screen(u'[download] %s of %s at %s ETA %s' % - (percent_str, data_len_str, speed_str, eta_str)) + + msg = (u'%s of %s at %s ETA %s' % + (percent_str, data_len_str, speed_str, eta_str)) + self._report_progress_status(msg) + + def report_finish(self, data_len_str, tot_time): + """Report download finished.""" + if self.params.get('noprogress', False): + self.to_screen(u'[download] Download completed') else: - self.to_screen(u'\r%s[download] %s of %s at %s ETA %s' % - (clear_line, percent_str, data_len_str, speed_str, eta_str), skip_eol=True) - self.to_console_title(u'youtube-dl - %s of %s at %s ETA %s' % - (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip())) + self._report_progress_status( + (u'100%% of %s in %s' % + (data_len_str, self.format_seconds(tot_time))), + is_last_line=True) def report_resuming_byte(self, resume_len): """Report attempt to resume at given byte.""" @@ -246,15 +260,6 @@ class FileDownloader(object): """Report it was impossible to resume download.""" self.to_screen(u'[download] Unable to resume') - def report_finish(self, data_len_str, tot_time): - """Report download finished.""" - if self.params.get('noprogress', False): - self.to_screen(u'[download] Download completed') - else: - clear_line = (u'\x1b[K' if sys.stderr.isatty() and os.name != 'nt' else u'') - self.to_screen(u'\r%s[download] 100%% of %s in %s' % - (clear_line, data_len_str, self.format_seconds(tot_time))) - def _download_with_rtmpdump(self, filename, url, player_url, page_url, play_path, tc_url, live): def run_rtmpdump(args): start = time.time() From 4c5216064657b267de378a84dd8ea4989934f25c Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Sun, 8 Dec 2013 06:53:46 +0100 Subject: [PATCH 014/150] [FileDownloader] Fix progress report on Windows (Fixes #1918) --- youtube_dl/FileDownloader.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/youtube_dl/FileDownloader.py b/youtube_dl/FileDownloader.py index ac6a6d8a0..8f9577512 100644 --- a/youtube_dl/FileDownloader.py +++ b/youtube_dl/FileDownloader.py @@ -205,12 +205,20 @@ class FileDownloader(object): self.to_screen(u'[download] Destination: ' + filename) def _report_progress_status(self, msg, is_last_line=False): - clear_line = (u'\x1b[K' if sys.stderr.isatty() and os.name != 'nt' else u'') + fullmsg = u'[download] ' + msg if self.params.get('progress_with_newline', False): - self.to_screen(u'[download] ' + msg) + self.to_screen(fullmsg) else: - self.to_screen(u'\r%s[download] %s' % (clear_line, msg), - skip_eol=not is_last_line) + if os.name == 'nt': + prev_len = getattr(self, '_report_progress_prev_line_length', + 0) + if prev_len > len(fullmsg): + fullmsg += u' ' * (prev_len - len(fullmsg)) + self._report_progress_prev_line_length = len(fullmsg) + clear_line = u'\r' + else: + clear_line = (u'\r\x1b[K' if sys.stderr.isatty() else u'\r') + self.to_screen(clear_line + fullmsg, skip_eol=not is_last_line) self.to_console_title(u'youtube-dl ' + msg) def report_progress(self, percent, data_len_str, speed, eta): From 303b479e0a4b7db5b8e30be6ff3ffc1ea267e87c Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Sun, 8 Dec 2013 06:54:39 +0100 Subject: [PATCH 015/150] Automatically load SSL certs on Windows --- youtube_dl/utils.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 7b5878830..a84aa59c2 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -561,11 +561,14 @@ def make_HTTPS_handler(opts_no_check_certificate): return HTTPSHandlerV3() else: context = ssl.SSLContext(ssl.PROTOCOL_SSLv3) - context.set_default_verify_paths() - context.verify_mode = (ssl.CERT_NONE if opts_no_check_certificate else ssl.CERT_REQUIRED) + context.set_default_verify_paths() + try: + context.load_default_certs() + except AttributeError: + pass # Python < 3.4 return compat_urllib_request.HTTPSHandler(context=context) class ExtractorError(Exception): From d4df5ed14c1efb41856224c963307d5aace0b0cf Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Sun, 8 Dec 2013 06:54:52 +0100 Subject: [PATCH 016/150] release 2013.12.08 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 68b30bfd4..794611810 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2013.12.04' +__version__ = '2013.12.08' From 11bf848191e6a2859f2e53dd7639305e0446e821 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Sun, 8 Dec 2013 07:22:19 +0100 Subject: [PATCH 017/150] [wimp] simplify --- youtube_dl/extractor/wimp.py | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/youtube_dl/extractor/wimp.py b/youtube_dl/extractor/wimp.py index b9c3b13f9..3635691e7 100644 --- a/youtube_dl/extractor/wimp.py +++ b/youtube_dl/extractor/wimp.py @@ -11,7 +11,8 @@ class WimpIE(InfoExtractor): u'file': u'deerfence.flv', u'md5': u'8b215e2e0168c6081a1cf84b2846a2b5', u'info_dict': { - u"title": u"Watch Till End: Herd of deer jump over a fence." + u"title": u"Watch Till End: Herd of deer jump over a fence.", + u"description": u"These deer look as fluid as running water when they jump over this fence as a herd. This video is one that needs to be watched until the very end for the true majesty to be witnessed, but once it comes, it's sure to take your breath away.", } } @@ -19,18 +20,15 @@ class WimpIE(InfoExtractor): mobj = re.match(self._VALID_URL, url) video_id = mobj.group(1) webpage = self._download_webpage(url, video_id) - title = self._search_regex(r'<meta name="description" content="(.+?)" />',webpage, 'video title') - thumbnail_url = self._search_regex(r'<meta property="og\:image" content="(.+?)" />', webpage,'video thumbnail') + title = self._html_search_meta('description', webpage, u'video title') googleString = self._search_regex("googleCode = '(.*?)'", webpage, 'file url') googleString = base64.b64decode(googleString).decode('ascii') final_url = self._search_regex('","(.*?)"', googleString,'final video url') - ext = final_url.rpartition(u'.')[2] - - return [{ - 'id': video_id, - 'url': final_url, - 'ext': ext, - 'title': title, - 'thumbnail': thumbnail_url, - }] + return { + 'id': video_id, + 'url': final_url, + 'title': self._og_search_title(webpage), + 'thumbnail': self._og_search_thumbnail(webpage), + 'description': self._og_search_description(webpage), + } From 31812a9e0e13925bc4d266a6d8eff779e10816c5 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Sun, 8 Dec 2013 07:30:42 +0100 Subject: [PATCH 018/150] [youtube:channel] Fix automated channel detection --- youtube_dl/extractor/youtube.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 01715024c..35121e6e5 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1631,10 +1631,11 @@ class YoutubeChannelIE(InfoExtractor): video_ids = [] url = 'https://www.youtube.com/channel/%s/videos' % channel_id channel_page = self._download_webpage(url, channel_id) - if re.search(r'channel-header-autogenerated-label', channel_page) is not None: - autogenerated = True - else: - autogenerated = False + autogenerated = re.search(r'''(?x) + class="[^"]*?(?: + channel-header-autogenerated-label| + yt-channel-title-autogenerated + )[^"]*"''', channel_page) is not None if autogenerated: # The videos are contained in a single page From 22686b91f0883e3686d039a5ddc104abd6cf3b26 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Sun, 8 Dec 2013 07:32:25 +0100 Subject: [PATCH 019/150] release 2013.12.08.1 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 794611810..eaeba1cbf 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2013.12.08' +__version__ = '2013.12.08.1' From 56a8ab7d6073493856d7e0ab8c5d1f128e3d475b Mon Sep 17 00:00:00 2001 From: Camillo Dell'mour <cdellmour@gmail.com> Date: Sun, 8 Dec 2013 14:02:14 +0100 Subject: [PATCH 020/150] added arte.tv extractor support for subdomain ddc - Mit offenen Karten(german) Le Dessous des Cartes(france) --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/arte.py | 28 ++++++++++++++++++++++++++++ 2 files changed, 29 insertions(+) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index d7cb6e463..7ecafb104 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -8,6 +8,7 @@ from .arte import ( ArteTVPlus7IE, ArteTVCreativeIE, ArteTVFutureIE, + ArteTVDDCIE, ) from .auengine import AUEngineIE from .bambuser import BambuserIE, BambuserChannelIE diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py index 56a5d009f..986643612 100644 --- a/youtube_dl/extractor/arte.py +++ b/youtube_dl/extractor/arte.py @@ -10,6 +10,7 @@ from ..utils import ( determine_ext, get_element_by_id, compat_str, + get_element_by_attribute, ) # There are different sources of video in arte.tv, the extraction process @@ -142,7 +143,9 @@ class ArteTVPlus7IE(InfoExtractor): def _extract_from_webpage(self, webpage, video_id, lang): json_url = self._html_search_regex(r'arte_vp_url="(.*?)"', webpage, 'json url') + return self._extract_from_json_url(json_url, video_id, lang) + def _extract_from_json_url(self, json_url, video_id, lang): json_info = self._download_webpage(json_url, video_id, 'Downloading info json') self.report_extraction(video_id) info = json.loads(json_info) @@ -257,3 +260,28 @@ class ArteTVFutureIE(ArteTVPlus7IE): webpage = self._download_webpage(url, anchor_id) row = get_element_by_id(anchor_id, webpage) return self._extract_from_webpage(row, anchor_id, lang) + +class ArteTVDDCIE(ArteTVPlus7IE): + IE_NAME = u'arte.tv:ddc' + _VALID_URL = r'http?://ddc\.arte\.tv/(?P<lang>emission|folge)/(?P<id>.+)' + + _TEST = { + u'url': u'http://ddc.arte.tv/folge/neues-aus-mauretanien', + u'file': u'Mit offenen Karten-049881-009_PLUS7-D.flv', + u'info_dict': { + u'title': u'neues-aus-mauretanien', + }, + } + + def _real_extract(self, url): + video_id, lang = self._extract_url_info(url) + if lang == 'folge': + lang = 'de' + elif lang == 'emission': + lang = 'fr' + webpage = self._download_webpage(url, video_id) + scriptElement = get_element_by_attribute('class', 'visu_video_block', webpage) + script_url = self._html_search_regex(r'src="(.*?)"', scriptElement, 'script url') + javascriptPlayerGenerator = self._download_webpage(script_url, video_id, 'Download javascript player generator') + json_url = self._search_regex(r"json_url=(.*)&rendering_place.*", javascriptPlayerGenerator, 'json url') + return self._extract_from_json_url(json_url, video_id, lang) From 52defb0c9bbf6716b303a06ce2cca7ffee7d8682 Mon Sep 17 00:00:00 2001 From: Camillo Dell'mour <cdellmour@gmail.com> Date: Sun, 8 Dec 2013 16:22:31 +0100 Subject: [PATCH 021/150] made ddc.arte.tv test working --- youtube_dl/extractor/arte.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py index 986643612..2c472464e 100644 --- a/youtube_dl/extractor/arte.py +++ b/youtube_dl/extractor/arte.py @@ -267,9 +267,9 @@ class ArteTVDDCIE(ArteTVPlus7IE): _TEST = { u'url': u'http://ddc.arte.tv/folge/neues-aus-mauretanien', - u'file': u'Mit offenen Karten-049881-009_PLUS7-D.flv', + u'file': u'049881-009_PLUS7-D.flv', u'info_dict': { - u'title': u'neues-aus-mauretanien', + u'title': u'Mit offenen Karten', }, } From ac5118bcb92a489eec0e9997e55e07945dc720db Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Sun, 8 Dec 2013 16:34:57 +0100 Subject: [PATCH 022/150] [arte.tv:ddc] Add fields to the test and skip download (rtmp) --- youtube_dl/extractor/arte.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py index 2c472464e..4b7bef775 100644 --- a/youtube_dl/extractor/arte.py +++ b/youtube_dl/extractor/arte.py @@ -261,6 +261,7 @@ class ArteTVFutureIE(ArteTVPlus7IE): row = get_element_by_id(anchor_id, webpage) return self._extract_from_webpage(row, anchor_id, lang) + class ArteTVDDCIE(ArteTVPlus7IE): IE_NAME = u'arte.tv:ddc' _VALID_URL = r'http?://ddc\.arte\.tv/(?P<lang>emission|folge)/(?P<id>.+)' @@ -270,6 +271,12 @@ class ArteTVDDCIE(ArteTVPlus7IE): u'file': u'049881-009_PLUS7-D.flv', u'info_dict': { u'title': u'Mit offenen Karten', + u'description': u'md5:57929b0eaeddeb8a0c983f58e9ebd3b6', + u'upload_date': u'20131207', + }, + u'params': { + # rtmp download + u'skip_download': True, }, } From baa7b1978bc1d77858458c2b31aec3ff819a3e6a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Sun, 8 Dec 2013 22:24:55 +0100 Subject: [PATCH 023/150] Remove the calls to 'compat_urllib_request.urlopen' in a few extractors --- youtube_dl/extractor/bliptv.py | 39 +++++++++++++-------------- youtube_dl/extractor/metacafe.py | 18 +++---------- youtube_dl/extractor/mixcloud.py | 11 ++++---- youtube_dl/extractor/stanfordoc.py | 18 +++---------- youtube_dl/extractor/youtube.py | 42 +++++++++--------------------- 5 files changed, 42 insertions(+), 86 deletions(-) diff --git a/youtube_dl/extractor/bliptv.py b/youtube_dl/extractor/bliptv.py index 493504f75..5e33a69df 100644 --- a/youtube_dl/extractor/bliptv.py +++ b/youtube_dl/extractor/bliptv.py @@ -51,8 +51,7 @@ class BlipTVIE(InfoExtractor): url = 'http://blip.tv/play/g_%s' % api_mobj.group('video_id') urlp = compat_urllib_parse_urlparse(url) if urlp.path.startswith('/play/'): - request = compat_urllib_request.Request(url) - response = compat_urllib_request.urlopen(request) + response = self._request_webpage(url, None, False) redirecturl = response.geturl() rurlp = compat_urllib_parse_urlparse(redirecturl) file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2] @@ -69,25 +68,23 @@ class BlipTVIE(InfoExtractor): request.add_header('User-Agent', 'iTunes/10.6.1') self.report_extraction(mobj.group(1)) info = None - try: - urlh = compat_urllib_request.urlopen(request) - if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download - basename = url.split('/')[-1] - title,ext = os.path.splitext(basename) - title = title.decode('UTF-8') - ext = ext.replace('.', '') - self.report_direct_download(title) - info = { - 'id': title, - 'url': url, - 'uploader': None, - 'upload_date': None, - 'title': title, - 'ext': ext, - 'urlhandle': urlh - } - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err)) + urlh = self._request_webpage(request, None, False, + u'unable to download video info webpage') + if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download + basename = url.split('/')[-1] + title,ext = os.path.splitext(basename) + title = title.decode('UTF-8') + ext = ext.replace('.', '') + self.report_direct_download(title) + info = { + 'id': title, + 'url': url, + 'uploader': None, + 'upload_date': None, + 'title': title, + 'ext': ext, + 'urlhandle': urlh + } if info is None: # Regular URL try: json_code_bytes = urlh.read() diff --git a/youtube_dl/extractor/metacafe.py b/youtube_dl/extractor/metacafe.py index e59bdd604..bd044fb60 100644 --- a/youtube_dl/extractor/metacafe.py +++ b/youtube_dl/extractor/metacafe.py @@ -1,11 +1,8 @@ import re -import socket from .common import InfoExtractor from ..utils import ( - compat_http_client, compat_parse_qs, - compat_urllib_error, compat_urllib_parse, compat_urllib_request, compat_str, @@ -93,12 +90,8 @@ class MetacafeIE(InfoExtractor): def _real_initialize(self): # Retrieve disclaimer - request = compat_urllib_request.Request(self._DISCLAIMER) - try: - self.report_disclaimer() - compat_urllib_request.urlopen(request).read() - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - raise ExtractorError(u'Unable to retrieve disclaimer: %s' % compat_str(err)) + self.report_disclaimer() + self._download_webpage(self._DISCLAIMER, None, False, u'Unable to retrieve disclaimer') # Confirm age disclaimer_form = { @@ -107,11 +100,8 @@ class MetacafeIE(InfoExtractor): } request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form)) request.add_header('Content-Type', 'application/x-www-form-urlencoded') - try: - self.report_age_confirmation() - compat_urllib_request.urlopen(request).read() - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err)) + self.report_age_confirmation() + self._download_webpage(request, None, False, u'Unable to confirm age') def _real_extract(self, url): # Extract id and simplified title from URL diff --git a/youtube_dl/extractor/mixcloud.py b/youtube_dl/extractor/mixcloud.py index e2baf44d7..04fa3ac7a 100644 --- a/youtube_dl/extractor/mixcloud.py +++ b/youtube_dl/extractor/mixcloud.py @@ -1,13 +1,10 @@ import json import re -import socket from .common import InfoExtractor from ..utils import ( - compat_http_client, - compat_urllib_error, - compat_urllib_request, unified_strdate, + ExtractorError, ) @@ -31,9 +28,11 @@ class MixcloudIE(InfoExtractor): """Returns 1st active url from list""" for url in url_list: try: - compat_urllib_request.urlopen(url) + # We only want to know if the request succeed + # don't download the whole file + self._request_webpage(url, None, False) return url - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error): + except ExtractorError: url = None return None diff --git a/youtube_dl/extractor/stanfordoc.py b/youtube_dl/extractor/stanfordoc.py index d54e01a12..d0d0989f0 100644 --- a/youtube_dl/extractor/stanfordoc.py +++ b/youtube_dl/extractor/stanfordoc.py @@ -1,13 +1,8 @@ import re -import socket -import xml.etree.ElementTree from .common import InfoExtractor from ..utils import ( - compat_http_client, compat_str, - compat_urllib_error, - compat_urllib_request, ExtractorError, orderedSet, @@ -45,11 +40,7 @@ class StanfordOpenClassroomIE(InfoExtractor): self.report_extraction(info['id']) baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/' xmlUrl = baseUrl + video + '.xml' - try: - metaXml = compat_urllib_request.urlopen(xmlUrl).read() - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err)) - mdoc = xml.etree.ElementTree.fromstring(metaXml) + mdoc = self._download_xml(xmlUrl, info['id']) try: info['title'] = mdoc.findall('./title')[0].text info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text @@ -95,12 +86,9 @@ class StanfordOpenClassroomIE(InfoExtractor): 'upload_date': None, } - self.report_download_webpage(info['id']) rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php' - try: - rootpage = compat_urllib_request.urlopen(rootURL).read() - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - raise ExtractorError(u'Unable to download course info page: ' + compat_str(err)) + rootpage = self._download_webpage(rootURL, info['id'], + errnote=u'Unable to download course info page') info['title'] = info['id'] diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 35121e6e5..1d211c450 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -7,7 +7,6 @@ import itertools import json import os.path import re -import socket import string import struct import traceback @@ -17,9 +16,7 @@ from .common import InfoExtractor, SearchInfoExtractor from .subtitles import SubtitlesInfoExtractor from ..utils import ( compat_chr, - compat_http_client, compat_parse_qs, - compat_urllib_error, compat_urllib_parse, compat_urllib_request, compat_urlparse, @@ -53,9 +50,9 @@ class YoutubeBaseInfoExtractor(InfoExtractor): request = compat_urllib_request.Request(self._LANG_URL) try: self.report_lang() - compat_urllib_request.urlopen(request).read() - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.report_warning(u'unable to set language: %s' % compat_str(err)) + self._download_webpage(self._LANG_URL, None, False) + except ExtractorError as err: + self._downloader.report_warning(u'unable to set language: %s' % compat_str(err.cause)) return False return True @@ -67,12 +64,8 @@ class YoutubeBaseInfoExtractor(InfoExtractor): raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True) return False - request = compat_urllib_request.Request(self._LOGIN_URL) - try: - login_page = compat_urllib_request.urlopen(request).read().decode('utf-8') - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err)) - return False + login_page = self._download_webpage(self._LOGIN_URL, None, False, + u'Unable to fetch login page') galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"', login_page, u'Login GALX parameter') @@ -105,12 +98,12 @@ class YoutubeBaseInfoExtractor(InfoExtractor): request = compat_urllib_request.Request(self._LOGIN_URL, login_data) try: self.report_login() - login_results = compat_urllib_request.urlopen(request).read().decode('utf-8') + login_results = self._download_webpage(request, None, False) if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None: self._downloader.report_warning(u'unable to log in: bad username or password') return False - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.report_warning(u'unable to log in: %s' % compat_str(err)) + except ExtractorError as err: + self._downloader.report_warning(u'unable to log in: %s' % compat_str(err.cause)) return False return True @@ -120,11 +113,8 @@ class YoutubeBaseInfoExtractor(InfoExtractor): 'action_confirm': 'Confirm', } request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form)) - try: - self.report_age_confirmation() - compat_urllib_request.urlopen(request).read().decode('utf-8') - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err)) + self.report_age_confirmation() + self._download_webpage(request, None, False, u'Unable to confirm age') return True def _real_initialize(self): @@ -1737,10 +1727,6 @@ class YoutubeSearchIE(SearchInfoExtractor): IE_NAME = u'youtube:search' _SEARCH_KEY = 'ytsearch' - def report_download_page(self, query, pagenum): - """Report attempt to download search page with given number.""" - self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum)) - def _get_n_results(self, query, n): """Get a specified number of results for a query""" @@ -1749,13 +1735,9 @@ class YoutubeSearchIE(SearchInfoExtractor): limit = n while (50 * pagenum) < limit: - self.report_download_page(query, pagenum+1) result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1) - request = compat_urllib_request.Request(result_url) - try: - data = compat_urllib_request.urlopen(request).read().decode('utf-8') - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - raise ExtractorError(u'Unable to download API page: %s' % compat_str(err)) + data = self._download_webpage(result_url, u'query "%s"' % query, + u'Downloading page %s' % pagenum, u'Unable to download API page') api_response = json.loads(data)['data'] if not 'items' in api_response: From 7cc3570e53a22dd7f8329ecb4105c5738dcf76c9 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 9 Dec 2013 01:49:01 +0100 Subject: [PATCH 024/150] Add fatal=False parameter to _download_* functions. This allows us to simplify the calls in the youtube extractor even further. --- youtube_dl/extractor/common.py | 30 +++++++++++---- youtube_dl/extractor/youtube.py | 67 +++++++++++++++++---------------- 2 files changed, 57 insertions(+), 40 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 92a0c5050..534908a2b 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -154,27 +154,38 @@ class InfoExtractor(object): def IE_NAME(self): return type(self).__name__[:-2] - def _request_webpage(self, url_or_request, video_id, note=None, errnote=None): + def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True): """ Returns the response handle """ if note is None: self.report_download_webpage(video_id) elif note is not False: - self.to_screen(u'%s: %s' % (video_id, note)) + if video_id is None: + self.to_screen(u'%s' % (note,)) + else: + self.to_screen(u'%s: %s' % (video_id, note)) try: return self._downloader.urlopen(url_or_request) except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: if errnote is None: errnote = u'Unable to download webpage' - raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2], cause=err) + errmsg = u'%s: %s' % (errnote, compat_str(err)) + if fatal: + raise ExtractorError(errmsg, sys.exc_info()[2], cause=err) + else: + self._downloader.report_warning(errmsg) + return False - def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None): + def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True): """ Returns a tuple (page content as string, URL handle) """ # Strip hashes from the URL (#1038) if isinstance(url_or_request, (compat_str, str)): url_or_request = url_or_request.partition('#')[0] - urlh = self._request_webpage(url_or_request, video_id, note, errnote) + urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal) + if urlh is False: + assert not fatal + return False content_type = urlh.headers.get('Content-Type', '') webpage_bytes = urlh.read() m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type) @@ -209,9 +220,14 @@ class InfoExtractor(object): content = webpage_bytes.decode(encoding, 'replace') return (content, urlh) - def _download_webpage(self, url_or_request, video_id, note=None, errnote=None): + def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True): """ Returns the data of the page as a string """ - return self._download_webpage_handle(url_or_request, video_id, note, errnote)[0] + res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal) + if res is False: + return res + else: + content, _ = res + return content def _download_xml(self, url_or_request, video_id, note=u'Downloading XML', errnote=u'Unable to download XML'): diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 1d211c450..7f7508c74 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -42,19 +42,11 @@ class YoutubeBaseInfoExtractor(InfoExtractor): # If True it will raise an error if no login info is provided _LOGIN_REQUIRED = False - def report_lang(self): - """Report attempt to set language.""" - self.to_screen(u'Setting language') - def _set_language(self): - request = compat_urllib_request.Request(self._LANG_URL) - try: - self.report_lang() - self._download_webpage(self._LANG_URL, None, False) - except ExtractorError as err: - self._downloader.report_warning(u'unable to set language: %s' % compat_str(err.cause)) - return False - return True + return bool(self._download_webpage( + self._LANG_URL, None, + note=u'Setting language', errnote='unable to set language', + fatal=False)) def _login(self): (username, password) = self._get_login_info() @@ -64,8 +56,12 @@ class YoutubeBaseInfoExtractor(InfoExtractor): raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True) return False - login_page = self._download_webpage(self._LOGIN_URL, None, False, - u'Unable to fetch login page') + login_page = self._download_webpage( + self._LOGIN_URL, None, + note=u'Downloading login page', + errnote=u'unable to fetch login page', fatal=False) + if login_page is False: + return galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"', login_page, u'Login GALX parameter') @@ -95,26 +91,28 @@ class YoutubeBaseInfoExtractor(InfoExtractor): # chokes on unicode login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items()) login_data = compat_urllib_parse.urlencode(login_form).encode('ascii') - request = compat_urllib_request.Request(self._LOGIN_URL, login_data) - try: - self.report_login() - login_results = self._download_webpage(request, None, False) - if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None: - self._downloader.report_warning(u'unable to log in: bad username or password') - return False - except ExtractorError as err: - self._downloader.report_warning(u'unable to log in: %s' % compat_str(err.cause)) + + req = compat_urllib_request.Request(self._LOGIN_URL, login_data) + login_results = self._download_webpage( + req, None, + note=u'Logging in', errnote=u'unable to log in', fatal=False) + if login_results is False: + return False + if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None: + self._downloader.report_warning(u'unable to log in: bad username or password') return False return True def _confirm_age(self): age_form = { - 'next_url': '/', - 'action_confirm': 'Confirm', - } - request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form)) - self.report_age_confirmation() - self._download_webpage(request, None, False, u'Unable to confirm age') + 'next_url': '/', + 'action_confirm': 'Confirm', + } + req = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form)) + + self._download_webpage( + req, None, + note=u'Confirming age', errnote=u'Unable to confirm age') return True def _real_initialize(self): @@ -1736,11 +1734,14 @@ class YoutubeSearchIE(SearchInfoExtractor): while (50 * pagenum) < limit: result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1) - data = self._download_webpage(result_url, u'query "%s"' % query, - u'Downloading page %s' % pagenum, u'Unable to download API page') - api_response = json.loads(data)['data'] + data_json = self._download_webpage( + result_url, video_id=u'query "%s"' % query, + note=u'Downloading page %s' % (pagenum + 1), + errnote=u'Unable to download API page') + data = json.loads(data_json) + api_response = data['data'] - if not 'items' in api_response: + if 'items' not in api_response: raise ExtractorError(u'[youtube] No video results') new_ids = list(video['id'] for video in api_response['items']) From ac79fa02b80f91a71b3c1f6d9a750c79d2ff378a Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 9 Dec 2013 03:02:54 +0100 Subject: [PATCH 025/150] Restore Python 2.6.<6 compatibility (Fixes #1860) --- youtube_dl/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index a84aa59c2..5ba06d965 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -547,7 +547,7 @@ def make_HTTPS_handler(opts_no_check_certificate): def connect(self): sock = socket.create_connection((self.host, self.port), self.timeout) - if self._tunnel_host: + if getattr(self, '_tunnel_host', False): self.sock = sock self._tunnel() try: From ffe62508e4ea5a7ef58f1634dd29fb0fa9ad7837 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 9 Dec 2013 03:03:01 +0100 Subject: [PATCH 026/150] release 2013.12.09 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index eaeba1cbf..4f9e5e12b 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2013.12.08.1' +__version__ = '2013.12.09' From 0783b09b9253b8dc55d592de98877f25ec7c3b90 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 9 Dec 2013 04:08:51 +0100 Subject: [PATCH 027/150] Add a workaround for terminals without bidi support (Fixes #1912) --- youtube_dl/YoutubeDL.py | 69 ++++++++++++++++++++++++++++++++--------- youtube_dl/__init__.py | 4 +++ 2 files changed, 59 insertions(+), 14 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 07b36a98e..fc36be56f 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -133,6 +133,8 @@ class YoutubeDL(object): nocheckcertificate:Do not verify SSL certificates proxy: URL of the proxy server to use socket_timeout: Time to wait for unresponsive hosts, in seconds + bidi_workaround: Work around buggy terminals without bidirectional text + support, using fridibi The following parameters are not used by YoutubeDL itself, they are used by the FileDownloader: @@ -156,8 +158,45 @@ class YoutubeDL(object): self._download_retcode = 0 self._num_downloads = 0 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)] + self._err_file = sys.stderr self.params = {} if params is None else params + # Pipe messsages through fribidi + if params.get('bidi_workaround', False): + # fribidi does not support ungetting, so force newlines + params['progress_with_newline'] = True + + for fid in ['_screen_file', '_err_file']: + class FribidiOut(object): + def __init__(self, outfile, errfile): + self.outfile = outfile + self.process = subprocess.Popen( + ['fribidi'], + stdin=subprocess.PIPE, + stdout=outfile, + stderr=errfile) + + def write(self, s): + res = self.process.stdin.write(s) + self.flush() + return res + + def flush(self): + return self.process.stdin.flush() + + def isatty(self): + return self.outfile.isatty() + + try: + vout = FribidiOut(getattr(self, fid), self._err_file) + setattr(self, fid, vout) + except OSError as ose: + if ose.errno == 2: + self.report_warning(u'Could not find fribidi executable, ignoring --bidi-workaround . Make sure that fribidi is an executable file in one of the directories in your $PATH.') + break + else: + raise + if (sys.version_info >= (3,) and sys.platform != 'win32' and sys.getfilesystemencoding() in ['ascii', 'ANSI_X3.4-1968'] and not params['restrictfilenames']): @@ -206,10 +245,14 @@ class YoutubeDL(object): pp.set_downloader(self) def to_screen(self, message, skip_eol=False): + """Print message to stdout if not in quiet mode.""" + return self.to_stdout(message, skip_eol, check_quiet=True) + + def to_stdout(self, message, skip_eol=False, check_quiet=False): """Print message to stdout if not in quiet mode.""" if self.params.get('logger'): self.params['logger'].debug(message) - elif not self.params.get('quiet', False): + elif not check_quiet or not self.params.get('quiet', False): terminator = [u'\n', u''][skip_eol] output = message + terminator write_string(output, self._screen_file) @@ -221,9 +264,7 @@ class YoutubeDL(object): self.params['logger'].error(message) else: output = message + u'\n' - if 'b' in getattr(self._screen_file, 'mode', '') or sys.version_info[0] < 3: # Python 2 lies about the mode of sys.stdout/sys.stderr - output = output.encode(preferredencoding()) - sys.stderr.write(output) + write_string(output, self._err_file) def to_console_title(self, message): if not self.params.get('consoletitle', False): @@ -294,7 +335,7 @@ class YoutubeDL(object): Print the message to stderr, it will be prefixed with 'WARNING:' If stderr is a tty file the 'WARNING:' will be colored ''' - if sys.stderr.isatty() and os.name != 'nt': + if self._err_file.isatty() and os.name != 'nt': _msg_header = u'\033[0;33mWARNING:\033[0m' else: _msg_header = u'WARNING:' @@ -306,7 +347,7 @@ class YoutubeDL(object): Do the same as trouble, but prefixes the message with 'ERROR:', colored in red if stderr is a tty file. ''' - if sys.stderr.isatty() and os.name != 'nt': + if self._err_file.isatty() and os.name != 'nt': _msg_header = u'\033[0;31mERROR:\033[0m' else: _msg_header = u'ERROR:' @@ -695,22 +736,22 @@ class YoutubeDL(object): # Forced printings if self.params.get('forcetitle', False): - compat_print(info_dict['fulltitle']) + self.to_stdout(info_dict['fulltitle']) if self.params.get('forceid', False): - compat_print(info_dict['id']) + self.to_stdout(info_dict['id']) if self.params.get('forceurl', False): # For RTMP URLs, also include the playpath - compat_print(info_dict['url'] + info_dict.get('play_path', u'')) + self.to_stdout(info_dict['url'] + info_dict.get('play_path', u'')) if self.params.get('forcethumbnail', False) and info_dict.get('thumbnail') is not None: - compat_print(info_dict['thumbnail']) + self.to_stdout(info_dict['thumbnail']) if self.params.get('forcedescription', False) and info_dict.get('description') is not None: - compat_print(info_dict['description']) + self.to_stdout(info_dict['description']) if self.params.get('forcefilename', False) and filename is not None: - compat_print(filename) + self.to_stdout(filename) if self.params.get('forceformat', False): - compat_print(info_dict['format']) + self.to_stdout(info_dict['format']) if self.params.get('forcejson', False): - compat_print(json.dumps(info_dict)) + self.to_stdout(json.dumps(info_dict)) # Do nothing else if in simulate mode if self.params.get('simulate', False): diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index d2446b670..2e3f96919 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -204,6 +204,9 @@ def parseOpts(overrideArguments=None): general.add_option( '--socket-timeout', dest='socket_timeout', type=float, default=None, help=optparse.SUPPRESS_HELP) + general.add_option( + '--bidi-workaround', dest='bidi_workaround', action='store_true', + help=u'Work around terminals that lack bidirectional text support. Requires fribidi executable in PATH') selection.add_option('--playlist-start', @@ -684,6 +687,7 @@ def _real_main(argv=None): 'nocheckcertificate': opts.no_check_certificate, 'proxy': opts.proxy, 'socket_timeout': opts.socket_timeout, + 'bidi_workaround': opts.bidi_workaround, } with YoutubeDL(ydl_opts) as ydl: From eaa1a7bde32fc3e5ea103b87cc54e915cd70c5e8 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 9 Dec 2013 04:09:06 +0100 Subject: [PATCH 028/150] release 2013.12.09.1 --- README.md | 2 ++ youtube_dl/version.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 029c418d1..27cfd7080 100644 --- a/README.md +++ b/README.md @@ -38,6 +38,8 @@ which means you can modify it, redistribute it or use it however you like. default $XDG_CACHE_HOME/youtube-dl or ~/.cache /youtube-dl . --no-cache-dir Disable filesystem caching + --bidi-workaround Work around terminals that lack bidirectional + text support. Requires fribidi executable in PATH ## Video Selection: --playlist-start NUMBER playlist video to start at (default is 1) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 4f9e5e12b..f7f658f49 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2013.12.09' +__version__ = '2013.12.09.1' From 2a18bc9a4b2b0304a234091e2169283181892b4d Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 9 Dec 2013 04:20:14 +0100 Subject: [PATCH 029/150] Add some bug reporting hints --- README.md | 56 +++++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 48 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 27cfd7080..9d4835053 100644 --- a/README.md +++ b/README.md @@ -276,14 +276,54 @@ This README file was originally written by Daniel Bolton (<https://github.com/db # BUGS -Bugs and suggestions should be reported at: <https://github.com/rg3/youtube-dl/issues> +Bugs and suggestions should be reported at: <https://github.com/rg3/youtube-dl/issues> . Unless you were prompted so or there is another pertinent reason (e.g. GitHub fails to accept the bug report), please do not send bug reports via personal email. -Please include: - -* Your exact command line, like `youtube-dl -t "http://www.youtube.com/watch?v=uHlDtZ6Oc3s&feature=channel_video_title"`. A common mistake is not to escape the `&`. Putting URLs in quotes should solve this problem. -* If possible re-run the command with `--verbose`, and include the full output, it is really helpful to us. -* The output of `youtube-dl --version` -* The output of `python --version` -* The name and version of your Operating System ("Ubuntu 11.04 x64" or "Windows 7 x64" is usually enough). +Please include the full output of the command when run with `--verbose`. The output (including the first lines) contain important debugging information. Issues without the full output are often not reproducible and therefore do not get solved in short order, if ever. For discussions, join us in the irc channel #youtube-dl on freenode. + +When you submit a request, please re-read it once to avoid a couple of mistakes (you can and should use this as a checklist): + +### Is the description of the issue itself sufficient? + +We often get issue reports that we cannot really decipher. While in most cases we eventually get the required information after asking back multiple times, this poses an unnecessary drain on our resources. Many contributors, including myself, are also not native speakers, so we may misread some parts. + +So please elaborate on what feature you are requesting, or what bug you want to be fixed. Make sure that it's obvious + +- What the problem is +- How it could be fixed +- How your proposed solution would look like + +If your report is shorter than two lines, it is almost certainly missing some of these, which makes it hard for us to respond to it. We're often too polite to close the issue outright, but the missing info makes misinterpretation likely. As a commiter myself, I often get frustrated by these issues, since the only possible way for me to move forward on them is to ask for clarification over and over. + +For bug reports, this means that your report should contain the *complete* output of youtube-dl when called with the -v flag. The error message you get for (most) bugs even says so, but you would not believe how many of our bug reports do not contain this information. + +Site support requests must contain an example URL. An example URL is a URL you might want to download, like http://www.youtube.com/watch?v=BaW_jenozKc . There should be an obvious video present. Except under very special circumstances, the main page of a video service (e.g. http://www.youtube.com/ ) is *not* an example URL. + +### Are you using the latest version? + +Before reporting any issue, type youtube-dl -U. This should report that you're up-to-date. Ábout 20% of the reports we receive are already fixed, but people are using outdated versions. This goes for feature requests as well. + +### Is the issue already documented? + +Make sure that someone has not already opened the issue you're trying to open. Search at the top of the window or at https://github.com/rg3/youtube-dl/search?type=Issues . If there is an issue, feel free to write something along the lines of "This affects me as well, with version 2015.01.01. Here is some more information on the issue: ...". While some issues may be old, a new post into them often spurs rapid activity. + +### Why are existing options not enough? + +Before requesting a new feature, please have a quick peek at [the list of supported options](https://github.com/rg3/youtube-dl/blob/master/README.md#synopsis). Many feature requests are for features that actually exist already! Please, absolutely do show off your work in the issue report and detail how the existing similar options do *not* solve your problem. + +### Is there enough context in your bug report? + +People want to solve problems, and often think they do us a favor by breaking down their larger problems (e.g. wanting to skip already downloaded files) to a specific request (e.g. requesting us to look whether the file exists before downloading the info page). However, what often happens is that they break down the problem into two steps: One simple, and one impossible (or extremely complicated one). + +We are then presented with a very complicated request when the original problem could be solved far easier, e.g. by recording the downloaded video IDs in a separate file. To avoid this, you must include the greater context where it is non-obvious. In particular, every feature request that does not consist of adding support for a new site should contain a use case scenario that explains in what situation the missing feature would be useful. + +### Does the issue involve one problem, and one problem only? + +Some of our users seem to think there is a limit of issues they can or should open. There is no limit of issues they can or should open. While it may seem appealing to be able to dump all your issues into one ticket, that means that someone who solves one of your issues cannot mark the issue as closed. Typically, reporting a bunch of issues leads to the ticket lingering since nobody wants to attack that behemoth, until someone mercifully splits the issue into multiple ones. + +In particular, every site support request issue should only pertain to services at one site (generally under a common domain, but always using the same backend technology). Do not request support for vimeo user videos, Whitehouse podcasts, and Google Plus pages in the same issue. Also, make sure that you don't post bug reports alongside feature requests. As a rule of thumb, a feature request does not include outputs of youtube-dl that are not immediately related to the feature at hand. Do not post reports of a network error alongside the request for a new video service. + +### Is anyone going to need the feature? + +Only post features that you (or an incapicated friend you can personally talk to) require. Do not post features because they seem like a good idea. If they are really useful, they will be requested by someone who requires them. From a0d96c9843da915bee28c29993d6e1a4e2d21d9f Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 9 Dec 2013 04:31:18 +0100 Subject: [PATCH 030/150] Add filename to --dump-json output (Fixes #1908) --- youtube_dl/YoutubeDL.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index fc36be56f..3fbbf3ba0 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -751,6 +751,7 @@ class YoutubeDL(object): if self.params.get('forceformat', False): self.to_stdout(info_dict['format']) if self.params.get('forcejson', False): + info_dict['_filename'] = filename self.to_stdout(json.dumps(info_dict)) # Do nothing else if in simulate mode From 977887469c3ebb7a6d6a7fee3bcd7bcc046f21e3 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 9 Dec 2013 04:50:48 +0100 Subject: [PATCH 031/150] Lower number of expected entries in top list --- test/test_youtube_lists.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_youtube_lists.py b/test/test_youtube_lists.py index 33db09f43..d9fe5af4e 100644 --- a/test/test_youtube_lists.py +++ b/test/test_youtube_lists.py @@ -122,7 +122,7 @@ class TestYoutubeLists(unittest.TestCase): ie = YoutubeTopListIE(dl) result = ie.extract('yttoplist:music:Top Tracks') entries = result['entries'] - self.assertTrue(len(entries) >= 9) + self.assertTrue(len(entries) >= 5) if __name__ == '__main__': unittest.main() From 2101830c0d27e6d9ad2f564ff004565e71da1ec5 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 9 Dec 2013 04:53:23 +0100 Subject: [PATCH 032/150] Remove unused imports --- youtube_dl/YoutubeDL.py | 1 - youtube_dl/extractor/metacafe.py | 1 - youtube_dl/extractor/stanfordoc.py | 2 -- youtube_dl/extractor/wimp.py | 3 +-- 4 files changed, 1 insertion(+), 6 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 3fbbf3ba0..17b3827f2 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -22,7 +22,6 @@ if os.name == 'nt': from .utils import ( compat_cookiejar, compat_http_client, - compat_print, compat_str, compat_urllib_error, compat_urllib_request, diff --git a/youtube_dl/extractor/metacafe.py b/youtube_dl/extractor/metacafe.py index bd044fb60..99d3c83a5 100644 --- a/youtube_dl/extractor/metacafe.py +++ b/youtube_dl/extractor/metacafe.py @@ -5,7 +5,6 @@ from ..utils import ( compat_parse_qs, compat_urllib_parse, compat_urllib_request, - compat_str, determine_ext, ExtractorError, ) diff --git a/youtube_dl/extractor/stanfordoc.py b/youtube_dl/extractor/stanfordoc.py index d0d0989f0..44c52c718 100644 --- a/youtube_dl/extractor/stanfordoc.py +++ b/youtube_dl/extractor/stanfordoc.py @@ -2,8 +2,6 @@ import re from .common import InfoExtractor from ..utils import ( - compat_str, - ExtractorError, orderedSet, unescapeHTML, diff --git a/youtube_dl/extractor/wimp.py b/youtube_dl/extractor/wimp.py index 3635691e7..82a626e0e 100644 --- a/youtube_dl/extractor/wimp.py +++ b/youtube_dl/extractor/wimp.py @@ -20,10 +20,9 @@ class WimpIE(InfoExtractor): mobj = re.match(self._VALID_URL, url) video_id = mobj.group(1) webpage = self._download_webpage(url, video_id) - title = self._html_search_meta('description', webpage, u'video title') googleString = self._search_regex("googleCode = '(.*?)'", webpage, 'file url') googleString = base64.b64decode(googleString).decode('ascii') - final_url = self._search_regex('","(.*?)"', googleString,'final video url') + final_url = self._search_regex('","(.*?)"', googleString, u'final video url') return { 'id': video_id, From 395293a88956a030f1be637748d50d216ff317a5 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 9 Dec 2013 04:59:50 +0100 Subject: [PATCH 033/150] [--load-info] Always read file as UTF-8 This allows editing the file (and not escaping non-ASCII characters) and reloading it in. --- youtube_dl/YoutubeDL.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 79d5c7e5e..6538fc06c 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -883,8 +883,7 @@ class YoutubeDL(object): return self._download_retcode def download_with_info_file(self, info_filename): - with open(info_filename, 'r') as f: - # TODO: Check for errors + with io.open(info_filename, 'r', encoding='utf-8') as f: info = json.load(f) try: self.process_ie_result(info, download=True) From de2dd4c502c5d1b7a51546d52bde12169a4f9d18 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Mon, 9 Dec 2013 17:08:58 +0100 Subject: [PATCH 034/150] [soundcloud] add support for private links (fixes #1927) --- youtube_dl/extractor/soundcloud.py | 38 +++++++++++++++++++++++------- 1 file changed, 30 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index cb6dedab7..0571b36ac 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -1,3 +1,4 @@ +# encoding: utf-8 import json import re import itertools @@ -23,7 +24,9 @@ class SoundcloudIE(InfoExtractor): """ _VALID_URL = r'''^(?:https?://)? - (?:(?:(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)/?(?:[?].*)?$) + (?:(?:(?:www\.)?soundcloud\.com/ + (?P<uploader>[\w\d-]+)/(?P<title>[\w\d-]+)/? + (?P<token>[^?]+?)?(?:[?].*)?$) |(?:api\.soundcloud\.com/tracks/(?P<track_id>\d+)) |(?P<widget>w\.soundcloud\.com/player/?.*?url=.*) ) @@ -56,6 +59,19 @@ class SoundcloudIE(InfoExtractor): u'skip_download': True, }, }, + # private link + { + u'url': u'https://soundcloud.com/jaimemf/youtube-dl-test-video-a-y-baw/s-8Pjrp', + u'md5': u'aa0dd32bfea9b0c5ef4f02aacd080604', + u'info_dict': { + u'id': u'123998367', + u'ext': u'mp3', + u'title': u'Youtube - Dl Test Video \'\' Ä↭', + u'uploader': u'jaimeMF', + u'description': u'test chars: \"\'/\\ä↭', + u'upload_date': u'20131209', + }, + }, ] _CLIENT_ID = 'b45b1aa10f1ac2941910a7f0d10f8e28' @@ -73,7 +89,7 @@ class SoundcloudIE(InfoExtractor): def _resolv_url(cls, url): return 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=' + cls._CLIENT_ID - def _extract_info_dict(self, info, full_title=None, quiet=False): + def _extract_info_dict(self, info, full_title=None, quiet=False, secret_token=None): track_id = compat_str(info['id']) name = full_title or track_id if quiet: @@ -104,8 +120,10 @@ class SoundcloudIE(InfoExtractor): }] else: # We have to retrieve the url + streams_url = ('http://api.soundcloud.com/i1/tracks/{0}/streams?' + 'client_id={1}&secret_token={2}'.format(track_id, self._IPHONE_CLIENT_ID, secret_token)) stream_json = self._download_webpage( - 'http://api.soundcloud.com/i1/tracks/{0}/streams?client_id={1}'.format(track_id, self._IPHONE_CLIENT_ID), + streams_url, track_id, u'Downloading track url') formats = [] @@ -157,6 +175,7 @@ class SoundcloudIE(InfoExtractor): raise ExtractorError(u'Invalid URL: %s' % url) track_id = mobj.group('track_id') + token = None if track_id is not None: info_json_url = 'http://api.soundcloud.com/tracks/' + track_id + '.json?client_id=' + self._CLIENT_ID full_title = track_id @@ -165,19 +184,22 @@ class SoundcloudIE(InfoExtractor): return self.url_result(query['url'][0], ie='Soundcloud') else: # extract uploader (which is in the url) - uploader = mobj.group(1) + uploader = mobj.group('uploader') # extract simple title (uploader + slug of song title) - slug_title = mobj.group(2) - full_title = '%s/%s' % (uploader, slug_title) + slug_title = mobj.group('title') + token = mobj.group('token') + full_title = resolve_title = '%s/%s' % (uploader, slug_title) + if token: + resolve_title += '/%s' % token self.report_resolve(full_title) - url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title) + url = 'http://soundcloud.com/%s' % resolve_title info_json_url = self._resolv_url(url) info_json = self._download_webpage(info_json_url, full_title, u'Downloading info JSON') info = json.loads(info_json) - return self._extract_info_dict(info, full_title) + return self._extract_info_dict(info, full_title, secret_token=token) class SoundcloudSetIE(SoundcloudIE): _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)(?:[?].*)?$' From 1c088fa89ddf1e7065334e9063c378d90c668cdb Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 9 Dec 2013 18:29:07 +0100 Subject: [PATCH 035/150] Improve --bidi-workaround support --- youtube_dl/YoutubeDL.py | 69 +++++++++++++++++++++-------------------- youtube_dl/__init__.py | 17 ++-------- youtube_dl/utils.py | 18 +++++++++++ 3 files changed, 55 insertions(+), 49 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 6538fc06c..2dd7e4907 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -33,6 +33,7 @@ from .utils import ( encodeFilename, ExtractorError, format_bytes, + get_term_width, locked_file, make_HTTPS_handler, MaxDownloadsReached, @@ -160,41 +161,26 @@ class YoutubeDL(object): self._err_file = sys.stderr self.params = {} if params is None else params - # Pipe messsages through fribidi if params.get('bidi_workaround', False): - # fribidi does not support ungetting, so force newlines - params['progress_with_newline'] = True - - for fid in ['_screen_file', '_err_file']: - class FribidiOut(object): - def __init__(self, outfile, errfile): - self.outfile = outfile - self.process = subprocess.Popen( - ['fribidi'], - stdin=subprocess.PIPE, - stdout=outfile, - stderr=errfile) - - def write(self, s): - res = self.process.stdin.write(s) - self.flush() - return res - - def flush(self): - return self.process.stdin.flush() - - def isatty(self): - return self.outfile.isatty() - - try: - vout = FribidiOut(getattr(self, fid), self._err_file) - setattr(self, fid, vout) - except OSError as ose: - if ose.errno == 2: - self.report_warning(u'Could not find fribidi executable, ignoring --bidi-workaround . Make sure that fribidi is an executable file in one of the directories in your $PATH.') - break - else: - raise + try: + import pty + master, slave = pty.openpty() + width = get_term_width() + if width is None: + width_args = [] + else: + width_args = ['-w', str(width)] + self._fribidi = subprocess.Popen( + ['fribidi', '-c', 'UTF-8'] + width_args, + stdin=subprocess.PIPE, + stdout=slave, + stderr=self._err_file) + self._fribidi_channel = os.fdopen(master, 'rb') + except OSError as ose: + if ose.errno == 2: + self.report_warning(u'Could not find fribidi executable, ignoring --bidi-workaround . Make sure that fribidi is an executable file in one of the directories in your $PATH.') + else: + raise if (sys.version_info >= (3,) and sys.platform != 'win32' and sys.getfilesystemencoding() in ['ascii', 'ANSI_X3.4-1968'] @@ -243,6 +229,18 @@ class YoutubeDL(object): self._pps.append(pp) pp.set_downloader(self) + def _bidi_workaround(self, message): + if not hasattr(self, '_fribidi_channel'): + return message + + assert type(message) == type(u'') + line_count = message.count(u'\n') + 1 + self._fribidi.stdin.write((message + u'\n').encode('utf-8')) + self._fribidi.stdin.flush() + res = u''.join(self._fribidi_channel.readline().decode('utf-8') + for _ in range(line_count)) + return res[:-len(u'\n')] + def to_screen(self, message, skip_eol=False): """Print message to stdout if not in quiet mode.""" return self.to_stdout(message, skip_eol, check_quiet=True) @@ -252,8 +250,10 @@ class YoutubeDL(object): if self.params.get('logger'): self.params['logger'].debug(message) elif not check_quiet or not self.params.get('quiet', False): + message = self._bidi_workaround(message) terminator = [u'\n', u''][skip_eol] output = message + terminator + write_string(output, self._screen_file) def to_stderr(self, message): @@ -262,6 +262,7 @@ class YoutubeDL(object): if self.params.get('logger'): self.params['logger'].error(message) else: + message = self._bidi_workaround(message) output = message + u'\n' write_string(output, self._err_file) diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 6e9dd68c4..3e82cd637 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -48,7 +48,6 @@ import os import random import re import shlex -import subprocess import sys @@ -57,6 +56,7 @@ from .utils import ( DateRange, decodeOption, determine_ext, + get_term_width, DownloadError, get_cachedir, MaxDownloadsReached, @@ -113,19 +113,6 @@ def parseOpts(overrideArguments=None): def _comma_separated_values_options_callback(option, opt_str, value, parser): setattr(parser.values, option.dest, value.split(',')) - def _find_term_columns(): - columns = os.environ.get('COLUMNS', None) - if columns: - return int(columns) - - try: - sp = subprocess.Popen(['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE) - out,err = sp.communicate() - return int(out.split()[1]) - except: - pass - return None - def _hide_login_info(opts): opts = list(opts) for private_opt in ['-p', '--password', '-u', '--username', '--video-password']: @@ -140,7 +127,7 @@ def parseOpts(overrideArguments=None): max_help_position = 80 # No need to wrap help messages if we're on a wide console - columns = _find_term_columns() + columns = get_term_width() if columns: max_width = columns fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 5ba06d965..64300d8e0 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -15,6 +15,7 @@ import platform import re import ssl import socket +import subprocess import sys import traceback import zlib @@ -1024,6 +1025,23 @@ def format_bytes(bytes): converted = float(bytes) / float(1024 ** exponent) return u'%.2f%s' % (converted, suffix) + def str_to_int(int_str): int_str = re.sub(r'[,\.]', u'', int_str) return int(int_str) + + +def get_term_width(): + columns = os.environ.get('COLUMNS', None) + if columns: + return int(columns) + + try: + sp = subprocess.Popen( + ['stty', 'size'], + stdout=subprocess.PIPE, stderr=subprocess.PIPE) + out, err = sp.communicate() + return int(out.split()[1]) + except: + pass + return None From df8ae1e3a2222acf189c7b82278a773e93887cec Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 9 Dec 2013 18:31:31 +0100 Subject: [PATCH 036/150] release 2013.12.09.2 --- README.md | 2 ++ youtube_dl/version.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 9d4835053..68b2e1ae7 100644 --- a/README.md +++ b/README.md @@ -100,6 +100,8 @@ which means you can modify it, redistribute it or use it however you like. --restrict-filenames Restrict filenames to only ASCII characters, and avoid "&" and spaces in filenames -a, --batch-file FILE file containing URLs to download ('-' for stdin) + --load-info FILE json file containing the video information + (created with the "--write-json" option -w, --no-overwrites do not overwrite files -c, --continue force resume of partially downloaded files. By default, youtube-dl will resume downloads if diff --git a/youtube_dl/version.py b/youtube_dl/version.py index f7f658f49..0b19fa9c5 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2013.12.09.1' +__version__ = '2013.12.09.2' From 1d87e3a1c64e3183d17a6dc6fbc4353a86aa5f99 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 9 Dec 2013 18:55:54 +0100 Subject: [PATCH 037/150] [rtlnow] Allow double slashes after domain name (Fixes #1928) --- youtube_dl/extractor/rtlnow.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/rtlnow.py b/youtube_dl/extractor/rtlnow.py index 2f238de35..511674d8d 100644 --- a/youtube_dl/extractor/rtlnow.py +++ b/youtube_dl/extractor/rtlnow.py @@ -7,14 +7,15 @@ from ..utils import ( ExtractorError, ) + class RTLnowIE(InfoExtractor): """Information Extractor for RTL NOW, RTL2 NOW, RTL NITRO, SUPER RTL NOW, VOX NOW and n-tv NOW""" - _VALID_URL = r'(?:http://)?(?P<url>(?P<base_url>rtl-now\.rtl\.de/|rtl2now\.rtl2\.de/|(?:www\.)?voxnow\.de/|(?:www\.)?rtlnitronow\.de/|(?:www\.)?superrtlnow\.de/|(?:www\.)?n-tvnow\.de/)[a-zA-Z0-9-]+/[a-zA-Z0-9-]+\.php\?(?:container_id|film_id)=(?P<video_id>[0-9]+)&player=1(?:&season=[0-9]+)?(?:&.*)?)' + _VALID_URL = r'(?:http://)?(?P<url>(?P<base_url>rtl-now\.rtl\.de|rtl2now\.rtl2\.de|(?:www\.)?voxnow\.de|(?:www\.)?rtlnitronow\.de|(?:www\.)?superrtlnow\.de|(?:www\.)?n-tvnow\.de)/+[a-zA-Z0-9-]+/[a-zA-Z0-9-]+\.php\?(?:container_id|film_id)=(?P<video_id>[0-9]+)&player=1(?:&season=[0-9]+)?(?:&.*)?)' _TESTS = [{ u'url': u'http://rtl-now.rtl.de/ahornallee/folge-1.php?film_id=90419&player=1&season=1', u'file': u'90419.flv', u'info_dict': { - u'upload_date': u'20070416', + u'upload_date': u'20070416', u'title': u'Ahornallee - Folge 1 - Der Einzug', u'description': u'Folge 1 - Der Einzug', }, From 1e1f84dac9b4fe8cc32f3d5aefb5fceb5d65232a Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 9 Dec 2013 18:56:17 +0100 Subject: [PATCH 038/150] release 2013.12.09.3 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 0b19fa9c5..17c0e5542 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2013.12.09.2' +__version__ = '2013.12.09.3' From caefb1de877e163fa3ece44757cb2fae6adf47e4 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 9 Dec 2013 19:39:41 +0100 Subject: [PATCH 039/150] [ndtv] Add extractor (Fixes #1924) --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/ndtv.py | 66 ++++++++++++++++++++++++++++++++ youtube_dl/utils.py | 12 ++++++ 3 files changed, 79 insertions(+) create mode 100644 youtube_dl/extractor/ndtv.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 3f740baa1..1149dc1ec 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -100,6 +100,7 @@ from .myvideo import MyVideoIE from .naver import NaverIE from .nba import NBAIE from .nbc import NBCNewsIE +from .ndtv import NDTVIE from .newgrounds import NewgroundsIE from .nhl import NHLIE, NHLVideocenterIE from .niconico import NiconicoIE diff --git a/youtube_dl/extractor/ndtv.py b/youtube_dl/extractor/ndtv.py new file mode 100644 index 000000000..2e8501f99 --- /dev/null +++ b/youtube_dl/extractor/ndtv.py @@ -0,0 +1,66 @@ +import json +import re +import time + +from .common import InfoExtractor +from ..utils import month_by_name + + +class NDTVIE(InfoExtractor): + _VALID_URL = r'^https?://(?:www\.)?ndtv\.com/video/player/[^/]*/[^/]*/(?P<id>[a-z0-9]+)' + + _TEST = { + u"url": u"http://www.ndtv.com/video/player/news/ndtv-exclusive-don-t-need-character-certificate-from-rahul-gandhi-says-arvind-kejriwal/300710", + u"file": u"300710.mp4", + u"md5": u"39f992dbe5fb531c395d8bbedb1e5e88", + u"info_dict": { + u"title": u"NDTV exclusive: Don't need character certificate from Rahul Gandhi, says Arvind Kejriwal", + u"description": u"In an exclusive interview to NDTV, Aam Aadmi Party's Arvind Kejriwal says it makes no difference to him that Rahul Gandhi said the Congress needs to learn from his party.", + u"upload_date": u"20131208", + u"duration": 1327, + u"thumbnail": u"http://i.ndtvimg.com/video/images/vod/medium/2013-12/big_300710_1386518307.jpg", + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + + webpage = self._download_webpage(url, video_id) + + filename = self._search_regex( + r"__filename='([^']+)'", webpage, u'video filename') + video_url = (u'http://bitcast-b.bitgravity.com/ndtvod/23372/ndtv/%s' % + filename) + + duration_str = filename = self._search_regex( + r"__duration='([^']+)'", webpage, u'duration', fatal=False) + duration = None if duration_str is None else int(duration_str) + + date_m = re.search(r'''(?x) + <p\s+class="vod_dateline">\s* + Published\s+On:\s* + (?P<monthname>[A-Za-z]+)\s+(?P<day>[0-9]+),\s*(?P<year>[0-9]+) + ''', webpage) + upload_date = None + assert date_m + if date_m is not None: + month = month_by_name(date_m.group('monthname')) + if month is not None: + upload_date = '%s%02d%02d' % ( + date_m.group('year'), month, int(date_m.group('day'))) + + description = self._og_search_description(webpage) + READ_MORE = u' (Read more)' + if description.endswith(READ_MORE): + description = description[:-len(READ_MORE)] + + return { + 'id': video_id, + 'url': video_url, + 'title': self._og_search_title(webpage), + 'description': description, + 'thumbnail': self._og_search_thumbnail(webpage), + 'duration': duration, + 'upload_date': upload_date, + } diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 64300d8e0..0dab9fcc5 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1045,3 +1045,15 @@ def get_term_width(): except: pass return None + + +def month_by_name(name): + """ Return the number of a month by (locale-independently) English name """ + + ENGLISH_NAMES = [ + u'Januar', u'February', u'March', u'April', u'May', u'June', + u'July', u'August', u'September', u'October', u'November', u'December'] + try: + return ENGLISH_NAMES.index(name) + 1 + except ValueError: + return None From 4ff50ef84607ee60ff813f7d7f3d35c8b497bf07 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 9 Dec 2013 19:57:00 +0100 Subject: [PATCH 040/150] [soundcloud] Do not match sets (Fixes #1930) --- test/test_all_urls.py | 3 +++ youtube_dl/extractor/soundcloud.py | 3 ++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/test/test_all_urls.py b/test/test_all_urls.py index 6b9764c67..e9458b2e3 100644 --- a/test/test_all_urls.py +++ b/test/test_all_urls.py @@ -110,6 +110,9 @@ class TestAllURLsMatching(unittest.TestCase): self.assertMatch('http://vimeo.com/channels/tributes', ['vimeo:channel']) self.assertMatch('http://vimeo.com/user7108434', ['vimeo:user']) + # https://github.com/rg3/youtube-dl/issues/1930 + def test_soundcloud_not_matching_sets(self): + self.assertMatch('http://soundcloud.com/floex/sets/gone-ep', ['soundcloud:set']) if __name__ == '__main__': unittest.main() diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index 0571b36ac..5c026c0b8 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -25,7 +25,8 @@ class SoundcloudIE(InfoExtractor): _VALID_URL = r'''^(?:https?://)? (?:(?:(?:www\.)?soundcloud\.com/ - (?P<uploader>[\w\d-]+)/(?P<title>[\w\d-]+)/? + (?P<uploader>[\w\d-]+)/ + (?!sets/)(?P<title>[\w\d-]+)/? (?P<token>[^?]+?)?(?:[?].*)?$) |(?:api\.soundcloud\.com/tracks/(?P<track_id>\d+)) |(?P<widget>w\.soundcloud\.com/player/?.*?url=.*) From 77526143e789368a15a0e89bfed5dbc52f8db9e5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Mon, 9 Dec 2013 20:01:43 +0100 Subject: [PATCH 041/150] [brightcove] Use the original url (usually the player) as the default referer (fixes #1929) --- youtube_dl/extractor/brightcove.py | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index 66fe0ac9a..63577a3cb 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -55,6 +55,18 @@ class BrightcoveIE(InfoExtractor): u'uploader': u'Mashable', }, }, + { + # test that the default referer works + # from http://national.ballet.ca/interact/video/Lost_in_Motion_II/ + u'url': u'http://link.brightcove.com/services/player/bcpid756015033001?bckey=AQ~~,AAAApYJi_Ck~,GxhXCegT1Dp39ilhXuxMJxasUhVNZiil&bctid=2878862109001', + u'info_dict': { + u'id': u'2878862109001', + u'ext': u'mp4', + u'title': u'Lost in Motion II', + u'description': u'md5:363109c02998fee92ec02211bd8000df', + u'uploader': u'National Ballet of Canada', + }, + }, ] @classmethod @@ -118,17 +130,21 @@ class BrightcoveIE(InfoExtractor): videoPlayer = query.get('@videoPlayer') if videoPlayer: - return self._get_video_info(videoPlayer[0], query_str, query) + return self._get_video_info(videoPlayer[0], query_str, query, + # We set the original url as the default 'Referer' header + referer=url) else: player_key = query['playerKey'] return self._get_playlist_info(player_key[0]) - def _get_video_info(self, video_id, query_str, query): + def _get_video_info(self, video_id, query_str, query, referer=None): request_url = self._FEDERATED_URL_TEMPLATE % query_str req = compat_urllib_request.Request(request_url) linkBase = query.get('linkBaseURL') if linkBase is not None: - req.add_header('Referer', linkBase[0]) + referer = linkBase[0] + if referer is not None: + req.add_header('Referer', referer) webpage = self._download_webpage(req, video_id) self.report_extraction(video_id) From f8bd0194a7723bcb702c34816e8fa522fce4e723 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 9 Dec 2013 20:05:10 +0100 Subject: [PATCH 042/150] Remove superfluous spaces --- youtube_dl/extractor/brightcove.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index 63577a3cb..b1b7526ca 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -64,7 +64,7 @@ class BrightcoveIE(InfoExtractor): u'ext': u'mp4', u'title': u'Lost in Motion II', u'description': u'md5:363109c02998fee92ec02211bd8000df', - u'uploader': u'National Ballet of Canada', + u'uploader': u'National Ballet of Canada', }, }, ] From 49929a20a7166199307b9d8eda623f8b81540bdb Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 9 Dec 2013 20:05:27 +0100 Subject: [PATCH 043/150] release 2013.12.09.4 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 17c0e5542..8906d6090 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2013.12.09.3' +__version__ = '2013.12.09.4' From 26e6393134b35121ab956a408250c565596dd2a1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Mon, 9 Dec 2013 22:00:42 +0100 Subject: [PATCH 044/150] Set 'NA' as the default value for missing fields in the output template (fixes #1931) Remove the `except KeyError` clause, it won't get raised anymore --- test/test_YoutubeDL.py | 15 +++++++++++++++ youtube_dl/YoutubeDL.py | 5 ++--- 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index 58cf9c313..3100c362a 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -7,6 +7,7 @@ import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from test.helper import FakeYDL +from youtube_dl import YoutubeDL class YDL(FakeYDL): @@ -140,6 +141,20 @@ class TestFormatSelection(unittest.TestCase): self.assertEqual(test_dict['extractor'], 'Foo') self.assertEqual(test_dict['playlist'], 'funny videos') + def test_prepare_filename(self): + info = { + u'id': u'1234', + u'ext': u'mp4', + u'width': None, + } + def fname(templ): + ydl = YoutubeDL({'outtmpl': templ}) + return ydl.prepare_filename(info) + self.assertEqual(fname(u'%(id)s.%(ext)s'), u'1234.mp4') + self.assertEqual(fname(u'%(id)s-%(width)s.%(ext)s'), u'1234-NA.mp4') + # Replace missing fields with 'NA' + self.assertEqual(fname(u'%(uploader_date)s-%(id)s.%(ext)s'), u'NA-1234.mp4') + if __name__ == '__main__': unittest.main() diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 2dd7e4907..11d4972dd 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -3,6 +3,7 @@ from __future__ import absolute_import +import collections import errno import io import json @@ -401,13 +402,11 @@ class YoutubeDL(object): is_id=(k == u'id')) template_dict = dict((k, sanitize(k, v)) for k, v in template_dict.items()) + template_dict = collections.defaultdict(lambda: u'NA', template_dict) tmpl = os.path.expanduser(self.params['outtmpl']) filename = tmpl % template_dict return filename - except KeyError as err: - self.report_error(u'Erroneous output template') - return None except ValueError as err: self.report_error(u'Error in output template: ' + str(err) + u' (encoding: ' + repr(preferredencoding()) + ')') return None From 45598aab0873c31dd6200899192ddeb07bd2c472 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Tue, 10 Dec 2013 11:23:35 +0100 Subject: [PATCH 045/150] [YoutubeDL] Simplify filename preparation --- youtube_dl/YoutubeDL.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 11d4972dd..cabe30980 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -397,11 +397,12 @@ class YoutubeDL(object): template_dict['playlist_index'] = u'%05d' % template_dict['playlist_index'] sanitize = lambda k, v: sanitize_filename( - u'NA' if v is None else compat_str(v), + compat_str(v), restricted=self.params.get('restrictfilenames'), is_id=(k == u'id')) template_dict = dict((k, sanitize(k, v)) - for k, v in template_dict.items()) + for k, v in template_dict.items() + if v is not None) template_dict = collections.defaultdict(lambda: u'NA', template_dict) tmpl = os.path.expanduser(self.params['outtmpl']) From 475700acfe3ee1604b884450c6a35c709b6410d5 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Tue, 10 Dec 2013 11:45:13 +0100 Subject: [PATCH 046/150] [soundcloud] Do not mistake original_format for ext (Fixes #1934) --- youtube_dl/extractor/soundcloud.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index 5c026c0b8..cc5f14593 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -99,7 +99,7 @@ class SoundcloudIE(InfoExtractor): thumbnail = info['artwork_url'] if thumbnail is not None: thumbnail = thumbnail.replace('-large', '-t500x500') - ext = info.get('original_format', u'mp3') + ext = u'mp3' result = { 'id': track_id, 'uploader': info['user']['username'], From 5a3ea17c9470c93be27f4b177f2e4114bb06d0e3 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Tue, 10 Dec 2013 11:52:10 +0100 Subject: [PATCH 047/150] [zdf] Correct order of unknown formats (#1936) --- youtube_dl/extractor/zdf.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/zdf.py b/youtube_dl/extractor/zdf.py index 689f19735..35ece354a 100644 --- a/youtube_dl/extractor/zdf.py +++ b/youtube_dl/extractor/zdf.py @@ -73,14 +73,14 @@ class ZDFIE(InfoExtractor): try: proto_pref = -PROTO_ORDER.index(format_m.group('proto')) except ValueError: - proto_pref = 999 + proto_pref = -999 quality = fnode.find('./quality').text QUALITY_ORDER = ['veryhigh', '300', 'high', 'med', 'low'] try: quality_pref = -QUALITY_ORDER.index(quality) except ValueError: - quality_pref = 999 + quality_pref = -999 abr = int(fnode.find('./audioBitrate').text) // 1000 vbr = int(fnode.find('./videoBitrate').text) // 1000 From a30a60d8eb027a55ec14c912bad4359b3128997e Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Tue, 10 Dec 2013 11:54:59 +0100 Subject: [PATCH 048/150] release 2013.12.10 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 8906d6090..4da3026b3 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2013.12.09.4' +__version__ = '2013.12.10' From e2b38da93112c97d46d612bf89c329b22ac2d00d Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Tue, 10 Dec 2013 12:45:22 +0100 Subject: [PATCH 049/150] [mtv] Fixup incorrectly encoded XML documents --- youtube_dl/extractor/common.py | 5 ++++- youtube_dl/extractor/mtv.py | 9 +++++++-- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 534908a2b..69a083b68 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -230,9 +230,12 @@ class InfoExtractor(object): return content def _download_xml(self, url_or_request, video_id, - note=u'Downloading XML', errnote=u'Unable to download XML'): + note=u'Downloading XML', errnote=u'Unable to download XML', + transform_source=None): """Return the xml as an xml.etree.ElementTree.Element""" xml_string = self._download_webpage(url_or_request, video_id, note, errnote) + if transform_source: + xml_string = transform_source(xml_string) return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8')) def to_screen(self, msg): diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py index 6b3feb560..5b2bd9633 100644 --- a/youtube_dl/extractor/mtv.py +++ b/youtube_dl/extractor/mtv.py @@ -82,8 +82,13 @@ class MTVServicesInfoExtractor(InfoExtractor): def _get_videos_info(self, uri): video_id = self._id_from_uri(uri) data = compat_urllib_parse.urlencode({'uri': uri}) - idoc = self._download_xml(self._FEED_URL +'?' + data, video_id, - u'Downloading info') + + def fix_ampersand(s): + """ Fix unencoded ampersand in XML """ + return s.replace(u'& ', '& ') + idoc = self._download_xml( + self._FEED_URL + '?' + data, video_id, + u'Downloading info', transform_source=fix_ampersand) return [self._get_video_info(item) for item in idoc.findall('.//item')] From f67ca84d4a77bb15b959bcc2c0a88ad5c4efc9af Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Tue, 10 Dec 2013 13:04:21 +0100 Subject: [PATCH 050/150] [soundcloud] Fix the extension for 'downloadable' songs In this case the 'original_format' field must be used. --- youtube_dl/extractor/soundcloud.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index cc5f14593..cbba4094b 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -73,6 +73,19 @@ class SoundcloudIE(InfoExtractor): u'upload_date': u'20131209', }, }, + # downloadable song + { + u'url': u'https://soundcloud.com/simgretina/just-your-problem-baby-1', + u'md5': u'56a8b69568acaa967b4c49f9d1d52d19', + u'info_dict': { + u'id': u'105614606', + u'ext': u'wav', + u'title': u'Just Your Problem Baby (Acapella)', + u'description': u'Vocals', + u'uploader': u'Sim Gretina', + u'upload_date': u'20130815', + }, + }, ] _CLIENT_ID = 'b45b1aa10f1ac2941910a7f0d10f8e28' @@ -115,7 +128,7 @@ class SoundcloudIE(InfoExtractor): track_id, self._CLIENT_ID)) result['formats'] = [{ 'format_id': 'download', - 'ext': ext, + 'ext': info.get('original_format', u'mp3'), 'url': format_url, 'vcodec': 'none', }] From cbfc4702280dcf26b76a1e2be151cdb97a293f4d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Tue, 10 Dec 2013 13:42:41 +0100 Subject: [PATCH 051/150] [mixcloud] Try to get the m4a url if the mp3 url fails to download (fixes #1939) --- youtube_dl/extractor/mixcloud.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/mixcloud.py b/youtube_dl/extractor/mixcloud.py index 04fa3ac7a..125d81551 100644 --- a/youtube_dl/extractor/mixcloud.py +++ b/youtube_dl/extractor/mixcloud.py @@ -37,6 +37,9 @@ class MixcloudIE(InfoExtractor): return None + def _get_url(self, template_url): + return self.check_urls(template_url % i for i in range(30)) + def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) @@ -52,13 +55,18 @@ class MixcloudIE(InfoExtractor): preview_url = self._search_regex(r'data-preview-url="(.+?)"', webpage, u'preview url') song_url = preview_url.replace('/previews/', '/cloudcasts/originals/') template_url = re.sub(r'(stream\d*)', 'stream%d', song_url) - final_song_url = self.check_urls(template_url % i for i in range(30)) + final_song_url = self._get_url(template_url) + if final_song_url is None: + self.to_screen('Trying with m4a extension') + template_url = template_url.replace('.mp3', '.m4a').replace('originals/', 'm4a/64/') + final_song_url = self._get_url(template_url) + if final_song_url is None: + raise ExtractorError(u'Unable to extract track url') return { 'id': track_id, 'title': info['name'], 'url': final_song_url, - 'ext': 'mp3', 'description': info.get('description'), 'thumbnail': info['pictures'].get('extra_large'), 'uploader': info['user']['name'], From df1d7da2afe748a9bc7be3ef43f96914dc0d2576 Mon Sep 17 00:00:00 2001 From: mc2avr <mc2avr@googlemail.com> Date: Tue, 10 Dec 2013 18:40:50 +0100 Subject: [PATCH 052/150] add MDRIE --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/mdr.py | 78 ++++++++++++++++++++++++++++++++ 2 files changed, 79 insertions(+) create mode 100644 youtube_dl/extractor/mdr.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index d7cb6e463..8a063648c 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -86,6 +86,7 @@ from .kickstarter import KickStarterIE from .keek import KeekIE from .liveleak import LiveLeakIE from .livestream import LivestreamIE, LivestreamOriginalIE +from .mdr import MDRIE from .metacafe import MetacafeIE from .metacritic import MetacriticIE from .mit import TechTVMITIE, MITIE diff --git a/youtube_dl/extractor/mdr.py b/youtube_dl/extractor/mdr.py new file mode 100644 index 000000000..8b096a78a --- /dev/null +++ b/youtube_dl/extractor/mdr.py @@ -0,0 +1,78 @@ +import re + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, +) + +class MDRIE(InfoExtractor): + _VALID_URL = r'^(?P<domain>(?:https?://)?(?:www\.)?mdr\.de)/mediathek/(?:.*)/(?P<type>video|audio)(?P<video_id>[^/_]+)_.*' + _TITLE = r'<h2>(?P<title1>[^<]+)<span>(?P<title2>[^<]+)</span></h2>' + + _MEDIA_XML = r'(?P<xmlurl>/mediathek/(.+)/(video|audio)([0-9]+)-avCustom.xml)' + _MEDIA_STREAM_VIDEO = r'<asset>.*<frameWidth>(?P<frameWidth>[0-9]+)</frameWidth>.*<flashMediaServerApplicationURL>(?P<flashMediaServerApplicationURL>[^<]+)</flashMediaServerApplicationURL><flashMediaServerURL>(?P<flashMediaServerURL>[^<]+)</flashMediaServerURL>.*<progressiveDownloadUrl>(?P<progressiveDownloadUrl>[^<]+)</progressiveDownloadUrl></asset>' + _MEDIA_STREAM_AUDIO = r'<asset>.*<mediaType>(?P<mediaType>[A-Z0-9]+)</mediaType><bitrateAudio>(?P<bitrateAudio>[0-9]+)</bitrateAudio>.*<flashMediaServerApplicationURL>(?P<flashMediaServerApplicationURL>[^<]+)</flashMediaServerApplicationURL><flashMediaServerURL>(?P<flashMediaServerURL>[^<]+)</flashMediaServerURL>.*<progressiveDownloadUrl>(?P<progressiveDownloadUrl>[^<]+)</progressiveDownloadUrl></asset>' + _TESTS = [{ + u'url': u'http://www.mdr.de/mediathek/themen/nachrichten/video165624_zc-c5c7de76_zs-3795826d.html', + u'file': u'165624.mp4', + u'md5': u'95165945756198b8fa2dea10f0b04614', + u'info_dict': { + u"title": u"MDR aktuell Eins30 09.12.2013, 22:48 Uhr" + }, + #u'skip': u'Requires rtmpdump' # rtmp is optional + }, + { + u'url': u' http://www.mdr.de/mediathek/radio/mdr1-radio-sachsen/audio718370_zc-67b21197_zs-1b9b2483.html', + u'file': u'718370.mp4', + u'md5': u'4a5b1fbb5519fb0d929c384b6ff7cb8b', + u'info_dict': { + u"title": u"MDR 1 RADIO SACHSEN 10.12.2013, 05:00 Uhr" + }, + #u'skip': u'Requires rtmpdump' # rtmp is optional + }] + + def _real_extract(self, url): + + # determine video id from url + m = re.match(self._VALID_URL, url) + video_id = m.group('video_id') + domain = m.group('domain') + mediatype = m.group('type') + + # determine title and media streams from webpage + html = self._download_webpage(url, video_id) + t = re.search(self._TITLE, html) + if not t: + raise ExtractorError(u'no title found') + title = t.group('title1') + t.group('title2') + m = re.search(self._MEDIA_XML, html) + if not m: + raise ExtractorError(u'no xml found') + xmlurl = m.group('xmlurl') + xml = self._download_webpage(domain+xmlurl, video_id, 'download XML').replace('\n','').replace('\r','').replace('<asset>','\n<asset>').replace('</asset>','</asset>\n') + if(mediatype == "video"): + streams = [mo.groupdict() for mo in re.finditer(self._MEDIA_STREAM_VIDEO, xml)] + if not streams: + raise ExtractorError(u'no media found') + # choose default media type and highest quality for now + stream = max([s for s in streams if s["progressiveDownloadUrl"].startswith("http://") ], + key=lambda s: int(s["frameWidth"])) + else: + streams = [mo.groupdict() for mo in re.finditer(self._MEDIA_STREAM_AUDIO, xml)] + if not streams: + raise ExtractorError(u'no media found') + # choose default media type (MP4) and highest quality for now + stream = max([s for s in streams if s["progressiveDownloadUrl"].startswith("http://") and s["mediaType"] == "MP4" ], + key=lambda s: int(s["bitrateAudio"])) + + # there's two possibilities: RTMP stream or HTTP download + info = {'id': video_id, 'title': title, 'ext': 'mp4'} + if not stream["progressiveDownloadUrl"]: + self.to_screen(u'RTMP download detected') + assert stream['flashMediaServerURL'].startswith('mp4:') + info["url"] = stream["flashMediaServerApplicationURL"] + info["play_path"] = stream['flashMediaServerURL'] + else: + assert stream["progressiveDownloadUrl"].endswith('.mp4') + info["url"] = stream["progressiveDownloadUrl"] + return [info] From 7c86cd5ab1c8e9091c42eaae7354520a708f7431 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Tue, 10 Dec 2013 19:44:16 +0100 Subject: [PATCH 053/150] [dailymotion] Fix uploader extraction Now it looks directly in the info dictionary --- youtube_dl/extractor/dailymotion.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index 3bd0b862c..7f2f0d6a4 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -101,10 +101,6 @@ class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor): self.to_screen(u'Vevo video detected: %s' % vevo_id) return self.url_result(u'vevo:%s' % vevo_id, ie='Vevo') - video_uploader = self._search_regex([r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', - # Looking for official user - r'<(?:span|a) .*?rel="author".*?>([^<]+?)</'], - webpage, 'video uploader', fatal=False) age_limit = self._rta_search(webpage) video_upload_date = None @@ -153,7 +149,7 @@ class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor): return { 'id': video_id, 'formats': formats, - 'uploader': video_uploader, + 'uploader': info['owner_screenname'], 'upload_date': video_upload_date, 'title': self._og_search_title(webpage), 'subtitles': video_subtitles, From 5458b4cefbfff4b419f22ad33fbaffda25b83382 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Tue, 10 Dec 2013 19:47:00 +0100 Subject: [PATCH 054/150] [dailymotion] Fix view count extraction and make it non fatal (fixes #1940) --- youtube_dl/extractor/dailymotion.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index 7f2f0d6a4..aea7e557e 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -143,8 +143,10 @@ class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor): self._list_available_subtitles(video_id, webpage) return - view_count = str_to_int(self._search_regex( - r'video_views_value[^>]+>([\d\.,]+)<', webpage, u'view count')) + view_count = self._search_regex( + r'video_views_count[^>]+>\s+([\d\.,]+)', webpage, u'view count', fatal=False) + if view_count is not None: + view_count = str_to_int(view_count) return { 'id': video_id, From 48ad51b243b1fbca7f1e72e209f38f5ca90335ad Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Tue, 10 Dec 2013 20:28:12 +0100 Subject: [PATCH 055/150] [vimeo] Fix the extraction for some 'player' or 'pro' videos The variable the config dict is assigned to can change, now we try to detect it or fallback to a, b or c --- youtube_dl/extractor/vimeo.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index fb2bd225a..7c7f537d8 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -151,8 +151,14 @@ class VimeoIE(InfoExtractor): config = json.loads(config_json) except RegexNotFoundError: # For pro videos or player.vimeo.com urls - config = self._search_regex([r' = {config:({.+?}),assets:', r'(?:c|b)=({.+?});'], - webpage, u'info section', flags=re.DOTALL) + # We try to find out to which variable is assigned the config dic + m_variable_name = re.search('(\w)\.video\.id', webpage) + if m_variable_name is not None: + config_re = r'%s=({.+?});' % re.escape(m_variable_name.group(1)) + else: + config_re = [r' = {config:({.+?}),assets:', r'(?:[abc])=({.+?});'] + config = self._search_regex(config_re, webpage, u'info section', + flags=re.DOTALL) config = json.loads(config) except Exception as e: if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage): From a0088bdf9342408a1fc5033a0f4599bae3b9aa0b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Tue, 10 Dec 2013 20:43:16 +0100 Subject: [PATCH 056/150] [vimeo] Fix unused argument of the `_real_extract` method --- youtube_dl/extractor/vimeo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 7c7f537d8..ea4409528 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -115,7 +115,7 @@ class VimeoIE(InfoExtractor): def _real_initialize(self): self._login() - def _real_extract(self, url, new_video=True): + def _real_extract(self, url): url, data = unsmuggle_url(url) headers = std_headers if data is not None: From 182583623583c8e71af9b4e24acf8c409fcff197 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Tue, 10 Dec 2013 21:03:53 +0100 Subject: [PATCH 057/150] Use `_download_xml` in more extractors --- youtube_dl/extractor/appletrailers.py | 23 +++++++++++------------ youtube_dl/extractor/clipsyndicate.py | 10 ++++------ youtube_dl/extractor/metacritic.py | 9 +++++---- youtube_dl/utils.py | 5 +++++ 4 files changed, 25 insertions(+), 22 deletions(-) diff --git a/youtube_dl/extractor/appletrailers.py b/youtube_dl/extractor/appletrailers.py index a527f10de..ef5644aa5 100644 --- a/youtube_dl/extractor/appletrailers.py +++ b/youtube_dl/extractor/appletrailers.py @@ -1,5 +1,4 @@ import re -import xml.etree.ElementTree import json from .common import InfoExtractor @@ -65,18 +64,18 @@ class AppleTrailersIE(InfoExtractor): uploader_id = mobj.group('company') playlist_url = compat_urlparse.urljoin(url, u'includes/playlists/itunes.inc') - playlist_snippet = self._download_webpage(playlist_url, movie) - playlist_cleaned = re.sub(r'(?s)<script[^<]*?>.*?</script>', u'', playlist_snippet) - playlist_cleaned = re.sub(r'<img ([^<]*?)>', r'<img \1/>', playlist_cleaned) - # The ' in the onClick attributes are not escaped, it couldn't be parsed - # with xml.etree.ElementTree.fromstring - # like: http://trailers.apple.com/trailers/wb/gravity/ - def _clean_json(m): - return u'iTunes.playURL(%s);' % m.group(1).replace('\'', ''') - playlist_cleaned = re.sub(self._JSON_RE, _clean_json, playlist_cleaned) - playlist_html = u'<html>' + playlist_cleaned + u'</html>' + def fix_html(s): + s = re.sub(r'(?s)<script[^<]*?>.*?</script>', u'', s) + s = re.sub(r'<img ([^<]*?)>', r'<img \1/>', s) + # The ' in the onClick attributes are not escaped, it couldn't be parsed + # like: http://trailers.apple.com/trailers/wb/gravity/ + def _clean_json(m): + return u'iTunes.playURL(%s);' % m.group(1).replace('\'', ''') + s = re.sub(self._JSON_RE, _clean_json, s) + s = u'<html>' + s + u'</html>' + return s + doc = self._download_xml(playlist_url, movie, transform_source=fix_html) - doc = xml.etree.ElementTree.fromstring(playlist_html) playlist = [] for li in doc.findall('./div/ul/li'): on_click = li.find('.//a').attrib['onClick'] diff --git a/youtube_dl/extractor/clipsyndicate.py b/youtube_dl/extractor/clipsyndicate.py index d4fc86973..c60089ad3 100644 --- a/youtube_dl/extractor/clipsyndicate.py +++ b/youtube_dl/extractor/clipsyndicate.py @@ -1,9 +1,9 @@ import re -import xml.etree.ElementTree from .common import InfoExtractor from ..utils import ( find_xpath_attr, + fix_xml_all_ampersand, ) @@ -30,12 +30,10 @@ class ClipsyndicateIE(InfoExtractor): # it includes a required token flvars = self._search_regex(r'flvars: "(.*?)"', js_player, u'flvars') - playlist_page = self._download_webpage( + pdoc = self._download_xml( 'http://eplayer.clipsyndicate.com/osmf/playlist?%s' % flvars, - video_id, u'Downloading video info') - # Fix broken xml - playlist_page = re.sub('&', '&', playlist_page) - pdoc = xml.etree.ElementTree.fromstring(playlist_page.encode('utf-8')) + video_id, u'Downloading video info', + transform_source=fix_xml_all_ampersand) track_doc = pdoc.find('trackList/track') def find_param(name): diff --git a/youtube_dl/extractor/metacritic.py b/youtube_dl/extractor/metacritic.py index 6b95b4998..e560c1d35 100644 --- a/youtube_dl/extractor/metacritic.py +++ b/youtube_dl/extractor/metacritic.py @@ -1,8 +1,10 @@ import re -import xml.etree.ElementTree import operator from .common import InfoExtractor +from ..utils import ( + fix_xml_all_ampersand, +) class MetacriticIE(InfoExtractor): @@ -23,9 +25,8 @@ class MetacriticIE(InfoExtractor): video_id = mobj.group('id') webpage = self._download_webpage(url, video_id) # The xml is not well formatted, there are raw '&' - info_xml = self._download_webpage('http://www.metacritic.com/video_data?video=' + video_id, - video_id, u'Downloading info xml').replace('&', '&') - info = xml.etree.ElementTree.fromstring(info_xml.encode('utf-8')) + info = self._download_xml('http://www.metacritic.com/video_data?video=' + video_id, + video_id, u'Downloading info xml', transform_source=fix_xml_all_ampersand) clip = next(c for c in info.findall('playList/clip') if c.find('id').text == video_id) formats = [] diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 0dab9fcc5..4593488ce 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1057,3 +1057,8 @@ def month_by_name(name): return ENGLISH_NAMES.index(name) + 1 except ValueError: return None + + +def fix_xml_all_ampersand(xml_str): + """Replace all the '&' by '&' in XML""" + return xml_str.replace(u'&', u'&') From 08d03235f9a5a7d28d7992a3caf27a8b88b69e12 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Wed, 11 Dec 2013 08:45:51 +0100 Subject: [PATCH 058/150] release 2013.12.11 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 4da3026b3..a4a89317c 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2013.12.10' +__version__ = '2013.12.11' From 357ddadbf53b48721d43061edc67d37c73d1ac95 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Wed, 11 Dec 2013 08:54:48 +0100 Subject: [PATCH 059/150] Fix thumbnail filename determination (Fixes #1945) --- youtube_dl/YoutubeDL.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index cabe30980..c77777ba0 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -827,7 +827,7 @@ class YoutubeDL(object): if self.params.get('writethumbnail', False): if info_dict.get('thumbnail') is not None: thumb_format = determine_ext(info_dict['thumbnail'], u'jpg') - thumb_filename = filename.rpartition('.')[0] + u'.' + thumb_format + thumb_filename = os.path.splitext(filename)[0] + u'.' + thumb_format self.to_screen(u'[%s] %s: Downloading thumbnail ...' % (info_dict['extractor'], info_dict['id'])) try: From fca1ef19c1dfc5cd841424700816e1ad3b3112eb Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Wed, 11 Dec 2013 08:54:54 +0100 Subject: [PATCH 060/150] release 2013.12.11.1 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index a4a89317c..a3ac65579 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2013.12.11' +__version__ = '2013.12.11.1' From 00381b4ccbf0504f15f48aeddc242fd8535b5c2d Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Wed, 11 Dec 2013 09:22:08 +0100 Subject: [PATCH 061/150] [pornhub] Fix URL regexp --- youtube_dl/extractor/pornhub.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index 8b3471919..d9135c6b9 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -12,7 +12,7 @@ from ..aes import ( ) class PornHubIE(InfoExtractor): - _VALID_URL = r'^(?:https?://)?(?:www\.)?(?P<url>pornhub\.com/view_video\.php\?viewkey=(?P<videoid>[0-9]+))' + _VALID_URL = r'^(?:https?://)?(?:www\.)?(?P<url>pornhub\.com/view_video\.php\?viewkey=(?P<videoid>[0-9a-f]+))' _TEST = { u'url': u'http://www.pornhub.com/view_video.php?viewkey=648719015', u'file': u'648719015.mp4', From f2c36ee43e06f81cd0ac77e5b1d0c238c0057623 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Wed, 11 Dec 2013 09:22:25 +0100 Subject: [PATCH 062/150] release 2013.12.11.2 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index a3ac65579..b9a52fcfa 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2013.12.11.1' +__version__ = '2013.12.11.2' From df5374743615fe54178b3942b115e7168ce0cd97 Mon Sep 17 00:00:00 2001 From: dst <dstftw@gmail.com> Date: Thu, 12 Dec 2013 15:13:45 +0700 Subject: [PATCH 063/150] [channel9] Initial implementation (#1885) --- youtube_dl/extractor/channel9.py | 351 +++++++++++++++++++++++++++++++ 1 file changed, 351 insertions(+) create mode 100644 youtube_dl/extractor/channel9.py diff --git a/youtube_dl/extractor/channel9.py b/youtube_dl/extractor/channel9.py new file mode 100644 index 000000000..016c4497a --- /dev/null +++ b/youtube_dl/extractor/channel9.py @@ -0,0 +1,351 @@ +# encoding: utf-8 + +import re + +from .common import InfoExtractor +from ..utils import ( + format_bytes, + ExtractorError, +) + +class Channel9IE(InfoExtractor): + ''' + Common extractor for channel9.msdn.com. + + The type of provided URL (video or playlist) is determined according to + meta Search.PageType from web page HTML rather than URL itself, as it is + not always possible to do. + ''' + IE_DESC = u'Channel 9' + IE_NAME = u'channel9' + _VALID_URL = r'^https?://(?:www\.)?channel9\.msdn\.com/(?P<contentpath>.+)/?' + + _TESTS = [ + { + u'url': u'http://channel9.msdn.com/Events/TechEd/Australia/2013/KOS002', + u'file': u'Events_TechEd_Australia_2013_KOS002.mp4', + u'md5': u'bbd75296ba47916b754e73c3a4bbdf10', + u'info_dict': { + u'title': u'Developer Kick-Off Session: Stuff We Love', + u'description': u'md5:c08d72240b7c87fcecafe2692f80e35f', + u'duration': 4576, + u'thumbnail': u'http://media.ch9.ms/ch9/9d51/03902f2d-fc97-4d3c-b195-0bfe15a19d51/KOS002_220.jpg', + u'session_code': u'KOS002', + u'session_day': u'Day 1', + u'session_room': u'Arena 1A', + u'session_speakers': [ u'Ed Blankenship', u'Andrew Coates', u'Brady Gaster', u'Patrick Klug', u'Mads Kristensen' ], + }, + }, + { + u'url': u'http://channel9.msdn.com/posts/Self-service-BI-with-Power-BI-nuclear-testing', + u'file': u'posts_Self-service-BI-with-Power-BI-nuclear-testing.mp4', + u'md5': u'b43ee4529d111bc37ba7ee4f34813e68', + u'info_dict': { + u'title': u'Self-service BI with Power BI - nuclear testing', + u'description': u'md5:a6d5cfd9ee46d1851cf6e40ea61cfc10', + u'duration': 1540, + u'thumbnail': u'http://media.ch9.ms/ch9/87e1/0300391f-a455-4c72-bec3-4422f19287e1/selfservicenuk_512.jpg', + u'authors': [ u'Mike Wilmot' ], + }, + } + ] + + _RSS_URL = 'http://channel9.msdn.com/%s/RSS' + _EXTRACT_ENTRY_ITEMS_FROM_RSS = False + + # Sorted by quality + _known_formats = ['MP3', 'MP4', 'Mid Quality WMV', 'Mid Quality MP4', 'High Quality WMV', 'High Quality MP4'] + + def _restore_bytes(self, formatted_size): + if not formatted_size: + return 0 + m = re.match(r'^(?P<size>\d+(?:\.\d+)?)\s+(?P<units>[a-zA-Z]+)', formatted_size) + if not m: + return 0 + units = m.group('units') + try: + exponent = [u'B', u'KB', u'MB', u'GB', u'TB', u'PB', u'EB', u'ZB', u'YB'].index(units.upper()) + except ValueError: + return 0 + size = float(m.group('size')) + return int(size * (1024 ** exponent)) + + def _formats_from_html(self, html): + FORMAT_REGEX = r''' + (?x) + <a\s+href="(?P<url>[^"]+)">(?P<quality>[^<]+)</a>\s* + <span\s+class="usage">\((?P<note>[^\)]+)\)</span>\s* + (?:<div\s+class="popup\s+rounded">\s* + <h3>File\s+size</h3>\s*(?P<filesize>.*?)\s* + </div>)? # File size part may be missing + ''' + # Extract known formats + formats = [{'url': x.group('url'), + 'format_id': x.group('quality'), + 'format_note': x.group('note'), + 'format': '%s (%s)' % (x.group('quality'), x.group('note')), + 'filesize': self._restore_bytes(x.group('filesize')), # File size is approximate + } for x in list(re.finditer(FORMAT_REGEX, html)) if x.group('quality') in self._known_formats] + # Sort according to known formats list + formats.sort(key=lambda fmt: self._known_formats.index(fmt['format_id'])) + return formats + + def _formats_from_rss_item(self, item): + + def process_formats(elem): + formats = [] + for media_content in elem.findall('./{http://search.yahoo.com/mrss/}content'): + url = media_content.attrib['url'] + # Ignore unrelated media + if url.endswith('.ism/manifest'): + continue + format_note = media_content.attrib['type'] + filesize = int(media_content.attrib['fileSize']) + formats.append({'url': url, + 'format_note': format_note, + 'format': '%s %s' % (format_note, format_bytes(filesize)), + 'filesize': filesize, + }) + return formats + + formats = [] + + for media_group in item.findall('./{http://search.yahoo.com/mrss/}group'): + formats.extend(process_formats(media_group)) + + # Sometimes there are no media:groups in item, but there is media:content + # right in item (usually when there is the only media source) + formats.extend(process_formats(item)) + + # Sort by file size + formats.sort(key=lambda fmt: fmt['filesize']) + return formats + + def _extract_title(self, html): + title = self._html_search_meta(u'title', html, u'title') + if title is None: + title = self._og_search_title(html) + TITLE_SUFFIX = u' (Channel 9)' + if title is not None and title.endswith(TITLE_SUFFIX): + title = title[:-len(TITLE_SUFFIX)] + return title + + def _extract_description(self, html): + DESCRIPTION_REGEX = r'''(?sx) + <div\s+class="entry-content">\s* + <div\s+id="entry-body">\s* + (?P<description>.+?)\s* + </div>\s* + </div> + ''' + m = re.search(DESCRIPTION_REGEX, html) + if m is not None: + return m.group('description') + return self._html_search_meta(u'description', html, u'description') + + def _extract_duration(self, html): + m = re.search(r'data-video_duration="(?P<hours>\d{2}):(?P<minutes>\d{2}):(?P<seconds>\d{2})"', html) + return ((int(m.group('hours')) * 60 * 60) + (int(m.group('minutes')) * 60) + int(m.group('seconds'))) if m else None + + def _extract_slides(self, html): + m = re.search(r'<a href="(?P<slidesurl>[^"]+)" class="slides">Slides</a>', html) + return m.group('slidesurl') if m is not None else None + + def _extract_zip(self, html): + m = re.search(r'<a href="(?P<zipurl>[^"]+)" class="zip">Zip</a>', html) + return m.group('zipurl') if m is not None else None + + def _extract_avg_rating(self, html): + m = re.search(r'<p class="avg-rating">Avg Rating: <span>(?P<avgrating>[^<]+)</span></p>', html) + return float(m.group('avgrating')) if m is not None else 0 + + def _extract_rating_count(self, html): + m = re.search(r'<div class="rating-count">\((?P<ratingcount>[^<]+)\)</div>', html) + return int(self._fix_count(m.group('ratingcount'))) if m is not None else 0 + + def _extract_view_count(self, html): + m = re.search(r'<li class="views">\s*<span class="count">(?P<viewcount>[^<]+)</span> Views\s*</li>', html) + return int(self._fix_count(m.group('viewcount'))) if m is not None else 0 + + def _extract_comment_count(self, html): + m = re.search(r'<li class="comments">\s*<a href="#comments">\s*<span class="count">(?P<commentcount>[^<]+)</span> Comments\s*</a>\s*</li>', html) + return int(self._fix_count(m.group('commentcount'))) if m is not None else 0 + + def _fix_count(self, count): + return int(str(count).replace(',', '')) if count is not None else None + + def _extract_authors(self, html): + m = re.search(r'(?s)<li class="author">(.*?)</li>', html) + if m is None: + return None + return re.findall(r'<a href="/Niners/[^"]+">([^<]+)</a>', m.group(1)) + + def _extract_session_code(self, html): + m = re.search(r'<li class="code">\s*(?P<code>.+?)\s*</li>', html) + return m.group('code') if m is not None else None + + def _extract_session_day(self, html): + m = re.search(r'<li class="day">\s*<a href="/Events/[^"]+">(?P<day>[^<]+)</a>\s*</li>', html) + return m.group('day') if m is not None else None + + def _extract_session_room(self, html): + m = re.search(r'<li class="room">\s*(?P<room>.+?)\s*</li>', html) + return m.group('room') if m is not None else None + + def _extract_session_speakers(self, html): + return re.findall(r'<a href="/Events/Speakers/[^"]+">([^<]+)</a>', html) + + def _extract_content(self, html, content_path): + # Look for downloadable content + formats = self._formats_from_html(html) + slides = self._extract_slides(html) + zip_ = self._extract_zip(html) + + # Nothing to download + if len(formats) == 0 and slides is None and zip_ is None: + self._downloader.report_warning(u'None of recording, slides or zip are available for %s' % content_path) + return + + # Extract meta + title = self._extract_title(html) + description = self._extract_description(html) + thumbnail = self._og_search_thumbnail(html) + duration = self._extract_duration(html) + avg_rating = self._extract_avg_rating(html) + rating_count = self._extract_rating_count(html) + view_count = self._extract_view_count(html) + comment_count = self._extract_comment_count(html) + + common = {'_type': 'video', + 'id': content_path, + 'description': description, + 'thumbnail': thumbnail, + 'duration': duration, + 'avg_rating': avg_rating, + 'rating_count': rating_count, + 'view_count': view_count, + 'comment_count': comment_count, + } + + result = [] + + if slides is not None: + d = common.copy() + d.update({ 'title': title + '-Slides', 'url': slides }) + result.append(d) + + if zip_ is not None: + d = common.copy() + d.update({ 'title': title + '-Zip', 'url': zip_ }) + result.append(d) + + if len(formats) > 0: + d = common.copy() + d.update({ 'title': title, 'formats': formats }) + result.append(d) + + return result + + def _extract_entry_item(self, html, content_path): + contents = self._extract_content(html, content_path) + if contents is None: + return contents + + authors = self._extract_authors(html) + + for content in contents: + content['authors'] = authors + + return contents + + def _extract_session(self, html, content_path): + contents = self._extract_content(html, content_path) + if contents is None: + return contents + + session_meta = {'session_code': self._extract_session_code(html), + 'session_day': self._extract_session_day(html), + 'session_room': self._extract_session_room(html), + 'session_speakers': self._extract_session_speakers(html), + } + + for content in contents: + content.update(session_meta) + + return contents + + def _extract_content_rss(self, rss): + ''' + Extracts links to entry items right out of RSS feed. + This approach is faster than extracting from web pages + one by one, but suffers from some problems. + Pros: + - no need to download additional pages + - provides more media links + - accurate file size + Cons: + - fewer meta data provided + - links to media files have no appropriate data that may be used as format_id + - RSS does not contain links to presentation materials (slides, zip) + ''' + entries = [] + for item in rss.findall('./channel/item'): + url = item.find('./link').text + video_id = url.split('/')[-1] + formats = self._formats_from_rss_item(item) + + if len(formats) == 0: + self._downloader.report_warning(u'The recording for session %s is not yet available' % video_id) + continue + + title = item.find('./title').text + description = item.find('./description').text + + thumbnail = item.find('./{http://search.yahoo.com/mrss/}thumbnail').text + + duration_e = item.find('./{http://www.itunes.com/dtds/podcast-1.0.dtd}duration') + duration = duration_e.text if duration_e is not None else 0 + + speakers_e = item.find('./{http://purl.org/dc/elements/1.1/}creator') + speakers = speakers_e.text.split(', ') if speakers_e is not None and speakers_e.text else [] + + entries.append({'_type': 'video', + 'id': video_id, + 'formats': formats, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'duration': duration, + 'session_speakers': speakers, + }) + return entries + + def _extract_list(self, content_path): + rss = self._download_xml(self._RSS_URL % content_path, content_path, u'Downloading RSS') + if self._EXTRACT_ENTRY_ITEMS_FROM_RSS: + return self._extract_content_rss(rss) + else: + entries = [self.url_result(session_url.text, 'Channel9') + for session_url in rss.findall('./channel/item/link')] + title_text = rss.find('./channel/title').text + return self.playlist_result(entries, content_path, title_text) + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + content_path = mobj.group('contentpath') + + webpage = self._download_webpage(url, content_path, u'Downloading web page') + + page_type_m = re.search(r'<meta name="Search.PageType" content="(?P<pagetype>[^"]+)"/>', webpage) + if page_type_m is None: + raise ExtractorError(u'Search.PageType not found, don\'t know how to process this page', expected=True) + + page_type = page_type_m.group('pagetype') + if page_type == 'List': # List page, may contain list of 'item'-like objects + return self._extract_list(content_path) + elif page_type == 'Entry.Item': # Any 'item'-like page, may contain downloadable content + return self._extract_entry_item(webpage, content_path) + elif page_type == 'Session': # Event session page, may contain downloadable content + return self._extract_session(webpage, content_path) + else: + raise ExtractorError(u'Unexpected Search.PageType %s' % page_type, expected=True) \ No newline at end of file From 4d2ebb6bd771edaa26ebb5e76f9ee8e880a4d152 Mon Sep 17 00:00:00 2001 From: dst <dstftw@gmail.com> Date: Thu, 12 Dec 2013 15:19:23 +0700 Subject: [PATCH 064/150] [channel9] Cleanup --- youtube_dl/extractor/channel9.py | 94 ++------------------------------ 1 file changed, 5 insertions(+), 89 deletions(-) diff --git a/youtube_dl/extractor/channel9.py b/youtube_dl/extractor/channel9.py index 016c4497a..b4b586c12 100644 --- a/youtube_dl/extractor/channel9.py +++ b/youtube_dl/extractor/channel9.py @@ -3,10 +3,7 @@ import re from .common import InfoExtractor -from ..utils import ( - format_bytes, - ExtractorError, -) +from ..utils import ExtractorError class Channel9IE(InfoExtractor): ''' @@ -51,7 +48,6 @@ class Channel9IE(InfoExtractor): ] _RSS_URL = 'http://channel9.msdn.com/%s/RSS' - _EXTRACT_ENTRY_ITEMS_FROM_RSS = False # Sorted by quality _known_formats = ['MP3', 'MP4', 'Mid Quality WMV', 'Mid Quality MP4', 'High Quality WMV', 'High Quality MP4'] @@ -90,37 +86,6 @@ class Channel9IE(InfoExtractor): formats.sort(key=lambda fmt: self._known_formats.index(fmt['format_id'])) return formats - def _formats_from_rss_item(self, item): - - def process_formats(elem): - formats = [] - for media_content in elem.findall('./{http://search.yahoo.com/mrss/}content'): - url = media_content.attrib['url'] - # Ignore unrelated media - if url.endswith('.ism/manifest'): - continue - format_note = media_content.attrib['type'] - filesize = int(media_content.attrib['fileSize']) - formats.append({'url': url, - 'format_note': format_note, - 'format': '%s %s' % (format_note, format_bytes(filesize)), - 'filesize': filesize, - }) - return formats - - formats = [] - - for media_group in item.findall('./{http://search.yahoo.com/mrss/}group'): - formats.extend(process_formats(media_group)) - - # Sometimes there are no media:groups in item, but there is media:content - # right in item (usually when there is the only media source) - formats.extend(process_formats(item)) - - # Sort by file size - formats.sort(key=lambda fmt: fmt['filesize']) - return formats - def _extract_title(self, html): title = self._html_search_meta(u'title', html, u'title') if title is None: @@ -274,61 +239,12 @@ class Channel9IE(InfoExtractor): return contents - def _extract_content_rss(self, rss): - ''' - Extracts links to entry items right out of RSS feed. - This approach is faster than extracting from web pages - one by one, but suffers from some problems. - Pros: - - no need to download additional pages - - provides more media links - - accurate file size - Cons: - - fewer meta data provided - - links to media files have no appropriate data that may be used as format_id - - RSS does not contain links to presentation materials (slides, zip) - ''' - entries = [] - for item in rss.findall('./channel/item'): - url = item.find('./link').text - video_id = url.split('/')[-1] - formats = self._formats_from_rss_item(item) - - if len(formats) == 0: - self._downloader.report_warning(u'The recording for session %s is not yet available' % video_id) - continue - - title = item.find('./title').text - description = item.find('./description').text - - thumbnail = item.find('./{http://search.yahoo.com/mrss/}thumbnail').text - - duration_e = item.find('./{http://www.itunes.com/dtds/podcast-1.0.dtd}duration') - duration = duration_e.text if duration_e is not None else 0 - - speakers_e = item.find('./{http://purl.org/dc/elements/1.1/}creator') - speakers = speakers_e.text.split(', ') if speakers_e is not None and speakers_e.text else [] - - entries.append({'_type': 'video', - 'id': video_id, - 'formats': formats, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'duration': duration, - 'session_speakers': speakers, - }) - return entries - def _extract_list(self, content_path): rss = self._download_xml(self._RSS_URL % content_path, content_path, u'Downloading RSS') - if self._EXTRACT_ENTRY_ITEMS_FROM_RSS: - return self._extract_content_rss(rss) - else: - entries = [self.url_result(session_url.text, 'Channel9') - for session_url in rss.findall('./channel/item/link')] - title_text = rss.find('./channel/title').text - return self.playlist_result(entries, content_path, title_text) + entries = [self.url_result(session_url.text, 'Channel9') + for session_url in rss.findall('./channel/item/link')] + title_text = rss.find('./channel/title').text + return self.playlist_result(entries, content_path, title_text) def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) From 211f555d4cb0bc142d5b9304a919e27aba237d40 Mon Sep 17 00:00:00 2001 From: dst <dstftw@gmail.com> Date: Thu, 12 Dec 2013 15:55:31 +0700 Subject: [PATCH 065/150] [channel9] Missing import in __init__ --- youtube_dl/extractor/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 1149dc1ec..fb206a742 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -20,6 +20,7 @@ from .brightcove import BrightcoveIE from .c56 import C56IE from .canalplus import CanalplusIE from .canalc2 import Canalc2IE +from .channel9 import Channel9IE from .cinemassacre import CinemassacreIE from .clipfish import ClipfishIE from .clipsyndicate import ClipsyndicateIE From 9b17ba0fa512a9882a22bb2407ec433038bc8327 Mon Sep 17 00:00:00 2001 From: dst <dstftw@gmail.com> Date: Thu, 12 Dec 2013 16:10:17 +0700 Subject: [PATCH 066/150] [channel9] Fix test description md5 --- youtube_dl/extractor/channel9.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/channel9.py b/youtube_dl/extractor/channel9.py index b4b586c12..ae70ea229 100644 --- a/youtube_dl/extractor/channel9.py +++ b/youtube_dl/extractor/channel9.py @@ -39,7 +39,7 @@ class Channel9IE(InfoExtractor): u'md5': u'b43ee4529d111bc37ba7ee4f34813e68', u'info_dict': { u'title': u'Self-service BI with Power BI - nuclear testing', - u'description': u'md5:a6d5cfd9ee46d1851cf6e40ea61cfc10', + u'description': u'md5:d1e6ecaafa7fb52a2cacdf9599829f5b', u'duration': 1540, u'thumbnail': u'http://media.ch9.ms/ch9/87e1/0300391f-a455-4c72-bec3-4422f19287e1/selfservicenuk_512.jpg', u'authors': [ u'Mike Wilmot' ], From 24b173fa5c4abf740cd5700722bc9162f1dc4c3b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Thu, 12 Dec 2013 13:04:02 +0100 Subject: [PATCH 067/150] [naver] Recognize mobile urls (fixes #1951) --- youtube_dl/extractor/naver.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/naver.py b/youtube_dl/extractor/naver.py index c012ec0cf..4cab30631 100644 --- a/youtube_dl/extractor/naver.py +++ b/youtube_dl/extractor/naver.py @@ -9,7 +9,7 @@ from ..utils import ( class NaverIE(InfoExtractor): - _VALID_URL = r'https?://tvcast\.naver\.com/v/(?P<id>\d+)' + _VALID_URL = r'https?://(?:m\.)?tvcast\.naver\.com/v/(?P<id>\d+)' _TEST = { u'url': u'http://tvcast.naver.com/v/81652', From 7a563df90ac93cf206d53f1af82f57a36343d589 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Thu, 12 Dec 2013 13:05:38 +0100 Subject: [PATCH 068/150] [daum] Recognize mobile urls (#1952) --- youtube_dl/extractor/daum.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/daum.py b/youtube_dl/extractor/daum.py index d418ce4a8..4876ecb48 100644 --- a/youtube_dl/extractor/daum.py +++ b/youtube_dl/extractor/daum.py @@ -9,7 +9,7 @@ from ..utils import ( class DaumIE(InfoExtractor): - _VALID_URL = r'https?://tvpot\.daum\.net/.*?clipid=(?P<id>\d+)' + _VALID_URL = r'https?://(?:m\.)?tvpot\.daum\.net/.*?clipid=(?P<id>\d+)' IE_NAME = u'daum.net' _TEST = { From dadb8184e4374c278f5c66c36b13c08d8474e30f Mon Sep 17 00:00:00 2001 From: dst <dstftw@gmail.com> Date: Fri, 13 Dec 2013 22:27:37 +0700 Subject: [PATCH 069/150] Fix typo in month name --- youtube_dl/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 0dab9fcc5..d9bf6c24c 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1051,7 +1051,7 @@ def month_by_name(name): """ Return the number of a month by (locale-independently) English name """ ENGLISH_NAMES = [ - u'Januar', u'February', u'March', u'April', u'May', u'June', + u'January', u'February', u'March', u'April', u'May', u'June', u'July', u'August', u'September', u'October', u'November', u'December'] try: return ENGLISH_NAMES.index(name) + 1 From 5d574e143f41e1b38a25c814e2bf573b19f022da Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Fri, 13 Dec 2013 17:04:40 +0100 Subject: [PATCH 070/150] [ign] Update one of test video's title --- youtube_dl/extractor/ign.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/ign.py b/youtube_dl/extractor/ign.py index 57b79a336..381af91e4 100644 --- a/youtube_dl/extractor/ign.py +++ b/youtube_dl/extractor/ign.py @@ -44,7 +44,7 @@ class IGNIE(InfoExtractor): { u'file': u'638672ee848ae4ff108df2a296418ee2.mp4', u'info_dict': { - u'title': u'GTA 5\'s Twisted Beauty in Super Slow Motion', + u'title': u'26 Twisted Moments from GTA 5 in Slow Motion', u'description': u'The twisted beauty of GTA 5 in stunning slow motion.', }, }, From 8e05c870b4650599acfa35050df0ba7143bbe6ad Mon Sep 17 00:00:00 2001 From: Michael Orlitzky <michael@orlitzky.com> Date: Fri, 13 Dec 2013 22:22:53 -0500 Subject: [PATCH 071/150] Add support for pornhd.com. --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/pornhd.py | 42 ++++++++++++++++++++++++++++++++ 2 files changed, 43 insertions(+) create mode 100644 youtube_dl/extractor/pornhd.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 1149dc1ec..bd16f0f3e 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -111,6 +111,7 @@ from .orf import ORFIE from .pbs import PBSIE from .photobucket import PhotobucketIE from .podomatic import PodomaticIE +from .pornhd import PornHdIE from .pornhub import PornHubIE from .pornotube import PornotubeIE from .pyvideo import PyvideoIE diff --git a/youtube_dl/extractor/pornhd.py b/youtube_dl/extractor/pornhd.py new file mode 100644 index 000000000..c56740b8a --- /dev/null +++ b/youtube_dl/extractor/pornhd.py @@ -0,0 +1,42 @@ +import re + +from .common import InfoExtractor +from ..utils import compat_urllib_parse + +class PornHdIE(InfoExtractor): + _VALID_URL = r'(?:http://)?(?:www\.)?pornhd\.com/videos/(?P<video_id>[0-9]+)/(?P<video_title>.+)' + _TEST = { + u'id': u'1962', + u'url': u'http://www.pornhd.com/videos/1962/sierra-day-gets-his-cum-all-over-herself-hd-porn-video', + u'md5': u'4fe06e5108e8b524c35896f4c54c7155', + u'info_dict': { + u"title": u"sierra-day-gets-his-cum-all-over-herself-hd-porn-video", + u"age_limit": 18, + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + + video_id = mobj.group('video_id') + video_title = mobj.group('video_title') + video_extension = 'flv' + + webpage = self._download_webpage(url, video_id) + + + self.report_extraction(video_id) + + video_url = self._html_search_regex( + r'&hd=(http.+?)&', webpage, u'video URL') + video_url = compat_urllib_parse.unquote(video_url) + + age_limit = 18 + + return { + 'id': video_id, + 'url': video_url, + 'ext': video_extension, + 'title': video_title, + 'age_limit': age_limit, + } From 9ee859b683cd19bcdc01c2b6271ad7380a402ad2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Sat, 14 Dec 2013 14:20:12 +0100 Subject: [PATCH 072/150] [daylimotion] Add support for urls from the mobile site (fixes #1953) It uses the 'touch' subdomain and adds a '#' before 'video' --- youtube_dl/extractor/dailymotion.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index aea7e557e..6685c94a3 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -28,7 +28,7 @@ class DailymotionBaseInfoExtractor(InfoExtractor): class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor): """Information Extractor for Dailymotion""" - _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/(?:embed/)?video/([^/]+)' + _VALID_URL = r'(?i)(?:https?://)?(?:(www|touch)\.)?dailymotion\.[a-z]{2,3}/(?:(embed|#)/)?video/(?P<id>[^/?_]+)' IE_NAME = u'dailymotion' _FORMATS = [ @@ -81,7 +81,7 @@ class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor): # Extract id and simplified title from URL mobj = re.match(self._VALID_URL, url) - video_id = mobj.group(1).split('_')[0].split('?')[0] + video_id = mobj.group('id') url = 'http://www.dailymotion.com/video/%s' % video_id From dca02c80bc5c370bb3b15c9343d255dd124d2a93 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Sun, 15 Dec 2013 11:42:38 +0100 Subject: [PATCH 073/150] Fix detection of the extension if the 'extractaudio' is given and improve the error message (#1969) Using 'foo.mp4' shouldn't raise an error. If 'foo' is given suggest using 'foo.%(ext)s' for the template --- youtube_dl/__init__.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 3e82cd637..55e59814b 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -604,10 +604,10 @@ def _real_main(argv=None): or (opts.useid and u'%(id)s.%(ext)s') or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s') or u'%(title)s-%(id)s.%(ext)s') - if '%(ext)s' not in outtmpl and opts.extractaudio: + if not os.path.splitext(outtmpl)[1] and opts.extractaudio: parser.error(u'Cannot download a video and extract audio into the same' - u' file! Use "%%(ext)s" instead of %r' % - determine_ext(outtmpl, u'')) + u' file! Use "{0}.%(ext)s" instead of "{0}" as the output' + u' template'.format(outtmpl)) ydl_opts = { 'usenetrc': opts.usenetrc, From 5fe18bdbde749267c35bb360a3a0699e555a3fcf Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 16 Dec 2013 03:09:49 +0100 Subject: [PATCH 074/150] Add --min-views / --max-views (Fixes #1979) --- youtube_dl/YoutubeDL.py | 27 ++++++++++++++++++++++----- youtube_dl/__init__.py | 10 ++++++++++ 2 files changed, 32 insertions(+), 5 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index c77777ba0..ab68d013c 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -127,7 +127,16 @@ class YoutubeDL(object): noplaylist: Download single video instead of a playlist if in doubt. age_limit: An integer representing the user's age in years. Unsuitable videos for the given age are skipped. - download_archive: File name of a file where all downloads are recorded. + min_views: An integer representing the minimum view count the video + must have in order to not be skipped. + Videos without view count information are always + downloaded. None for no limit. + max_views: An integer representing the maximum view count. + Videos that are more popular than that are not + downloaded. + Videos without view count information are always + downloaded. None for no limit. + download_archive: File name of a file where all downloads are recorded. Videos already present in the file are not downloaded again. cookiefile: File name where cookies should be read from and dumped to. @@ -415,13 +424,14 @@ class YoutubeDL(object): def _match_entry(self, info_dict): """ Returns None iff the file should be downloaded """ + video_title = info_dict.get('title', info_dict.get('id', u'video')) if 'title' in info_dict: # This can happen when we're just evaluating the playlist title = info_dict['title'] matchtitle = self.params.get('matchtitle', False) if matchtitle: if not re.search(matchtitle, title, re.IGNORECASE): - return u'[download] "' + title + '" title did not match pattern "' + matchtitle + '"' + return u'"' + title + '" title did not match pattern "' + matchtitle + '"' rejecttitle = self.params.get('rejecttitle', False) if rejecttitle: if re.search(rejecttitle, title, re.IGNORECASE): @@ -430,14 +440,21 @@ class YoutubeDL(object): if date is not None: dateRange = self.params.get('daterange', DateRange()) if date not in dateRange: - return u'[download] %s upload date is not in range %s' % (date_from_str(date).isoformat(), dateRange) + return u'%s upload date is not in range %s' % (date_from_str(date).isoformat(), dateRange) + view_count = info_dict.get('view_count', None) + if view_count is not None: + min_views = self.params.get('min_views') + if min_views is not None and view_count < min_views: + return u'Skipping %s, because it has not reached minimum view count (%d/%d)' % (video_title, view_count, min_views) + max_views = self.params.get('max_views') + if max_views is not None and view_count > max_views: + return u'Skipping %s, because it has exceeded the maximum view count (%d/%d)' % (video_title, view_count, max_views) age_limit = self.params.get('age_limit') if age_limit is not None: if age_limit < info_dict.get('age_limit', 0): return u'Skipping "' + title + '" because it is age restricted' if self.in_download_archive(info_dict): - return (u'%s has already been recorded in archive' - % info_dict.get('title', info_dict.get('id', u'video'))) + return u'%s has already been recorded in archive' % video_title return None @staticmethod diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 55e59814b..437c37541 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -210,6 +210,14 @@ def parseOpts(overrideArguments=None): selection.add_option('--date', metavar='DATE', dest='date', help='download only videos uploaded in this date', default=None) selection.add_option('--datebefore', metavar='DATE', dest='datebefore', help='download only videos uploaded before this date', default=None) selection.add_option('--dateafter', metavar='DATE', dest='dateafter', help='download only videos uploaded after this date', default=None) + selection.add_option( + '--min-views', metavar='COUNT', dest='min_views', + default=None, type=int, + help="Do not download any videos with less than COUNT views",) + selection.add_option( + '--max-views', metavar='COUNT', dest='max_views', + default=None, type=int, + help="Do not download any videos with more than COUNT views",) selection.add_option('--no-playlist', action='store_true', dest='noplaylist', help='download only the currently playing video', default=False) selection.add_option('--age-limit', metavar='YEARS', dest='age_limit', help='download only videos suitable for the given age', @@ -668,6 +676,8 @@ def _real_main(argv=None): 'keepvideo': opts.keepvideo, 'min_filesize': opts.min_filesize, 'max_filesize': opts.max_filesize, + 'min_views': opts.min_views, + 'max_views': opts.max_views, 'daterange': date, 'cachedir': opts.cachedir, 'youtube_print_sig_code': opts.youtube_print_sig_code, From f8b56e95b82d9cb783c5a789b6b2770efff57b73 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 16 Dec 2013 03:34:46 +0100 Subject: [PATCH 075/150] [theplatform] Detect geoblocked content --- youtube_dl/extractor/theplatform.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py index 61452e47d..650d7517f 100644 --- a/youtube_dl/extractor/theplatform.py +++ b/youtube_dl/extractor/theplatform.py @@ -3,6 +3,7 @@ import json from .common import InfoExtractor from ..utils import ( + ExtractorError, xpath_with_ns, ) @@ -32,6 +33,17 @@ class ThePlatformIE(InfoExtractor): smil_url = ('http://link.theplatform.com/s/dJ5BDC/{0}/meta.smil?' 'format=smil&mbr=true'.format(video_id)) meta = self._download_xml(smil_url, video_id) + + try: + error_msg = next( + n.attrib['abstract'] + for n in meta.findall(_x('.//smil:ref')) + if n.attrib['title'] == u'Geographic Restriction') + except StopIteration: + pass + else: + raise ExtractorError(error_msg, expected=True) + info_url = 'http://link.theplatform.com/s/dJ5BDC/{0}?format=preview'.format(video_id) info_json = self._download_webpage(info_url, video_id) info = json.loads(info_json) From 48462108f3b1937bcaa19a4752246748a5fc9360 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 16 Dec 2013 03:43:45 +0100 Subject: [PATCH 076/150] [theplatform] Fix geographic restriction check --- youtube_dl/extractor/theplatform.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py index 650d7517f..cec65261b 100644 --- a/youtube_dl/extractor/theplatform.py +++ b/youtube_dl/extractor/theplatform.py @@ -38,7 +38,7 @@ class ThePlatformIE(InfoExtractor): error_msg = next( n.attrib['abstract'] for n in meta.findall(_x('.//smil:ref')) - if n.attrib['title'] == u'Geographic Restriction') + if n.attrib.get('title') == u'Geographic Restriction') except StopIteration: pass else: From fa3ae234e04f631a0754b6b474417ddffc2faee2 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 16 Dec 2013 03:53:43 +0100 Subject: [PATCH 077/150] [cbs] Add extractor (Fixes #1977) --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/cbs.py | 30 ++++++++++++++++++++++++++++++ 2 files changed, 31 insertions(+) create mode 100644 youtube_dl/extractor/cbs.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 1149dc1ec..37bf8e306 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -20,6 +20,7 @@ from .brightcove import BrightcoveIE from .c56 import C56IE from .canalplus import CanalplusIE from .canalc2 import Canalc2IE +from .cbs import CBSIE from .cinemassacre import CinemassacreIE from .clipfish import ClipfishIE from .clipsyndicate import ClipsyndicateIE diff --git a/youtube_dl/extractor/cbs.py b/youtube_dl/extractor/cbs.py new file mode 100644 index 000000000..ac0315853 --- /dev/null +++ b/youtube_dl/extractor/cbs.py @@ -0,0 +1,30 @@ +import re + +from .common import InfoExtractor + + +class CBSIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?cbs\.com/shows/[^/]+/video/(?P<id>[^/]+)/.*' + + _TEST = { + u'url': u'http://www.cbs.com/shows/garth-brooks/video/_u7W953k6la293J7EPTd9oHkSPs6Xn6_/connect-chat-feat-garth-brooks/', + u'file': u'4JUVEwq3wUT7.flv', + u'info_dict': { + u'title': u'Connect Chat feat. Garth Brooks', + u'description': u'Connect with country music singer Garth Brooks, as he chats with fans on Wednesday November 27, 2013. Be sure to tune in to Garth Brooks: Live from Las Vegas, Friday November 29, at 9/8c on CBS!', + u'duration': 1495, + }, + u'params': { + # rtmp download + u'skip_download': True, + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + webpage = self._download_webpage(url, video_id) + real_id = self._search_regex( + r"video\.settings\.pid\s*=\s*'([^']+)';", + webpage, u'real video ID') + return self.url_result(u'theplatform:%s' % real_id) From b466b7029d0af24dc06ecd8bc1f147a39be91c62 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 16 Dec 2013 04:09:05 +0100 Subject: [PATCH 078/150] [youtube] Make duration an integer or None --- youtube_dl/extractor/youtube.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 874429b78..a68a214ca 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1377,9 +1377,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): if 'length_seconds' not in video_info: self._downloader.report_warning(u'unable to extract video duration') - video_duration = '' + video_duration = None else: - video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0]) + video_duration = int(compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])) # annotations video_annotations = None From c0ba0f485914330d5b4d7b175fb1da3724d4bda0 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 16 Dec 2013 04:09:30 +0100 Subject: [PATCH 079/150] Document duration field --- youtube_dl/extractor/common.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 69a083b68..fe8ce9e6c 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -34,8 +34,8 @@ class InfoExtractor(object): The dictionaries must include the following fields: id: Video identifier. - url: Final video URL. title: Video title, unescaped. + url: Final video URL. ext: Video filename extension. Instead of url and ext, formats can also specified. @@ -54,6 +54,7 @@ class InfoExtractor(object): player_url: SWF Player URL (used for rtmpdump). subtitles: The subtitle file contents as a dictionary in the format {language: subtitles}. + duration: Length of the video in seconds, as an integer. view_count: How many users have watched the video on the platform. like_count: Number of positive ratings of the video dislike_count: Number of negative ratings of the video From 525ef9227f178a965a2010971256a1929af8cdd3 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 16 Dec 2013 04:15:10 +0100 Subject: [PATCH 080/150] Add --get-duration (Fixes #859) --- youtube_dl/YoutubeDL.py | 4 ++++ youtube_dl/__init__.py | 10 ++++++++-- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index ab68d013c..52bd8e0e3 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -34,6 +34,7 @@ from .utils import ( encodeFilename, ExtractorError, format_bytes, + formatSeconds, get_term_width, locked_file, make_HTTPS_handler, @@ -94,6 +95,7 @@ class YoutubeDL(object): forcethumbnail: Force printing thumbnail URL. forcedescription: Force printing description. forcefilename: Force printing final filename. + forceduration: Force printing duration. forcejson: Force printing info_dict as JSON. simulate: Do not download the video files. format: Video format code. @@ -765,6 +767,8 @@ class YoutubeDL(object): self.to_stdout(info_dict['description']) if self.params.get('forcefilename', False) and filename is not None: self.to_stdout(filename) + if self.params.get('forceduration', False) and info_dict.get('duration') is not None: + self.to_stdout(formatSeconds(info_dict['duration'])) if self.params.get('forceformat', False): self.to_stdout(info_dict['format']) if self.params.get('forcejson', False): diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 437c37541..d5c0a3643 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -298,6 +298,9 @@ def parseOpts(overrideArguments=None): verbosity.add_option('--get-description', action='store_true', dest='getdescription', help='simulate, quiet but print video description', default=False) + verbosity.add_option('--get-duration', + action='store_true', dest='getduration', + help='simulate, quiet but print video length', default=False) verbosity.add_option('--get-filename', action='store_true', dest='getfilename', help='simulate, quiet but print output filename', default=False) @@ -617,22 +620,25 @@ def _real_main(argv=None): u' file! Use "{0}.%(ext)s" instead of "{0}" as the output' u' template'.format(outtmpl)) + any_printing = opts.geturl or opts.gettitle or opts.getid or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat or opts.getduration or opts.dumpjson + ydl_opts = { 'usenetrc': opts.usenetrc, 'username': opts.username, 'password': opts.password, 'videopassword': opts.videopassword, - 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getid or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat or opts.dumpjson), + 'quiet': (opts.quiet or any_printing), 'forceurl': opts.geturl, 'forcetitle': opts.gettitle, 'forceid': opts.getid, 'forcethumbnail': opts.getthumbnail, 'forcedescription': opts.getdescription, + 'forceduration': opts.getduration, 'forcefilename': opts.getfilename, 'forceformat': opts.getformat, 'forcejson': opts.dumpjson, 'simulate': opts.simulate, - 'skip_download': (opts.skip_download or opts.simulate or opts.geturl or opts.gettitle or opts.getid or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat or opts.dumpjson), + 'skip_download': (opts.skip_download or opts.simulate or any_printing), 'format': opts.format, 'format_limit': opts.format_limit, 'listformats': opts.listformats, From 7b6fefc9d4df92dca11ea536bdaef633f7a4229c Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 16 Dec 2013 04:39:04 +0100 Subject: [PATCH 081/150] Apply --no-overwrites for --write-* files as well (Fixes #1980) --- youtube_dl/YoutubeDL.py | 114 ++++++++++++++++++++-------------------- 1 file changed, 57 insertions(+), 57 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 52bd8e0e3..2a4ab674d 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -366,22 +366,6 @@ class YoutubeDL(object): error_message = u'%s %s' % (_msg_header, message) self.trouble(error_message, tb) - def report_writedescription(self, descfn): - """ Report that the description file is being written """ - self.to_screen(u'[info] Writing video description to: ' + descfn) - - def report_writesubtitles(self, sub_filename): - """ Report that the subtitles file is being written """ - self.to_screen(u'[info] Writing video subtitles to: ' + sub_filename) - - def report_writeinfojson(self, infofn): - """ Report that the metadata file has been written """ - self.to_screen(u'[info] Video description metadata as JSON to: ' + infofn) - - def report_writeannotations(self, annofn): - """ Report that the annotations file has been written. """ - self.to_screen(u'[info] Writing video annotations to: ' + annofn) - def report_file_already_downloaded(self, file_name): """Report file has already been fully downloaded.""" try: @@ -791,28 +775,34 @@ class YoutubeDL(object): return if self.params.get('writedescription', False): - try: - descfn = filename + u'.description' - self.report_writedescription(descfn) - with io.open(encodeFilename(descfn), 'w', encoding='utf-8') as descfile: - descfile.write(info_dict['description']) - except (KeyError, TypeError): - self.report_warning(u'There\'s no description to write.') - except (OSError, IOError): - self.report_error(u'Cannot write description file ' + descfn) - return + descfn = filename + u'.description' + if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(descfn)): + self.to_screen(u'[info] Video description is already present') + else: + try: + self.to_screen(u'[info] Writing video description to: ' + descfn) + with io.open(encodeFilename(descfn), 'w', encoding='utf-8') as descfile: + descfile.write(info_dict['description']) + except (KeyError, TypeError): + self.report_warning(u'There\'s no description to write.') + except (OSError, IOError): + self.report_error(u'Cannot write description file ' + descfn) + return if self.params.get('writeannotations', False): - try: - annofn = filename + u'.annotations.xml' - self.report_writeannotations(annofn) - with io.open(encodeFilename(annofn), 'w', encoding='utf-8') as annofile: - annofile.write(info_dict['annotations']) - except (KeyError, TypeError): - self.report_warning(u'There are no annotations to write.') - except (OSError, IOError): - self.report_error(u'Cannot write annotations file: ' + annofn) - return + annofn = filename + u'.annotations.xml' + if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(annofn)): + self.to_screen(u'[info] Video annotations are already present') + else: + try: + self.to_screen(u'[info] Writing video annotations to: ' + annofn) + with io.open(encodeFilename(annofn), 'w', encoding='utf-8') as annofile: + annofile.write(info_dict['annotations']) + except (KeyError, TypeError): + self.report_warning(u'There are no annotations to write.') + except (OSError, IOError): + self.report_error(u'Cannot write annotations file: ' + annofn) + return subtitles_are_requested = any([self.params.get('writesubtitles', False), self.params.get('writeautomaticsub')]) @@ -828,38 +818,48 @@ class YoutubeDL(object): continue try: sub_filename = subtitles_filename(filename, sub_lang, sub_format) - self.report_writesubtitles(sub_filename) - with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8') as subfile: - subfile.write(sub) + if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(sub_filename)): + self.to_screen(u'[info] Video subtitle %s.%s is already_present' % (sub_lang, sub_format)) + else: + self.to_screen(u'[info] Writing video subtitles to: ' + sub_filename) + with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8') as subfile: + subfile.write(sub) except (OSError, IOError): self.report_error(u'Cannot write subtitles file ' + descfn) return if self.params.get('writeinfojson', False): infofn = os.path.splitext(filename)[0] + u'.info.json' - self.report_writeinfojson(infofn) - try: - json_info_dict = dict((k, v) for k, v in info_dict.items() if not k in ['urlhandle']) - write_json_file(json_info_dict, encodeFilename(infofn)) - except (OSError, IOError): - self.report_error(u'Cannot write metadata to JSON file ' + infofn) - return + if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(infofn)): + self.to_screen(u'[info] Video description metadata is already present') + else: + self.to_screen(u'[info] Writing video description metadata as JSON to: ' + infofn) + try: + json_info_dict = dict((k, v) for k, v in info_dict.items() if not k in ['urlhandle']) + write_json_file(json_info_dict, encodeFilename(infofn)) + except (OSError, IOError): + self.report_error(u'Cannot write metadata to JSON file ' + infofn) + return if self.params.get('writethumbnail', False): if info_dict.get('thumbnail') is not None: thumb_format = determine_ext(info_dict['thumbnail'], u'jpg') thumb_filename = os.path.splitext(filename)[0] + u'.' + thumb_format - self.to_screen(u'[%s] %s: Downloading thumbnail ...' % - (info_dict['extractor'], info_dict['id'])) - try: - uf = compat_urllib_request.urlopen(info_dict['thumbnail']) - with open(thumb_filename, 'wb') as thumbf: - shutil.copyfileobj(uf, thumbf) - self.to_screen(u'[%s] %s: Writing thumbnail to: %s' % - (info_dict['extractor'], info_dict['id'], thumb_filename)) - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self.report_warning(u'Unable to download thumbnail "%s": %s' % - (info_dict['thumbnail'], compat_str(err))) + if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(infofn)): + self.to_screen(u'[%s] %s: Thumbnail is already present' % + (info_dict['extractor'], info_dict['id'])) + else: + self.to_screen(u'[%s] %s: Downloading thumbnail ...' % + (info_dict['extractor'], info_dict['id'])) + try: + uf = compat_urllib_request.urlopen(info_dict['thumbnail']) + with open(thumb_filename, 'wb') as thumbf: + shutil.copyfileobj(uf, thumbf) + self.to_screen(u'[%s] %s: Writing thumbnail to: %s' % + (info_dict['extractor'], info_dict['id'], thumb_filename)) + except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: + self.report_warning(u'Unable to download thumbnail "%s": %s' % + (info_dict['thumbnail'], compat_str(err))) if not self.params.get('skip_download', False): if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(filename)): From 8863d0de91ee0b7f7bac28fe9b6ba27f986e656c Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 16 Dec 2013 04:45:32 +0100 Subject: [PATCH 082/150] release 2013.12.16 --- README.md | 5 +++++ youtube_dl/version.py | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 68b2e1ae7..caed94846 100644 --- a/README.md +++ b/README.md @@ -56,6 +56,10 @@ which means you can modify it, redistribute it or use it however you like. --date DATE download only videos uploaded in this date --datebefore DATE download only videos uploaded before this date --dateafter DATE download only videos uploaded after this date + --min-views COUNT Do not download any videos with less than COUNT + views + --max-views COUNT Do not download any videos with more than COUNT + views --no-playlist download only the currently playing video --age-limit YEARS download only videos suitable for the given age --download-archive FILE Download only videos not listed in the archive @@ -127,6 +131,7 @@ which means you can modify it, redistribute it or use it however you like. --get-id simulate, quiet but print id --get-thumbnail simulate, quiet but print thumbnail URL --get-description simulate, quiet but print video description + --get-duration simulate, quiet but print video length --get-filename simulate, quiet but print output filename --get-format simulate, quiet but print output format -j, --dump-json simulate, quiet but print JSON information diff --git a/youtube_dl/version.py b/youtube_dl/version.py index b9a52fcfa..5bc7fd774 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2013.12.11.2' +__version__ = '2013.12.16' From e3946f989ea53c14fb7ae0afd29f80a74512a3a3 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 16 Dec 2013 05:04:12 +0100 Subject: [PATCH 083/150] Set process title to youtube-dl This allows killing all youtube-dl processes with killall youtube-dl, and shows up nicer in some programs. --- youtube_dl/__init__.py | 4 ++++ youtube_dl/utils.py | 15 +++++++++++++++ 2 files changed, 19 insertions(+) diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index d5c0a3643..09a99f0db 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -62,6 +62,7 @@ from .utils import ( MaxDownloadsReached, preferredencoding, SameFileError, + setproctitle, std_headers, write_string, ) @@ -471,12 +472,15 @@ def parseOpts(overrideArguments=None): return parser, opts, args + def _real_main(argv=None): # Compatibility fixes for Windows if sys.platform == 'win32': # https://github.com/rg3/youtube-dl/issues/820 codecs.register(lambda name: codecs.lookup('utf-8') if name == 'cp65001' else None) + setproctitle(u'youtube-dl') + parser, opts, args = parseOpts(argv) # Set user agent diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 4e8a84a56..bd46a2da2 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1,6 +1,7 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- +import ctypes import datetime import email.utils import errno @@ -1062,3 +1063,17 @@ def month_by_name(name): def fix_xml_all_ampersand(xml_str): """Replace all the '&' by '&' in XML""" return xml_str.replace(u'&', u'&') + + +def setproctitle(title): + try: + libc = ctypes.cdll.LoadLibrary("libc.so.6") + except OSError: + return + title = title + buf = ctypes.create_string_buffer(len(title) + 1) + buf.value = title + try: + libc.prctl(15, ctypes.byref(buf), 0, 0, 0) + except AttributeError: + return # Strange libc, just skip this From 6f5dcd4eee10a291b89b70a68e1a955cd94d341f Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 16 Dec 2013 05:10:42 +0100 Subject: [PATCH 084/150] [pornhd] Simplify --- youtube_dl/extractor/pornhd.py | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/pornhd.py b/youtube_dl/extractor/pornhd.py index c56740b8a..71abd5013 100644 --- a/youtube_dl/extractor/pornhd.py +++ b/youtube_dl/extractor/pornhd.py @@ -3,12 +3,13 @@ import re from .common import InfoExtractor from ..utils import compat_urllib_parse + class PornHdIE(InfoExtractor): _VALID_URL = r'(?:http://)?(?:www\.)?pornhd\.com/videos/(?P<video_id>[0-9]+)/(?P<video_title>.+)' _TEST = { - u'id': u'1962', u'url': u'http://www.pornhd.com/videos/1962/sierra-day-gets-his-cum-all-over-herself-hd-porn-video', - u'md5': u'4fe06e5108e8b524c35896f4c54c7155', + u'file': u'1962.flv', + u'md5': u'35272469887dca97abd30abecc6cdf75', u'info_dict': { u"title": u"sierra-day-gets-his-cum-all-over-herself-hd-porn-video", u"age_limit": 18, @@ -20,23 +21,18 @@ class PornHdIE(InfoExtractor): video_id = mobj.group('video_id') video_title = mobj.group('video_title') - video_extension = 'flv' webpage = self._download_webpage(url, video_id) - - self.report_extraction(video_id) - video_url = self._html_search_regex( r'&hd=(http.+?)&', webpage, u'video URL') video_url = compat_urllib_parse.unquote(video_url) - age_limit = 18 return { - 'id': video_id, - 'url': video_url, - 'ext': video_extension, - 'title': video_title, + 'id': video_id, + 'url': video_url, + 'ext': 'flv', + 'title': video_title, 'age_limit': age_limit, } From b83be81d27e252eed4585fcce14205266b51b1d5 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 16 Dec 2013 05:11:19 +0100 Subject: [PATCH 085/150] Credit @mjorlitzky for pornhd (#1961) --- youtube_dl/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 09a99f0db..0775b72fd 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -37,6 +37,7 @@ __authors__ = ( 'Anton Larionov', 'Takuya Tsuchida', 'Sergey M.', + 'Michael Orlitzky', ) __license__ = 'Public Domain' From 09dacfa57fa92643e181f039bd7fdf5513921d0f Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 16 Dec 2013 05:44:34 +0100 Subject: [PATCH 086/150] [mdr] Simplify --- youtube_dl/extractor/mdr.py | 93 +++++++++++++++++++------------------ 1 file changed, 47 insertions(+), 46 deletions(-) diff --git a/youtube_dl/extractor/mdr.py b/youtube_dl/extractor/mdr.py index 8b096a78a..366352dbe 100644 --- a/youtube_dl/extractor/mdr.py +++ b/youtube_dl/extractor/mdr.py @@ -5,35 +5,28 @@ from ..utils import ( ExtractorError, ) + class MDRIE(InfoExtractor): _VALID_URL = r'^(?P<domain>(?:https?://)?(?:www\.)?mdr\.de)/mediathek/(?:.*)/(?P<type>video|audio)(?P<video_id>[^/_]+)_.*' - _TITLE = r'<h2>(?P<title1>[^<]+)<span>(?P<title2>[^<]+)</span></h2>' - _MEDIA_XML = r'(?P<xmlurl>/mediathek/(.+)/(video|audio)([0-9]+)-avCustom.xml)' - _MEDIA_STREAM_VIDEO = r'<asset>.*<frameWidth>(?P<frameWidth>[0-9]+)</frameWidth>.*<flashMediaServerApplicationURL>(?P<flashMediaServerApplicationURL>[^<]+)</flashMediaServerApplicationURL><flashMediaServerURL>(?P<flashMediaServerURL>[^<]+)</flashMediaServerURL>.*<progressiveDownloadUrl>(?P<progressiveDownloadUrl>[^<]+)</progressiveDownloadUrl></asset>' - _MEDIA_STREAM_AUDIO = r'<asset>.*<mediaType>(?P<mediaType>[A-Z0-9]+)</mediaType><bitrateAudio>(?P<bitrateAudio>[0-9]+)</bitrateAudio>.*<flashMediaServerApplicationURL>(?P<flashMediaServerApplicationURL>[^<]+)</flashMediaServerApplicationURL><flashMediaServerURL>(?P<flashMediaServerURL>[^<]+)</flashMediaServerURL>.*<progressiveDownloadUrl>(?P<progressiveDownloadUrl>[^<]+)</progressiveDownloadUrl></asset>' _TESTS = [{ u'url': u'http://www.mdr.de/mediathek/themen/nachrichten/video165624_zc-c5c7de76_zs-3795826d.html', u'file': u'165624.mp4', - u'md5': u'95165945756198b8fa2dea10f0b04614', + u'md5': u'ae785f36ecbf2f19b42edf1bc9c85815', u'info_dict': { u"title": u"MDR aktuell Eins30 09.12.2013, 22:48 Uhr" }, - #u'skip': u'Requires rtmpdump' # rtmp is optional }, { - u'url': u' http://www.mdr.de/mediathek/radio/mdr1-radio-sachsen/audio718370_zc-67b21197_zs-1b9b2483.html', - u'file': u'718370.mp4', - u'md5': u'4a5b1fbb5519fb0d929c384b6ff7cb8b', + u'url': u'http://www.mdr.de/mediathek/radio/mdr1-radio-sachsen/audio718370_zc-67b21197_zs-1b9b2483.html', + u'file': u'718370.mp3', + u'md5': u'a9d21345a234c7b45dee612f290fd8d7', u'info_dict': { u"title": u"MDR 1 RADIO SACHSEN 10.12.2013, 05:00 Uhr" }, - #u'skip': u'Requires rtmpdump' # rtmp is optional }] def _real_extract(self, url): - - # determine video id from url m = re.match(self._VALID_URL, url) video_id = m.group('video_id') domain = m.group('domain') @@ -41,38 +34,46 @@ class MDRIE(InfoExtractor): # determine title and media streams from webpage html = self._download_webpage(url, video_id) - t = re.search(self._TITLE, html) - if not t: - raise ExtractorError(u'no title found') - title = t.group('title1') + t.group('title2') - m = re.search(self._MEDIA_XML, html) - if not m: - raise ExtractorError(u'no xml found') - xmlurl = m.group('xmlurl') - xml = self._download_webpage(domain+xmlurl, video_id, 'download XML').replace('\n','').replace('\r','').replace('<asset>','\n<asset>').replace('</asset>','</asset>\n') - if(mediatype == "video"): - streams = [mo.groupdict() for mo in re.finditer(self._MEDIA_STREAM_VIDEO, xml)] - if not streams: - raise ExtractorError(u'no media found') - # choose default media type and highest quality for now - stream = max([s for s in streams if s["progressiveDownloadUrl"].startswith("http://") ], - key=lambda s: int(s["frameWidth"])) - else: - streams = [mo.groupdict() for mo in re.finditer(self._MEDIA_STREAM_AUDIO, xml)] - if not streams: - raise ExtractorError(u'no media found') - # choose default media type (MP4) and highest quality for now - stream = max([s for s in streams if s["progressiveDownloadUrl"].startswith("http://") and s["mediaType"] == "MP4" ], - key=lambda s: int(s["bitrateAudio"])) - # there's two possibilities: RTMP stream or HTTP download - info = {'id': video_id, 'title': title, 'ext': 'mp4'} - if not stream["progressiveDownloadUrl"]: - self.to_screen(u'RTMP download detected') - assert stream['flashMediaServerURL'].startswith('mp4:') - info["url"] = stream["flashMediaServerApplicationURL"] - info["play_path"] = stream['flashMediaServerURL'] - else: - assert stream["progressiveDownloadUrl"].endswith('.mp4') - info["url"] = stream["progressiveDownloadUrl"] - return [info] + title = self._html_search_regex(r'<h2>(.*?)</h2>', html, u'title') + xmlurl = self._search_regex( + r'(/mediathek/(?:.+)/(?:video|audio)[0-9]+-avCustom.xml)', html, u'XML URL') + + doc = self._download_xml(domain + xmlurl, video_id) + formats = [] + for a in doc.findall('./assets/asset'): + url_el = a.find('.//progressiveDownloadUrl') + if url_el is None: + continue + abr = int(a.find('bitrateAudio').text) // 1000 + media_type = a.find('mediaType').text + format = { + 'abr': abr, + 'filesize': int(a.find('fileSize').text), + 'url': url_el.text, + } + + vbr_el = a.find('bitrateVideo') + if vbr_el is None: + format.update({ + 'vcodec': 'none', + 'format_id': u'%s-%d' % (media_type, abr), + }) + else: + vbr = int(vbr_el.text) // 1000 + format.update({ + 'vbr': vbr, + 'width': int(a.find('frameWidth').text), + 'height': int(a.find('frameHeight').text), + 'format_id': u'%s-%d' % (media_type, vbr), + }) + formats.append(format) + formats.sort(key=lambda f: (f.get('vbr'), f['abr'])) + if not formats: + raise ValueError('Could not find any valid formats') + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + } From 6888a874a1dcdc916bafa5aa55864b60514f5fa4 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 16 Dec 2013 05:45:15 +0100 Subject: [PATCH 087/150] release 2013.12.16.1 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 5bc7fd774..15ac46d17 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2013.12.16' +__version__ = '2013.12.16.1' From 8c5f0c9fbc09b361517cd5045d856fec96381696 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 16 Dec 2013 08:16:11 +0100 Subject: [PATCH 088/150] [mdr] Clean up --- youtube_dl/extractor/mdr.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/youtube_dl/extractor/mdr.py b/youtube_dl/extractor/mdr.py index 366352dbe..d29cf2c07 100644 --- a/youtube_dl/extractor/mdr.py +++ b/youtube_dl/extractor/mdr.py @@ -30,7 +30,6 @@ class MDRIE(InfoExtractor): m = re.match(self._VALID_URL, url) video_id = m.group('video_id') domain = m.group('domain') - mediatype = m.group('type') # determine title and media streams from webpage html = self._download_webpage(url, video_id) @@ -70,7 +69,7 @@ class MDRIE(InfoExtractor): formats.append(format) formats.sort(key=lambda f: (f.get('vbr'), f['abr'])) if not formats: - raise ValueError('Could not find any valid formats') + raise ExtractorError(u'Could not find any valid formats') return { 'id': video_id, From d66152a8982261f930c2a324c1fc3ba3b3324134 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 16 Dec 2013 08:16:38 +0100 Subject: [PATCH 089/150] [ndtv] Remove unused imports --- youtube_dl/extractor/ndtv.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/youtube_dl/extractor/ndtv.py b/youtube_dl/extractor/ndtv.py index 2e8501f99..d81df3c10 100644 --- a/youtube_dl/extractor/ndtv.py +++ b/youtube_dl/extractor/ndtv.py @@ -1,6 +1,4 @@ -import json import re -import time from .common import InfoExtractor from ..utils import month_by_name From a19fd00cc4f799215a942c92fd5c722b3ea499cd Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 16 Dec 2013 13:16:20 +0100 Subject: [PATCH 090/150] Simplify --playlist-start / --playlist-end interface --- youtube_dl/YoutubeDL.py | 12 ++++++------ youtube_dl/__init__.py | 28 ++++++++++++---------------- 2 files changed, 18 insertions(+), 22 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 2a4ab674d..2fa34ebc9 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -557,16 +557,16 @@ class YoutubeDL(object): n_all_entries = len(ie_result['entries']) playliststart = self.params.get('playliststart', 1) - 1 - playlistend = self.params.get('playlistend', -1) - + playlistend = self.params.get('playlistend', None) + # For backwards compatibility, interpret -1 as whole list if playlistend == -1: - entries = ie_result['entries'][playliststart:] - else: - entries = ie_result['entries'][playliststart:playlistend] + playlistend = None + entries = ie_result['entries'][playliststart:playlistend] n_entries = len(entries) - self.to_screen(u"[%s] playlist '%s': Collected %d video ids (downloading %d of them)" % + self.to_screen( + u"[%s] playlist '%s': Collected %d video ids (downloading %d of them)" % (ie_result['extractor'], playlist, n_all_entries, n_entries)) for i, entry in enumerate(entries, 1): diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 0775b72fd..6df44020b 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -198,10 +198,14 @@ def parseOpts(overrideArguments=None): help=u'Work around terminals that lack bidirectional text support. Requires fribidi executable in PATH') - selection.add_option('--playlist-start', - dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is %default)', default=1) - selection.add_option('--playlist-end', - dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1) + selection.add_option( + '--playlist-start', + dest='playliststart', metavar='NUMBER', default=1, type=int, + help='playlist video to start at (default is %default)') + selection.add_option( + '--playlist-end', + dest='playlistend', metavar='NUMBER', default=None, type=int, + help='playlist video to end at (default is last)') selection.add_option('--match-title', dest='matchtitle', metavar='REGEX',help='download only matching titles (regex or caseless sub-string)') selection.add_option('--reject-title', dest='rejecttitle', metavar='REGEX',help='skip download for matching titles (regex or caseless sub-string)') selection.add_option('--max-downloads', metavar='NUMBER', @@ -576,18 +580,10 @@ def _real_main(argv=None): if numeric_buffersize is None: parser.error(u'invalid buffer size specified') opts.buffersize = numeric_buffersize - try: - opts.playliststart = int(opts.playliststart) - if opts.playliststart <= 0: - raise ValueError(u'Playlist start must be positive') - except (TypeError, ValueError): - parser.error(u'invalid playlist start number specified') - try: - opts.playlistend = int(opts.playlistend) - if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart): - raise ValueError(u'Playlist end must be greater than playlist start') - except (TypeError, ValueError): - parser.error(u'invalid playlist end number specified') + if opts.playliststart <= 0: + raise ValueError(u'Playlist start must be positive') + if opts.playlistend not in (-1, None) and opts.playlistend < opts.playliststart: + raise ValueError(u'Playlist end must be greater than playlist start') if opts.extractaudio: if opts.audioformat not in ['best', 'aac', 'mp3', 'm4a', 'opus', 'vorbis', 'wav']: parser.error(u'invalid audio format specified') From d7dda1688886284b17c35be200d980f24c2546d2 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 16 Dec 2013 13:56:13 +0100 Subject: [PATCH 091/150] [blinkx] Add extractor (Fixes #1972) --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/blinkx.py | 86 ++++++++++++++++++++++++++++++++ youtube_dl/utils.py | 6 +++ 3 files changed, 93 insertions(+) create mode 100644 youtube_dl/extractor/blinkx.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index cebb8717f..b8ff750d0 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -13,6 +13,7 @@ from .arte import ( from .auengine import AUEngineIE from .bambuser import BambuserIE, BambuserChannelIE from .bandcamp import BandcampIE, BandcampAlbumIE +from .blinkx import BlinkxIE from .bliptv import BlipTVIE, BlipTVUserIE from .bloomberg import BloombergIE from .breakcom import BreakIE diff --git a/youtube_dl/extractor/blinkx.py b/youtube_dl/extractor/blinkx.py new file mode 100644 index 000000000..48f16b692 --- /dev/null +++ b/youtube_dl/extractor/blinkx.py @@ -0,0 +1,86 @@ +import datetime +import json +import re + +from .common import InfoExtractor +from ..utils import ( + remove_start, +) + + +class BlinkxIE(InfoExtractor): + _VALID_URL = r'^(?:https?://(?:www\.)blinkx\.com/ce/|blinkx:)(?P<id>[^?]+)' + _IE_NAME = u'blinkx' + + _TEST = { + u'url': u'http://www.blinkx.com/ce/8aQUy7GVFYgFzpKhT0oqsilwOGFRVXk3R1ZGWWdGenBLaFQwb3FzaWx3OGFRVXk3R1ZGWWdGenB', + u'file': u'8aQUy7GV.mp4', + u'md5': u'2e9a07364af40163a908edbf10bb2492', + u'info_dict': { + u"title": u"Police Car Rolls Away", + u"uploader": u"stupidvideos.com", + u"upload_date": u"20131215", + u"description": u"A police car gently rolls away from a fight. Maybe it felt weird being around a confrontation and just had to get out of there!", + u"duration": 14.886, + u"thumbnails": [{ + "width": 100, + "height": 76, + "url": "http://cdn.blinkx.com/stream/b/41/StupidVideos/20131215/1873969261/1873969261_tn_0.jpg", + }], + }, + } + + def _real_extract(self, url): + m = re.match(self._VALID_URL, url) + video_id = m.group('id') + display_id = video_id[:8] + + api_url = (u'https://apib4.blinkx.com/api.php?action=play_video&' + + u'video=%s' % video_id) + data_json = self._download_webpage(api_url, display_id) + data = json.loads(data_json)['api']['results'][0] + dt = datetime.datetime.fromtimestamp(data['pubdate_epoch']) + upload_date = dt.strftime('%Y%m%d') + + duration = None + thumbnails = [] + formats = [] + for m in data['media']: + if m['type'] == 'jpg': + thumbnails.append({ + 'url': m['link'], + 'width': int(m['w']), + 'height': int(m['h']), + }) + elif m['type'] == 'original': + duration = m['d'] + elif m['type'] in ('flv', 'mp4'): + vcodec = remove_start(m['vcodec'], 'ff') + acodec = remove_start(m['acodec'], 'ff') + format_id = (u'%s-%sk-%s' % + (vcodec, + (int(m['vbr']) + int(m['abr'])) // 1000, + m['w'])) + formats.append({ + 'format_id': format_id, + 'url': m['link'], + 'vcodec': vcodec, + 'acodec': acodec, + 'abr': int(m['abr']) // 1000, + 'vbr': int(m['vbr']) // 1000, + 'width': int(m['w']), + 'height': int(m['h']), + }) + formats.sort(key=lambda f: (f['width'], f['vbr'], f['abr'])) + + return { + 'id': display_id, + 'fullid': video_id, + 'title': data['title'], + 'formats': formats, + 'uploader': data['channel_name'], + 'upload_date': upload_date, + 'description': data.get('description'), + 'thumbnails': thumbnails, + 'duration': duration, + } diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index bd46a2da2..f3ad47422 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1077,3 +1077,9 @@ def setproctitle(title): libc.prctl(15, ctypes.byref(buf), 0, 0, 0) except AttributeError: return # Strange libc, just skip this + + +def remove_start(s, start): + if s.startswith(start): + return s[len(start):] + return s From d67b0b15964c3f335e47a291b274e07129ac199d Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 16 Dec 2013 14:13:40 +0100 Subject: [PATCH 092/150] Reorder info_dict documentation --- youtube_dl/extractor/common.py | 52 ++++++++++++++++++---------------- 1 file changed, 27 insertions(+), 25 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index fe8ce9e6c..11b6ed524 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -35,14 +35,38 @@ class InfoExtractor(object): id: Video identifier. title: Video title, unescaped. + + Additionally, it must contain either a formats entry or url and ext: + + formats: A list of dictionaries for each format available, it must + be ordered from worst to best quality. Potential fields: + * url Mandatory. The URL of the video file + * ext Will be calculated from url if missing + * format A human-readable description of the format + ("mp4 container with h264/opus"). + Calculated from the format_id, width, height. + and format_note fields if missing. + * format_id A short description of the format + ("mp4_h264_opus" or "19") + * format_note Additional info about the format + ("3D" or "DASH video") + * width Width of the video, if known + * height Height of the video, if known + * abr Average audio bitrate in KBit/s + * acodec Name of the audio codec in use + * vbr Average video bitrate in KBit/s + * vcodec Name of the video codec in use + * filesize The number of bytes, if known in advance + * player_url SWF Player URL (used for rtmpdump). url: Final video URL. ext: Video filename extension. - - Instead of url and ext, formats can also specified. + format: The video format, defaults to ext (used for --get-format) + player_url: SWF Player URL (used for rtmpdump). + urlhandle: [internal] The urlHandle to be used to download the file, + like returned by urllib.request.urlopen The following fields are optional: - format: The video format, defaults to ext (used for --get-format) thumbnails: A list of dictionaries (with the entries "resolution" and "url") for the varying thumbnails thumbnail: Full URL to a video thumbnail image. @@ -51,7 +75,6 @@ class InfoExtractor(object): upload_date: Video upload date (YYYYMMDD). uploader_id: Nickname or id of the video uploader. location: Physical location of the video. - player_url: SWF Player URL (used for rtmpdump). subtitles: The subtitle file contents as a dictionary in the format {language: subtitles}. duration: Length of the video in seconds, as an integer. @@ -59,28 +82,7 @@ class InfoExtractor(object): like_count: Number of positive ratings of the video dislike_count: Number of negative ratings of the video comment_count: Number of comments on the video - urlhandle: [internal] The urlHandle to be used to download the file, - like returned by urllib.request.urlopen age_limit: Age restriction for the video, as an integer (years) - formats: A list of dictionaries for each format available, it must - be ordered from worst to best quality. Potential fields: - * url Mandatory. The URL of the video file - * ext Will be calculated from url if missing - * format A human-readable description of the format - ("mp4 container with h264/opus"). - Calculated from the format_id, width, height. - and format_note fields if missing. - * format_id A short description of the format - ("mp4_h264_opus" or "19") - * format_note Additional info about the format - ("3D" or "DASH video") - * width Width of the video, if known - * height Height of the video, if known - * abr Average audio bitrate in KBit/s - * acodec Name of the audio codec in use - * vbr Average video bitrate in KBit/s - * vcodec Name of the video codec in use - * filesize The number of bytes, if known in advance webpage_url: The url to the video webpage, if given to youtube-dl it should allow to get the same result again. (It will be set by YoutubeDL if it's missing) From 00902cd60139c919871aa30c43d0ae21c130fb86 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 16 Dec 2013 14:13:51 +0100 Subject: [PATCH 093/150] release 2013.12.16.2 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 15ac46d17..76b86c976 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2013.12.16.1' +__version__ = '2013.12.16.2' From 780603027f8e57b0e4ec9ea1bf74b6a1afa24cb4 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 16 Dec 2013 14:42:07 +0100 Subject: [PATCH 094/150] [videopremium] Skip test --- youtube_dl/extractor/videopremium.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/videopremium.py b/youtube_dl/extractor/videopremium.py index acae81448..65463c733 100644 --- a/youtube_dl/extractor/videopremium.py +++ b/youtube_dl/extractor/videopremium.py @@ -15,6 +15,7 @@ class VideoPremiumIE(InfoExtractor): u'params': { u'skip_download': True, }, + u'skip': u'Test file has been deleted.', } def _real_extract(self, url): From e64eaaa97dd00b15ff0ebde17d6d6e99e6a7394e Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 16 Dec 2013 14:44:17 +0100 Subject: [PATCH 095/150] Fix execution under Python 3 --- youtube_dl/utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index f3ad47422..dbfac0f43 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1066,13 +1066,14 @@ def fix_xml_all_ampersand(xml_str): def setproctitle(title): + assert isinstance(title, type(u'')) try: libc = ctypes.cdll.LoadLibrary("libc.so.6") except OSError: return title = title buf = ctypes.create_string_buffer(len(title) + 1) - buf.value = title + buf.value = title.encode('utf-8') try: libc.prctl(15, ctypes.byref(buf), 0, 0, 0) except AttributeError: From 4b2da48ea794178236975eaeef1be120b8a95f2a Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 16 Dec 2013 14:44:29 +0100 Subject: [PATCH 096/150] release 2013.12.16.3 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 76b86c976..76691fb46 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2013.12.16.2' +__version__ = '2013.12.16.3' From 0a9ce268bad2d2bfaba1498e3843784a2f14e856 Mon Sep 17 00:00:00 2001 From: alimirjamali <ali.mirjamali@gmail.com> Date: Mon, 16 Dec 2013 20:14:28 +0330 Subject: [PATCH 097/150] Incorrect variable is used to check whether thumbnail exists Dear @phihag I believe in line 848, the correct variable to check is 'thumb_filename' rather than 'infofn' Kindly advise Mit freundlichen Gruessen Ali --- youtube_dl/YoutubeDL.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 2fa34ebc9..b1f87415b 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -845,7 +845,7 @@ class YoutubeDL(object): if info_dict.get('thumbnail') is not None: thumb_format = determine_ext(info_dict['thumbnail'], u'jpg') thumb_filename = os.path.splitext(filename)[0] + u'.' + thumb_format - if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(infofn)): + if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(thumb_filename)): self.to_screen(u'[%s] %s: Thumbnail is already present' % (info_dict['extractor'], info_dict['id'])) else: From ee3e63e477c12591bbfcb6f13382397139900d44 Mon Sep 17 00:00:00 2001 From: rzhxeo <rzhxeot7z81b4700@mailcatch.com> Date: Mon, 16 Dec 2013 20:08:23 +0100 Subject: [PATCH 098/150] [GenericIE] Add support for embedded blip.tv --- youtube_dl/extractor/generic.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 216e03218..a7ea9f733 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -222,6 +222,18 @@ class GenericIE(InfoExtractor): 'id': video_id, } + # Look for embedded blip.tv player + mobj = re.search(r'<meta\s[^>]*https?://api.blip.tv/\w+/redirect/\w+/(\d+)', webpage) + if mobj: + return self.url_result('http://blip.tv/seo/-'+mobj.group(1), 'BlipTV') + mobj = re.search(r'<(?:iframe|embed|object)\s[^>]*https?://(?:\w+\.)?blip.tv/(?:play/|api\.swf#)([a-zA-Z0-9]+)', webpage) + if mobj: + player_url = 'http://blip.tv/play/%s.x?p=1' % mobj.group(1) + player_page = self._download_webpage(player_url, mobj.group(1)) + blip_video_id = self._search_regex(r'data-episode-id="(\d+)', player_page, u'blip_video_id', fatal=False) + if blip_video_id: + return self.url_result('http://blip.tv/seo/-'+blip_video_id, 'BlipTV') + # Look for Bandcamp pages with custom domain mobj = re.search(r'<meta property="og:url"[^>]*?content="(.*?bandcamp\.com.*?)"', webpage) if mobj is not None: From 8c8e3eec793d04cc4dc884a65c119902d1ec7793 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 16 Dec 2013 21:10:06 +0100 Subject: [PATCH 099/150] [facebook] Recognize #! URLs (Fixes #1988) --- test/test_all_urls.py | 6 +++++- youtube_dl/extractor/facebook.py | 4 ++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/test/test_all_urls.py b/test/test_all_urls.py index e9458b2e3..bd77b7c30 100644 --- a/test/test_all_urls.py +++ b/test/test_all_urls.py @@ -10,6 +10,7 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from test.helper import get_testcases from youtube_dl.extractor import ( + FacebookIE, gen_extractors, JustinTVIE, YoutubeIE, @@ -87,12 +88,15 @@ class TestAllURLsMatching(unittest.TestCase): assertExtractId('http://www.youtube.com/watch?v=BaW_jenozKcsharePLED17F32AD9753930', 'BaW_jenozKc') assertExtractId('BaW_jenozKc', 'BaW_jenozKc') + def test_facebook_matching(self): + self.assertTrue(FacebookIE.suitable(u'https://www.facebook.com/Shiniknoh#!/photo.php?v=10153317450565268')) + def test_no_duplicates(self): ies = gen_extractors() for tc in get_testcases(): url = tc['url'] for ie in ies: - if type(ie).__name__ in ['GenericIE', tc['name'] + 'IE']: + if type(ie).__name__ in ('GenericIE', tc['name'] + 'IE'): self.assertTrue(ie.suitable(url), '%s should match URL %r' % (type(ie).__name__, url)) else: self.assertFalse(ie.suitable(url), '%s should not match URL %r' % (type(ie).__name__, url)) diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index 3b210710e..4556079c8 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -17,7 +17,7 @@ from ..utils import ( class FacebookIE(InfoExtractor): """Information Extractor for Facebook""" - _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)' + _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:[^#?]*#!/)?(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)' _LOGIN_URL = 'https://www.facebook.com/login.php?next=http%3A%2F%2Ffacebook.com%2Fhome.php&login_attempt=1' _CHECKPOINT_URL = 'https://www.facebook.com/checkpoint/?next=http%3A%2F%2Ffacebook.com%2Fhome.php&_fb_noscript=1' _NETRC_MACHINE = 'facebook' @@ -27,7 +27,7 @@ class FacebookIE(InfoExtractor): u'file': u'120708114770723.mp4', u'md5': u'48975a41ccc4b7a581abd68651c1a5a8', u'info_dict': { - u"duration": 279, + u"duration": 279, u"title": u"PEOPLE ARE AWESOME 2013" } } From 24050dd11c0fe46136344cdcccedfef0d0b260c3 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 16 Dec 2013 21:10:18 +0100 Subject: [PATCH 100/150] release 2013.12.16.4 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 76691fb46..6c1fb95dc 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2013.12.16.3' +__version__ = '2013.12.16.4' From 0e2a436dcebc7dc17c5848bada4adcad4248491d Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 16 Dec 2013 21:34:41 +0100 Subject: [PATCH 101/150] [radiofrance] Add support (Fixes #1942) --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/radiofrance.py | 60 +++++++++++++++++++++++++++++ 2 files changed, 61 insertions(+) create mode 100644 youtube_dl/extractor/radiofrance.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index b8ff750d0..2761b5439 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -119,6 +119,7 @@ from .pornhd import PornHdIE from .pornhub import PornHubIE from .pornotube import PornotubeIE from .pyvideo import PyvideoIE +from .radiofrance import RadioFranceIE from .rbmaradio import RBMARadioIE from .redtube import RedTubeIE from .ringtv import RingTVIE diff --git a/youtube_dl/extractor/radiofrance.py b/youtube_dl/extractor/radiofrance.py new file mode 100644 index 000000000..bb33b50bc --- /dev/null +++ b/youtube_dl/extractor/radiofrance.py @@ -0,0 +1,60 @@ +# coding: utf-8 +import datetime +import json +import re + +from .common import InfoExtractor +from ..utils import ( + remove_start, +) + + +class RadioFranceIE(InfoExtractor): + _VALID_URL = r'^https?://maison\.radiofrance\.fr/radiovisions/(?P<id>[^?#]+)' + IE_NAME = u'radiofrance' + + _TEST = { + u'url': u'http://maison.radiofrance.fr/radiovisions/one-one', + u'file': u'one-one.mp4', + u'md5': u'todo', + u'info_dict': { + u"title": u"One to one", + u"description": u"Plutôt que d'imaginer la radio de demain comme technologie ou comme création de contenu, je veux montrer que quelles que soient ses évolutions, j'ai l'intime conviction que la radio continuera d'être un grand média de proximité pour les auditeurs.", + u"uploader": u"ferdi", + }, + } + + def _real_extract(self, url): + m = re.match(self._VALID_URL, url) + video_id = m.group('id') + + webpage = self._download_webpage(url, video_id) + title = self._html_search_regex(r'<h1>(.*?)</h1>', webpage, u'title') + description = self._html_search_regex( + r'<div class="bloc_page_wrapper"><div class="text">(.*?)</div>', + webpage, u'description', fatal=False) + uploader = self._html_search_regex( + r'<div class="credit">  © (.*?)</div>', + webpage, u'uploader', fatal=False) + + formats_str = self._html_search_regex( + r'class="jp-jplayer[^"]*" data-source="([^"]+)">', + webpage, u'audio URLs') + formats = [ + { + 'format_id': m[0], + 'url': m[1], + 'vcodec': 'none', + } + for m in + re.findall(r"([a-z0-9]+)\s*:\s*'([^']+)'", formats_str) + ] + # No sorting, we don't know any more about these formats + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'description': description, + 'uploader': uploader, + } From 8fe56478f879a21e24695a4e27ba8546d0822f7b Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 16 Dec 2013 21:34:47 +0100 Subject: [PATCH 102/150] release 2013.12.16.5 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 6c1fb95dc..2ce0bc9d1 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2013.12.16.4' +__version__ = '2013.12.16.5' From f7a68925724eda913c96519f2a5644aebb3688c4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Mon, 16 Dec 2013 21:42:41 +0100 Subject: [PATCH 103/150] [arte:ddc] Remove test video seems to expire in 7 days, as arte+7 --- youtube_dl/extractor/arte.py | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py index 4b7bef775..9254fbfe0 100644 --- a/youtube_dl/extractor/arte.py +++ b/youtube_dl/extractor/arte.py @@ -266,20 +266,6 @@ class ArteTVDDCIE(ArteTVPlus7IE): IE_NAME = u'arte.tv:ddc' _VALID_URL = r'http?://ddc\.arte\.tv/(?P<lang>emission|folge)/(?P<id>.+)' - _TEST = { - u'url': u'http://ddc.arte.tv/folge/neues-aus-mauretanien', - u'file': u'049881-009_PLUS7-D.flv', - u'info_dict': { - u'title': u'Mit offenen Karten', - u'description': u'md5:57929b0eaeddeb8a0c983f58e9ebd3b6', - u'upload_date': u'20131207', - }, - u'params': { - # rtmp download - u'skip_download': True, - }, - } - def _real_extract(self, url): video_id, lang = self._extract_url_info(url) if lang == 'folge': From f25571ffbf67c14bb91798e445a2690e03481937 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 16 Dec 2013 21:45:21 +0100 Subject: [PATCH 104/150] Add support for embedded vevo player (Fixes #1957) --- youtube_dl/extractor/generic.py | 6 ++++++ youtube_dl/extractor/vevo.py | 6 +++++- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index a7ea9f733..fca23b6c4 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -241,6 +241,12 @@ class GenericIE(InfoExtractor): # Don't set the extractor because it can be a track url or an album return self.url_result(burl) + # Look for embedded Vevo player + mobj = re.search( + r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:cache\.)?vevo\.com/.+?)\1', webpage) + if mobj is not None: + return self.url_result(mobj.group('url')) + # Start with something easy: JW Player in SWFObject mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage) if mobj is None: diff --git a/youtube_dl/extractor/vevo.py b/youtube_dl/extractor/vevo.py index 4823992ef..7eda9469b 100644 --- a/youtube_dl/extractor/vevo.py +++ b/youtube_dl/extractor/vevo.py @@ -15,7 +15,11 @@ class VevoIE(InfoExtractor): Accepts urls from vevo.com or in the format 'vevo:{id}' (currently used by MTVIE) """ - _VALID_URL = r'((http://www\.vevo\.com/watch/(?:[^/]+/[^/]+/)?)|(vevo:))(?P<id>.*?)(\?|$)' + _VALID_URL = r'''(?x) + (?:https?://www\.vevo\.com/watch/(?:[^/]+/[^/]+/)?| + https?://cache\.vevo\.com/m/html/embed\.html\?video=| + vevo:) + (?P<id>[^&?#]+)''' _TESTS = [{ u'url': u'http://www.vevo.com/watch/hurts/somebody-to-die-for/GB1101300280', u'file': u'GB1101300280.mp4', From 83c632dc439cbf475e34f788d81e6308359c5c52 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 16 Dec 2013 21:46:16 +0100 Subject: [PATCH 105/150] release 2013.12.16.6 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 2ce0bc9d1..712b6c4e1 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2013.12.16.5' +__version__ = '2013.12.16.6' From ebce53b3d85bc5c099b4fd0ca27d03034d7024ea Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 16 Dec 2013 21:48:38 +0100 Subject: [PATCH 106/150] [vevo] Add suppor for videoplayer. URLs (#1957) --- youtube_dl/extractor/vevo.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/vevo.py b/youtube_dl/extractor/vevo.py index 7eda9469b..a4b26a26f 100644 --- a/youtube_dl/extractor/vevo.py +++ b/youtube_dl/extractor/vevo.py @@ -18,6 +18,7 @@ class VevoIE(InfoExtractor): _VALID_URL = r'''(?x) (?:https?://www\.vevo\.com/watch/(?:[^/]+/[^/]+/)?| https?://cache\.vevo\.com/m/html/embed\.html\?video=| + https?://videoplayer\.vevo\.com/embed/embedded\?videoId=| vevo:) (?P<id>[^&?#]+)''' _TESTS = [{ From 87a28127d225f698c3cbf77a7f943338a3499d6e Mon Sep 17 00:00:00 2001 From: Itay Brandes <Brandes.Itay@gmail.com> Date: Sat, 14 Dec 2013 17:40:51 +0200 Subject: [PATCH 107/150] _search_regex's "isatty" call fails with Py2exe's _search_regex calls the sys.stderr.isatty() function for unix systems. Py2exe uses a custom Stderr() stream which doesn't have an `isatty()` function, leading to it's crash. Fixes easily with checking that it's a unix system first. --- youtube_dl/extractor/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 11b6ed524..1fc0624a3 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -295,7 +295,7 @@ class InfoExtractor(object): mobj = re.search(p, string, flags) if mobj: break - if sys.stderr.isatty() and os.name != 'nt': + if os.name != 'nt' and sys.stderr.isatty(): _name = u'\033[0;34m%s\033[0m' % name else: _name = name From 5c541b2cb79113952efbdb72be72db568c8132a8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Mon, 16 Dec 2013 22:05:28 +0100 Subject: [PATCH 108/150] [mtv] Add support for urls from the mobile site (fixes #1959) --- youtube_dl/extractor/mtv.py | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py index 5b2bd9633..ed11f521a 100644 --- a/youtube_dl/extractor/mtv.py +++ b/youtube_dl/extractor/mtv.py @@ -93,7 +93,9 @@ class MTVServicesInfoExtractor(InfoExtractor): class MTVIE(MTVServicesInfoExtractor): - _VALID_URL = r'^https?://(?:www\.)?mtv\.com/videos/.+?/(?P<videoid>[0-9]+)/[^/]+$' + _VALID_URL = r'''(?x)^https?:// + (?:(?:www\.)?mtv\.com/videos/.+?/(?P<videoid>[0-9]+)/[^/]+$| + m\.mtv\.com/videos/video\.rbml\?.*?id=(?P<mgid>[^&]+))''' _FEED_URL = 'http://www.mtv.com/player/embed/AS3/rss/' @@ -127,16 +129,17 @@ class MTVIE(MTVServicesInfoExtractor): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('videoid') - - webpage = self._download_webpage(url, video_id) - - # Some videos come from Vevo.com - m_vevo = re.search(r'isVevoVideo = true;.*?vevoVideoId = "(.*?)";', - webpage, re.DOTALL) - if m_vevo: - vevo_id = m_vevo.group(1); - self.to_screen(u'Vevo video detected: %s' % vevo_id) - return self.url_result('vevo:%s' % vevo_id, ie='Vevo') - - uri = self._html_search_regex(r'/uri/(.*?)\?', webpage, u'uri') + uri = mobj.group('mgid') + if uri is None: + webpage = self._download_webpage(url, video_id) + + # Some videos come from Vevo.com + m_vevo = re.search(r'isVevoVideo = true;.*?vevoVideoId = "(.*?)";', + webpage, re.DOTALL) + if m_vevo: + vevo_id = m_vevo.group(1); + self.to_screen(u'Vevo video detected: %s' % vevo_id) + return self.url_result('vevo:%s' % vevo_id, ie='Vevo') + + uri = self._html_search_regex(r'/uri/(.*?)\?', webpage, u'uri') return self._get_videos_info(uri) From d90df974c3164ea377f2ce2b04742e6ff21379e8 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 16 Dec 2013 22:18:27 +0100 Subject: [PATCH 109/150] [academicearth] Add support for courses (#1976) --- test/test_playlists.py | 12 +++++++++ youtube_dl/extractor/__init__.py | 3 ++- youtube_dl/extractor/academicearth.py | 36 +++++++++++++++++++++++++++ 3 files changed, 50 insertions(+), 1 deletion(-) create mode 100644 youtube_dl/extractor/academicearth.py diff --git a/test/test_playlists.py b/test/test_playlists.py index 87ca401e5..b7c6850fd 100644 --- a/test/test_playlists.py +++ b/test/test_playlists.py @@ -12,6 +12,7 @@ from test.helper import FakeYDL from youtube_dl.extractor import ( + AcademicEarthCourseIE, DailymotionPlaylistIE, DailymotionUserIE, VimeoChannelIE, @@ -158,5 +159,16 @@ class TestPlaylists(unittest.TestCase): self.assertEqual(result['title'], u'Inspector') self.assertTrue(len(result['entries']) >= 9) + def test_AcademicEarthCourse(self): + dl = FakeYDL() + ie = AcademicEarthCourseIE(dl) + result = ie.extract(u'http://academicearth.org/courses/building-dynamic-websites/') + self.assertIsPlaylist(result) + self.assertEqual(result['id'], u'building-dynamic-websites') + self.assertEqual(result['title'], u'Building Dynamic Websites') + self.assertEqual(result['description'], "Today's websites are increasingly dynamic. Pages are no longer static HTML files but instead generated by scripts and database calls. User interfaces are more seamless, with technologies like Ajax replacing traditional page reloads. This course teaches students how to build dynamic websites with Ajax and with Linux, Apache, MySQL, and PHP (LAMP), one of today's most popular frameworks. Students learn how to set up domain names with DNS, how to structure pages with XHTML and CSS, how to program in JavaScript and PHP, how to configure Apache and MySQL, how to design and query databases with SQL, how to use Ajax with both XML and JSON, and how to build mashups. The course explores issues of security, scalability, and cross-browser support and also discusses enterprise-level deployments of websites, including third-party hosting, virtualization, colocation in data centers, firewalling, and load-balancing.") + self.assertEqual(len(result['entries']), 10) + + if __name__ == '__main__': unittest.main() diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 2761b5439..7f2f8806e 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -1,6 +1,7 @@ -from .appletrailers import AppleTrailersIE +from .academicearth import AcademicEarthCourseIE from .addanime import AddAnimeIE from .anitube import AnitubeIE +from .appletrailers import AppleTrailersIE from .archiveorg import ArchiveOrgIE from .ard import ARDIE from .arte import ( diff --git a/youtube_dl/extractor/academicearth.py b/youtube_dl/extractor/academicearth.py new file mode 100644 index 000000000..5045e7332 --- /dev/null +++ b/youtube_dl/extractor/academicearth.py @@ -0,0 +1,36 @@ +import datetime +import json +import re + +from .common import InfoExtractor +from ..utils import ( + remove_start, +) + + +class AcademicEarthCourseIE(InfoExtractor): + _VALID_URL = r'^https?://(?:www\.)?academicearth\.org/courses/(?P<id>[^?#/]+)' + IE_NAME = u'AcademicEarth:Course' + + def _real_extract(self, url): + m = re.match(self._VALID_URL, url) + playlist_id = m.group('id') + + webpage = self._download_webpage(url, playlist_id) + title = self._html_search_regex( + r'<h1 class="playlist-name">(.*?)</h1>', webpage, u'title') + description = self._html_search_regex( + r'<p class="excerpt">(.*?)</p>', + webpage, u'description', fatal=False) + urls = re.findall( + r'<h3 class="lecture-title"><a target="_blank" href="([^"]+)">', + webpage) + entries = [self.url_result(u) for u in urls] + + return { + '_type': 'playlist', + 'id': playlist_id, + 'title': title, + 'description': description, + 'entries': entries, + } From 11b68f6e1be60a6faa61fc7df3460bb58ed95d70 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 16 Dec 2013 22:18:58 +0100 Subject: [PATCH 110/150] release 2013.12.16.7 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 712b6c4e1..f5bcc67ce 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2013.12.16.6' +__version__ = '2013.12.16.7' From d6756d375807f84e86daca7168f12da03f7572f4 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 16 Dec 2013 22:25:02 +0100 Subject: [PATCH 111/150] [playlist-test] require a string --- test/test_playlists.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_playlists.py b/test/test_playlists.py index b7c6850fd..5004d0464 100644 --- a/test/test_playlists.py +++ b/test/test_playlists.py @@ -166,7 +166,7 @@ class TestPlaylists(unittest.TestCase): self.assertIsPlaylist(result) self.assertEqual(result['id'], u'building-dynamic-websites') self.assertEqual(result['title'], u'Building Dynamic Websites') - self.assertEqual(result['description'], "Today's websites are increasingly dynamic. Pages are no longer static HTML files but instead generated by scripts and database calls. User interfaces are more seamless, with technologies like Ajax replacing traditional page reloads. This course teaches students how to build dynamic websites with Ajax and with Linux, Apache, MySQL, and PHP (LAMP), one of today's most popular frameworks. Students learn how to set up domain names with DNS, how to structure pages with XHTML and CSS, how to program in JavaScript and PHP, how to configure Apache and MySQL, how to design and query databases with SQL, how to use Ajax with both XML and JSON, and how to build mashups. The course explores issues of security, scalability, and cross-browser support and also discusses enterprise-level deployments of websites, including third-party hosting, virtualization, colocation in data centers, firewalling, and load-balancing.") + self.assertEqual(result['description'], u"Today's websites are increasingly dynamic. Pages are no longer static HTML files but instead generated by scripts and database calls. User interfaces are more seamless, with technologies like Ajax replacing traditional page reloads. This course teaches students how to build dynamic websites with Ajax and with Linux, Apache, MySQL, and PHP (LAMP), one of today's most popular frameworks. Students learn how to set up domain names with DNS, how to structure pages with XHTML and CSS, how to program in JavaScript and PHP, how to configure Apache and MySQL, how to design and query databases with SQL, how to use Ajax with both XML and JSON, and how to build mashups. The course explores issues of security, scalability, and cross-browser support and also discusses enterprise-level deployments of websites, including third-party hosting, virtualization, colocation in data centers, firewalling, and load-balancing.") self.assertEqual(len(result['entries']), 10) From 8b4e27461075ec246906487f000b2424e0603a93 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 16 Dec 2013 22:28:52 +0100 Subject: [PATCH 112/150] [rtlnow] Fix URL calculation (Closes #1989) --- youtube_dl/extractor/rtlnow.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/rtlnow.py b/youtube_dl/extractor/rtlnow.py index 511674d8d..ccf0b1546 100644 --- a/youtube_dl/extractor/rtlnow.py +++ b/youtube_dl/extractor/rtlnow.py @@ -10,7 +10,7 @@ from ..utils import ( class RTLnowIE(InfoExtractor): """Information Extractor for RTL NOW, RTL2 NOW, RTL NITRO, SUPER RTL NOW, VOX NOW and n-tv NOW""" - _VALID_URL = r'(?:http://)?(?P<url>(?P<base_url>rtl-now\.rtl\.de|rtl2now\.rtl2\.de|(?:www\.)?voxnow\.de|(?:www\.)?rtlnitronow\.de|(?:www\.)?superrtlnow\.de|(?:www\.)?n-tvnow\.de)/+[a-zA-Z0-9-]+/[a-zA-Z0-9-]+\.php\?(?:container_id|film_id)=(?P<video_id>[0-9]+)&player=1(?:&season=[0-9]+)?(?:&.*)?)' + _VALID_URL = r'(?:http://)?(?P<url>(?P<domain>rtl-now\.rtl\.de|rtl2now\.rtl2\.de|(?:www\.)?voxnow\.de|(?:www\.)?rtlnitronow\.de|(?:www\.)?superrtlnow\.de|(?:www\.)?n-tvnow\.de)/+[a-zA-Z0-9-]+/[a-zA-Z0-9-]+\.php\?(?:container_id|film_id)=(?P<video_id>[0-9]+)&player=1(?:&season=[0-9]+)?(?:&.*)?)' _TESTS = [{ u'url': u'http://rtl-now.rtl.de/ahornallee/folge-1.php?film_id=90419&player=1&season=1', u'file': u'90419.flv', @@ -82,7 +82,7 @@ class RTLnowIE(InfoExtractor): mobj = re.match(self._VALID_URL, url) webpage_url = u'http://' + mobj.group('url') - video_page_url = u'http://' + mobj.group('base_url') + video_page_url = u'http://' + mobj.group('domain') + u'/' video_id = mobj.group(u'video_id') webpage = self._download_webpage(webpage_url, video_id) From fa77b742ac54c2125e7dca931f533cd3e945b8f7 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 16 Dec 2013 23:07:57 +0100 Subject: [PATCH 113/150] [radiofrance] Fill in test details --- youtube_dl/extractor/radiofrance.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/radiofrance.py b/youtube_dl/extractor/radiofrance.py index bb33b50bc..2866115ef 100644 --- a/youtube_dl/extractor/radiofrance.py +++ b/youtube_dl/extractor/radiofrance.py @@ -15,12 +15,12 @@ class RadioFranceIE(InfoExtractor): _TEST = { u'url': u'http://maison.radiofrance.fr/radiovisions/one-one', - u'file': u'one-one.mp4', - u'md5': u'todo', + u'file': u'one-one.ogg', + u'md5': u'bdbb28ace95ed0e04faab32ba3160daf', u'info_dict': { u"title": u"One to one", u"description": u"Plutôt que d'imaginer la radio de demain comme technologie ou comme création de contenu, je veux montrer que quelles que soient ses évolutions, j'ai l'intime conviction que la radio continuera d'être un grand média de proximité pour les auditeurs.", - u"uploader": u"ferdi", + u"uploader": u"Thomas Hercouët", }, } From ec98946ef9ae19f5218ac66d72d61883709982ca Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Tue, 17 Dec 2013 02:41:34 +0100 Subject: [PATCH 114/150] [academicearth] Support playlists (Closes #1976) --- youtube_dl/extractor/academicearth.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/academicearth.py b/youtube_dl/extractor/academicearth.py index 5045e7332..c450c30ca 100644 --- a/youtube_dl/extractor/academicearth.py +++ b/youtube_dl/extractor/academicearth.py @@ -9,7 +9,7 @@ from ..utils import ( class AcademicEarthCourseIE(InfoExtractor): - _VALID_URL = r'^https?://(?:www\.)?academicearth\.org/courses/(?P<id>[^?#/]+)' + _VALID_URL = r'^https?://(?:www\.)?academicearth\.org/(?:courses|playlists)/(?P<id>[^?#/]+)' IE_NAME = u'AcademicEarth:Course' def _real_extract(self, url): From 46374a56b214cae9f66ef3c01cf3d62a71544030 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Tue, 17 Dec 2013 02:49:54 +0100 Subject: [PATCH 115/150] [youtube] Do not warn for videos with allow_rating=0 This fixes #1982 Test video: http://www.youtube.com/watch?v=gi2uH3YxohU --- youtube_dl/extractor/common.py | 7 ++++--- youtube_dl/extractor/xtube.py | 2 +- youtube_dl/extractor/youtube.py | 4 +++- 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 1fc0624a3..939249d7b 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -18,6 +18,7 @@ from ..utils import ( sanitize_filename, unescapeHTML, ) +_NO_DEFAULT = object() class InfoExtractor(object): @@ -281,7 +282,7 @@ class InfoExtractor(object): video_info['title'] = playlist_title return video_info - def _search_regex(self, pattern, string, name, default=None, fatal=True, flags=0): + def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0): """ Perform a regex search on the given string, using a single or a list of patterns returning the first matching group. @@ -303,7 +304,7 @@ class InfoExtractor(object): if mobj: # return the first matching group return next(g for g in mobj.groups() if g is not None) - elif default is not None: + elif default is not _NO_DEFAULT: return default elif fatal: raise RegexNotFoundError(u'Unable to extract %s' % _name) @@ -312,7 +313,7 @@ class InfoExtractor(object): u'please report this issue on http://yt-dl.org/bug' % _name) return None - def _html_search_regex(self, pattern, string, name, default=None, fatal=True, flags=0): + def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0): """ Like _search_regex, but strips HTML tags and unescapes entities. """ diff --git a/youtube_dl/extractor/xtube.py b/youtube_dl/extractor/xtube.py index e3458d2bd..1a6a7688d 100644 --- a/youtube_dl/extractor/xtube.py +++ b/youtube_dl/extractor/xtube.py @@ -32,7 +32,7 @@ class XTubeIE(InfoExtractor): video_title = self._html_search_regex(r'<div class="p_5px[^>]*>([^<]+)', webpage, u'title') video_uploader = self._html_search_regex(r'so_s\.addVariable\("owner_u", "([^"]+)', webpage, u'uploader', fatal=False) - video_description = self._html_search_regex(r'<p class="video_description">([^<]+)', webpage, u'description', default=None) + video_description = self._html_search_regex(r'<p class="video_description">([^<]+)', webpage, u'description', fatal=False) video_url= self._html_search_regex(r'var videoMp4 = "([^"]+)', webpage, u'video_url').replace('\\/', '/') path = compat_urllib_parse_urlparse(video_url).path extension = os.path.splitext(path)[1][1:] diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index a68a214ca..c860eedda 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1361,7 +1361,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): video_description = u'' def _extract_count(klass): - count = self._search_regex(r'class="%s">([\d,]+)</span>' % re.escape(klass), video_webpage, klass, fatal=False) + count = self._search_regex( + r'class="%s">([\d,]+)</span>' % re.escape(klass), + video_webpage, klass, default=None) if count is not None: return int(count.replace(',', '')) return None From 44c471c3b873473157adb8ba8a55667ab54b2602 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Tue, 17 Dec 2013 02:51:22 +0100 Subject: [PATCH 116/150] release 2013.12.17 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index f5bcc67ce..9ccfa6a81 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2013.12.16.7' +__version__ = '2013.12.17' From 29eb5174031cfc0b5de556da3da7761ac377de4e Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Tue, 17 Dec 2013 04:13:36 +0100 Subject: [PATCH 117/150] Add webpage_url_basename info_dict field (Fixes #1938) --- test/test_utils.py | 25 ++++++++++++++++--------- youtube_dl/YoutubeDL.py | 4 ++++ youtube_dl/utils.py | 7 +++++++ 3 files changed, 27 insertions(+), 9 deletions(-) diff --git a/test/test_utils.py b/test/test_utils.py index 0fa66beec..5f4fdb771 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -13,20 +13,21 @@ import xml.etree.ElementTree #from youtube_dl.utils import htmlentity_transform from youtube_dl.utils import ( - timeconvert, - sanitize_filename, - unescapeHTML, - orderedSet, DateRange, - unified_strdate, + encodeFilename, find_xpath_attr, get_meta_content, - xpath_with_ns, - smuggle_url, - unsmuggle_url, + orderedSet, + sanitize_filename, shell_quote, - encodeFilename, + smuggle_url, str_to_int, + timeconvert, + unescapeHTML, + unified_strdate, + unsmuggle_url, + url_basename, + xpath_with_ns, ) if sys.version_info < (3, 0): @@ -181,6 +182,12 @@ class TestUtil(unittest.TestCase): self.assertEqual(str_to_int('123,456'), 123456) self.assertEqual(str_to_int('123.456'), 123456) + def test_url_basename(self): + self.assertEqual(url_basename(u'http://foo.de/'), u'') + self.assertEqual(url_basename(u'http://foo.de/bar/baz'), u'baz') + self.assertEqual(url_basename(u'http://foo.de/bar/baz?x=y'), u'baz') + self.assertEqual(url_basename(u'http://foo.de/bar/baz#x=y'), u'baz') + self.assertEqual(url_basename(u'http://foo.de/bar/baz/'), u'baz') if __name__ == '__main__': unittest.main() diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index b1f87415b..2a078adfb 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -47,6 +47,7 @@ from .utils import ( subtitles_filename, takewhile_inclusive, UnavailableVideoError, + url_basename, write_json_file, write_string, YoutubeDLHandler, @@ -484,6 +485,7 @@ class YoutubeDL(object): { 'extractor': ie.IE_NAME, 'webpage_url': url, + 'webpage_url_basename': url_basename(url), 'extractor_key': ie.ie_key(), }) if process: @@ -576,6 +578,7 @@ class YoutubeDL(object): 'playlist_index': i + playliststart, 'extractor': ie_result['extractor'], 'webpage_url': ie_result['webpage_url'], + 'webpage_url_basename': url_basename(ie_result['webpage_url']), 'extractor_key': ie_result['extractor_key'], } @@ -596,6 +599,7 @@ class YoutubeDL(object): { 'extractor': ie_result['extractor'], 'webpage_url': ie_result['webpage_url'], + 'webpage_url_basename': url_basename(ie_result['webpage_url']), 'extractor_key': ie_result['extractor_key'], }) return r diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index dbfac0f43..a249c7ec1 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1084,3 +1084,10 @@ def remove_start(s, start): if s.startswith(start): return s[len(start):] return s + + +def url_basename(url): + m = re.match(r'(?:https?:|)//[^/]+/(?:[^/?#]+/)?([^/?#]+)/?(?:[?#]|$)', url) + if not m: + return u'' + return m.group(1) From f09828b4e175997adce92f6ccecdcc30bb8060b6 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Tue, 17 Dec 2013 04:13:41 +0100 Subject: [PATCH 118/150] release 2013.12.17.1 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 9ccfa6a81..910c5a34f 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2013.12.17' +__version__ = '2013.12.17.1' From cecaaf3f58ad9f544dbb79af1e565d9353fa2b2d Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Tue, 17 Dec 2013 12:04:33 +0100 Subject: [PATCH 119/150] [generic] Do not use compatibility result fallback --- youtube_dl/extractor/generic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index fca23b6c4..da933067a 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -148,7 +148,7 @@ class GenericIE(InfoExtractor): try: new_url = self._test_redirect(url) if new_url: - return [self.url_result(new_url)] + return self.url_result(new_url) except compat_urllib_error.HTTPError: # This may be a stupid server that doesn't like HEAD, our UA, or so pass From d6c7a367e88096bb17e323954002c084477fe908 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Tue, 17 Dec 2013 12:32:58 +0100 Subject: [PATCH 120/150] [utils] Fix url_basename --- test/test_utils.py | 3 +++ youtube_dl/utils.py | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/test/test_utils.py b/test/test_utils.py index 5f4fdb771..e5778cd83 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -188,6 +188,9 @@ class TestUtil(unittest.TestCase): self.assertEqual(url_basename(u'http://foo.de/bar/baz?x=y'), u'baz') self.assertEqual(url_basename(u'http://foo.de/bar/baz#x=y'), u'baz') self.assertEqual(url_basename(u'http://foo.de/bar/baz/'), u'baz') + self.assertEqual( + url_basename(u'http://media.w3.org/2010/05/sintel/trailer.mp4'), + u'trailer.mp4') if __name__ == '__main__': unittest.main() diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index a249c7ec1..2d12e2df9 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1087,7 +1087,7 @@ def remove_start(s, start): def url_basename(url): - m = re.match(r'(?:https?:|)//[^/]+/(?:[^/?#]+/)?([^/?#]+)/?(?:[?#]|$)', url) + m = re.match(r'(?:https?:|)//[^/]+/(?:[^?#]+/)?([^/?#]+)/?(?:[?#]|$)', url) if not m: return u'' return m.group(1) From 42393ce234c651aaae244e1546e1803101765acc Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Tue, 17 Dec 2013 12:33:55 +0100 Subject: [PATCH 121/150] Add support for direct links to a video (#1973) --- youtube_dl/extractor/generic.py | 52 ++++++++++++++++++++++++++------- youtube_dl/utils.py | 5 ++++ 2 files changed, 46 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index da933067a..209f68204 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -13,6 +13,8 @@ from ..utils import ( ExtractorError, smuggle_url, unescapeHTML, + unified_strdate, + url_basename, ) from .brightcove import BrightcoveIE @@ -71,6 +73,17 @@ class GenericIE(InfoExtractor): u'skip_download': True, }, }, + # Direct link to a video + { + u'url': u'http://media.w3.org/2010/05/sintel/trailer.mp4', + u'file': u'trailer.mp4', + u'md5': u'67d406c2bcb6af27fa886f31aa934bbe', + u'info_dict': { + u'id': u'trailer', + u'title': u'trailer', + u'upload_date': u'20100513', + } + } ] def report_download_webpage(self, video_id): @@ -83,7 +96,7 @@ class GenericIE(InfoExtractor): """Report information extraction.""" self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url) - def _test_redirect(self, url): + def _send_head(self, url): """Check if it is a redirect, like url shorteners, in case return the new url.""" class HeadRequest(compat_urllib_request.Request): def get_method(self): @@ -131,29 +144,46 @@ class GenericIE(InfoExtractor): response = opener.open(HeadRequest(url)) if response is None: raise ExtractorError(u'Invalid URL protocol') - new_url = response.geturl() - - if url == new_url: - return False - - self.report_following_redirect(new_url) - return new_url + return response def _real_extract(self, url): parsed_url = compat_urlparse.urlparse(url) if not parsed_url.scheme: self._downloader.report_warning('The url doesn\'t specify the protocol, trying with http') return self.url_result('http://' + url) + video_id = os.path.splitext(url.split('/')[-1])[0] try: - new_url = self._test_redirect(url) - if new_url: + response = self._send_head(url) + + # Check for redirect + new_url = response.geturl() + if url != new_url: + self.report_following_redirect(new_url) return self.url_result(new_url) + + # Check for direct link to a video + content_type = response.headers.get('Content-Type', '') + m = re.match(r'^(?:audio|video)/(?P<format_id>.+)$', content_type) + if m: + upload_date = response.headers.get('Last-Modified') + if upload_date: + upload_date = unified_strdate(upload_date) + assert (url_basename(url) == 'trailer.mp4') + return { + 'id': video_id, + 'title': os.path.splitext(url_basename(url))[0], + 'formats': [{ + 'format_id': m.group('format_id'), + 'url': url, + }], + 'upload_date': upload_date, + } + except compat_urllib_error.HTTPError: # This may be a stupid server that doesn't like HEAD, our UA, or so pass - video_id = url.split('/')[-1] try: webpage = self._download_webpage(url, video_id) except ValueError: diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 2d12e2df9..d5069dcca 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -761,12 +761,17 @@ def unified_strdate(date_str): '%Y-%m-%dT%H:%M:%S.%fZ', '%Y-%m-%dT%H:%M:%S.%f0Z', '%Y-%m-%dT%H:%M:%S', + '%Y-%m-%dT%H:%M:%S', ] for expression in format_expressions: try: upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d') except: pass + if upload_date is None: + timetuple = email.utils.parsedate_tz(date_str) + if timetuple: + upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d') return upload_date def determine_ext(url, default_ext=u'unknown_video'): From 946135aa2ab846a367cde5b3b86a0224ac696fdd Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Tue, 17 Dec 2013 12:34:30 +0100 Subject: [PATCH 122/150] [academicearth] remove unused imports --- youtube_dl/extractor/academicearth.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/youtube_dl/extractor/academicearth.py b/youtube_dl/extractor/academicearth.py index c450c30ca..ac05f8246 100644 --- a/youtube_dl/extractor/academicearth.py +++ b/youtube_dl/extractor/academicearth.py @@ -1,11 +1,6 @@ -import datetime -import json import re from .common import InfoExtractor -from ..utils import ( - remove_start, -) class AcademicEarthCourseIE(InfoExtractor): From 7de6e075b4b8738fed185d9924375100ae0d8425 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Tue, 17 Dec 2013 12:35:16 +0100 Subject: [PATCH 123/150] [radiofrance] remove unused imports --- youtube_dl/extractor/radiofrance.py | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/radiofrance.py b/youtube_dl/extractor/radiofrance.py index 2866115ef..34652f6c1 100644 --- a/youtube_dl/extractor/radiofrance.py +++ b/youtube_dl/extractor/radiofrance.py @@ -1,12 +1,7 @@ # coding: utf-8 -import datetime -import json import re from .common import InfoExtractor -from ..utils import ( - remove_start, -) class RadioFranceIE(InfoExtractor): @@ -42,11 +37,11 @@ class RadioFranceIE(InfoExtractor): webpage, u'audio URLs') formats = [ { - 'format_id': m[0], - 'url': m[1], + 'format_id': fm[0], + 'url': fm[1], 'vcodec': 'none', } - for m in + for fm in re.findall(r"([a-z0-9]+)\s*:\s*'([^']+)'", formats_str) ] # No sorting, we don't know any more about these formats From 6086d121cbc3e49c30e3ef64151cc2c4b22ed713 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Tue, 17 Dec 2013 12:35:57 +0100 Subject: [PATCH 124/150] release 2013.12.17.2 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 910c5a34f..7cbee7335 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2013.12.17.1' +__version__ = '2013.12.17.2' From 9b8aaeed856f189fef016563a10f7f4e46a1590e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Tue, 17 Dec 2013 14:56:29 +0100 Subject: [PATCH 125/150] Simplify url_basename Use urlparse from the standard library. --- youtube_dl/utils.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index d5069dcca..4c7ad89c0 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1092,7 +1092,5 @@ def remove_start(s, start): def url_basename(url): - m = re.match(r'(?:https?:|)//[^/]+/(?:[^?#]+/)?([^/?#]+)/?(?:[?#]|$)', url) - if not m: - return u'' - return m.group(1) + path = compat_urlparse.urlparse(url).path + return path.strip(u'/').split(u'/')[-1] From e029b8bd43a30211e1ed71ee8a123c9b081d4c22 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Tue, 17 Dec 2013 16:12:14 +0100 Subject: [PATCH 126/150] [utils] Remove duplicated line This line was added by accident in 42393ce234c651aaae244e1546e1803101765acc --- youtube_dl/utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 4c7ad89c0..cc391bddd 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -761,7 +761,6 @@ def unified_strdate(date_str): '%Y-%m-%dT%H:%M:%S.%fZ', '%Y-%m-%dT%H:%M:%S.%f0Z', '%Y-%m-%dT%H:%M:%S', - '%Y-%m-%dT%H:%M:%S', ] for expression in format_expressions: try: From 3e78514568ddb516d23de052f694c8de7ab469d7 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Tue, 17 Dec 2013 16:26:32 +0100 Subject: [PATCH 127/150] [generic] Support application/ogg for direct links Also remove some debugging code. --- youtube_dl/extractor/generic.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 209f68204..fd32370c2 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -164,18 +164,18 @@ class GenericIE(InfoExtractor): # Check for direct link to a video content_type = response.headers.get('Content-Type', '') - m = re.match(r'^(?:audio|video)/(?P<format_id>.+)$', content_type) + m = re.match(r'^(?P<type>audio|video|application(?=/ogg$))/(?P<format_id>.+)$', content_type) if m: upload_date = response.headers.get('Last-Modified') if upload_date: upload_date = unified_strdate(upload_date) - assert (url_basename(url) == 'trailer.mp4') return { 'id': video_id, 'title': os.path.splitext(url_basename(url))[0], 'formats': [{ 'format_id': m.group('format_id'), 'url': url, + 'vcodec': u'none' if m.group('type') == 'audio' else None }], 'upload_date': upload_date, } From 77aa6b329da84335ac3f183e0ed04198b318a990 Mon Sep 17 00:00:00 2001 From: dst <dstftw@gmail.com> Date: Thu, 19 Dec 2013 05:28:16 +0700 Subject: [PATCH 128/150] [ivi] Add support for ivi.ru --- youtube_dl/extractor/__init__.py | 4 + youtube_dl/extractor/ivi.py | 154 +++++++++++++++++++++++++++++++ 2 files changed, 158 insertions(+) create mode 100644 youtube_dl/extractor/ivi.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 7f2f8806e..677a894b3 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -82,6 +82,10 @@ from .ina import InaIE from .infoq import InfoQIE from .instagram import InstagramIE from .internetvideoarchive import InternetVideoArchiveIE +from .ivi import ( + IviIE, + IviCompilationIE +) from .jeuxvideo import JeuxVideoIE from .jukebox import JukeboxIE from .justintv import JustinTVIE diff --git a/youtube_dl/extractor/ivi.py b/youtube_dl/extractor/ivi.py new file mode 100644 index 000000000..aa8b3b8a3 --- /dev/null +++ b/youtube_dl/extractor/ivi.py @@ -0,0 +1,154 @@ +# encoding: utf-8 + +import re +import json + +from .common import InfoExtractor +from ..utils import ( + compat_urllib_request, + ExtractorError, +) + + +class IviIE(InfoExtractor): + IE_DESC = u'ivi.ru' + IE_NAME = u'ivi' + _VALID_URL = r'^https?://(?:www\.)?(?P<url>ivi\.ru/watch(?:/(?P<compilationid>[^/]+))?/(?P<videoid>\d+))' + + _TESTS = [ + # Single movie + { + u'url': u'http://www.ivi.ru/watch/53141', + u'file': u'53141.mp4', + u'md5': u'6ff5be2254e796ed346251d117196cf4', + u'info_dict': { + u'title': u'Иван Васильевич меняет профессию', + u'description': u'md5:14d8eda24e9d93d29b5857012c6d6346', + u'duration': 5498, + u'thumbnail': u'http://thumbs.ivi.ru/f20.vcp.digitalaccess.ru/contents/d/1/c3c885163a082c29bceeb7b5a267a6.jpg', + }, + }, + # Serial's serie + { + u'url': u'http://www.ivi.ru/watch/dezhurnyi_angel/74791', + u'file': u'74791.mp4', + u'md5': u'3e6cc9a848c1d2ebcc6476444967baa9', + u'info_dict': { + u'title': u'Дежурный ангел - 1 серия', + u'duration': 2490, + u'thumbnail': u'http://thumbs.ivi.ru/f7.vcp.digitalaccess.ru/contents/8/e/bc2f6c2b6e5d291152fdd32c059141.jpg', + }, + } + ] + + # Sorted by quality + _known_formats = ['MP4-low-mobile', 'MP4-mobile', 'FLV-lo', 'MP4-lo', 'FLV-hi', 'MP4-hi', 'MP4-SHQ'] + + # Sorted by size + _known_thumbnails = ['Thumb-120x90', 'Thumb-160', 'Thumb-640x480'] + + def _extract_description(self, html): + m = re.search(r'<meta name="description" content="(?P<description>[^"]+)"/>', html) + return m.group('description') if m is not None else None + + def _extract_comment_count(self, html): + m = re.search(u'(?s)<a href="#" id="view-comments" class="action-button dim gradient">\s*Комментарии:\s*(?P<commentcount>\d+)\s*</a>', html) + return int(m.group('commentcount')) if m is not None else 0 + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('videoid') + + api_url = 'http://api.digitalaccess.ru/api/json/' + + data = {u'method': u'da.content.get', + u'params': [video_id, {u'site': u's183', + u'referrer': u'http://www.ivi.ru/watch/%s' % video_id, + u'contentid': video_id + } + ] + } + + request = compat_urllib_request.Request(api_url, json.dumps(data)) + + video_json_page = self._download_webpage(request, video_id, u'Downloading video JSON') + video_json = json.loads(video_json_page) + + if u'error' in video_json: + error = video_json[u'error'] + if error[u'origin'] == u'NoRedisValidData': + raise ExtractorError(u'Video %s does not exist' % video_id, expected=True) + raise ExtractorError(u'Unable to download video %s: %s' % (video_id, error[u'message']), expected=True) + + result = video_json[u'result'] + + formats = [{'url': x[u'url'], + 'format_id': x[u'content_format'] + } for x in result[u'files'] if x[u'content_format'] in self._known_formats] + formats.sort(key=lambda fmt: self._known_formats.index(fmt['format_id'])) + + if len(formats) == 0: + self._downloader.report_warning(u'No media links available for %s' % video_id) + return + + duration = result[u'duration'] + compilation = result[u'compilation'] + title = result[u'title'] + + title = '%s - %s' % (compilation, title) if compilation is not None else title + + previews = result[u'preview'] + previews.sort(key=lambda fmt: self._known_thumbnails.index(fmt['content_format'])) + thumbnail = previews[-1][u'url'] if len(previews) > 0 else None + + video_page_url = 'http://' + mobj.group('url') + video_page = self._download_webpage(video_page_url, video_id, u'Downloading video page') + + description = self._extract_description(video_page) + comment_count = self._extract_comment_count(video_page) + + return { + 'id': video_id, + 'title': title, + 'thumbnail': thumbnail, + 'description': description, + 'duration': duration, + 'comment_count': comment_count, + 'formats': formats, + } + + +class IviCompilationIE(InfoExtractor): + IE_DESC = u'ivi.ru compilations' + IE_NAME = u'ivi:compilation' + _VALID_URL = r'^https?://(?:www\.)?ivi\.ru/watch/(?!\d+)(?P<compilationid>[a-z\d_-]+)(?:/season(?P<seasonid>\d+))?$' + + def _extract_entries(self, html, compilation_id): + return [self.url_result('http://www.ivi.ru/watch/%s/%s' % (compilation_id, serie), 'Ivi') + for serie in re.findall(r'<strong><a href="/watch/%s/(\d+)">(?:[^<]+)</a></strong>' % compilation_id, html)] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + compilation_id = mobj.group('compilationid') + season_id = mobj.group('seasonid') + + if season_id is not None: # Season link + season_page = self._download_webpage(url, compilation_id, u'Downloading season %s web page' % season_id) + playlist_id = '%s/season%s' % (compilation_id, season_id) + playlist_title = self._html_search_meta(u'title', season_page, u'title') + entries = self._extract_entries(season_page, compilation_id) + else: # Compilation link + compilation_page = self._download_webpage(url, compilation_id, u'Downloading compilation web page') + playlist_id = compilation_id + playlist_title = self._html_search_meta(u'title', compilation_page, u'title') + seasons = re.findall(r'<a href="/watch/%s/season(\d+)">[^<]+</a>' % compilation_id, compilation_page) + if len(seasons) == 0: # No seasons in this compilation + entries = self._extract_entries(compilation_page, compilation_id) + else: + entries = [] + for season_id in seasons: + season_page = self._download_webpage('http://www.ivi.ru/watch/%s/season%s' % (compilation_id, season_id), + compilation_id, u'Downloading season %s web page' % season_id) + entries.extend(self._extract_entries(season_page, compilation_id)) + + return self.playlist_result(entries, playlist_id, playlist_title) \ No newline at end of file From 8c21b7c647d5328388cb5be6af6cbe9f6143485c Mon Sep 17 00:00:00 2001 From: dst <dstftw@gmail.com> Date: Thu, 19 Dec 2013 05:39:22 +0700 Subject: [PATCH 129/150] [ivi] Add playlist tests --- test/test_playlists.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/test/test_playlists.py b/test/test_playlists.py index 5004d0464..576f7fb4e 100644 --- a/test/test_playlists.py +++ b/test/test_playlists.py @@ -6,6 +6,7 @@ import os import sys import unittest +from youtube_dl.extractor.ivi import IviCompilationIE sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from test.helper import FakeYDL @@ -168,6 +169,24 @@ class TestPlaylists(unittest.TestCase): self.assertEqual(result['title'], u'Building Dynamic Websites') self.assertEqual(result['description'], u"Today's websites are increasingly dynamic. Pages are no longer static HTML files but instead generated by scripts and database calls. User interfaces are more seamless, with technologies like Ajax replacing traditional page reloads. This course teaches students how to build dynamic websites with Ajax and with Linux, Apache, MySQL, and PHP (LAMP), one of today's most popular frameworks. Students learn how to set up domain names with DNS, how to structure pages with XHTML and CSS, how to program in JavaScript and PHP, how to configure Apache and MySQL, how to design and query databases with SQL, how to use Ajax with both XML and JSON, and how to build mashups. The course explores issues of security, scalability, and cross-browser support and also discusses enterprise-level deployments of websites, including third-party hosting, virtualization, colocation in data centers, firewalling, and load-balancing.") self.assertEqual(len(result['entries']), 10) + + def test_ivi_compilation(self): + dl = FakeYDL() + ie = IviCompilationIE(dl) + result = ie.extract('http://www.ivi.ru/watch/dezhurnyi_angel') + self.assertIsPlaylist(result) + self.assertEqual(result['id'], u'dezhurnyi_angel') + self.assertEqual(result['title'], u'Дежурный ангел (2010 - 2012)') + self.assertTrue(len(result['entries']) >= 36) + + def test_ivi_compilation_season(self): + dl = FakeYDL() + ie = IviCompilationIE(dl) + result = ie.extract('http://www.ivi.ru/watch/dezhurnyi_angel/season2') + self.assertIsPlaylist(result) + self.assertEqual(result['id'], u'dezhurnyi_angel/season2') + self.assertEqual(result['title'], u'Дежурный ангел (2010 - 2012) 2 сезон') + self.assertTrue(len(result['entries']) >= 20) if __name__ == '__main__': From 5ce54a8205df1eea7e38ed86d3540f601f300a7e Mon Sep 17 00:00:00 2001 From: dst <dstftw@gmail.com> Date: Thu, 19 Dec 2013 05:53:34 +0700 Subject: [PATCH 130/150] [ivi] Neat import --- test/test_playlists.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/test_playlists.py b/test/test_playlists.py index 576f7fb4e..1b7b4e3d8 100644 --- a/test/test_playlists.py +++ b/test/test_playlists.py @@ -6,7 +6,6 @@ import os import sys import unittest -from youtube_dl.extractor.ivi import IviCompilationIE sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from test.helper import FakeYDL @@ -28,7 +27,8 @@ from youtube_dl.extractor import ( BambuserChannelIE, BandcampAlbumIE, SmotriCommunityIE, - SmotriUserIE + SmotriUserIE, + IviCompilationIE ) From 0cc83dc54b619376800f9983cc1d7338a6b414ec Mon Sep 17 00:00:00 2001 From: dst <dstftw@gmail.com> Date: Thu, 19 Dec 2013 05:56:48 +0700 Subject: [PATCH 131/150] [smotri] Fix duration field name --- youtube_dl/extractor/smotri.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/smotri.py b/youtube_dl/extractor/smotri.py index 4ea89bf85..a589a893b 100644 --- a/youtube_dl/extractor/smotri.py +++ b/youtube_dl/extractor/smotri.py @@ -202,7 +202,7 @@ class SmotriIE(InfoExtractor): 'uploader': video_uploader, 'upload_date': video_upload_date, 'uploader_id': video_uploader_id, - 'video_duration': video_duration, + 'duration': video_duration, 'view_count': video_view_count, 'age_limit': 18 if adult_content else 0, 'video_page_url': video_page_url From 6c6db72ed4505520d80002e37b523e4177146979 Mon Sep 17 00:00:00 2001 From: dst <dstftw@gmail.com> Date: Thu, 19 Dec 2013 06:19:41 +0700 Subject: [PATCH 132/150] [ivi] Skip tests for travis build --- youtube_dl/extractor/ivi.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/youtube_dl/extractor/ivi.py b/youtube_dl/extractor/ivi.py index aa8b3b8a3..10279478b 100644 --- a/youtube_dl/extractor/ivi.py +++ b/youtube_dl/extractor/ivi.py @@ -27,6 +27,7 @@ class IviIE(InfoExtractor): u'duration': 5498, u'thumbnail': u'http://thumbs.ivi.ru/f20.vcp.digitalaccess.ru/contents/d/1/c3c885163a082c29bceeb7b5a267a6.jpg', }, + u'skip': u'Only works from Russia', }, # Serial's serie { @@ -38,6 +39,7 @@ class IviIE(InfoExtractor): u'duration': 2490, u'thumbnail': u'http://thumbs.ivi.ru/f7.vcp.digitalaccess.ru/contents/8/e/bc2f6c2b6e5d291152fdd32c059141.jpg', }, + u'skip': u'Only works from Russia', } ] From a51e37af6242a3fa49ad258a63d3f1a40c0ef9f2 Mon Sep 17 00:00:00 2001 From: dst <dstftw@gmail.com> Date: Thu, 19 Dec 2013 10:53:38 +0700 Subject: [PATCH 133/150] [ivi] Simplify --- youtube_dl/extractor/ivi.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/ivi.py b/youtube_dl/extractor/ivi.py index 10279478b..4bdf55f93 100644 --- a/youtube_dl/extractor/ivi.py +++ b/youtube_dl/extractor/ivi.py @@ -13,7 +13,7 @@ from ..utils import ( class IviIE(InfoExtractor): IE_DESC = u'ivi.ru' IE_NAME = u'ivi' - _VALID_URL = r'^https?://(?:www\.)?(?P<url>ivi\.ru/watch(?:/(?P<compilationid>[^/]+))?/(?P<videoid>\d+))' + _VALID_URL = r'^https?://(?:www\.)?ivi\.ru/watch(?:/(?P<compilationid>[^/]+))?/(?P<videoid>\d+)' _TESTS = [ # Single movie @@ -103,9 +103,7 @@ class IviIE(InfoExtractor): previews.sort(key=lambda fmt: self._known_thumbnails.index(fmt['content_format'])) thumbnail = previews[-1][u'url'] if len(previews) > 0 else None - video_page_url = 'http://' + mobj.group('url') - video_page = self._download_webpage(video_page_url, video_id, u'Downloading video page') - + video_page = self._download_webpage(url, video_id, u'Downloading video page') description = self._extract_description(video_page) comment_count = self._extract_comment_count(video_page) From 71507a11c899ceba571c0e2d0b2d8e632bcd0c08 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Thu, 19 Dec 2013 16:39:01 +0100 Subject: [PATCH 134/150] [soundcloud] Support mobile URLs (Fixes #2009) --- youtube_dl/extractor/soundcloud.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index cbba4094b..e22ff9c38 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -24,7 +24,7 @@ class SoundcloudIE(InfoExtractor): """ _VALID_URL = r'''^(?:https?://)? - (?:(?:(?:www\.)?soundcloud\.com/ + (?:(?:(?:www\.|m\.)?soundcloud\.com/ (?P<uploader>[\w\d-]+)/ (?!sets/)(?P<title>[\w\d-]+)/? (?P<token>[^?]+?)?(?:[?].*)?$) From 97e302a419d559a9837eeefb46ad05066540cc66 Mon Sep 17 00:00:00 2001 From: dst <dstftw@gmail.com> Date: Fri, 20 Dec 2013 00:21:04 +0700 Subject: [PATCH 135/150] [imdb] Add support for mobile site URLs --- youtube_dl/extractor/imdb.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/imdb.py b/youtube_dl/extractor/imdb.py index 6fb373db2..e5332cce8 100644 --- a/youtube_dl/extractor/imdb.py +++ b/youtube_dl/extractor/imdb.py @@ -11,7 +11,7 @@ from ..utils import ( class ImdbIE(InfoExtractor): IE_NAME = u'imdb' IE_DESC = u'Internet Movie Database trailers' - _VALID_URL = r'http://www\.imdb\.com/video/imdb/vi(?P<id>\d+)' + _VALID_URL = r'http://(?:www|m)\.imdb\.com/video/imdb/vi(?P<id>\d+)' _TEST = { u'url': u'http://www.imdb.com/video/imdb/vi2524815897', @@ -27,7 +27,7 @@ class ImdbIE(InfoExtractor): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') - webpage = self._download_webpage(url,video_id) + webpage = self._download_webpage('http://www.imdb.com/video/imdb/vi%s' % video_id, video_id) descr = get_element_by_attribute('itemprop', 'description', webpage) available_formats = re.findall( r'case \'(?P<f_id>.*?)\' :$\s+url = \'(?P<path>.*?)\'', webpage, From c0d0b01f0e12ce23f7a751ef05e52dabd3e4c1e7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Thu, 19 Dec 2013 20:28:52 +0100 Subject: [PATCH 136/150] [generic] Detect ooyala videos (fixes #2013) --- youtube_dl/extractor/common.py | 6 ++++-- youtube_dl/extractor/generic.py | 18 +++++++++++++++++- youtube_dl/extractor/ooyala.py | 5 +++++ 3 files changed, 26 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 939249d7b..f89df57ed 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -263,7 +263,8 @@ class InfoExtractor(object): self.to_screen(u'Logging in') #Methods for following #608 - def url_result(self, url, ie=None, video_id=None): + @staticmethod + def url_result(url, ie=None, video_id=None): """Returns a url that points to a page that should be processed""" #TODO: ie should be the class used for getting the info video_info = {'_type': 'url', @@ -272,7 +273,8 @@ class InfoExtractor(object): if video_id is not None: video_info['id'] = video_id return video_info - def playlist_result(self, entries, playlist_id=None, playlist_title=None): + @staticmethod + def playlist_result(entries, playlist_id=None, playlist_title=None): """Returns a playlist""" video_info = {'_type': 'playlist', 'entries': entries} diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index fd32370c2..3c56daa02 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -17,6 +17,7 @@ from ..utils import ( url_basename, ) from .brightcove import BrightcoveIE +from .ooyala import OoyalaIE class GenericIE(InfoExtractor): @@ -83,7 +84,17 @@ class GenericIE(InfoExtractor): u'title': u'trailer', u'upload_date': u'20100513', } - } + }, + # ooyala video + { + u'url': u'http://www.rollingstone.com/music/videos/norwegian-dj-cashmere-cat-goes-spartan-on-with-me-premiere-20131219', + u'md5': u'5644c6ca5d5782c1d0d350dad9bd840c', + u'info_dict': { + u'id': u'BwY2RxaTrTkslxOfcan0UCf0YqyvWysJ', + u'ext': u'mp4', + u'title': u'2cc213299525360.mov', #that's what we get + }, + }, ] def report_download_webpage(self, video_id): @@ -277,6 +288,11 @@ class GenericIE(InfoExtractor): if mobj is not None: return self.url_result(mobj.group('url')) + # Look for Ooyala videos + mobj = re.search(r'player.ooyala.com/[^"?]+\?[^"]*?(?:embedCode|ec)=([^"&]+)', webpage) + if mobj is not None: + return OoyalaIE._build_url_result(mobj.group(1)) + # Start with something easy: JW Player in SWFObject mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage) if mobj is None: diff --git a/youtube_dl/extractor/ooyala.py b/youtube_dl/extractor/ooyala.py index 1f7b4d2e7..d08e47734 100644 --- a/youtube_dl/extractor/ooyala.py +++ b/youtube_dl/extractor/ooyala.py @@ -22,6 +22,11 @@ class OoyalaIE(InfoExtractor): def _url_for_embed_code(embed_code): return 'http://player.ooyala.com/player.js?embedCode=%s' % embed_code + @classmethod + def _build_url_result(cls, embed_code): + return cls.url_result(cls._url_for_embed_code(embed_code), + ie=cls.ie_key()) + def _extract_result(self, info, more_info): return {'id': info['embedCode'], 'ext': 'mp4', From 1f9da9049b5b6dcede3d274acd58aa1b6dea5d2c Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Thu, 19 Dec 2013 20:44:30 +0100 Subject: [PATCH 137/150] [generic] Support YouTube swf embed (Fixes #2010) --- youtube_dl/extractor/generic.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 3c56daa02..75cb96eb7 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -233,8 +233,11 @@ class GenericIE(InfoExtractor): return self.url_result(surl, 'Vimeo') # Look for embedded YouTube player - matches = re.findall( - r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?youtube\.com/embed/.+?)\1', webpage) + matches = re.findall(r'''(?x) + (?:<iframe[^>]+?src=|embedSWF\(\s*) + (["\'])(?P<url>(?:https?:)?//(?:www\.)?youtube\.com/ + (?:embed|v)/.+?) + \1''', webpage) if matches: urlrs = [self.url_result(unescapeHTML(tuppl[1]), 'Youtube') for tuppl in matches] From 768df745385a283f4df3a38ee4734feec518ec87 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Thu, 19 Dec 2013 21:02:25 +0100 Subject: [PATCH 138/150] [blinkxx] Add support for youtube videos --- youtube_dl/extractor/blinkx.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/youtube_dl/extractor/blinkx.py b/youtube_dl/extractor/blinkx.py index 48f16b692..e118f2e9f 100644 --- a/youtube_dl/extractor/blinkx.py +++ b/youtube_dl/extractor/blinkx.py @@ -54,6 +54,10 @@ class BlinkxIE(InfoExtractor): }) elif m['type'] == 'original': duration = m['d'] + elif m['type'] == 'youtube': + yt_id = m['link'] + self.to_screen(u'Youtube video detected: %s' % yt_id) + return self.url_result(yt_id, 'Youtube', video_id=yt_id) elif m['type'] in ('flv', 'mp4'): vcodec = remove_start(m['vcodec'], 'ff') acodec = remove_start(m['acodec'], 'ff') From aa94a6d3159af8333b56d16f3ed0bc3a164a882a Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Fri, 20 Dec 2013 17:05:28 +0100 Subject: [PATCH 139/150] [aparat] Add support (Fixes #2012) --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/aparat.py | 56 ++++++++++++++++++++++++++++++++ youtube_dl/extractor/common.py | 2 ++ youtube_dl/extractor/generic.py | 15 +++++---- youtube_dl/utils.py | 5 +++ 5 files changed, 73 insertions(+), 6 deletions(-) create mode 100644 youtube_dl/extractor/aparat.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 7f2f8806e..7de9d594a 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -1,6 +1,7 @@ from .academicearth import AcademicEarthCourseIE from .addanime import AddAnimeIE from .anitube import AnitubeIE +from .aparat import AparatIE from .appletrailers import AppleTrailersIE from .archiveorg import ArchiveOrgIE from .ard import ARDIE diff --git a/youtube_dl/extractor/aparat.py b/youtube_dl/extractor/aparat.py new file mode 100644 index 000000000..7e93bc4df --- /dev/null +++ b/youtube_dl/extractor/aparat.py @@ -0,0 +1,56 @@ +#coding: utf-8 + +import re + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + HEADRequest, +) + + +class AparatIE(InfoExtractor): + _VALID_URL = r'^https?://(?:www\.)?aparat\.com/(?:v/|video/video/embed/videohash/)(?P<id>[a-zA-Z0-9]+)' + + _TEST = { + u'url': u'http://www.aparat.com/v/wP8On', + u'file': u'wP8On.mp4', + u'md5': u'6714e0af7e0d875c5a39c4dc4ab46ad1', + u'info_dict': { + u"title": u"تیم گلکسی 11 - زومیت", + }, + #u'skip': u'Extremely unreliable', + } + + def _real_extract(self, url): + m = re.match(self._VALID_URL, url) + video_id = m.group('id') + + # Note: There is an easier-to-parse configuration at + # http://www.aparat.com/video/video/config/videohash/%video_id + # but the URL in there does not work + embed_url = (u'http://www.aparat.com/video/video/embed/videohash/' + + video_id + u'/vt/frame') + webpage = self._download_webpage(embed_url, video_id) + + video_urls = re.findall(r'fileList\[[0-9]+\]\s*=\s*"([^"]+)"', webpage) + for i, video_url in enumerate(video_urls): + req = HEADRequest(video_url) + res = self._request_webpage( + req, video_id, note=u'Testing video URL %d' % i, errnote=False) + if res: + break + else: + raise ExtractorError(u'No working video URLs found') + + title = self._search_regex(r'\s+title:\s*"([^"]+)"', webpage, u'title') + thumbnail = self._search_regex( + r'\s+image:\s*"([^"]+)"', webpage, u'thumbnail', fatal=False) + + return { + 'id': video_id, + 'title': title, + 'url': video_url, + 'ext': 'mp4', + 'thumbnail': thumbnail, + } diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index f89df57ed..ba46a7bc7 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -170,6 +170,8 @@ class InfoExtractor(object): try: return self._downloader.urlopen(url_or_request) except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: + if errnote is False: + return False if errnote is None: errnote = u'Unable to download webpage' errmsg = u'%s: %s' % (errnote, compat_str(err)) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 75cb96eb7..bdb4f58d6 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -11,6 +11,7 @@ from ..utils import ( compat_urlparse, ExtractorError, + HEADRequest, smuggle_url, unescapeHTML, unified_strdate, @@ -109,21 +110,18 @@ class GenericIE(InfoExtractor): def _send_head(self, url): """Check if it is a redirect, like url shorteners, in case return the new url.""" - class HeadRequest(compat_urllib_request.Request): - def get_method(self): - return "HEAD" class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler): """ Subclass the HTTPRedirectHandler to make it use our - HeadRequest also on the redirected URL + HEADRequest also on the redirected URL """ def redirect_request(self, req, fp, code, msg, headers, newurl): if code in (301, 302, 303, 307): newurl = newurl.replace(' ', '%20') newheaders = dict((k,v) for k,v in req.headers.items() if k.lower() not in ("content-length", "content-type")) - return HeadRequest(newurl, + return HEADRequest(newurl, headers=newheaders, origin_req_host=req.get_origin_req_host(), unverifiable=True) @@ -152,7 +150,7 @@ class GenericIE(InfoExtractor): compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]: opener.add_handler(handler()) - response = opener.open(HeadRequest(url)) + response = opener.open(HEADRequest(url)) if response is None: raise ExtractorError(u'Invalid URL protocol') return response @@ -296,6 +294,11 @@ class GenericIE(InfoExtractor): if mobj is not None: return OoyalaIE._build_url_result(mobj.group(1)) + # Look for Aparat videos + mobj = re.search(r'<iframe src="(http://www.aparat.com/video/[^"]+)"', webpage) + if mobj is not None: + return self.url_result(mobj.group(1), 'Aparat') + # Start with something easy: JW Player in SWFObject mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage) if mobj is None: diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index cc391bddd..2e48f187e 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1093,3 +1093,8 @@ def remove_start(s, start): def url_basename(url): path = compat_urlparse.urlparse(url).path return path.strip(u'/').split(u'/')[-1] + + +class HEADRequest(compat_urllib_request.Request): + def get_method(self): + return "HEAD" From f65c1d2be020b8078a6a5e166bf4fd17af981387 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Fri, 20 Dec 2013 17:08:16 +0100 Subject: [PATCH 140/150] release 2013.12.20 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 7cbee7335..c13af8abd 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2013.12.17.2' +__version__ = '2013.12.20' From 79fed2a4dfb9be350671632b8e0364cbed9dc83c Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Fri, 20 Dec 2013 17:20:39 +0100 Subject: [PATCH 141/150] [crunchyroll] Fix test (#1721) --- youtube_dl/extractor/crunchyroll.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index 4bd366079..2b66bddbb 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -24,7 +24,7 @@ class CrunchyrollIE(InfoExtractor): u'file': u'645513.flv', #u'md5': u'b1639fd6ddfaa43788c85f6d1dddd412', u'info_dict': { - u'title': u'Wanna be the Strongest in the World – Episode 1 – An Idol-Wrestler is Born!', + u'title': u'Wanna be the Strongest in the World Episode 1 – An Idol-Wrestler is Born!', u'description': u'md5:2d17137920c64f2f49981a7797d275ef', u'thumbnail': u'http://img1.ak.crunchyroll.com/i/spire1-tmb/20c6b5e10f1a47b10516877d3c039cae1380951166_full.jpg', u'uploader': u'Yomiuri Telecasting Corporation (YTV)', @@ -37,9 +37,9 @@ class CrunchyrollIE(InfoExtractor): }] _FORMAT_IDS = { - u'360': (u'60', u'106'), - u'480': (u'61', u'106'), - u'720': (u'62', u'106'), + u'360': (u'60', u'106'), + u'480': (u'61', u'106'), + u'720': (u'62', u'106'), u'1080': (u'80', u'108'), } @@ -102,7 +102,7 @@ class CrunchyrollIE(InfoExtractor): raise ExtractorError(note_m) video_title = self._html_search_regex(r'<h1[^>]*>(.+?)</h1>', webpage, u'video_title', flags=re.DOTALL) - video_title = re.sub(r' {5} *–? *', u' – ', video_title) + video_title = re.sub(r' {2,}', u' ', video_title) video_description = self._html_search_regex(r'"description":"([^"]+)', webpage, u'video_description', default=u'') if not video_description: video_description = None From bd1488ae647d735abc42db078804cf93c333912f Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Fri, 20 Dec 2013 17:23:59 +0100 Subject: [PATCH 142/150] [mdr] Remove test For context, refer to the http://de.wikipedia.org/wiki/Depublizieren --- youtube_dl/extractor/mdr.py | 19 ++----------------- 1 file changed, 2 insertions(+), 17 deletions(-) diff --git a/youtube_dl/extractor/mdr.py b/youtube_dl/extractor/mdr.py index d29cf2c07..08ce0647f 100644 --- a/youtube_dl/extractor/mdr.py +++ b/youtube_dl/extractor/mdr.py @@ -8,23 +8,8 @@ from ..utils import ( class MDRIE(InfoExtractor): _VALID_URL = r'^(?P<domain>(?:https?://)?(?:www\.)?mdr\.de)/mediathek/(?:.*)/(?P<type>video|audio)(?P<video_id>[^/_]+)_.*' - - _TESTS = [{ - u'url': u'http://www.mdr.de/mediathek/themen/nachrichten/video165624_zc-c5c7de76_zs-3795826d.html', - u'file': u'165624.mp4', - u'md5': u'ae785f36ecbf2f19b42edf1bc9c85815', - u'info_dict': { - u"title": u"MDR aktuell Eins30 09.12.2013, 22:48 Uhr" - }, - }, - { - u'url': u'http://www.mdr.de/mediathek/radio/mdr1-radio-sachsen/audio718370_zc-67b21197_zs-1b9b2483.html', - u'file': u'718370.mp3', - u'md5': u'a9d21345a234c7b45dee612f290fd8d7', - u'info_dict': { - u"title": u"MDR 1 RADIO SACHSEN 10.12.2013, 05:00 Uhr" - }, - }] + + # No tests, MDR regularily deletes its videos def _real_extract(self, url): m = re.match(self._VALID_URL, url) From 147e4aece0cfa6ca5d584b57785dc99f31d1e4eb Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Fri, 20 Dec 2013 17:27:43 +0100 Subject: [PATCH 143/150] [vbox7] New video checksum --- youtube_dl/extractor/vbox7.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/vbox7.py b/youtube_dl/extractor/vbox7.py index 4f803bcd3..5a136a952 100644 --- a/youtube_dl/extractor/vbox7.py +++ b/youtube_dl/extractor/vbox7.py @@ -15,7 +15,7 @@ class Vbox7IE(InfoExtractor): _TEST = { u'url': u'http://vbox7.com/play:249bb972c2', u'file': u'249bb972c2.flv', - u'md5': u'9c70d6d956f888bdc08c124acc120cfe', + u'md5': u'99f65c0c9ef9b682b97313e052734c3f', u'info_dict': { u"title": u"\u0421\u043c\u044f\u0445! \u0427\u0443\u0434\u043e - \u0447\u0438\u0441\u0442 \u0437\u0430 \u0441\u0435\u043a\u0443\u043d\u0434\u0438 - \u0421\u043a\u0440\u0438\u0442\u0430 \u043a\u0430\u043c\u0435\u0440\u0430" } From c4d55a33fce91d846a6ae270ac3e4c69b7f45ba4 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Fri, 20 Dec 2013 17:28:50 +0100 Subject: [PATCH 144/150] [brightcove] Test checksum changed --- youtube_dl/extractor/brightcove.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index b1b7526ca..f7f0041c0 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -26,7 +26,7 @@ class BrightcoveIE(InfoExtractor): # From http://www.8tv.cat/8aldia/videos/xavier-sala-i-martin-aquesta-tarda-a-8-al-dia/ u'url': u'http://c.brightcove.com/services/viewer/htmlFederated?playerID=1654948606001&flashID=myExperience&%40videoPlayer=2371591881001', u'file': u'2371591881001.mp4', - u'md5': u'8eccab865181d29ec2958f32a6a754f5', + u'md5': u'5423e113865d26e40624dce2e4b45d95', u'note': u'Test Brightcove downloads and detection in GenericIE', u'info_dict': { u'title': u'Xavier Sala i Martín: “Un banc que no presta és un banc zombi que no serveix per a res”', From bbafbe20c233d00e86fc87a1b1ccab8cf9e88232 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Sun, 22 Dec 2013 03:17:56 +0100 Subject: [PATCH 145/150] [vimeo] Better formatting for regexp --- youtube_dl/extractor/vimeo.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index ea4409528..4e8fef165 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -16,11 +16,20 @@ from ..utils import ( unsmuggle_url, ) + class VimeoIE(InfoExtractor): """Information extractor for vimeo.com.""" # _VALID_URL matches Vimeo URLs - _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|(?P<player>player))\.)?vimeo(?P<pro>pro)?\.com/(?:.*?/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)/?(?:[?].*)?(?:#.*)?$' + _VALID_URL = r'''(?x) + (?P<proto>https?://)? + (?:(?:www|(?P<player>player))\.)? + vimeo(?P<pro>pro)?\.com/ + (?:.*?/)? + (?P<direct_link>play_redirect_hls\?clip_id=)? + (?:videos?/)? + (?P<id>[0-9]+) + /?(?:[?].*)?(?:[#].*)?$''' _NETRC_MACHINE = 'vimeo' IE_NAME = u'vimeo' _TESTS = [ From 04ff34ab8977ae14654089f5bf9956ffa8f23c5f Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Sun, 22 Dec 2013 03:25:55 +0100 Subject: [PATCH 146/150] Show all matching URLs --- youtube_dl/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 6df44020b..7e156d4d4 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -525,7 +525,6 @@ def _real_main(argv=None): for ie in sorted(extractors, key=lambda ie: ie.IE_NAME.lower()): compat_print(ie.IE_NAME + (' (CURRENTLY BROKEN)' if not ie._WORKING else '')) matchedUrls = [url for url in all_urls if ie.suitable(url)] - all_urls = [url for url in all_urls if url not in matchedUrls] for mu in matchedUrls: compat_print(u' ' + mu) sys.exit(0) From 7115ca84aacf8eca3016c9a27cb6411ae5f52305 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Sun, 22 Dec 2013 03:34:13 +0100 Subject: [PATCH 147/150] [vimeo/generic] Add support for embedded SWF vimeo videos --- youtube_dl/extractor/generic.py | 8 +++++++- youtube_dl/extractor/vimeo.py | 4 ++-- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index bdb4f58d6..7a14c98f9 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -222,7 +222,7 @@ class GenericIE(InfoExtractor): self.to_screen(u'Brightcove video detected.') return self.url_result(bc_url, 'Brightcove') - # Look for embedded Vimeo player + # Look for embedded (iframe) Vimeo player mobj = re.search( r'<iframe[^>]+?src="(https?://player.vimeo.com/video/.+?)"', webpage) if mobj: @@ -230,6 +230,12 @@ class GenericIE(InfoExtractor): surl = smuggle_url(player_url, {'Referer': url}) return self.url_result(surl, 'Vimeo') + # Look for embedded (swf embed) Vimeo player + mobj = re.search( + r'<embed[^>]+?src="(https?://(?:www\.)?vimeo.com/moogaloop.swf.+?)"', webpage) + if mobj: + return self.url_result(mobj.group(1), 'Vimeo') + # Look for embedded YouTube player matches = re.findall(r'''(?x) (?:<iframe[^>]+?src=|embedSWF\(\s*) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 4e8fef165..c3623fcbe 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -26,10 +26,10 @@ class VimeoIE(InfoExtractor): (?:(?:www|(?P<player>player))\.)? vimeo(?P<pro>pro)?\.com/ (?:.*?/)? - (?P<direct_link>play_redirect_hls\?clip_id=)? + (?:(?:play_redirect_hls|moogaloop\.swf)\?clip_id=)? (?:videos?/)? (?P<id>[0-9]+) - /?(?:[?].*)?(?:[#].*)?$''' + /?(?:[?&].*)?(?:[#].*)?$''' _NETRC_MACHINE = 'vimeo' IE_NAME = u'vimeo' _TESTS = [ From 5a94982abe4b9824c8e886c8176f930541714adc Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Sun, 22 Dec 2013 03:52:12 +0100 Subject: [PATCH 148/150] Remove unused import --- youtube_dl/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 7e156d4d4..63437301b 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -56,7 +56,6 @@ from .utils import ( compat_print, DateRange, decodeOption, - determine_ext, get_term_width, DownloadError, get_cachedir, From e302f9ce324c13e24a717027eaa7a918658b4e8a Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Sun, 22 Dec 2013 03:57:42 +0100 Subject: [PATCH 149/150] [youtube:user] Speed up --match-title --- youtube_dl/extractor/youtube.py | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index c860eedda..a68576547 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1717,7 +1717,7 @@ class YoutubeUserIE(InfoExtractor): # page by page until there are no video ids - it means we got # all of them. - video_ids = [] + url_results = [] for pagenum in itertools.count(0): start_index = pagenum * self._GDATA_PAGE_SIZE + 1 @@ -1735,10 +1735,17 @@ class YoutubeUserIE(InfoExtractor): break # Extract video identifiers - ids_in_page = [] - for entry in response['feed']['entry']: - ids_in_page.append(entry['id']['$t'].split('/')[-1]) - video_ids.extend(ids_in_page) + entries = response['feed']['entry'] + for entry in entries: + title = entry['title']['$t'] + video_id = entry['id']['$t'].split('/')[-1] + url_results.append({ + '_type': 'url', + 'url': video_id, + 'ie_key': 'Youtube', + 'id': 'video_id', + 'title': title, + }) # A little optimization - if current page is not # "full", ie. does not contain PAGE_SIZE video ids then @@ -1746,12 +1753,9 @@ class YoutubeUserIE(InfoExtractor): # are no more ids on further pages - no need to query # again. - if len(ids_in_page) < self._GDATA_PAGE_SIZE: + if len(entries) < self._GDATA_PAGE_SIZE: break - url_results = [ - self.url_result(video_id, 'Youtube', video_id=video_id) - for video_id in video_ids] return self.playlist_result(url_results, playlist_title=username) From 1b969041d701e2fb7ff106476b91084dbc67332a Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Sun, 22 Dec 2013 07:43:54 +0100 Subject: [PATCH 150/150] [blinkx] Support mobile URLs (Closes #2022) --- youtube_dl/extractor/blinkx.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/blinkx.py b/youtube_dl/extractor/blinkx.py index e118f2e9f..144ce64cc 100644 --- a/youtube_dl/extractor/blinkx.py +++ b/youtube_dl/extractor/blinkx.py @@ -9,7 +9,7 @@ from ..utils import ( class BlinkxIE(InfoExtractor): - _VALID_URL = r'^(?:https?://(?:www\.)blinkx\.com/ce/|blinkx:)(?P<id>[^?]+)' + _VALID_URL = r'^(?:https?://(?:www\.)blinkx\.com/#?ce/|blinkx:)(?P<id>[^?]+)' _IE_NAME = u'blinkx' _TEST = {