From a504ced097e703a9bc6c18b6e31bcafb4783ed80 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sun, 15 Feb 2015 18:03:41 +0100 Subject: [PATCH 01/83] Improve subtitles support For each language the extractor builds a list with the available formats sorted (like for video formats), then YoutubeDL selects one of them using the '--sub-format' option which now allows giving the format preferences (for example 'ass/srt/best'). For each format the 'url' field can be set so that we only download the contents if needed, or if the contents needs to be processed (like in crunchyroll) the 'data' field can be used. The reasons for this change are: * We weren't checking that the format given with '--sub-format' was available, checking it in each extractor would be repetitive. * It allows to easily support giving a format preference. * The subtitles were automatically downloaded in the extractor, but I think that if you use for example the '--dump-json' option you want to finish as fast as possible. Currently only the ted extractor has been updated, but the old system still works. --- test/test_subtitles.py | 24 ++++----- youtube_dl/YoutubeDL.py | 85 +++++++++++++++++++++++++++--- youtube_dl/__init__.py | 1 - youtube_dl/extractor/common.py | 20 ++++++- youtube_dl/extractor/ted.py | 18 ++++--- youtube_dl/options.py | 4 +- youtube_dl/postprocessor/ffmpeg.py | 6 +-- 7 files changed, 121 insertions(+), 37 deletions(-) diff --git a/test/test_subtitles.py b/test/test_subtitles.py index bcc69a778..fbc9eaf4d 100644 --- a/test/test_subtitles.py +++ b/test/test_subtitles.py @@ -27,15 +27,23 @@ class BaseTestSubtitles(unittest.TestCase): def setUp(self): self.DL = FakeYDL() - self.ie = self.IE(self.DL) + self.ie = self.IE() + self.DL.add_info_extractor(self.ie) def getInfoDict(self): - info_dict = self.ie.extract(self.url) + info_dict = self.DL.extract_info(self.url, download=False) return info_dict def getSubtitles(self): info_dict = self.getInfoDict() - return info_dict['subtitles'] + subtitles = info_dict['subtitles'] + if not subtitles: + return subtitles + for sub_info in subtitles.values(): + if sub_info.get('data') is None: + uf = self.DL.urlopen(sub_info['url']) + sub_info['data'] = uf.read().decode('utf-8') + return dict((l, sub_info['data']) for l, sub_info in subtitles.items()) class TestYoutubeSubtitles(BaseTestSubtitles): @@ -176,7 +184,7 @@ class TestTedSubtitles(BaseTestSubtitles): def test_no_writesubtitles(self): subtitles = self.getSubtitles() - self.assertEqual(subtitles, None) + self.assertFalse(subtitles) def test_subtitles(self): self.DL.params['writesubtitles'] = True @@ -196,18 +204,10 @@ class TestTedSubtitles(BaseTestSubtitles): self.assertTrue(len(subtitles.keys()) >= 28) def test_list_subtitles(self): - self.DL.expect_warning('Automatic Captions not supported by this server') self.DL.params['listsubtitles'] = True info_dict = self.getInfoDict() self.assertEqual(info_dict, None) - def test_automatic_captions(self): - self.DL.expect_warning('Automatic Captions not supported by this server') - self.DL.params['writeautomaticsub'] = True - self.DL.params['subtitleslang'] = ['en'] - subtitles = self.getSubtitles() - self.assertTrue(len(subtitles.keys()) == 0) - def test_multiple_langs(self): self.DL.params['writesubtitles'] = True langs = ['es', 'fr', 'de'] diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 13d18e25e..e665e3d53 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -154,7 +154,7 @@ class YoutubeDL(object): allsubtitles: Downloads all the subtitles of the video (requires writesubtitles or writeautomaticsub) listsubtitles: Lists all available subtitles for the video - subtitlesformat: Subtitle format [srt/sbv/vtt] (default=srt) + subtitlesformat: The format code for subtitles subtitleslangs: List of languages of the subtitles to download keepvideo: Keep the video file after post-processing daterange: A DateRange object, download only if the upload_date is in the range. @@ -1019,6 +1019,11 @@ class YoutubeDL(object): info_dict['timestamp']) info_dict['upload_date'] = upload_date.strftime('%Y%m%d') + if self.params.get('listsubtitles', False): + self.list_subtitles(info_dict['id'], info_dict.get('subtitles')) + return + info_dict['subtitles'] = self.process_subtitles(info_dict['id'], info_dict.get('subtitles')) + # This extractors handle format selection themselves if info_dict['extractor'] in ['Youku']: if download: @@ -1147,6 +1152,53 @@ class YoutubeDL(object): info_dict.update(formats_to_download[-1]) return info_dict + def process_subtitles(self, video_id, available_subs): + """Select the requested subtitles and their format""" + if not available_subs: + return available_subs + + if self.params.get('allsubtitles', False): + requested_langs = available_subs.keys() + else: + if self.params.get('subtitleslangs', False): + requested_langs = self.params.get('subtitleslangs') + elif 'en' in available_subs: + requested_langs = ['en'] + else: + requested_langs = [list(available_subs.keys())[0]] + + formats_query = self.params.get('subtitlesformat', 'best') + formats_preference = formats_query.split('/') if formats_query else [] + subs = {} + for lang in requested_langs: + formats = available_subs.get(lang) + if formats is None: + self.report_warning('%s subtitles not available for %s' % (lang, video_id)) + continue + if isinstance(formats, compat_str): + # TODO: convert all IE with subtitles support to the new format + # and remove this + subs[lang] = { + 'ext': formats_preference[0], + 'data': formats, + } + continue + for ext in formats_preference: + if ext == 'best': + f = formats[-1] + break + matches = list(filter(lambda f: f['ext'] == ext, formats)) + if matches: + f = matches[-1] + break + else: + f = formats[-1] + self.report_warning( + 'No subtitle format found matching "%s" for language %s, ' + 'using %s' % (formats_query, lang, f['ext'])) + subs[lang] = f + return subs + def process_info(self, info_dict): """Process a single resolved IE result.""" @@ -1253,11 +1305,18 @@ class YoutubeDL(object): # subtitles download errors are already managed as troubles in relevant IE # that way it will silently go on when used with unsupporting IE subtitles = info_dict['subtitles'] - sub_format = self.params.get('subtitlesformat', 'srt') - for sub_lang in subtitles.keys(): - sub = subtitles[sub_lang] - if sub is None: - continue + for sub_lang, sub_info in subtitles.items(): + sub_format = sub_info['ext'] + if sub_info.get('data') is not None: + sub_data = sub_info['data'] + else: + try: + uf = self.urlopen(sub_info['url']) + sub_data = uf.read().decode('utf-8') + except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: + self.report_warning('Unable to download subtitle for "%s": %s' % + (sub_lang, compat_str(err))) + continue try: sub_filename = subtitles_filename(filename, sub_lang, sub_format) if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(sub_filename)): @@ -1265,7 +1324,7 @@ class YoutubeDL(object): else: self.to_screen('[info] Writing video subtitles to: ' + sub_filename) with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8') as subfile: - subfile.write(sub) + subfile.write(sub_data) except (OSError, IOError): self.report_error('Cannot write subtitles file ' + sub_filename) return @@ -1586,6 +1645,18 @@ class YoutubeDL(object): ['ID', 'width', 'height', 'URL'], [[t['id'], t.get('width', 'unknown'), t.get('height', 'unknown'), t['url']] for t in thumbnails])) + def list_subtitles(self, video_id, subtitles): + if not subtitles: + self.to_screen('%s has no subtitles' % video_id) + return + header_line = 'Language formats' + sub_lines = [ + '%-12s%s' % (lang, ', '.join(f['ext'] for f in reversed(formats))) + for lang, formats in subtitles.items()] + self.to_screen( + 'Available subtitles for %s:\n%s\n%s' % + (video_id, header_line, '\n'.join(sub_lines))) + def urlopen(self, req): """ Start an HTTP download """ diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index ed22f169f..5f2585003 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -226,7 +226,6 @@ def _real_main(argv=None): if opts.embedsubtitles: postprocessors.append({ 'key': 'FFmpegEmbedSubtitle', - 'subtitlesformat': opts.subtitlesformat, }) if opts.xattrs: postprocessors.append({'key': 'XAttrMetadata'}) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index c784eedb9..161c623eb 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -151,8 +151,14 @@ class InfoExtractor(object): If not explicitly set, calculated from timestamp. uploader_id: Nickname or id of the video uploader. location: Physical location where the video was filmed. - subtitles: The subtitle file contents as a dictionary in the format - {language: subtitles}. + subtitles: The available subtitles as a dictionary in the format + {language: subformats}. "subformats" is a list sorted from + lower to higher preference, each element is a dictionary + with the "ext" entry and one of: + * "data": The subtitles file contents + * "url": A url pointing to the subtitles file + Note: YoutubeDL.extract_info will get the requested + format and replace the "subformats" list with it. duration: Length of the video in seconds, as an integer. view_count: How many users have watched the video on the platform. like_count: Number of positive ratings of the video @@ -993,6 +999,16 @@ class InfoExtractor(object): any_restricted = any_restricted or is_restricted return not any_restricted + def extract_subtitles(self, *args, **kwargs): + subtitles = {} + list_subtitles = self._downloader.params.get('listsubtitles') + if self._downloader.params.get('writesubtitles', False) or list_subtitles: + subtitles.update(self._get_subtitles(*args, **kwargs)) + return subtitles + + def _get_subtitles(self, *args, **kwargs): + raise NotImplementedError("This method must be implemented by subclasses") + class SearchInfoExtractor(InfoExtractor): """ diff --git a/youtube_dl/extractor/ted.py b/youtube_dl/extractor/ted.py index 10b3b706a..1809eaae4 100644 --- a/youtube_dl/extractor/ted.py +++ b/youtube_dl/extractor/ted.py @@ -3,14 +3,14 @@ from __future__ import unicode_literals import json import re -from .subtitles import SubtitlesInfoExtractor +from .common import InfoExtractor from ..compat import ( compat_str, ) -class TEDIE(SubtitlesInfoExtractor): +class TEDIE(InfoExtractor): _VALID_URL = r'''(?x) (?Phttps?://) (?Pwww|embed(?:-ssl)?)(?P\.ted\.com/ @@ -165,9 +165,6 @@ class TEDIE(SubtitlesInfoExtractor): video_id = compat_str(talk_info['id']) # subtitles video_subtitles = self.extract_subtitles(video_id, talk_info) - if self._downloader.params.get('listsubtitles', False): - self._list_available_subtitles(video_id, talk_info) - return thumbnail = talk_info['thumb'] if not thumbnail.startswith('http'): @@ -183,13 +180,18 @@ class TEDIE(SubtitlesInfoExtractor): 'duration': talk_info.get('duration'), } - def _get_available_subtitles(self, video_id, talk_info): + def _get_subtitles(self, video_id, talk_info): languages = [lang['languageCode'] for lang in talk_info.get('languages', [])] if languages: sub_lang_list = {} for l in languages: - url = 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/srt' % (video_id, l) - sub_lang_list[l] = url + sub_lang_list[l] = [ + { + 'url': 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/%s' % (video_id, l, ext), + 'ext': ext, + } + for ext in ['ted', 'srt'] + ] return sub_lang_list else: self._downloader.report_warning('video doesn\'t have subtitles') diff --git a/youtube_dl/options.py b/youtube_dl/options.py index 873432bee..4fcf8c83d 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -387,8 +387,8 @@ def parseOpts(overrideArguments=None): help='lists all available subtitles for the video') subtitles.add_option( '--sub-format', - action='store', dest='subtitlesformat', metavar='FORMAT', default='srt', - help='subtitle format (default=srt) ([sbv/vtt] youtube only)') + action='store', dest='subtitlesformat', metavar='FORMAT', default='best', + help='subtitle format, accepts formats preference, for example: "ass/srt/best"') subtitles.add_option( '--sub-lang', '--sub-langs', '--srt-lang', action='callback', dest='subtitleslangs', metavar='LANGS', type='str', diff --git a/youtube_dl/postprocessor/ffmpeg.py b/youtube_dl/postprocessor/ffmpeg.py index 5238ce534..d1bbfbfe3 100644 --- a/youtube_dl/postprocessor/ffmpeg.py +++ b/youtube_dl/postprocessor/ffmpeg.py @@ -453,10 +453,6 @@ class FFmpegEmbedSubtitlePP(FFmpegPostProcessor): 'zu': 'zul', } - def __init__(self, downloader=None, subtitlesformat='srt'): - super(FFmpegEmbedSubtitlePP, self).__init__(downloader) - self._subformat = subtitlesformat - @classmethod def _conver_lang_code(cls, code): """Convert language code from ISO 639-1 to ISO 639-2/T""" @@ -472,7 +468,7 @@ class FFmpegEmbedSubtitlePP(FFmpegPostProcessor): sub_langs = [key for key in information['subtitles']] filename = information['filepath'] - input_files = [filename] + [subtitles_filename(filename, lang, self._subformat) for lang in sub_langs] + input_files = [filename] + [subtitles_filename(filename, lang, sub_info['ext']) for lang, sub_info in information['subtitles'].items()] opts = [ '-map', '0', From b5857f62e2c2ca70316e041212aa9e89d54cc253 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sun, 15 Feb 2015 18:21:42 +0100 Subject: [PATCH 02/83] [crunchyroll] Convert to new subtitles system --- youtube_dl/extractor/crunchyroll.py | 66 +++++++++++++++-------------- 1 file changed, 35 insertions(+), 31 deletions(-) diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index 1680f532f..f1da7d09b 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -9,7 +9,7 @@ import xml.etree.ElementTree from hashlib import sha1 from math import pow, sqrt, floor -from .subtitles import SubtitlesInfoExtractor +from .common import InfoExtractor from ..compat import ( compat_urllib_parse, compat_urllib_request, @@ -25,10 +25,9 @@ from ..aes import ( aes_cbc_decrypt, inc, ) -from .common import InfoExtractor -class CrunchyrollIE(SubtitlesInfoExtractor): +class CrunchyrollIE(InfoExtractor): _VALID_URL = r'https?://(?:(?Pwww|m)\.)?(?Pcrunchyroll\.(?:com|fr)/(?:[^/]*/[^/?&]*?|media/\?id=)(?P[0-9]+))(?:[/?&]|$)' _TESTS = [{ 'url': 'http://www.crunchyroll.com/wanna-be-the-strongest-in-the-world/episode-1-an-idol-wrestler-is-born-645513', @@ -187,6 +186,38 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text return output + def _get_subtitles(self, video_id, webpage): + subtitles = {} + for sub_id, sub_name in re.findall(r'\?ssid=([0-9]+)" title="([^"]+)', webpage): + sub_page = self._download_webpage( + 'http://www.crunchyroll.com/xml/?req=RpcApiSubtitle_GetXml&subtitle_script_id=' + sub_id, + video_id, note='Downloading subtitles for ' + sub_name) + id = self._search_regex(r'id=\'([0-9]+)', sub_page, 'subtitle_id', fatal=False) + iv = self._search_regex(r'([^<]+)', sub_page, 'subtitle_iv', fatal=False) + data = self._search_regex(r'([^<]+)', sub_page, 'subtitle_data', fatal=False) + if not id or not iv or not data: + continue + id = int(id) + iv = base64.b64decode(iv) + data = base64.b64decode(data) + + subtitle = self._decrypt_subtitles(data, iv, id).decode('utf-8') + lang_code = self._search_regex(r'lang_code=["\']([^"\']+)', subtitle, 'subtitle_lang_code', fatal=False) + if not lang_code: + continue + sub_root = xml.etree.ElementTree.fromstring(subtitle) + subtitles[lang_code] = [ + { + 'ext': 'srt', + 'data': self._convert_subtitles_to_srt(sub_root), + }, + { + 'ext': 'ass', + 'data': self._convert_subtitles_to_ass(sub_root), + }, + ] + return subtitles + def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('video_id') @@ -249,34 +280,7 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text 'format_id': video_format, }) - subtitles = {} - sub_format = self._downloader.params.get('subtitlesformat', 'srt') - for sub_id, sub_name in re.findall(r'\?ssid=([0-9]+)" title="([^"]+)', webpage): - sub_page = self._download_webpage( - 'http://www.crunchyroll.com/xml/?req=RpcApiSubtitle_GetXml&subtitle_script_id=' + sub_id, - video_id, note='Downloading subtitles for ' + sub_name) - id = self._search_regex(r'id=\'([0-9]+)', sub_page, 'subtitle_id', fatal=False) - iv = self._search_regex(r'([^<]+)', sub_page, 'subtitle_iv', fatal=False) - data = self._search_regex(r'([^<]+)', sub_page, 'subtitle_data', fatal=False) - if not id or not iv or not data: - continue - id = int(id) - iv = base64.b64decode(iv) - data = base64.b64decode(data) - - subtitle = self._decrypt_subtitles(data, iv, id).decode('utf-8') - lang_code = self._search_regex(r'lang_code=["\']([^"\']+)', subtitle, 'subtitle_lang_code', fatal=False) - if not lang_code: - continue - sub_root = xml.etree.ElementTree.fromstring(subtitle) - if sub_format == 'ass': - subtitles[lang_code] = self._convert_subtitles_to_ass(sub_root) - else: - subtitles[lang_code] = self._convert_subtitles_to_srt(sub_root) - - if self._downloader.params.get('listsubtitles', False): - self._list_available_subtitles(video_id, subtitles) - return + subtitles = self.extract_subtitles(video_id, webpage) return { 'id': video_id, From 6b597516c12c7fd81e832f3ec05dd0dca6089823 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sun, 15 Feb 2015 18:32:40 +0100 Subject: [PATCH 03/83] [atresplayer] Convert to new subtitles system --- youtube_dl/extractor/atresplayer.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/youtube_dl/extractor/atresplayer.py b/youtube_dl/extractor/atresplayer.py index f016368fa..7669e0e3d 100644 --- a/youtube_dl/extractor/atresplayer.py +++ b/youtube_dl/extractor/atresplayer.py @@ -3,7 +3,7 @@ from __future__ import unicode_literals import time import hmac -from .subtitles import SubtitlesInfoExtractor +from .common import InfoExtractor from ..compat import ( compat_str, compat_urllib_parse, @@ -17,7 +17,7 @@ from ..utils import ( ) -class AtresPlayerIE(SubtitlesInfoExtractor): +class AtresPlayerIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?atresplayer\.com/television/[^/]+/[^/]+/[^/]+/(?P.+?)_\d+\.html' _TESTS = [ { @@ -144,13 +144,12 @@ class AtresPlayerIE(SubtitlesInfoExtractor): thumbnail = xpath_text(episode, './media/asset/files/background', 'thumbnail') subtitles = {} - subtitle = xpath_text(episode, './media/asset/files/subtitle', 'subtitle') - if subtitle: - subtitles['es'] = subtitle - - if self._downloader.params.get('listsubtitles', False): - self._list_available_subtitles(video_id, subtitles) - return + subtitle_url = xpath_text(episode, './media/asset/files/subtitle', 'subtitle') + if subtitle_url: + subtitles['es'] = [{ + 'ext': 'srt', + 'url': subtitle_url, + }] return { 'id': video_id, @@ -159,5 +158,5 @@ class AtresPlayerIE(SubtitlesInfoExtractor): 'thumbnail': thumbnail, 'duration': duration, 'formats': formats, - 'subtitles': self.extract_subtitles(video_id, subtitles), + 'subtitles': subtitles, } From 65469a7f8b0ba50bd3c8918707e35125962aa2cd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sun, 15 Feb 2015 18:52:07 +0100 Subject: [PATCH 04/83] [vimeo] Convert to new subtitles system Removed some tests, the behaviour should be checked in a test for the YoutubeDL class --- test/parameters.json | 2 +- test/test_subtitles.py | 17 ----------------- youtube_dl/extractor/vimeo.py | 15 ++++++--------- 3 files changed, 7 insertions(+), 27 deletions(-) diff --git a/test/parameters.json b/test/parameters.json index af77b89b4..cbff9bd16 100644 --- a/test/parameters.json +++ b/test/parameters.json @@ -28,7 +28,7 @@ "retries": 10, "simulate": false, "subtitleslang": null, - "subtitlesformat": "srt", + "subtitlesformat": "best", "test": true, "updatetime": true, "usenetrc": false, diff --git a/test/test_subtitles.py b/test/test_subtitles.py index fbc9eaf4d..3f2d61d36 100644 --- a/test/test_subtitles.py +++ b/test/test_subtitles.py @@ -240,10 +240,6 @@ class TestVimeoSubtitles(BaseTestSubtitles): url = 'http://vimeo.com/76979871' IE = VimeoIE - def test_no_writesubtitles(self): - subtitles = self.getSubtitles() - self.assertEqual(subtitles, None) - def test_subtitles(self): self.DL.params['writesubtitles'] = True subtitles = self.getSubtitles() @@ -261,19 +257,6 @@ class TestVimeoSubtitles(BaseTestSubtitles): subtitles = self.getSubtitles() self.assertEqual(set(subtitles.keys()), set(['de', 'en', 'es', 'fr'])) - def test_list_subtitles(self): - self.DL.expect_warning('Automatic Captions not supported by this server') - self.DL.params['listsubtitles'] = True - info_dict = self.getInfoDict() - self.assertEqual(info_dict, None) - - def test_automatic_captions(self): - self.DL.expect_warning('Automatic Captions not supported by this server') - self.DL.params['writeautomaticsub'] = True - self.DL.params['subtitleslang'] = ['en'] - subtitles = self.getSubtitles() - self.assertTrue(len(subtitles.keys()) == 0) - def test_nosubtitles(self): self.DL.expect_warning('video doesn\'t have subtitles') self.url = 'http://vimeo.com/56015672' diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 303e81447..5930d5984 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -6,7 +6,6 @@ import re import itertools from .common import InfoExtractor -from .subtitles import SubtitlesInfoExtractor from ..compat import ( compat_HTTPError, compat_urllib_parse, @@ -51,7 +50,7 @@ class VimeoBaseInfoExtractor(InfoExtractor): self._download_webpage(login_request, None, False, 'Wrong login info') -class VimeoIE(VimeoBaseInfoExtractor, SubtitlesInfoExtractor): +class VimeoIE(VimeoBaseInfoExtractor): """Information extractor for vimeo.com.""" # _VALID_URL matches Vimeo URLs @@ -368,12 +367,10 @@ class VimeoIE(VimeoBaseInfoExtractor, SubtitlesInfoExtractor): text_tracks = config['request'].get('text_tracks') if text_tracks: for tt in text_tracks: - subtitles[tt['lang']] = 'http://vimeo.com' + tt['url'] - - video_subtitles = self.extract_subtitles(video_id, subtitles) - if self._downloader.params.get('listsubtitles', False): - self._list_available_subtitles(video_id, subtitles) - return + subtitles[tt['lang']] = [{ + 'ext': 'vtt', + 'url': 'http://vimeo.com' + tt['url'], + }] return { 'id': video_id, @@ -389,7 +386,7 @@ class VimeoIE(VimeoBaseInfoExtractor, SubtitlesInfoExtractor): 'view_count': view_count, 'like_count': like_count, 'comment_count': comment_count, - 'subtitles': video_subtitles, + 'subtitles': subtitles, } From c84dd8a90dcc75547b343449b921b644a2119c4f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Mon, 16 Feb 2015 21:12:31 +0100 Subject: [PATCH 05/83] [YoutubeDL] store the subtitles to download in the 'requested_subtitles' field We need to keep the orginal subtitles information, so that the '--load-info' option can be used to list or select the subtitles again. We'll also be able to have a separate field for storing the automatic captions info. --- test/test_subtitles.py | 2 +- youtube_dl/YoutubeDL.py | 6 +++--- youtube_dl/extractor/common.py | 2 -- youtube_dl/postprocessor/ffmpeg.py | 7 ++++--- 4 files changed, 8 insertions(+), 9 deletions(-) diff --git a/test/test_subtitles.py b/test/test_subtitles.py index 3f2d61d36..b3c615c4f 100644 --- a/test/test_subtitles.py +++ b/test/test_subtitles.py @@ -36,7 +36,7 @@ class BaseTestSubtitles(unittest.TestCase): def getSubtitles(self): info_dict = self.getInfoDict() - subtitles = info_dict['subtitles'] + subtitles = info_dict['requested_subtitles'] if not subtitles: return subtitles for sub_info in subtitles.values(): diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index e665e3d53..8545dc9e9 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -1022,7 +1022,7 @@ class YoutubeDL(object): if self.params.get('listsubtitles', False): self.list_subtitles(info_dict['id'], info_dict.get('subtitles')) return - info_dict['subtitles'] = self.process_subtitles(info_dict['id'], info_dict.get('subtitles')) + info_dict['requested_subtitles'] = self.process_subtitles(info_dict['id'], info_dict.get('subtitles')) # This extractors handle format selection themselves if info_dict['extractor'] in ['Youku']: @@ -1301,10 +1301,10 @@ class YoutubeDL(object): subtitles_are_requested = any([self.params.get('writesubtitles', False), self.params.get('writeautomaticsub')]) - if subtitles_are_requested and 'subtitles' in info_dict and info_dict['subtitles']: + if subtitles_are_requested and info_dict.get('requested_subtitles'): # subtitles download errors are already managed as troubles in relevant IE # that way it will silently go on when used with unsupporting IE - subtitles = info_dict['subtitles'] + subtitles = info_dict['requested_subtitles'] for sub_lang, sub_info in subtitles.items(): sub_format = sub_info['ext'] if sub_info.get('data') is not None: diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 161c623eb..d149e0f92 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -157,8 +157,6 @@ class InfoExtractor(object): with the "ext" entry and one of: * "data": The subtitles file contents * "url": A url pointing to the subtitles file - Note: YoutubeDL.extract_info will get the requested - format and replace the "subformats" list with it. duration: Length of the video in seconds, as an integer. view_count: How many users have watched the video on the platform. like_count: Number of positive ratings of the video diff --git a/youtube_dl/postprocessor/ffmpeg.py b/youtube_dl/postprocessor/ffmpeg.py index d1bbfbfe3..e42298f0e 100644 --- a/youtube_dl/postprocessor/ffmpeg.py +++ b/youtube_dl/postprocessor/ffmpeg.py @@ -462,13 +462,14 @@ class FFmpegEmbedSubtitlePP(FFmpegPostProcessor): if information['ext'] != 'mp4': self._downloader.to_screen('[ffmpeg] Subtitles can only be embedded in mp4 files') return True, information - if not information.get('subtitles'): + subtitles = information.get('requested_subtitles') + if not subtitles: self._downloader.to_screen('[ffmpeg] There aren\'t any subtitles to embed') return True, information - sub_langs = [key for key in information['subtitles']] + sub_langs = list(subtitles.keys()) filename = information['filepath'] - input_files = [filename] + [subtitles_filename(filename, lang, sub_info['ext']) for lang, sub_info in information['subtitles'].items()] + input_files = [filename] + [subtitles_filename(filename, lang, sub_info['ext']) for lang, sub_info in subtitles.items()] opts = [ '-map', '0', From a1f2a06b34807a2e1b5eb5176fa418da2405392d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Mon, 16 Feb 2015 21:28:06 +0100 Subject: [PATCH 06/83] [dailymotion] Convert to new subtitles system --- test/test_subtitles.py | 17 ----------------- youtube_dl/extractor/dailymotion.py | 10 +++------- 2 files changed, 3 insertions(+), 24 deletions(-) diff --git a/test/test_subtitles.py b/test/test_subtitles.py index b3c615c4f..84ae0e714 100644 --- a/test/test_subtitles.py +++ b/test/test_subtitles.py @@ -127,10 +127,6 @@ class TestDailymotionSubtitles(BaseTestSubtitles): url = 'http://www.dailymotion.com/video/xczg00' IE = DailymotionIE - def test_no_writesubtitles(self): - subtitles = self.getSubtitles() - self.assertEqual(subtitles, None) - def test_subtitles(self): self.DL.params['writesubtitles'] = True subtitles = self.getSubtitles() @@ -148,19 +144,6 @@ class TestDailymotionSubtitles(BaseTestSubtitles): subtitles = self.getSubtitles() self.assertTrue(len(subtitles.keys()) >= 6) - def test_list_subtitles(self): - self.DL.expect_warning('Automatic Captions not supported by this server') - self.DL.params['listsubtitles'] = True - info_dict = self.getInfoDict() - self.assertEqual(info_dict, None) - - def test_automatic_captions(self): - self.DL.expect_warning('Automatic Captions not supported by this server') - self.DL.params['writeautomaticsub'] = True - self.DL.params['subtitleslang'] = ['en'] - subtitles = self.getSubtitles() - self.assertTrue(len(subtitles.keys()) == 0) - def test_nosubtitles(self): self.DL.expect_warning('video doesn\'t have subtitles') self.url = 'http://www.dailymotion.com/video/x12u166_le-zapping-tele-star-du-08-aout-2013_tv' diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index cf5841a7c..4ca892926 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -6,7 +6,6 @@ import json import itertools from .common import InfoExtractor -from .subtitles import SubtitlesInfoExtractor from ..compat import ( compat_str, @@ -31,7 +30,7 @@ class DailymotionBaseInfoExtractor(InfoExtractor): return request -class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor): +class DailymotionIE(DailymotionBaseInfoExtractor): """Information Extractor for Dailymotion""" _VALID_URL = r'(?i)(?:https?://)?(?:(www|touch)\.)?dailymotion\.[a-z]{2,3}/(?:(embed|#)/)?video/(?P[^/?_]+)' @@ -143,9 +142,6 @@ class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor): # subtitles video_subtitles = self.extract_subtitles(video_id, webpage) - if self._downloader.params.get('listsubtitles', False): - self._list_available_subtitles(video_id, webpage) - return view_count = str_to_int(self._search_regex( r'video_views_count[^>]+>\s+([\d\.,]+)', @@ -169,7 +165,7 @@ class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor): 'view_count': view_count, } - def _get_available_subtitles(self, video_id, webpage): + def _get_subtitles(self, video_id, webpage): try: sub_list = self._download_webpage( 'https://api.dailymotion.com/video/%s/subtitles?fields=id,language,url' % video_id, @@ -179,7 +175,7 @@ class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor): return {} info = json.loads(sub_list) if (info['total'] > 0): - sub_lang_list = dict((l['language'], l['url']) for l in info['list']) + sub_lang_list = dict((l['language'], [{'url': l['url'], 'ext': 'srt'}]) for l in info['list']) return sub_lang_list self._downloader.report_warning('video doesn\'t have subtitles') return {} From 360e1ca5ccabcb5d48228d9472b09f1bce68bbc4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Mon, 16 Feb 2015 21:44:17 +0100 Subject: [PATCH 07/83] [youtube] Convert to new subtitles system The automatic captions are stored in the 'automactic_captions' field, which is used if no normal subtitles are found for an specific language. --- test/test_subtitles.py | 5 --- youtube_dl/YoutubeDL.py | 24 ++++++++++---- youtube_dl/extractor/common.py | 12 +++++++ youtube_dl/extractor/youtube.py | 57 ++++++++++++++++++--------------- 4 files changed, 61 insertions(+), 37 deletions(-) diff --git a/test/test_subtitles.py b/test/test_subtitles.py index 84ae0e714..91cebce28 100644 --- a/test/test_subtitles.py +++ b/test/test_subtitles.py @@ -50,11 +50,6 @@ class TestYoutubeSubtitles(BaseTestSubtitles): url = 'QRS8MkLhQmM' IE = YoutubeIE - def test_youtube_no_writesubtitles(self): - self.DL.params['writesubtitles'] = False - subtitles = self.getSubtitles() - self.assertEqual(subtitles, None) - def test_youtube_subtitles(self): self.DL.params['writesubtitles'] = True subtitles = self.getSubtitles() diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 8545dc9e9..a47f8f5de 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -1020,9 +1020,13 @@ class YoutubeDL(object): info_dict['upload_date'] = upload_date.strftime('%Y%m%d') if self.params.get('listsubtitles', False): - self.list_subtitles(info_dict['id'], info_dict.get('subtitles')) + if 'automatic_captions' in info_dict: + self.list_subtitles(info_dict['id'], info_dict.get('automatic_captions'), 'automatic captions') + self.list_subtitles(info_dict['id'], info_dict.get('subtitles'), 'subtitles') return - info_dict['requested_subtitles'] = self.process_subtitles(info_dict['id'], info_dict.get('subtitles')) + info_dict['requested_subtitles'] = self.process_subtitles( + info_dict['id'], info_dict.get('subtitles'), + info_dict.get('automatic_captions')) # This extractors handle format selection themselves if info_dict['extractor'] in ['Youku']: @@ -1152,8 +1156,14 @@ class YoutubeDL(object): info_dict.update(formats_to_download[-1]) return info_dict - def process_subtitles(self, video_id, available_subs): + def process_subtitles(self, video_id, available_subs, available_autocaps): """Select the requested subtitles and their format""" + if available_autocaps and self.params.get('writeautomaticsub'): + available_subs = available_subs.copy() + for lang, cap_info in available_autocaps.items(): + if lang not in available_subs: + available_subs[lang] = cap_info + if not available_subs: return available_subs @@ -1645,17 +1655,17 @@ class YoutubeDL(object): ['ID', 'width', 'height', 'URL'], [[t['id'], t.get('width', 'unknown'), t.get('height', 'unknown'), t['url']] for t in thumbnails])) - def list_subtitles(self, video_id, subtitles): + def list_subtitles(self, video_id, subtitles, name='subtitles'): if not subtitles: - self.to_screen('%s has no subtitles' % video_id) + self.to_screen('%s has no %s' % (video_id, name)) return header_line = 'Language formats' sub_lines = [ '%-12s%s' % (lang, ', '.join(f['ext'] for f in reversed(formats))) for lang, formats in subtitles.items()] self.to_screen( - 'Available subtitles for %s:\n%s\n%s' % - (video_id, header_line, '\n'.join(sub_lines))) + 'Available %s for %s:\n%s\n%s' % + (name, video_id, header_line, '\n'.join(sub_lines))) def urlopen(self, req): """ Start an HTTP download """ diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index d149e0f92..fe7d8dbc9 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -157,6 +157,8 @@ class InfoExtractor(object): with the "ext" entry and one of: * "data": The subtitles file contents * "url": A url pointing to the subtitles file + automatic_captions: Like 'subtitles', used by the YoutubeIE for + automatically generated captions duration: Length of the video in seconds, as an integer. view_count: How many users have watched the video on the platform. like_count: Number of positive ratings of the video @@ -1007,6 +1009,16 @@ class InfoExtractor(object): def _get_subtitles(self, *args, **kwargs): raise NotImplementedError("This method must be implemented by subclasses") + def extract_automatic_captions(self, *args, **kwargs): + automatic_captions = {} + list_subtitles = self._downloader.params.get('listsubtitles') + if self._downloader.params.get('writeautomaticsub', False) or list_subtitles: + automatic_captions.update(self._get_automatic_captions(*args, **kwargs)) + return automatic_captions + + def _get_automatic_captions(self, *args, **kwargs): + raise NotImplementedError("This method must be implemented by subclasses") + class SearchInfoExtractor(InfoExtractor): """ diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 35ef4c303..1b2dbf276 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -11,7 +11,6 @@ import time import traceback from .common import InfoExtractor, SearchInfoExtractor -from .subtitles import SubtitlesInfoExtractor from ..jsinterp import JSInterpreter from ..swfinterp import SWFInterpreter from ..compat import ( @@ -185,7 +184,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor): return -class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): +class YoutubeIE(YoutubeBaseInfoExtractor): IE_DESC = 'YouTube.com' _VALID_URL = r"""(?x)^ ( @@ -644,7 +643,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): raise ExtractorError( 'Signature extraction failed: ' + tb, cause=e) - def _get_available_subtitles(self, video_id, webpage): + def _get_subtitles(self, video_id, webpage): try: subs_doc = self._download_xml( 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id, @@ -658,23 +657,27 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): lang = track.attrib['lang_code'] if lang in sub_lang_list: continue - params = compat_urllib_parse.urlencode({ - 'lang': lang, - 'v': video_id, - 'fmt': self._downloader.params.get('subtitlesformat', 'srt'), - 'name': track.attrib['name'].encode('utf-8'), - }) - url = 'https://www.youtube.com/api/timedtext?' + params - sub_lang_list[lang] = url + sub_formats = [] + for ext in ['sbv', 'vtt', 'srt']: + params = compat_urllib_parse.urlencode({ + 'lang': lang, + 'v': video_id, + 'fmt': ext, + 'name': track.attrib['name'].encode('utf-8'), + }) + sub_formats.append({ + 'url': 'https://www.youtube.com/api/timedtext?' + params, + 'ext': ext, + }) + sub_lang_list[lang] = sub_formats if not sub_lang_list: self._downloader.report_warning('video doesn\'t have subtitles') return {} return sub_lang_list - def _get_available_automatic_caption(self, video_id, webpage): + def _get_automatic_captions(self, video_id, webpage): """We need the webpage for getting the captions url, pass it as an argument to speed up the process.""" - sub_format = self._downloader.params.get('subtitlesformat', 'srt') self.to_screen('%s: Looking for automatic captions' % video_id) mobj = re.search(r';ytplayer.config = ({.*?});', webpage) err_msg = 'Couldn\'t find automatic captions for %s' % video_id @@ -704,14 +707,20 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): sub_lang_list = {} for lang_node in caption_list.findall('target'): sub_lang = lang_node.attrib['lang_code'] - params = compat_urllib_parse.urlencode({ - 'lang': original_lang, - 'tlang': sub_lang, - 'fmt': sub_format, - 'ts': timestamp, - 'kind': caption_kind, - }) - sub_lang_list[sub_lang] = caption_url + '&' + params + sub_formats = [] + for ext in ['sbv', 'vtt', 'srt']: + params = compat_urllib_parse.urlencode({ + 'lang': original_lang, + 'tlang': sub_lang, + 'fmt': ext, + 'ts': timestamp, + 'kind': caption_kind, + }) + sub_formats.append({ + 'url': caption_url + '&' + params, + 'ext': ext, + }) + sub_lang_list[sub_lang] = sub_formats return sub_lang_list # An extractor error can be raise by the download process if there are # no automatic captions but there are subtitles @@ -966,10 +975,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): # subtitles video_subtitles = self.extract_subtitles(video_id, video_webpage) - - if self._downloader.params.get('listsubtitles', False): - self._list_available_subtitles(video_id, video_webpage) - return + automatic_captions = self.extract_automatic_captions(video_id, video_webpage) if 'length_seconds' not in video_info: self._downloader.report_warning('unable to extract video duration') @@ -1118,6 +1124,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): 'description': video_description, 'categories': video_categories, 'subtitles': video_subtitles, + 'automatic_captions': automatic_captions, 'duration': video_duration, 'age_limit': 18 if age_gate else 0, 'annotations': video_annotations, From 48246541da66a12486505804f9519391a298ff54 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Tue, 17 Feb 2015 21:17:47 +0100 Subject: [PATCH 08/83] [ceskatelevize] Convert to new subtitles system --- youtube_dl/extractor/ceskatelevize.py | 30 +++++++++++++-------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/youtube_dl/extractor/ceskatelevize.py b/youtube_dl/extractor/ceskatelevize.py index f70e090bb..65f6be623 100644 --- a/youtube_dl/extractor/ceskatelevize.py +++ b/youtube_dl/extractor/ceskatelevize.py @@ -3,7 +3,7 @@ from __future__ import unicode_literals import re -from .subtitles import SubtitlesInfoExtractor +from .common import InfoExtractor from ..compat import ( compat_urllib_request, compat_urllib_parse, @@ -15,7 +15,7 @@ from ..utils import ( ) -class CeskaTelevizeIE(SubtitlesInfoExtractor): +class CeskaTelevizeIE(InfoExtractor): _VALID_URL = r'https?://www\.ceskatelevize\.cz/(porady|ivysilani)/(.+/)?(?P[^?#]+)' _TESTS = [ @@ -107,13 +107,7 @@ class CeskaTelevizeIE(SubtitlesInfoExtractor): subtitles = {} subs = item.get('subtitles') if subs: - subtitles['cs'] = subs[0]['url'] - - if self._downloader.params.get('listsubtitles', False): - self._list_available_subtitles(video_id, subtitles) - return - - subtitles = self._fix_subtitles(self.extract_subtitles(video_id, subtitles)) + subtitles = self.extract_subtitles(episode_id, subs) return { 'id': episode_id, @@ -125,11 +119,20 @@ class CeskaTelevizeIE(SubtitlesInfoExtractor): 'subtitles': subtitles, } + def _get_subtitles(self, episode_id, subs): + original_subtitles = self._download_webpage( + subs[0]['url'], episode_id, 'Downloading subtitles') + srt_subs = self._fix_subtitles(original_subtitles) + return { + 'cs': [{ + 'ext': 'srt', + 'data': srt_subs, + }] + } + @staticmethod def _fix_subtitles(subtitles): """ Convert millisecond-based subtitles to SRT """ - if subtitles is None: - return subtitles # subtitles not requested def _msectotimecode(msec): """ Helper utility to convert milliseconds to timecode """ @@ -149,7 +152,4 @@ class CeskaTelevizeIE(SubtitlesInfoExtractor): else: yield line - fixed_subtitles = {} - for k, v in subtitles.items(): - fixed_subtitles[k] = "\r\n".join(_fix_subtitle(v)) - return fixed_subtitles + return "\r\n".join(_fix_subtitle(subtitles)) From bd7fe0cf6668c9ea9272dbe25774072b383e67d4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Tue, 17 Feb 2015 21:23:09 +0100 Subject: [PATCH 09/83] [walla] Convert to new subtitles system --- youtube_dl/extractor/walla.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/youtube_dl/extractor/walla.py b/youtube_dl/extractor/walla.py index 672bda7a7..24efbd6e6 100644 --- a/youtube_dl/extractor/walla.py +++ b/youtube_dl/extractor/walla.py @@ -3,14 +3,14 @@ from __future__ import unicode_literals import re -from .subtitles import SubtitlesInfoExtractor +from .common import InfoExtractor from ..utils import ( xpath_text, int_or_none, ) -class WallaIE(SubtitlesInfoExtractor): +class WallaIE(InfoExtractor): _VALID_URL = r'http://vod\.walla\.co\.il/[^/]+/(?P\d+)/(?P.+)' _TEST = { 'url': 'http://vod.walla.co.il/movie/2642630/one-direction-all-for-one', @@ -52,13 +52,10 @@ class WallaIE(SubtitlesInfoExtractor): subtitles = {} for subtitle in item.findall('./subtitles/subtitle'): lang = xpath_text(subtitle, './title') - subtitles[self._SUBTITLE_LANGS.get(lang, lang)] = xpath_text(subtitle, './src') - - if self._downloader.params.get('listsubtitles', False): - self._list_available_subtitles(video_id, subtitles) - return - - subtitles = self.extract_subtitles(video_id, subtitles) + subtitles[self._SUBTITLE_LANGS.get(lang, lang)] = [{ + 'ext': 'srt', + 'url': xpath_text(subtitle, './src'), + }] formats = [] for quality in item.findall('./qualities/quality'): From 85920dd01d98cf74ea7d3ab7834a3b50cd6f1fde Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Tue, 17 Feb 2015 21:56:25 +0100 Subject: [PATCH 10/83] [bliptv] Convert to new subtitles system --- test/test_subtitles.py | 2 -- youtube_dl/extractor/bliptv.py | 34 ++++++++++++++++++---------------- 2 files changed, 18 insertions(+), 18 deletions(-) diff --git a/test/test_subtitles.py b/test/test_subtitles.py index 91cebce28..0ca510310 100644 --- a/test/test_subtitles.py +++ b/test/test_subtitles.py @@ -200,13 +200,11 @@ class TestBlipTVSubtitles(BaseTestSubtitles): IE = BlipTVIE def test_list_subtitles(self): - self.DL.expect_warning('Automatic Captions not supported by this server') self.DL.params['listsubtitles'] = True info_dict = self.getInfoDict() self.assertEqual(info_dict, None) def test_allsubtitles(self): - self.DL.expect_warning('Automatic Captions not supported by this server') self.DL.params['writesubtitles'] = True self.DL.params['allsubtitles'] = True subtitles = self.getSubtitles() diff --git a/youtube_dl/extractor/bliptv.py b/youtube_dl/extractor/bliptv.py index 436cc5155..8c7ba4b91 100644 --- a/youtube_dl/extractor/bliptv.py +++ b/youtube_dl/extractor/bliptv.py @@ -3,7 +3,6 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from .subtitles import SubtitlesInfoExtractor from ..compat import ( compat_str, @@ -18,7 +17,7 @@ from ..utils import ( ) -class BlipTVIE(SubtitlesInfoExtractor): +class BlipTVIE(InfoExtractor): _VALID_URL = r'https?://(?:\w+\.)?blip\.tv/(?:(?:.+-|rss/flash/)(?P\d+)|((?:play/|api\.swf#)(?P[\da-zA-Z+_]+)))' _TESTS = [ @@ -143,7 +142,7 @@ class BlipTVIE(SubtitlesInfoExtractor): categories = [category.text for category in item.findall('category')] formats = [] - subtitles = {} + subtitles_urls = {} media_group = item.find(media('group')) for media_content in media_group.findall(media('content')): @@ -161,7 +160,7 @@ class BlipTVIE(SubtitlesInfoExtractor): } lang = role.rpartition('-')[-1].strip().lower() langcode = LANGS.get(lang, lang) - subtitles[langcode] = url + subtitles_urls[langcode] = url elif media_type.startswith('video/'): formats.append({ 'url': real_url, @@ -175,11 +174,7 @@ class BlipTVIE(SubtitlesInfoExtractor): }) self._sort_formats(formats) - # subtitles - video_subtitles = self.extract_subtitles(video_id, subtitles) - if self._downloader.params.get('listsubtitles', False): - self._list_available_subtitles(video_id, subtitles) - return + subtitles = self.extract_subtitles(video_id, subtitles_urls) return { 'id': video_id, @@ -192,15 +187,22 @@ class BlipTVIE(SubtitlesInfoExtractor): 'thumbnail': thumbnail, 'categories': categories, 'formats': formats, - 'subtitles': video_subtitles, + 'subtitles': subtitles, } - def _download_subtitle_url(self, sub_lang, url): - # For some weird reason, blip.tv serves a video instead of subtitles - # when we request with a common UA - req = compat_urllib_request.Request(url) - req.add_header('User-Agent', 'youtube-dl') - return self._download_webpage(req, None, note=False) + def _get_subtitles(self, video_id, subtitles_urls): + subtitles = {} + for lang, url in subtitles_urls.items(): + # For some weird reason, blip.tv serves a video instead of subtitles + # when we request with a common UA + req = compat_urllib_request.Request(url) + req.add_header('User-Agent', 'youtube-dl') + subtitles[lang] = [{ + # The extension is 'srt' but it's actually an 'ass' file + 'ext': 'ass', + 'data': self._download_webpage(req, None, note=False), + }] + return subtitles class BlipTVUserIE(InfoExtractor): From 9868ea493626a3a81d30d084fd00d22982a0f86a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Tue, 17 Feb 2015 22:16:29 +0100 Subject: [PATCH 11/83] [extractor/common] Simplify subtitles handling methods Initially I was going to use a single method for handling both subtitles and automatic captions, that's why I used the 'list_subtitles' and the 'subtitles' variables. --- youtube_dl/extractor/common.py | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index fe7d8dbc9..7d8ce1808 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1000,21 +1000,19 @@ class InfoExtractor(object): return not any_restricted def extract_subtitles(self, *args, **kwargs): - subtitles = {} - list_subtitles = self._downloader.params.get('listsubtitles') - if self._downloader.params.get('writesubtitles', False) or list_subtitles: - subtitles.update(self._get_subtitles(*args, **kwargs)) - return subtitles + if (self._downloader.params.get('writesubtitles', False) or + self._downloader.params.get('listsubtitles')): + return self._get_subtitles(*args, **kwargs) + return {} def _get_subtitles(self, *args, **kwargs): raise NotImplementedError("This method must be implemented by subclasses") def extract_automatic_captions(self, *args, **kwargs): - automatic_captions = {} - list_subtitles = self._downloader.params.get('listsubtitles') - if self._downloader.params.get('writeautomaticsub', False) or list_subtitles: - automatic_captions.update(self._get_automatic_captions(*args, **kwargs)) - return automatic_captions + if (self._downloader.params.get('writeautomaticsub', False) or + self._downloader.params.get('listsubtitles')): + return self._get_automatic_captions(*args, **kwargs) + return {} def _get_automatic_captions(self, *args, **kwargs): raise NotImplementedError("This method must be implemented by subclasses") From edab9dbf4d00a7f76fbfd2df9ef4b205c88e47a8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Tue, 17 Feb 2015 22:59:19 +0100 Subject: [PATCH 12/83] [YoutubeDL] use the 'render_table' function for listing the subtitles --- youtube_dl/YoutubeDL.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index a47f8f5de..f8b8fb0c1 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -1659,13 +1659,12 @@ class YoutubeDL(object): if not subtitles: self.to_screen('%s has no %s' % (video_id, name)) return - header_line = 'Language formats' - sub_lines = [ - '%-12s%s' % (lang, ', '.join(f['ext'] for f in reversed(formats))) - for lang, formats in subtitles.items()] self.to_screen( - 'Available %s for %s:\n%s\n%s' % - (name, video_id, header_line, '\n'.join(sub_lines))) + 'Available %s for %s:' % (name, video_id)) + self.to_screen(render_table( + ['Language', 'formats'], + [[lang, ', '.join(f['ext'] for f in reversed(formats))] + for lang, formats in subtitles.items()])) def urlopen(self, req): """ Start an HTTP download """ From 37dd5d4629ae955940265f245316c43cd0373a3c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Wed, 18 Feb 2015 16:54:36 +0100 Subject: [PATCH 13/83] [mit] Don't set the subtitles field YouTube already provides them in more formats --- youtube_dl/extractor/mit.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/youtube_dl/extractor/mit.py b/youtube_dl/extractor/mit.py index 3c61a850f..d7ab6a9ae 100644 --- a/youtube_dl/extractor/mit.py +++ b/youtube_dl/extractor/mit.py @@ -5,9 +5,6 @@ import json from .common import InfoExtractor from .youtube import YoutubeIE -from ..compat import ( - compat_urlparse, -) from ..utils import ( clean_html, ExtractorError, @@ -108,7 +105,6 @@ class OCWMITIE(InfoExtractor): 'upload_date': '20121109', 'uploader_id': 'MIT', 'uploader': 'MIT OpenCourseWare', - # 'subtitles': 'http://ocw.mit.edu/courses/electrical-engineering-and-computer-science/6-041-probabilistic-systems-analysis-and-applied-probability-fall-2010/video-lectures/lecture-7-multiple-variables-expectations-independence/MIT6_041F11_lec07_300k.mp4.srt' } }, { @@ -121,7 +117,6 @@ class OCWMITIE(InfoExtractor): 'uploader_id': 'MIT', 'uploader': 'MIT OpenCourseWare', 'description': 'This section contains lecture video excerpts, lecture notes, an interactive mathlet with supporting documents, and problem solving videos.', - # 'subtitles': 'http://ocw.mit.edu//courses/mathematics/18-01sc-single-variable-calculus-fall-2010/ocw-18.01-f07-lec01_300k.SRT' } } ] @@ -140,7 +135,6 @@ class OCWMITIE(InfoExtractor): metadata = re.sub(r'[\'"]', '', embed_chapter_media.group(1)) metadata = re.split(r', ?', metadata) yt = metadata[1] - subs = compat_urlparse.urljoin(self._BASE_URL, metadata[7]) else: # search for call to ocw_embed_chapter_media(container_id, media_url, provider, page_url, image_url, captions_file) embed_media = re.search(r'ocw_embed_media\((.+?)\)', webpage) @@ -148,7 +142,6 @@ class OCWMITIE(InfoExtractor): metadata = re.sub(r'[\'"]', '', embed_media.group(1)) metadata = re.split(r', ?', metadata) yt = metadata[1] - subs = compat_urlparse.urljoin(self._BASE_URL, metadata[5]) else: raise ExtractorError('Unable to find embedded YouTube video.') video_id = YoutubeIE.extract_id(yt) @@ -159,7 +152,5 @@ class OCWMITIE(InfoExtractor): 'title': title, 'description': description, 'url': yt, - 'url_transparent' - 'subtitles': subs, 'ie_key': 'Youtube', } From 18c1c424057dd06f85f4420b14089e032fcb0000 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Wed, 18 Feb 2015 17:20:22 +0100 Subject: [PATCH 14/83] [drtv] Convert to new subtitles system --- youtube_dl/extractor/drtv.py | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/youtube_dl/extractor/drtv.py b/youtube_dl/extractor/drtv.py index d5df18d7c..8257e35a4 100644 --- a/youtube_dl/extractor/drtv.py +++ b/youtube_dl/extractor/drtv.py @@ -1,11 +1,10 @@ from __future__ import unicode_literals -from .subtitles import SubtitlesInfoExtractor -from .common import ExtractorError +from .common import InfoExtractor, ExtractorError from ..utils import parse_iso8601 -class DRTVIE(SubtitlesInfoExtractor): +class DRTVIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?dr\.dk/tv/se/(?:[^/]+/)*(?P[\da-z-]+)(?:[/#?]|$)' _TEST = { @@ -76,7 +75,7 @@ class DRTVIE(SubtitlesInfoExtractor): } for subs in subtitles_list: lang = subs['Language'] - subtitles[LANGS.get(lang, lang)] = subs['Uri'] + subtitles[LANGS.get(lang, lang)] = [{'url': subs['Uri'], 'ext': 'vtt'}] if not formats and restricted_to_denmark: raise ExtractorError( @@ -84,10 +83,6 @@ class DRTVIE(SubtitlesInfoExtractor): self._sort_formats(formats) - if self._downloader.params.get('listsubtitles', False): - self._list_available_subtitles(video_id, subtitles) - return - return { 'id': video_id, 'title': title, @@ -96,5 +91,5 @@ class DRTVIE(SubtitlesInfoExtractor): 'timestamp': timestamp, 'duration': duration, 'formats': formats, - 'subtitles': self.extract_subtitles(video_id, subtitles), + 'subtitles': subtitles, } From 311c39383827e42649a287633a67ef021476d23a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Wed, 18 Feb 2015 17:46:33 +0100 Subject: [PATCH 15/83] [lynda] Convert to new subtitles system --- test/test_subtitles.py | 13 ++++++++ youtube_dl/extractor/lynda.py | 60 ++++++++++++++--------------------- 2 files changed, 37 insertions(+), 36 deletions(-) diff --git a/test/test_subtitles.py b/test/test_subtitles.py index 0ca510310..ee170879f 100644 --- a/test/test_subtitles.py +++ b/test/test_subtitles.py @@ -18,6 +18,7 @@ from youtube_dl.extractor import ( VimeoIE, WallaIE, CeskaTelevizeIE, + LyndaIE, ) @@ -304,5 +305,17 @@ class TestCeskaTelevizeSubtitles(BaseTestSubtitles): self.assertEqual(len(subtitles), 0) +class TestLyndaSubtitles(BaseTestSubtitles): + url = 'http://www.lynda.com/Bootstrap-tutorials/Using-exercise-files/110885/114408-4.html' + IE = LyndaIE + + def test_allsubtitles(self): + self.DL.params['writesubtitles'] = True + self.DL.params['allsubtitles'] = True + subtitles = self.getSubtitles() + self.assertEqual(set(subtitles.keys()), set(['en'])) + self.assertEqual(md5(subtitles['en']), '09bbe67222259bed60deaa26997d73a7') + + if __name__ == '__main__': unittest.main() diff --git a/youtube_dl/extractor/lynda.py b/youtube_dl/extractor/lynda.py index 762cefa34..109055e72 100644 --- a/youtube_dl/extractor/lynda.py +++ b/youtube_dl/extractor/lynda.py @@ -3,7 +3,6 @@ from __future__ import unicode_literals import re import json -from .subtitles import SubtitlesInfoExtractor from .common import InfoExtractor from ..compat import ( compat_str, @@ -16,7 +15,7 @@ from ..utils import ( ) -class LyndaIE(SubtitlesInfoExtractor): +class LyndaIE(InfoExtractor): IE_NAME = 'lynda' IE_DESC = 'lynda.com videos' _VALID_URL = r'https?://www\.lynda\.com/[^/]+/[^/]+/\d+/(\d+)-\d\.html' @@ -88,11 +87,7 @@ class LyndaIE(SubtitlesInfoExtractor): self._check_formats(formats, video_id) self._sort_formats(formats) - if self._downloader.params.get('listsubtitles', False): - self._list_available_subtitles(video_id, page) - return - - subtitles = self._fix_subtitles(self.extract_subtitles(video_id, page)) + subtitles = self.extract_subtitles(video_id, page) return { 'id': video_id, @@ -144,38 +139,31 @@ class LyndaIE(SubtitlesInfoExtractor): if re.search(self._SUCCESSFUL_LOGIN_REGEX, login_page) is None: raise ExtractorError('Unable to log in') - def _fix_subtitles(self, subtitles): - if subtitles is None: - return subtitles # subtitles not requested - - fixed_subtitles = {} - for k, v in subtitles.items(): - subs = json.loads(v) - if len(subs) == 0: + def _fix_subtitles(self, subs): + srt = '' + for pos in range(0, len(subs) - 1): + seq_current = subs[pos] + m_current = re.match(self._TIMECODE_REGEX, seq_current['Timecode']) + if m_current is None: continue - srt = '' - for pos in range(0, len(subs) - 1): - seq_current = subs[pos] - m_current = re.match(self._TIMECODE_REGEX, seq_current['Timecode']) - if m_current is None: - continue - seq_next = subs[pos + 1] - m_next = re.match(self._TIMECODE_REGEX, seq_next['Timecode']) - if m_next is None: - continue - appear_time = m_current.group('timecode') - disappear_time = m_next.group('timecode') - text = seq_current['Caption'] - srt += '%s\r\n%s --> %s\r\n%s' % (str(pos), appear_time, disappear_time, text) - if srt: - fixed_subtitles[k] = srt - return fixed_subtitles + seq_next = subs[pos + 1] + m_next = re.match(self._TIMECODE_REGEX, seq_next['Timecode']) + if m_next is None: + continue + appear_time = m_current.group('timecode') + disappear_time = m_next.group('timecode') + text = seq_current['Caption'] + srt += '%s\r\n%s --> %s\r\n%s' % (str(pos), appear_time, disappear_time, text) + if srt: + return srt - def _get_available_subtitles(self, video_id, webpage): + def _get_subtitles(self, video_id, webpage): url = 'http://www.lynda.com/ajax/player?videoId=%s&type=transcript' % video_id - sub = self._download_webpage(url, None, False) - sub_json = json.loads(sub) - return {'en': url} if len(sub_json) > 0 else {} + subs = self._download_json(url, None, False) + if subs: + return {'en': [{'ext': 'srt', 'data': self._fix_subtitles(subs)}]} + else: + return {} class LyndaCourseIE(InfoExtractor): From b9b42f2ea0c564f3e75a8f052bfe0dfe21cf320f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Wed, 18 Feb 2015 17:57:10 +0100 Subject: [PATCH 16/83] [npo] Convert to new subtitles system --- test/test_subtitles.py | 13 +++++++++++++ youtube_dl/extractor/npo.py | 14 +++++--------- 2 files changed, 18 insertions(+), 9 deletions(-) diff --git a/test/test_subtitles.py b/test/test_subtitles.py index ee170879f..b2195cac4 100644 --- a/test/test_subtitles.py +++ b/test/test_subtitles.py @@ -19,6 +19,7 @@ from youtube_dl.extractor import ( WallaIE, CeskaTelevizeIE, LyndaIE, + NPOIE, ) @@ -317,5 +318,17 @@ class TestLyndaSubtitles(BaseTestSubtitles): self.assertEqual(md5(subtitles['en']), '09bbe67222259bed60deaa26997d73a7') +class TestNPOSubtitles(BaseTestSubtitles): + url = 'http://www.npo.nl/nos-journaal/28-08-2014/POW_00722860' + IE = NPOIE + + def test_allsubtitles(self): + self.DL.params['writesubtitles'] = True + self.DL.params['allsubtitles'] = True + subtitles = self.getSubtitles() + self.assertEqual(set(subtitles.keys()), set(['nl'])) + self.assertEqual(md5(subtitles['nl']), 'fc6435027572b63fb4ab143abd5ad3f4') + + if __name__ == '__main__': unittest.main() diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index c075618e8..9c01eb0af 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -1,6 +1,5 @@ from __future__ import unicode_literals -from .subtitles import SubtitlesInfoExtractor from .common import InfoExtractor from ..utils import ( fix_xml_ampersands, @@ -12,7 +11,7 @@ from ..utils import ( ) -class NPOBaseIE(SubtitlesInfoExtractor): +class NPOBaseIE(InfoExtractor): def _get_token(self, video_id): token_page = self._download_webpage( 'http://ida.omroep.nl/npoplayer/i.js', @@ -164,13 +163,10 @@ class NPOIE(NPOBaseIE): subtitles = {} if metadata.get('tt888') == 'ja': - subtitles['nl'] = 'http://e.omroep.nl/tt888/%s' % video_id - - if self._downloader.params.get('listsubtitles', False): - self._list_available_subtitles(video_id, subtitles) - return - - subtitles = self.extract_subtitles(video_id, subtitles) + subtitles['nl'] = [{ + 'ext': 'vtt', + 'url': 'http://e.omroep.nl/tt888/%s' % video_id, + }] return { 'id': video_id, From 0af25f784bc5bff7cbce2d4af725b4bf2d2262c5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Wed, 18 Feb 2015 18:27:45 +0100 Subject: [PATCH 17/83] [mtv] Convert to new subtitles system --- test/test_subtitles.py | 16 ++++++++++++++++ youtube_dl/extractor/mtv.py | 26 +++++++------------------- 2 files changed, 23 insertions(+), 19 deletions(-) diff --git a/test/test_subtitles.py b/test/test_subtitles.py index b2195cac4..c018d9b49 100644 --- a/test/test_subtitles.py +++ b/test/test_subtitles.py @@ -20,6 +20,7 @@ from youtube_dl.extractor import ( CeskaTelevizeIE, LyndaIE, NPOIE, + ComedyCentralIE, ) @@ -330,5 +331,20 @@ class TestNPOSubtitles(BaseTestSubtitles): self.assertEqual(md5(subtitles['nl']), 'fc6435027572b63fb4ab143abd5ad3f4') +class TestMTVSubtitles(BaseTestSubtitles): + url = 'http://www.cc.com/video-clips/kllhuv/stand-up-greg-fitzsimmons--uncensored---too-good-of-a-mother' + IE = ComedyCentralIE + + def getInfoDict(self): + return super(TestMTVSubtitles, self).getInfoDict()['entries'][0] + + def test_allsubtitles(self): + self.DL.params['writesubtitles'] = True + self.DL.params['allsubtitles'] = True + subtitles = self.getSubtitles() + self.assertEqual(set(subtitles.keys()), set(['en'])) + self.assertEqual(md5(subtitles['en']), 'b9f6ca22a6acf597ec76f61749765e65') + + if __name__ == '__main__': unittest.main() diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py index bc7f49ebb..c11de1cb6 100644 --- a/youtube_dl/extractor/mtv.py +++ b/youtube_dl/extractor/mtv.py @@ -2,7 +2,7 @@ from __future__ import unicode_literals import re -from .subtitles import SubtitlesInfoExtractor +from .common import InfoExtractor from ..compat import ( compat_urllib_parse, compat_urllib_request, @@ -23,7 +23,7 @@ def _media_xml_tag(tag): return '{http://search.yahoo.com/mrss/}%s' % tag -class MTVServicesInfoExtractor(SubtitlesInfoExtractor): +class MTVServicesInfoExtractor(InfoExtractor): _MOBILE_TEMPLATE = None @staticmethod @@ -95,25 +95,15 @@ class MTVServicesInfoExtractor(SubtitlesInfoExtractor): def _extract_subtitles(self, mdoc, mtvn_id): subtitles = {} - FORMATS = { - 'scc': 'cea-608', - 'eia-608': 'cea-608', - 'xml': 'ttml', - } - subtitles_format = FORMATS.get( - self._downloader.params.get('subtitlesformat'), 'ttml') for transcript in mdoc.findall('.//transcript'): if transcript.get('kind') != 'captions': continue lang = transcript.get('srclang') - for typographic in transcript.findall('./typographic'): - captions_format = typographic.get('format') - if captions_format == subtitles_format: - subtitles[lang] = compat_str(typographic.get('src')) - break - if self._downloader.params.get('listsubtitles', False): - self._list_available_subtitles(mtvn_id, subtitles) - return self.extract_subtitles(mtvn_id, subtitles) + subtitles[lang] = [{ + 'url': compat_str(typographic.get('src')), + 'ext': typographic.get('format') + } for typographic in transcript.findall('./typographic')] + return subtitles def _get_video_info(self, itemdoc): uri = itemdoc.find('guid').text @@ -196,8 +186,6 @@ class MTVServicesInfoExtractor(SubtitlesInfoExtractor): webpage, 'mgid') videos_info = self._get_videos_info(mgid) - if self._downloader.params.get('listsubtitles', False): - return return videos_info From 01561da142485a581e67ce98ef009ebe0ed7b4c1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Wed, 18 Feb 2015 18:57:01 +0100 Subject: [PATCH 18/83] [nrk] Convert to new subtitles system --- test/test_subtitles.py | 13 +++++++++++++ youtube_dl/extractor/nrk.py | 15 +++++++-------- 2 files changed, 20 insertions(+), 8 deletions(-) diff --git a/test/test_subtitles.py b/test/test_subtitles.py index c018d9b49..1e2324232 100644 --- a/test/test_subtitles.py +++ b/test/test_subtitles.py @@ -21,6 +21,7 @@ from youtube_dl.extractor import ( LyndaIE, NPOIE, ComedyCentralIE, + NRKTVIE, ) @@ -346,5 +347,17 @@ class TestMTVSubtitles(BaseTestSubtitles): self.assertEqual(md5(subtitles['en']), 'b9f6ca22a6acf597ec76f61749765e65') +class TestNRKSubtitles(BaseTestSubtitles): + url = 'http://tv.nrk.no/serie/ikke-gjoer-dette-hjemme/DMPV73000411/sesong-2/episode-1' + IE = NRKTVIE + + def test_allsubtitles(self): + self.DL.params['writesubtitles'] = True + self.DL.params['allsubtitles'] = True + subtitles = self.getSubtitles() + self.assertEqual(set(subtitles.keys()), set(['no'])) + self.assertEqual(md5(subtitles['no']), '1d221e6458c95c5494dcd38e6a1f129a') + + if __name__ == '__main__': unittest.main() diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py index f6de26022..46f493cfc 100644 --- a/youtube_dl/extractor/nrk.py +++ b/youtube_dl/extractor/nrk.py @@ -10,7 +10,6 @@ from ..utils import ( parse_duration, unified_strdate, ) -from .subtitles import SubtitlesInfoExtractor class NRKIE(InfoExtractor): @@ -73,7 +72,7 @@ class NRKIE(InfoExtractor): } -class NRKTVIE(SubtitlesInfoExtractor): +class NRKTVIE(InfoExtractor): _VALID_URL = r'(?Phttp://tv\.nrk(?:super)?\.no/)(?:serie/[^/]+|program)/(?P[a-zA-Z]{4}\d{8})(?:/\d{2}-\d{2}-\d{4})?(?:#del=(?P\d+))?' _TESTS = [ @@ -156,7 +155,7 @@ class NRKTVIE(SubtitlesInfoExtractor): if self._downloader.params.get('verbose', False): self.to_screen('[debug] %s' % txt) - def _extract_captions(self, subtitlesurl, video_id, baseurl): + def _get_subtitles(self, subtitlesurl, video_id, baseurl): url = "%s%s" % (baseurl, subtitlesurl) self._debug_print('%s: Subtitle url: %s' % (video_id, url)) captions = self._download_xml(url, video_id, 'Downloading subtitles') @@ -170,7 +169,10 @@ class NRKTVIE(SubtitlesInfoExtractor): endtime = self._seconds2str(begin + duration) text = '\n'.join(p.itertext()) srt += '%s\r\n%s --> %s\r\n%s\r\n\r\n' % (str(pos), starttime, endtime, text) - return {lang: srt} + return {lang: [ + {'ext': 'ttml', 'url': url}, + {'ext': 'srt', 'data': srt}, + ]} def _extract_f4m(self, manifest_url, video_id): return self._extract_f4m_formats(manifest_url + '?hdcore=3.1.1&plugin=aasp-3.1.1.69.124', video_id) @@ -243,10 +245,7 @@ class NRKTVIE(SubtitlesInfoExtractor): webpage, 'subtitle URL', default=None) subtitles = None if subtitles_url: - subtitles = self._extract_captions(subtitles_url, video_id, baseurl) - if self._downloader.params.get('listsubtitles', False): - self._list_available_subtitles(video_id, subtitles) - return + subtitles = self.extract_subtitles(subtitles_url, video_id, baseurl) return { 'id': video_id, From afbdd3acc36130d1a717b3cacab69c0dfc716622 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Wed, 18 Feb 2015 20:14:42 +0100 Subject: [PATCH 19/83] [rai] Convert to new subtitles system --- test/test_subtitles.py | 13 +++++++++++++ youtube_dl/extractor/rai.py | 22 +++++++++------------- 2 files changed, 22 insertions(+), 13 deletions(-) diff --git a/test/test_subtitles.py b/test/test_subtitles.py index 1e2324232..4dbb50515 100644 --- a/test/test_subtitles.py +++ b/test/test_subtitles.py @@ -22,6 +22,7 @@ from youtube_dl.extractor import ( NPOIE, ComedyCentralIE, NRKTVIE, + RaiIE, ) @@ -359,5 +360,17 @@ class TestNRKSubtitles(BaseTestSubtitles): self.assertEqual(md5(subtitles['no']), '1d221e6458c95c5494dcd38e6a1f129a') +class TestRaiSubtitles(BaseTestSubtitles): + url = 'http://www.rai.tv/dl/RaiTV/programmi/media/ContentItem-cb27157f-9dd0-4aee-b788-b1f67643a391.html' + IE = RaiIE + + def test_allsubtitles(self): + self.DL.params['writesubtitles'] = True + self.DL.params['allsubtitles'] = True + subtitles = self.getSubtitles() + self.assertEqual(set(subtitles.keys()), set(['it'])) + self.assertEqual(md5(subtitles['it']), 'b1d90a98755126b61e667567a1f6680a') + + if __name__ == '__main__': unittest.main() diff --git a/youtube_dl/extractor/rai.py b/youtube_dl/extractor/rai.py index aa26b7e0b..144e33982 100644 --- a/youtube_dl/extractor/rai.py +++ b/youtube_dl/extractor/rai.py @@ -2,7 +2,7 @@ from __future__ import unicode_literals import re -from .subtitles import SubtitlesInfoExtractor +from .common import InfoExtractor from ..compat import ( compat_urllib_parse, ) @@ -12,7 +12,7 @@ from ..utils import ( ) -class RaiIE(SubtitlesInfoExtractor): +class RaiIE(InfoExtractor): _VALID_URL = r'(?Phttp://(?:.+?\.)?(?:rai\.it|rai\.tv|rainews\.it)/dl/.+?-(?P[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})(?:-.+?)?\.html)' _TESTS = [ { @@ -89,15 +89,7 @@ class RaiIE(SubtitlesInfoExtractor): 'ext': 'mp4', }) - if self._downloader.params.get('listsubtitles', False): - page = self._download_webpage(url, video_id) - self._list_available_subtitles(video_id, page) - return - - subtitles = {} - if self._have_to_download_any_subtitles: - page = self._download_webpage(url, video_id) - subtitles = self.extract_subtitles(video_id, page) + subtitles = self.extract_subtitles(video_id, url) return { 'id': video_id, @@ -111,7 +103,8 @@ class RaiIE(SubtitlesInfoExtractor): 'subtitles': subtitles, } - def _get_available_subtitles(self, video_id, webpage): + def _get_subtitles(self, video_id, url): + webpage = self._download_webpage(url, video_id) subtitles = {} m = re.search(r' Date: Wed, 18 Feb 2015 20:37:16 +0100 Subject: [PATCH 20/83] [viki] Convert to new subtitles system --- test/test_subtitles.py | 13 +++++++++++++ youtube_dl/extractor/viki.py | 17 +++++++++-------- 2 files changed, 22 insertions(+), 8 deletions(-) diff --git a/test/test_subtitles.py b/test/test_subtitles.py index 4dbb50515..98d1afff4 100644 --- a/test/test_subtitles.py +++ b/test/test_subtitles.py @@ -23,6 +23,7 @@ from youtube_dl.extractor import ( ComedyCentralIE, NRKTVIE, RaiIE, + VikiIE, ) @@ -372,5 +373,17 @@ class TestRaiSubtitles(BaseTestSubtitles): self.assertEqual(md5(subtitles['it']), 'b1d90a98755126b61e667567a1f6680a') +class TestVikiSubtitles(BaseTestSubtitles): + url = 'http://www.viki.com/videos/1060846v-punch-episode-18' + IE = VikiIE + + def test_allsubtitles(self): + self.DL.params['writesubtitles'] = True + self.DL.params['allsubtitles'] = True + subtitles = self.getSubtitles() + self.assertEqual(set(subtitles.keys()), set(['en'])) + self.assertEqual(md5(subtitles['en']), 'b0b781eeb45efd3f6398a925b259150b') + + if __name__ == '__main__': unittest.main() diff --git a/youtube_dl/extractor/viki.py b/youtube_dl/extractor/viki.py index 944901e14..6816dacb6 100644 --- a/youtube_dl/extractor/viki.py +++ b/youtube_dl/extractor/viki.py @@ -2,16 +2,17 @@ from __future__ import unicode_literals import re +from ..compat import compat_urlparse from ..utils import ( ExtractorError, unescapeHTML, unified_strdate, US_RATINGS, ) -from .subtitles import SubtitlesInfoExtractor +from .common import InfoExtractor -class VikiIE(SubtitlesInfoExtractor): +class VikiIE(InfoExtractor): IE_NAME = 'viki' _VALID_URL = r'^https?://(?:www\.)?viki\.com/videos/(?P[0-9]+v)' @@ -69,9 +70,6 @@ class VikiIE(SubtitlesInfoExtractor): # subtitles video_subtitles = self.extract_subtitles(video_id, info_webpage) - if self._downloader.params.get('listsubtitles', False): - self._list_available_subtitles(video_id, info_webpage) - return return { 'id': video_id, @@ -85,12 +83,15 @@ class VikiIE(SubtitlesInfoExtractor): 'upload_date': upload_date, } - def _get_available_subtitles(self, video_id, info_webpage): + def _get_subtitles(self, video_id, info_webpage): res = {} - for sturl_html in re.findall(r'', info_webpage): + for sturl_html in re.findall(r'[a-z]+)\.vtt', sturl) if not m: continue - res[m.group('lang')] = sturl + res[m.group('lang')] = [{ + 'url': compat_urlparse.urljoin('http://www.viki.com', sturl), + 'ext': 'vtt', + }] return res From 8807f1277f8c69488046fc7215cc79165e976ff3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Thu, 19 Feb 2015 14:54:50 +0100 Subject: [PATCH 21/83] [theplatform] Convert to new subtitles system --- test/test_subtitles.py | 15 +++++++++++++++ youtube_dl/extractor/theplatform.py | 18 +++++++----------- 2 files changed, 22 insertions(+), 11 deletions(-) diff --git a/test/test_subtitles.py b/test/test_subtitles.py index 98d1afff4..c04fe6f22 100644 --- a/test/test_subtitles.py +++ b/test/test_subtitles.py @@ -24,6 +24,7 @@ from youtube_dl.extractor import ( NRKTVIE, RaiIE, VikiIE, + ThePlatformIE, ) @@ -385,5 +386,19 @@ class TestVikiSubtitles(BaseTestSubtitles): self.assertEqual(md5(subtitles['en']), 'b0b781eeb45efd3f6398a925b259150b') +class TestThePlatformSubtitles(BaseTestSubtitles): + # from http://www.3playmedia.com/services-features/tools/integrations/theplatform/ + # (see http://theplatform.com/about/partners/type/subtitles-closed-captioning/) + url = 'theplatform:JFUjUE1_ehvq' + IE = ThePlatformIE + + def test_allsubtitles(self): + self.DL.params['writesubtitles'] = True + self.DL.params['allsubtitles'] = True + subtitles = self.getSubtitles() + self.assertEqual(set(subtitles.keys()), set(['en'])) + self.assertEqual(md5(subtitles['en']), '97e7670cbae3c4d26ae8bcc7fdd78d4b') + + if __name__ == '__main__': unittest.main() diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py index 1579822f2..5f24189cc 100644 --- a/youtube_dl/extractor/theplatform.py +++ b/youtube_dl/extractor/theplatform.py @@ -8,7 +8,7 @@ import binascii import hashlib -from .subtitles import SubtitlesInfoExtractor +from .common import InfoExtractor from ..compat import ( compat_str, ) @@ -22,7 +22,7 @@ from ..utils import ( _x = lambda p: xpath_with_ns(p, {'smil': 'http://www.w3.org/2005/SMIL21/Language'}) -class ThePlatformIE(SubtitlesInfoExtractor): +class ThePlatformIE(InfoExtractor): _VALID_URL = r'''(?x) (?:https?://(?:link|player)\.theplatform\.com/[sp]/(?P[^/]+)/ (?P(?:[^/\?]+/(?:swf|config)|onsite)/select/)? @@ -104,15 +104,11 @@ class ThePlatformIE(SubtitlesInfoExtractor): captions = info.get('captions') if isinstance(captions, list): for caption in captions: - lang, src = caption.get('lang'), caption.get('src') - if lang and src: - subtitles[lang] = src - - if self._downloader.params.get('listsubtitles', False): - self._list_available_subtitles(video_id, subtitles) - return - - subtitles = self.extract_subtitles(video_id, subtitles) + lang, src, mime = caption.get('lang', 'en'), caption.get('src'), caption.get('type') + subtitles[lang] = [{ + 'ext': 'srt' if mime == 'text/srt' else 'ttml', + 'url': src, + }] head = meta.find(_x('smil:head')) body = meta.find(_x('smil:body')) From f13b1e7d7fd4a63c9ca4a0aa9930c540033cc408 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Thu, 19 Feb 2015 16:46:41 +0100 Subject: [PATCH 22/83] [bbccouk] Convert to new subtitles system I haven't found any video available outside the UK, so I haven't added tests. I have updated how the srt file is build, because (at least for www.bbc.co.uk/programmes/p02j9b69) the subtitles is inside 'span' elements. --- youtube_dl/extractor/bbccouk.py | 33 ++++++++++++++++++++++----------- 1 file changed, 22 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/bbccouk.py b/youtube_dl/extractor/bbccouk.py index f23e39545..abc34a576 100644 --- a/youtube_dl/extractor/bbccouk.py +++ b/youtube_dl/extractor/bbccouk.py @@ -2,12 +2,12 @@ from __future__ import unicode_literals import xml.etree.ElementTree -from .subtitles import SubtitlesInfoExtractor +from .common import InfoExtractor from ..utils import ExtractorError from ..compat import compat_HTTPError -class BBCCoUkIE(SubtitlesInfoExtractor): +class BBCCoUkIE(InfoExtractor): IE_NAME = 'bbc.co.uk' IE_DESC = 'BBC iPlayer' _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:(?:(?:programmes|iplayer(?:/[^/]+)?/(?:episode|playlist))/)|music/clips[/#])(?P[\da-z]{8})' @@ -215,17 +215,32 @@ class BBCCoUkIE(SubtitlesInfoExtractor): formats.extend(conn_formats) return formats - def _extract_captions(self, media, programme_id): + def _get_subtitles(self, media, programme_id): subtitles = {} for connection in self._extract_connections(media): captions = self._download_xml(connection.get('href'), programme_id, 'Downloading captions') lang = captions.get('{http://www.w3.org/XML/1998/namespace}lang', 'en') ps = captions.findall('./{0}body/{0}div/{0}p'.format('{http://www.w3.org/2006/10/ttaf1}')) srt = '' + + def _extract_text(p): + if p.text is not None: + stripped_text = p.text.strip() + if stripped_text: + return stripped_text + return ' '.join(span.text.strip() for span in p.findall('{http://www.w3.org/2006/10/ttaf1}span')) for pos, p in enumerate(ps): - srt += '%s\r\n%s --> %s\r\n%s\r\n\r\n' % (str(pos), p.get('begin'), p.get('end'), - p.text.strip() if p.text is not None else '') - subtitles[lang] = srt + srt += '%s\r\n%s --> %s\r\n%s\r\n\r\n' % (str(pos), p.get('begin'), p.get('end'), _extract_text(p)) + subtitles[lang] = [ + { + 'url': connection.get('href'), + 'ext': 'ttml', + }, + { + 'data': srt, + 'ext': 'srt', + }, + ] return subtitles def _download_media_selector(self, programme_id): @@ -249,7 +264,7 @@ class BBCCoUkIE(SubtitlesInfoExtractor): elif kind == 'video': formats.extend(self._extract_video(media, programme_id)) elif kind == 'captions': - subtitles = self._extract_captions(media, programme_id) + subtitles = self.extract_subtitles(media, programme_id) return formats, subtitles @@ -324,10 +339,6 @@ class BBCCoUkIE(SubtitlesInfoExtractor): else: programme_id, title, description, duration, formats, subtitles = self._download_playlist(group_id) - if self._downloader.params.get('listsubtitles', False): - self._list_available_subtitles(programme_id, subtitles) - return - self._sort_formats(formats) return { From fb7cb6823e5ace9437bc79f2e1928a30f317856b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Thu, 19 Feb 2015 23:24:24 +0100 Subject: [PATCH 23/83] Remove the SubtitlesInfoExtractor class No longer needed --- youtube_dl/extractor/subtitles.py | 99 ------------------------------- 1 file changed, 99 deletions(-) delete mode 100644 youtube_dl/extractor/subtitles.py diff --git a/youtube_dl/extractor/subtitles.py b/youtube_dl/extractor/subtitles.py deleted file mode 100644 index 59a51268d..000000000 --- a/youtube_dl/extractor/subtitles.py +++ /dev/null @@ -1,99 +0,0 @@ -from __future__ import unicode_literals -from .common import InfoExtractor - -from ..compat import compat_str -from ..utils import ( - ExtractorError, -) - - -class SubtitlesInfoExtractor(InfoExtractor): - @property - def _have_to_download_any_subtitles(self): - return any([self._downloader.params.get('writesubtitles', False), - self._downloader.params.get('writeautomaticsub')]) - - def _list_available_subtitles(self, video_id, webpage): - """ outputs the available subtitles for the video """ - sub_lang_list = self._get_available_subtitles(video_id, webpage) - auto_captions_list = self._get_available_automatic_caption(video_id, webpage) - sub_lang = ",".join(list(sub_lang_list.keys())) - self.to_screen('%s: Available subtitles for video: %s' % - (video_id, sub_lang)) - auto_lang = ",".join(auto_captions_list.keys()) - self.to_screen('%s: Available automatic captions for video: %s' % - (video_id, auto_lang)) - - def extract_subtitles(self, video_id, webpage): - """ - returns {sub_lang: sub} ,{} if subtitles not found or None if the - subtitles aren't requested. - """ - if not self._have_to_download_any_subtitles: - return None - available_subs_list = {} - if self._downloader.params.get('writeautomaticsub', False): - available_subs_list.update(self._get_available_automatic_caption(video_id, webpage)) - if self._downloader.params.get('writesubtitles', False): - available_subs_list.update(self._get_available_subtitles(video_id, webpage)) - - if not available_subs_list: # error, it didn't get the available subtitles - return {} - if self._downloader.params.get('allsubtitles', False): - sub_lang_list = available_subs_list - else: - if self._downloader.params.get('subtitleslangs', False): - requested_langs = self._downloader.params.get('subtitleslangs') - elif 'en' in available_subs_list: - requested_langs = ['en'] - else: - requested_langs = [list(available_subs_list.keys())[0]] - - sub_lang_list = {} - for sub_lang in requested_langs: - if sub_lang not in available_subs_list: - self._downloader.report_warning('no closed captions found in the specified language "%s"' % sub_lang) - continue - sub_lang_list[sub_lang] = available_subs_list[sub_lang] - - subtitles = {} - for sub_lang, url in sub_lang_list.items(): - subtitle = self._request_subtitle_url(sub_lang, url) - if subtitle: - subtitles[sub_lang] = subtitle - return subtitles - - def _download_subtitle_url(self, sub_lang, url): - return self._download_webpage(url, None, note=False) - - def _request_subtitle_url(self, sub_lang, url): - """ makes the http request for the subtitle """ - try: - sub = self._download_subtitle_url(sub_lang, url) - except ExtractorError as err: - self._downloader.report_warning('unable to download video subtitles for %s: %s' % (sub_lang, compat_str(err))) - return - if not sub: - self._downloader.report_warning('Did not fetch video subtitles') - return - return sub - - def _get_available_subtitles(self, video_id, webpage): - """ - returns {sub_lang: url} or {} if not available - Must be redefined by the subclasses - """ - - # By default, allow implementations to simply pass in the result - assert isinstance(webpage, dict), \ - '_get_available_subtitles not implemented' - return webpage - - def _get_available_automatic_caption(self, video_id, webpage): - """ - returns {sub_lang: url} or {} if not available - Must be redefined by the subclasses that support automatic captions, - otherwise it will return {} - """ - self._downloader.report_warning('Automatic Captions not supported by this server') - return {} From 5e9a033e6e8054605bb87c8448f95a5bb86a71bb Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Fri, 20 Feb 2015 02:52:03 +0100 Subject: [PATCH 24/83] [imgur] Allow alternative values Every now and then, imgur.com goes crazy and gives us a generic title and description (otherwise it looks all fine though). Simply update the test case to allow for that craziness. --- youtube_dl/extractor/imgur.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/imgur.py b/youtube_dl/extractor/imgur.py index b16c7aed0..fe5d95e2c 100644 --- a/youtube_dl/extractor/imgur.py +++ b/youtube_dl/extractor/imgur.py @@ -19,16 +19,16 @@ class ImgurIE(InfoExtractor): 'info_dict': { 'id': 'A61SaA1', 'ext': 'mp4', - 'title': 'MRW gifv is up and running without any bugs', - 'description': 'The Internet\'s visual storytelling community. Explore, share, and discuss the best visual stories the Internet has to offer.', + 'title': 're:Imgur GIF$|MRW gifv is up and running without any bugs$', + 'description': 're:The origin of the Internet\'s most viral images$|The Internet\'s visual storytelling community\. Explore, share, and discuss the best visual stories the Internet has to offer\.$', }, }, { 'url': 'https://imgur.com/A61SaA1', 'info_dict': { 'id': 'A61SaA1', 'ext': 'mp4', - 'title': 'MRW gifv is up and running without any bugs', - 'description': 'The Internet\'s visual storytelling community. Explore, share, and discuss the best visual stories the Internet has to offer.', + 'title': 're:Imgur GIF$|MRW gifv is up and running without any bugs$', + 'description': 're:The origin of the Internet\'s most viral images$|The Internet\'s visual storytelling community\. Explore, share, and discuss the best visual stories the Internet has to offer\.$', }, }] From 5da6bd00837236cf8a5dc5aeeadae5cfed7f2021 Mon Sep 17 00:00:00 2001 From: "Leslie P. Polzer" Date: Fri, 20 Feb 2015 10:49:45 +0100 Subject: [PATCH 25/83] [chirbit] Add new extractor. --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/chirbit.py | 34 ++++++++++++++++++++++++++++++++ 2 files changed, 35 insertions(+) create mode 100644 youtube_dl/extractor/chirbit.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index f225ac654..de08e69bc 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -63,6 +63,7 @@ from .ccc import CCCIE from .ceskatelevize import CeskaTelevizeIE from .channel9 import Channel9IE from .chilloutzone import ChilloutzoneIE +from .chirbit import ChirbitIE from .cinchcast import CinchcastIE from .clipfish import ClipfishIE from .cliphunter import CliphunterIE diff --git a/youtube_dl/extractor/chirbit.py b/youtube_dl/extractor/chirbit.py new file mode 100644 index 000000000..06a3e1a7a --- /dev/null +++ b/youtube_dl/extractor/chirbit.py @@ -0,0 +1,34 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class ChirbitIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?chirb\.it/(?P[^/]+)' + _TEST = { + 'url': 'http://chirb.it/PrIPv5', + 'md5': '9847b0dad6ac3e074568bf2cfb197de8', + 'info_dict': { + 'id': 'PrIPv5', + 'display_id': 'kukushtv_1423231243', + 'ext': 'mp3', + 'title': 'Фасадстрой', + 'url': 'http://audio.chirbit.com/kukushtv_1423231243.mp3' + } + } + + def _real_extract(self, url): + audio_linkid = self._match_id(url) + webpage = self._download_webpage(url, audio_linkid) + + audio_title = self._html_search_regex(r'(.*?)', webpage, 'title') + audio_id = self._html_search_regex(r'\("setFile",\s+"http://audio.chirbit.com/(.*?).mp3"\)', webpage, 'audio ID') + audio_url = 'http://audio.chirbit.com/' + audio_id + '.mp3'; + + return { + 'id': audio_linkid, + 'display_id': audio_id, + 'title': audio_title, + 'url': audio_url + } From 365577f5676d63089cb834855dd4cdce7d0dc8aa Mon Sep 17 00:00:00 2001 From: "Leslie P. Polzer" Date: Fri, 20 Feb 2015 14:48:12 +0100 Subject: [PATCH 26/83] [chirbit] add profile extractor. --- youtube_dl/extractor/__init__.py | 2 +- youtube_dl/extractor/chirbit.py | 63 ++++++++++++++++++++++++++++++++ 2 files changed, 64 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index de08e69bc..94e150826 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -63,7 +63,7 @@ from .ccc import CCCIE from .ceskatelevize import CeskaTelevizeIE from .channel9 import Channel9IE from .chilloutzone import ChilloutzoneIE -from .chirbit import ChirbitIE +from .chirbit import ChirbitIE, ChirbitProfileIE from .cinchcast import CinchcastIE from .clipfish import ClipfishIE from .cliphunter import CliphunterIE diff --git a/youtube_dl/extractor/chirbit.py b/youtube_dl/extractor/chirbit.py index 06a3e1a7a..47ce94aa0 100644 --- a/youtube_dl/extractor/chirbit.py +++ b/youtube_dl/extractor/chirbit.py @@ -1,7 +1,10 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor +from ..utils import clean_html class ChirbitIE(InfoExtractor): @@ -32,3 +35,63 @@ class ChirbitIE(InfoExtractor): 'title': audio_title, 'url': audio_url } + +class ChirbitProfileIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?chirbit.com/(?P[^/]+)' + _TEST = { + 'url': 'http://chirbit.com/ScarletBeauty', + 'playlist_count': 3, + 'info_dict': { + '_type': 'playlist', + 'title': 'ScarletBeauty', + 'id': 'ScarletBeauty' + } + } + + def _real_extract(self, url): + profile_id = self._match_id(url) + + # Chirbit has a pretty weird "Last Page" navigation behavior. + # We grab the profile's oldest entry to determine when to + # stop fetching entries. + oldestpage = self._download_webpage(url + '/24599', profile_id) + oldest_page_entries = re.findall( + r'''soundFile:\s*"http://audio.chirbit.com/(.*?).mp3"''', + oldestpage); + oldestentry = clean_html(oldest_page_entries[-1]); + + ids = [] + titles = [] + n = 0 + while True: + page = self._download_webpage(url + '/' + str(n), profile_id) + page_ids = re.findall( + r'''soundFile:\s*"http://audio.chirbit.com/(.*?).mp3"''', + page); + page_titles = re.findall( + r'''(.*?)''', + page); + ids += page_ids + titles += page_titles + if oldestentry in page_ids: + break + n += 1 + + entries = [] + i = 0 + for id in ids: + entries.append({ + 'id': id, + 'title': titles[i], + 'url': 'http://audio.chirbit.com/' + id + '.mp3' + }); + i += 1 + + info_dict = { + '_type': 'playlist', + 'id': profile_id, + 'title': profile_id, + 'entries': entries + } + + return info_dict; From 3da0db62e6c4122b151349e7a05f59803da4fbbc Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Fri, 20 Feb 2015 23:22:47 +0100 Subject: [PATCH 27/83] [escapist] Fix extraction (fixes #5017) --- youtube_dl/extractor/escapist.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/escapist.py b/youtube_dl/extractor/escapist.py index 4303feccd..6b693b3b6 100644 --- a/youtube_dl/extractor/escapist.py +++ b/youtube_dl/extractor/escapist.py @@ -22,6 +22,7 @@ class EscapistIE(InfoExtractor): 'uploader_id': 'the-escapist-presents', 'uploader': 'The Escapist Presents', 'title': "Breaking Down Baldur's Gate", + 'thumbnail': 're:^https?://.*\.jpg$', } } @@ -40,9 +41,8 @@ class EscapistIE(InfoExtractor): raw_title = self._html_search_meta('title', webpage, fatal=True) title = raw_title.partition(' : ')[2] - player_url = self._og_search_video_url(webpage, name='player URL') - config_url = compat_urllib_parse.unquote(self._search_regex( - r'config=(.*)$', player_url, 'config URL')) + config_url = compat_urllib_parse.unquote(self._html_search_regex( + r' Date: Fri, 20 Feb 2015 23:23:12 +0100 Subject: [PATCH 28/83] release 2015.02.20 --- docs/supportedsites.md | 1 + youtube_dl/version.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index f6ba28e7a..dbff5e270 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -451,6 +451,7 @@ - **Turbo** - **Tutv** - **tv.dfb.de** + - **TV4**: tv4.se and tv4play.se - **tvigle**: Интернет-телевидение Tvigle.ru - **tvp.pl** - **tvp.pl:Series** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 537e8cf60..9fd0ee963 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2015.02.19.3' +__version__ = '2015.02.20' From a00a8bcc8a131e0bea605ab3cfa0f09bedde9e0c Mon Sep 17 00:00:00 2001 From: CyberJacob Date: Fri, 20 Feb 2015 22:43:51 +0000 Subject: [PATCH 29/83] Change example URLs in readme (fixes #5018y) --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index a2c148311..8ea31d605 100644 --- a/README.md +++ b/README.md @@ -571,7 +571,7 @@ Support requests for services that **do** purchase the rights to distribute thei ### How can I detect whether a given URL is supported by youtube-dl? -For one, have a look at the [list of supported sites](docs/supportedsites.md). Note that it can sometimes happen that the site changes its URL scheme (say, from http://example.com/v/1234567 to http://example.com/v/1234567 ) and youtube-dl reports an URL of a service in that list as unsupported. In that case, simply report a bug. +For one, have a look at the [list of supported sites](docs/supportedsites.md). Note that it can sometimes happen that the site changes its URL scheme (say, from http://example.com/video/1234567 to http://example.com/v/1234567 ) and youtube-dl reports an URL of a service in that list as unsupported. In that case, simply report a bug. It is *not* possible to detect whether a URL is supported or not. That's because youtube-dl contains a generic extractor which matches **all** URLs. You may be tempted to disable, exclude, or remove the generic extractor, but the generic extractor not only allows users to extract videos from lots of websites that embed a video from another service, but may also be used to extract video from a service that it's hosting itself. Therefore, we neither recommend nor support disabling, excluding, or removing the generic extractor. From fad6768bd1a67dccbb153ac371d3e82575321ea9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 21 Feb 2015 18:00:25 +0600 Subject: [PATCH 30/83] [vimeo] Fix password protected videos (Closes #5001) --- youtube_dl/extractor/vimeo.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 78d287e0e..5f8649e35 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -4,6 +4,7 @@ from __future__ import unicode_literals import json import re import itertools +import hashlib from .common import InfoExtractor from .subtitles import SubtitlesInfoExtractor @@ -225,6 +226,10 @@ class VimeoIE(VimeoBaseInfoExtractor, SubtitlesInfoExtractor): if mobj.group('pro') or mobj.group('player'): url = 'http://player.vimeo.com/video/' + video_id + password = self._downloader.params.get('videopassword', None) + if password: + headers['Cookie'] = '%s_password=%s' % (video_id, hashlib.md5(password).hexdigest()) + # Retrieve video webpage to extract further information request = compat_urllib_request.Request(url, None, headers) try: From 62b013df0dcb5f902d745b77b3de62b64b828863 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 21 Feb 2015 18:31:10 +0600 Subject: [PATCH 31/83] [vimeo] Encode password before hash calculation --- youtube_dl/extractor/vimeo.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 5f8649e35..4cd2f73d9 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -228,7 +228,8 @@ class VimeoIE(VimeoBaseInfoExtractor, SubtitlesInfoExtractor): password = self._downloader.params.get('videopassword', None) if password: - headers['Cookie'] = '%s_password=%s' % (video_id, hashlib.md5(password).hexdigest()) + headers['Cookie'] = '%s_password=%s' % ( + video_id, hashlib.md5(password.encode('utf-8')).hexdigest()) # Retrieve video webpage to extract further information request = compat_urllib_request.Request(url, None, headers) From 77b2986b5b0246234b72ae9dd78fb40f9d37374f Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Sat, 21 Feb 2015 14:51:07 +0100 Subject: [PATCH 32/83] [extractor/common] Recognize Indian censorship (#5021) --- youtube_dl/extractor/common.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 08b8ad37c..ee64ad329 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -391,6 +391,16 @@ class InfoExtractor(object): if blocked_iframe: msg += ' Visit %s for more details' % blocked_iframe raise ExtractorError(msg, expected=True) + if 'The URL you requested has been blocked' in content[:512]: + msg = ( + 'Access to this webpage has been blocked by Indian censorship. ' + 'Use a VPN or proxy server (with --proxy) to route around it.') + block_msg = self._html_search_regex( + r'

(.*?)

', + content, 'block message', default=None) + if block_msg: + msg += ' (Message: "%s")' % block_msg.replace('\n', ' ') + raise ExtractorError(msg, expected=True) return content From 8fb3ac3649ca7df6f328971f58afa84dd9d05cc6 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Sat, 21 Feb 2015 14:55:13 +0100 Subject: [PATCH 33/83] PEP8: W503 --- devscripts/check-porn.py | 8 ++++---- test/test_swfinterp.py | 4 ++-- youtube_dl/YoutubeDL.py | 8 ++++---- youtube_dl/__init__.py | 16 ++++++++-------- youtube_dl/downloader/common.py | 10 +++++----- youtube_dl/downloader/f4m.py | 4 ++-- youtube_dl/extractor/adobetv.py | 7 ++++--- youtube_dl/extractor/common.py | 4 ++-- youtube_dl/extractor/defense.py | 5 +++-- youtube_dl/utils.py | 4 ++-- 10 files changed, 36 insertions(+), 34 deletions(-) diff --git a/devscripts/check-porn.py b/devscripts/check-porn.py index 216282712..6a5bd9eda 100644 --- a/devscripts/check-porn.py +++ b/devscripts/check-porn.py @@ -45,12 +45,12 @@ for test in get_testcases(): RESULT = ('.' + domain + '\n' in LIST or '\n' + domain + '\n' in LIST) - if RESULT and ('info_dict' not in test or 'age_limit' not in test['info_dict'] - or test['info_dict']['age_limit'] != 18): + if RESULT and ('info_dict' not in test or 'age_limit' not in test['info_dict'] or + test['info_dict']['age_limit'] != 18): print('\nPotential missing age_limit check: {0}'.format(test['name'])) - elif not RESULT and ('info_dict' in test and 'age_limit' in test['info_dict'] - and test['info_dict']['age_limit'] == 18): + elif not RESULT and ('info_dict' in test and 'age_limit' in test['info_dict'] and + test['info_dict']['age_limit'] == 18): print('\nPotential false negative: {0}'.format(test['name'])) else: diff --git a/test/test_swfinterp.py b/test/test_swfinterp.py index 9f18055e6..f1e899819 100644 --- a/test/test_swfinterp.py +++ b/test/test_swfinterp.py @@ -34,8 +34,8 @@ def _make_testfunc(testfile): def test_func(self): as_file = os.path.join(TEST_DIR, testfile) swf_file = os.path.join(TEST_DIR, test_id + '.swf') - if ((not os.path.exists(swf_file)) - or os.path.getmtime(swf_file) < os.path.getmtime(as_file)): + if ((not os.path.exists(swf_file)) or + os.path.getmtime(swf_file) < os.path.getmtime(as_file)): # Recompile try: subprocess.check_call([ diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 88809783b..ca7c3f5c6 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -308,8 +308,8 @@ class YoutubeDL(object): raise if (sys.version_info >= (3,) and sys.platform != 'win32' and - sys.getfilesystemencoding() in ['ascii', 'ANSI_X3.4-1968'] - and not params.get('restrictfilenames', False)): + sys.getfilesystemencoding() in ['ascii', 'ANSI_X3.4-1968'] and + not params.get('restrictfilenames', False)): # On Python 3, the Unicode filesystem API will throw errors (#1474) self.report_warning( 'Assuming --restrict-filenames since file system encoding ' @@ -1366,8 +1366,8 @@ class YoutubeDL(object): """Download a given list of URLs.""" outtmpl = self.params.get('outtmpl', DEFAULT_OUTTMPL) if (len(url_list) > 1 and - '%' not in outtmpl - and self.params.get('max_downloads') != 1): + '%' not in outtmpl and + self.params.get('max_downloads') != 1): raise SameFileError(outtmpl) for url in url_list: diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index eac2a26ec..25ab3fdfe 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -189,14 +189,14 @@ def _real_main(argv=None): # In Python 2, sys.argv is a bytestring (also note http://bugs.python.org/issue2128 for Windows systems) if opts.outtmpl is not None: opts.outtmpl = opts.outtmpl.decode(preferredencoding()) - outtmpl = ((opts.outtmpl is not None and opts.outtmpl) - or (opts.format == '-1' and opts.usetitle and '%(title)s-%(id)s-%(format)s.%(ext)s') - or (opts.format == '-1' and '%(id)s-%(format)s.%(ext)s') - or (opts.usetitle and opts.autonumber and '%(autonumber)s-%(title)s-%(id)s.%(ext)s') - or (opts.usetitle and '%(title)s-%(id)s.%(ext)s') - or (opts.useid and '%(id)s.%(ext)s') - or (opts.autonumber and '%(autonumber)s-%(id)s.%(ext)s') - or DEFAULT_OUTTMPL) + outtmpl = ((opts.outtmpl is not None and opts.outtmpl) or + (opts.format == '-1' and opts.usetitle and '%(title)s-%(id)s-%(format)s.%(ext)s') or + (opts.format == '-1' and '%(id)s-%(format)s.%(ext)s') or + (opts.usetitle and opts.autonumber and '%(autonumber)s-%(title)s-%(id)s.%(ext)s') or + (opts.usetitle and '%(title)s-%(id)s.%(ext)s') or + (opts.useid and '%(id)s.%(ext)s') or + (opts.autonumber and '%(autonumber)s-%(id)s.%(ext)s') or + DEFAULT_OUTTMPL) if not os.path.splitext(outtmpl)[1] and opts.extractaudio: parser.error('Cannot download a video and extract audio into the same' ' file! Use "{0}.%(ext)s" instead of "{0}" as the output' diff --git a/youtube_dl/downloader/common.py b/youtube_dl/downloader/common.py index 45e55b99c..3ae90021a 100644 --- a/youtube_dl/downloader/common.py +++ b/youtube_dl/downloader/common.py @@ -311,14 +311,14 @@ class FileDownloader(object): """ nooverwrites_and_exists = ( - self.params.get('nooverwrites', False) - and os.path.exists(encodeFilename(filename)) + self.params.get('nooverwrites', False) and + os.path.exists(encodeFilename(filename)) ) continuedl_and_exists = ( - self.params.get('continuedl', False) - and os.path.isfile(encodeFilename(filename)) - and not self.params.get('nopart', False) + self.params.get('continuedl', False) and + os.path.isfile(encodeFilename(filename)) and + not self.params.get('nopart', False) ) # Check file already present diff --git a/youtube_dl/downloader/f4m.py b/youtube_dl/downloader/f4m.py index b40ebfa50..7b8fe8cf5 100644 --- a/youtube_dl/downloader/f4m.py +++ b/youtube_dl/downloader/f4m.py @@ -325,8 +325,8 @@ class F4mFD(FileDownloader): state['frag_index'] += 1 estimated_size = ( - (state['downloaded_bytes'] + frag_total_bytes) - / (state['frag_index'] + 1) * total_frags) + (state['downloaded_bytes'] + frag_total_bytes) / + (state['frag_index'] + 1) * total_frags) time_now = time.time() state['total_bytes_estimate'] = estimated_size state['elapsed'] = time_now - start diff --git a/youtube_dl/extractor/adobetv.py b/youtube_dl/extractor/adobetv.py index 28e07f8b0..97d128560 100644 --- a/youtube_dl/extractor/adobetv.py +++ b/youtube_dl/extractor/adobetv.py @@ -28,7 +28,6 @@ class AdobeTVIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) player = self._parse_json( @@ -44,8 +43,10 @@ class AdobeTVIE(InfoExtractor): self._html_search_meta('datepublished', webpage, 'upload date')) duration = parse_duration( - self._html_search_meta('duration', webpage, 'duration') - or self._search_regex(r'Runtime:\s*(\d{2}:\d{2}:\d{2})', webpage, 'duration')) + self._html_search_meta('duration', webpage, 'duration') or + self._search_regex( + r'Runtime:\s*(\d{2}:\d{2}:\d{2})', + webpage, 'duration', fatal=False)) view_count = str_to_int(self._search_regex( r'
\s*Views?:\s*([\d,.]+)\s*
', diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index ee64ad329..d3f86cf4a 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -808,8 +808,8 @@ class InfoExtractor(object): media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media') for i, media_el in enumerate(media_nodes): if manifest_version == '2.0': - manifest_url = ('/'.join(manifest_url.split('/')[:-1]) + '/' - + (media_el.attrib.get('href') or media_el.attrib.get('url'))) + manifest_url = ('/'.join(manifest_url.split('/')[:-1]) + '/' + + (media_el.attrib.get('href') or media_el.attrib.get('url'))) tbr = int_or_none(media_el.attrib.get('bitrate')) formats.append({ 'format_id': '-'.join(filter(None, [f4m_id, 'f4m-%d' % (i if tbr is None else tbr)])), diff --git a/youtube_dl/extractor/defense.py b/youtube_dl/extractor/defense.py index 2b90bf4fc..98e3aedfd 100644 --- a/youtube_dl/extractor/defense.py +++ b/youtube_dl/extractor/defense.py @@ -25,8 +25,9 @@ class DefenseGouvFrIE(InfoExtractor): r"flashvars.pvg_id=\"(\d+)\";", webpage, 'ID') - json_url = ('http://static.videos.gouv.fr/brightcovehub/export/json/' - + video_id) + json_url = ( + 'http://static.videos.gouv.fr/brightcovehub/export/json/%s' % + video_id) info = self._download_json(json_url, title, 'Downloading JSON config') video_url = info['renditions'][0]['url'] diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 238b6556b..475fad3c9 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -900,8 +900,8 @@ def _windows_write_string(s, out): def not_a_console(handle): if handle == INVALID_HANDLE_VALUE or handle is None: return True - return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR - or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0) + return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or + GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0) if not_a_console(h): return False From 93540ee10e4143f8de7885af2d68c213aab7d8cb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sat, 21 Feb 2015 19:31:39 +0100 Subject: [PATCH 34/83] [rtve] Fix the video url Changing mvod to mvod1 fixes the url, we don't need to add the query. --- youtube_dl/extractor/rtve.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/rtve.py b/youtube_dl/extractor/rtve.py index 3469d9578..e60f85b5b 100644 --- a/youtube_dl/extractor/rtve.py +++ b/youtube_dl/extractor/rtve.py @@ -6,6 +6,7 @@ import re import time from .common import InfoExtractor +from ..compat import compat_urlparse from ..utils import ( struct_unpack, remove_end, @@ -96,12 +97,10 @@ class RTVEALaCartaIE(InfoExtractor): ).replace('.net.rtve', '.multimedia.cdn.rtve') video_path = self._download_webpage( auth_url, video_id, 'Getting video url') - # Use mvod.akcdn instead of flash.akamaihd.multimedia.cdn to get + # Use mvod1.akcdn instead of flash.akamaihd.multimedia.cdn to get # the right Content-Length header and the mp4 format - video_url = ( - 'http://mvod.akcdn.rtve.es/{0}&v=2.6.8' - '&fp=MAC%2016,0,0,296&r=MRUGG&g=OEOJWFXNFGCP'.format(video_path) - ) + video_url = compat_urlparse.urljoin( + 'http://mvod1.akcdn.rtve.es/', video_path) return { 'id': video_id, From 4aeccadf4ef8528c252c917d071e98a091f7766c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 22 Feb 2015 01:38:57 +0600 Subject: [PATCH 35/83] [zapiks] Add extractor (#5014) --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/zapiks.py | 102 +++++++++++++++++++++++++++++++ 2 files changed, 103 insertions(+) create mode 100644 youtube_dl/extractor/zapiks.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 7d0c6b5ac..ef0adfd87 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -612,6 +612,7 @@ from .youtube import ( YoutubeUserIE, YoutubeWatchLaterIE, ) +from .zapiks import ZapiksIE from .zdf import ZDFIE, ZDFChannelIE from .zingmp3 import ( ZingMp3SongIE, diff --git a/youtube_dl/extractor/zapiks.py b/youtube_dl/extractor/zapiks.py new file mode 100644 index 000000000..12810637e --- /dev/null +++ b/youtube_dl/extractor/zapiks.py @@ -0,0 +1,102 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + parse_duration, + parse_iso8601, + xpath_with_ns, + xpath_text, + int_or_none, +) + + +class ZapiksIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?zapiks\.fr/(?:(?P.+?)\.html|index\.php\?.*\bmedia_id=(?P\d+))' + _TESTS = [ + { + 'url': 'http://www.zapiks.fr/ep2s3-bon-appetit-eh-be-viva.html', + 'md5': 'aeb3c473b2d564b2d46d664d28d5f050', + 'info_dict': { + 'id': '80798', + 'ext': 'mp4', + 'title': 'EP2S3 - Bon Appétit - Eh bé viva les pyrénées con!', + 'description': 'md5:7054d6f6f620c6519be1fe710d4da847', + 'thumbnail': 're:^https?://.*\.jpg$', + 'duration': 528, + 'timestamp': 1359044972, + 'upload_date': '20130124', + 'view_count': int, + 'comment_count': int, + }, + }, + { + 'url': 'http://www.zapiks.fr/index.php?action=playerIframe&media_id=118046&width=640&height=360&autoStart=false&language=fr', + 'only_matching': True, + }, + ] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + display_id = mobj.group('display_id') or video_id + + webpage = self._download_webpage(url, display_id) + + if not video_id: + video_id = self._search_regex( + r'data-media-id="(\d+)"', webpage, 'video id') + + playlist = self._download_xml( + 'http://www.zapiks.fr/view/index.php?action=playlist&media_id=%s&lang=en' % video_id, + display_id) + + NS_MAP = { + 'jwplayer': 'http://rss.jwpcdn.com/' + } + + def ns(path): + return xpath_with_ns(path, NS_MAP) + + item = playlist.find('./channel/item') + + title = xpath_text(item, 'title', 'title') or self._og_search_title(webpage) + description = self._og_search_description(webpage, default=None) + thumbnail = xpath_text( + item, ns('./jwplayer:image'), 'thumbnail') or self._og_search_thumbnail(webpage, default=None) + duration = parse_duration(self._html_search_meta( + 'duration', webpage, 'duration', default=None)) + timestamp = parse_iso8601(self._html_search_meta( + 'uploadDate', webpage, 'upload date', default=None), ' ') + + view_count = int_or_none(self._search_regex( + r'UserPlays:(\d+)', webpage, 'view count', default=None)) + comment_count = int_or_none(self._search_regex( + r'UserComments:(\d+)', webpage, 'comment count', default=None)) + + formats = [] + for source in item.findall(ns('./jwplayer:source')): + format_id = source.attrib['label'] + f = { + 'url': source.attrib['file'], + 'format_id': format_id, + } + m = re.search(r'^(?P\d+)[pP]', format_id) + if m: + f['height'] = int(m.group('height')) + formats.append(f) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'duration': duration, + 'timestamp': timestamp, + 'view_count': view_count, + 'comment_count': comment_count, + 'formats': formats, + } From 255fca5eea70a171530a5a0f2af143362f0211cf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 22 Feb 2015 01:39:26 +0600 Subject: [PATCH 36/83] [generic] Add support for Zapiks embeds (#5014) --- youtube_dl/extractor/generic.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 8dce96a64..875e1bf05 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -547,7 +547,16 @@ class GenericIE(InfoExtractor): 'id': 'aanslagen-kopenhagen', 'title': 'Aanslagen Kopenhagen | RTL Nieuws', } - } + }, + # Zapiks embed + { + 'url': 'http://www.skipass.com/news/116090-bon-appetit-s5ep3-baqueira-mi-cor.html', + 'info_dict': { + 'id': '118046', + 'ext': 'mp4', + 'title': 'EP3S5 - Bon Appétit - Baqueira Mi Corazon !', + } + }, ] def report_following_redirect(self, new_url): @@ -1098,6 +1107,12 @@ class GenericIE(InfoExtractor): if mobj is not None: return self.url_result(mobj.group('url'), 'Livestream') + # Look for Zapiks embed + mobj = re.search( + r']+src="(?Phttps?://(?:www\.)?zapiks\.fr/index\.php\?.+?)"', webpage) + if mobj is not None: + return self.url_result(mobj.group('url'), 'Zapiks') + def check_video(vurl): if YoutubeIE.suitable(vurl): return True From ea5152cae110d55b82c755c23926f077b90c071c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 22 Feb 2015 01:42:47 +0600 Subject: [PATCH 37/83] [zapiks] Extend _VALID_URL (#5014) --- youtube_dl/extractor/zapiks.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/zapiks.py b/youtube_dl/extractor/zapiks.py index 12810637e..22a9a57e8 100644 --- a/youtube_dl/extractor/zapiks.py +++ b/youtube_dl/extractor/zapiks.py @@ -14,7 +14,7 @@ from ..utils import ( class ZapiksIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?zapiks\.fr/(?:(?P.+?)\.html|index\.php\?.*\bmedia_id=(?P\d+))' + _VALID_URL = r'https?://(?:www\.)?zapiks\.(?:fr|com)/(?:(?:[a-z]{2}/)?(?P.+?)\.html|index\.php\?.*\bmedia_id=(?P\d+))' _TESTS = [ { 'url': 'http://www.zapiks.fr/ep2s3-bon-appetit-eh-be-viva.html', @@ -32,6 +32,14 @@ class ZapiksIE(InfoExtractor): 'comment_count': int, }, }, + { + 'url': 'http://www.zapiks.com/ep3s5-bon-appetit-baqueira-m-1.html', + 'only_matching': True, + }, + { + 'url': 'http://www.zapiks.com/nl/ep3s5-bon-appetit-baqueira-m-1.html', + 'only_matching': True, + }, { 'url': 'http://www.zapiks.fr/index.php?action=playerIframe&media_id=118046&width=640&height=360&autoStart=false&language=fr', 'only_matching': True, From c5181ab4101323de94bdb20850c64711c625c3ad Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 22 Feb 2015 02:10:26 +0600 Subject: [PATCH 38/83] [gdcvault] Fix rtmp streams (Closes #5024) --- youtube_dl/extractor/gdcvault.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/gdcvault.py b/youtube_dl/extractor/gdcvault.py index fed968f51..05f58f1af 100644 --- a/youtube_dl/extractor/gdcvault.py +++ b/youtube_dl/extractor/gdcvault.py @@ -7,6 +7,7 @@ from ..compat import ( compat_urllib_parse, compat_urllib_request, ) +from ..utils import remove_end class GDCVaultIE(InfoExtractor): @@ -68,7 +69,9 @@ class GDCVaultIE(InfoExtractor): akami_url = xml_description.find('./metadata/akamaiHost').text slide_video_path = xml_description.find('./metadata/slideVideo').text video_formats.append({ - 'url': 'rtmp://' + akami_url + '/' + slide_video_path, + 'url': 'rtmp://%s/ondemand?ovpfv=1.1' % 'fms.digitallyspeaking.com/cfx/st', + 'play_path': remove_end(slide_video_path, '.flv'), + 'ext': 'flv', 'format_note': 'slide deck video', 'quality': -2, 'preference': -2, @@ -76,7 +79,9 @@ class GDCVaultIE(InfoExtractor): }) speaker_video_path = xml_description.find('./metadata/speakerVideo').text video_formats.append({ - 'url': 'rtmp://' + akami_url + '/' + speaker_video_path, + 'url': 'rtmp://%s/ondemand?ovpfv=1.1' % 'fms.digitallyspeaking.com/cfx/st', + 'play_path': remove_end(speaker_video_path, '.flv'), + 'ext': 'flv', 'format_note': 'speaker video', 'quality': -1, 'preference': -1, From 314368c822428437e60bbc24af65d5415717632c Mon Sep 17 00:00:00 2001 From: Naglis Jonaitis Date: Sat, 21 Feb 2015 22:19:39 +0200 Subject: [PATCH 39/83] [teamcoco] Fix extraction Also, use a single style of quotes --- youtube_dl/extractor/teamcoco.py | 49 ++++++++++++++++---------------- 1 file changed, 24 insertions(+), 25 deletions(-) diff --git a/youtube_dl/extractor/teamcoco.py b/youtube_dl/extractor/teamcoco.py index a73da1c9c..5793dbc10 100644 --- a/youtube_dl/extractor/teamcoco.py +++ b/youtube_dl/extractor/teamcoco.py @@ -1,8 +1,10 @@ from __future__ import unicode_literals +import base64 import re from .common import InfoExtractor +from ..utils import qualities class TeamcocoIE(InfoExtractor): @@ -24,8 +26,8 @@ class TeamcocoIE(InfoExtractor): 'info_dict': { 'id': '19705', 'ext': 'mp4', - "description": "Louis C.K. got starstruck by George W. Bush, so what? Part one.", - "title": "Louis C.K. Interview Pt. 1 11/3/11", + 'description': 'Louis C.K. got starstruck by George W. Bush, so what? Part one.', + 'title': 'Louis C.K. Interview Pt. 1 11/3/11', 'age_limit': 0, } } @@ -42,42 +44,39 @@ class TeamcocoIE(InfoExtractor): display_id = mobj.group('display_id') webpage = self._download_webpage(url, display_id) - video_id = mobj.group("video_id") + video_id = mobj.group('video_id') if not video_id: video_id = self._html_search_regex( self._VIDEO_ID_REGEXES, webpage, 'video id') - data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id - data = self._download_xml( - data_url, display_id, 'Downloading data webpage') + embed_url = 'http://teamcoco.com/embed/v/%s' % video_id + embed = self._download_webpage( + embed_url, video_id, 'Downloading embed page') + + encoded_data = self._search_regex( + r'"preload"\s*:\s*"([^"]+)"', embed, 'encoded data') + data = self._parse_json( + base64.b64decode(encoded_data.encode('ascii')).decode('utf-8'), video_id) - qualities = ['500k', '480p', '1000k', '720p', '1080p'] formats = [] - for filed in data.findall('files/file'): - if filed.attrib.get('playmode') == 'all': - # it just duplicates one of the entries - break - file_url = filed.text - m_format = re.search(r'(\d+(k|p))\.mp4', file_url) + get_quality = qualities(['500k', '480p', '1000k', '720p', '1080p']) + for filed in data['files']: + m_format = re.search(r'(\d+(k|p))\.mp4', filed['url']) if m_format is not None: format_id = m_format.group(1) else: - format_id = filed.attrib['bitrate'] + format_id = filed['bitrate'] tbr = ( - int(filed.attrib['bitrate']) - if filed.attrib['bitrate'].isdigit() + int(filed['bitrate']) + if filed['bitrate'].isdigit() else None) - try: - quality = qualities.index(format_id) - except ValueError: - quality = -1 formats.append({ - 'url': file_url, + 'url': filed['url'], 'ext': 'mp4', 'tbr': tbr, 'format_id': format_id, - 'quality': quality, + 'quality': get_quality(format_id), }) self._sort_formats(formats) @@ -86,8 +85,8 @@ class TeamcocoIE(InfoExtractor): 'id': video_id, 'display_id': display_id, 'formats': formats, - 'title': self._og_search_title(webpage), - 'thumbnail': self._og_search_thumbnail(webpage), - 'description': self._og_search_description(webpage), + 'title': data['title'], + 'thumbnail': data.get('thumb', {}).get('href'), + 'description': data.get('teaser'), 'age_limit': self._family_friendly_search(webpage), } From e086e0eb6cef80db2d4ab44572a1a5d6b6f1dee0 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Sat, 21 Feb 2015 21:25:29 +0100 Subject: [PATCH 40/83] release 2015.02.21 --- docs/supportedsites.md | 1 + youtube_dl/version.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index dbff5e270..5fe3e47cd 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -559,6 +559,7 @@ - **youtube:subscriptions**: YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication) - **youtube:user**: YouTube.com user videos (URL or "ytuser" keyword) - **youtube:watch_later**: Youtube watch later list, ":ytwatchlater" for short (requires authentication) + - **Zapiks** - **ZDF** - **ZDFChannel** - **zingmp3:album**: mp3.zing.vn albums diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 9fd0ee963..7c8b29c3b 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2015.02.20' +__version__ = '2015.02.21' From 4d1718481755dde078678b6e55d457fc6351fcdd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sat, 21 Feb 2015 22:31:53 +0100 Subject: [PATCH 41/83] [YoutubeDL] don't set the 'requested_subtitles' without writesubtitles or writeautomaticsub --- test/test_subtitles.py | 10 +++++----- youtube_dl/YoutubeDL.py | 6 ++++-- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/test/test_subtitles.py b/test/test_subtitles.py index c04fe6f22..457f268fa 100644 --- a/test/test_subtitles.py +++ b/test/test_subtitles.py @@ -113,7 +113,7 @@ class TestYoutubeSubtitles(BaseTestSubtitles): self.DL.params['writesubtitles'] = True self.DL.params['allsubtitles'] = True subtitles = self.getSubtitles() - self.assertEqual(len(subtitles), 0) + self.assertFalse(subtitles) def test_youtube_multiple_langs(self): self.url = 'QRS8MkLhQmM' @@ -152,7 +152,7 @@ class TestDailymotionSubtitles(BaseTestSubtitles): self.DL.params['writesubtitles'] = True self.DL.params['allsubtitles'] = True subtitles = self.getSubtitles() - self.assertEqual(len(subtitles), 0) + self.assertFalse(subtitles) def test_multiple_langs(self): self.DL.params['writesubtitles'] = True @@ -246,7 +246,7 @@ class TestVimeoSubtitles(BaseTestSubtitles): self.DL.params['writesubtitles'] = True self.DL.params['allsubtitles'] = True subtitles = self.getSubtitles() - self.assertEqual(len(subtitles), 0) + self.assertFalse(subtitles) def test_multiple_langs(self): self.DL.params['writesubtitles'] = True @@ -281,7 +281,7 @@ class TestWallaSubtitles(BaseTestSubtitles): self.DL.params['writesubtitles'] = True self.DL.params['allsubtitles'] = True subtitles = self.getSubtitles() - self.assertEqual(len(subtitles), 0) + self.assertFalse(subtitles) class TestCeskaTelevizeSubtitles(BaseTestSubtitles): @@ -308,7 +308,7 @@ class TestCeskaTelevizeSubtitles(BaseTestSubtitles): self.DL.params['writesubtitles'] = True self.DL.params['allsubtitles'] = True subtitles = self.getSubtitles() - self.assertEqual(len(subtitles), 0) + self.assertFalse(subtitles) class TestLyndaSubtitles(BaseTestSubtitles): diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index f8b8fb0c1..088b111eb 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -1164,8 +1164,10 @@ class YoutubeDL(object): if lang not in available_subs: available_subs[lang] = cap_info - if not available_subs: - return available_subs + if (not self.params.get('writesubtitles') and not + self.params.get('writeautomaticsub') or not + available_subs): + return None if self.params.get('allsubtitles', False): requested_langs = available_subs.keys() From 03091e372f7033fa52c7961b1a99cd3790c0f60f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sat, 21 Feb 2015 22:33:11 +0100 Subject: [PATCH 42/83] [ted] Always extract the subtitles The required info is already in the webpage --- youtube_dl/extractor/ted.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/youtube_dl/extractor/ted.py b/youtube_dl/extractor/ted.py index 1809eaae4..0c38c8f89 100644 --- a/youtube_dl/extractor/ted.py +++ b/youtube_dl/extractor/ted.py @@ -163,8 +163,6 @@ class TEDIE(InfoExtractor): self._sort_formats(formats) video_id = compat_str(talk_info['id']) - # subtitles - video_subtitles = self.extract_subtitles(video_id, talk_info) thumbnail = talk_info['thumb'] if not thumbnail.startswith('http'): @@ -175,7 +173,7 @@ class TEDIE(InfoExtractor): 'uploader': talk_info['speaker'], 'thumbnail': thumbnail, 'description': self._og_search_description(webpage), - 'subtitles': video_subtitles, + 'subtitles': self._get_subtitles(video_id, talk_info), 'formats': formats, 'duration': talk_info.get('duration'), } @@ -194,7 +192,6 @@ class TEDIE(InfoExtractor): ] return sub_lang_list else: - self._downloader.report_warning('video doesn\'t have subtitles') return {} def _watch_info(self, url, name): From ab84349b16b3c94775543a04855fc77005f8237e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sun, 22 Feb 2015 11:26:27 +0100 Subject: [PATCH 43/83] [test/YoutubeDL] Add test for subtitles Updated the offlinetest make target to not skip it --- Makefile | 2 +- test/test_YoutubeDL.py | 52 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 53 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 0636fc4cb..07c90c225 100644 --- a/Makefile +++ b/Makefile @@ -43,7 +43,7 @@ test: ot: offlinetest offlinetest: codetest - nosetests --verbose test --exclude test_download --exclude test_age_restriction --exclude test_subtitles --exclude test_write_annotations --exclude test_youtube_lists + nosetests --verbose test --exclude test_download.py --exclude test_age_restriction.py --exclude test_subtitles.py --exclude test_write_annotations.py --exclude test_youtube_lists.py tar: youtube-dl.tar.gz diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index b1cd6a69f..e11292211 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -337,6 +337,58 @@ class TestFormatSelection(unittest.TestCase): downloaded = ydl.downloaded_info_dicts[0] self.assertEqual(downloaded['format_id'], 'G') + def test_subtitles(self): + def s_formats(lang, autocaption=False): + return [{ + 'ext': ext, + 'url': 'http://localhost/video.%s.%s' % (lang, ext), + '_auto': autocaption, + } for ext in ['vtt', 'srt', 'ass']] + subtitles = dict((l, s_formats(l)) for l in ['en', 'fr', 'es']) + auto_captions = dict((l, s_formats(l, True)) for l in ['it', 'pt', 'es']) + info_dict = { + 'id': 'test', + 'title': 'Test', + 'url': 'http://localhost/video.mp4', + 'subtitles': subtitles, + 'automatic_captions': auto_captions, + 'extractor': 'TEST', + } + + def get_info(params={}): + params.setdefault('simulate', True) + ydl = YDL(params) + ydl.report_warning = lambda *args, **kargs: None + return ydl.process_video_result(info_dict, download=False) + + result = get_info() + self.assertFalse(result.get('requested_subtitles')) + self.assertEqual(result['subtitles'], subtitles) + self.assertEqual(result['automatic_captions'], auto_captions) + + result = get_info({'writesubtitles': True}) + subs = result['requested_subtitles'] + self.assertTrue(subs) + self.assertEqual(set(subs.keys()), set(['en'])) + self.assertTrue(subs['en'].get('data') is None) + self.assertEqual(subs['en']['ext'], 'ass') + + result = get_info({'writesubtitles': True, 'subtitlesformat': 'foo/srt'}) + subs = result['requested_subtitles'] + self.assertEqual(subs['en']['ext'], 'srt') + + result = get_info({'writesubtitles': True, 'subtitleslangs': ['es', 'fr', 'it']}) + subs = result['requested_subtitles'] + self.assertTrue(subs) + self.assertEqual(set(subs.keys()), set(['es', 'fr'])) + + result = get_info({'writesubtitles': True, 'writeautomaticsub': True, 'subtitleslangs': ['es', 'pt']}) + subs = result['requested_subtitles'] + self.assertTrue(subs) + self.assertEqual(set(subs.keys()), set(['es', 'pt'])) + self.assertFalse(subs['es']['_auto']) + self.assertTrue(subs['pt']['_auto']) + def test_add_extra_info(self): test_dict = { 'extractor': 'Foo', From 98c70d6fc7006c8cbbd76fb1b8661d758fc4f5d8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sun, 22 Feb 2015 11:37:27 +0100 Subject: [PATCH 44/83] [YoutubeDL] only add normal subtitles to the 'requested_subtitles' field if 'writesubtitles' is True --- test/test_YoutubeDL.py | 7 +++++++ youtube_dl/YoutubeDL.py | 10 ++++++---- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index e11292211..055e42555 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -389,6 +389,13 @@ class TestFormatSelection(unittest.TestCase): self.assertFalse(subs['es']['_auto']) self.assertTrue(subs['pt']['_auto']) + result = get_info({'writeautomaticsub': True, 'subtitleslangs': ['es', 'pt']}) + subs = result['requested_subtitles'] + self.assertTrue(subs) + self.assertEqual(set(subs.keys()), set(['es', 'pt'])) + self.assertTrue(subs['es']['_auto']) + self.assertTrue(subs['pt']['_auto']) + def test_add_extra_info(self): test_dict = { 'extractor': 'Foo', diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 088b111eb..7319323e5 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -1156,11 +1156,13 @@ class YoutubeDL(object): info_dict.update(formats_to_download[-1]) return info_dict - def process_subtitles(self, video_id, available_subs, available_autocaps): + def process_subtitles(self, video_id, normal_subtitles, automatic_captions): """Select the requested subtitles and their format""" - if available_autocaps and self.params.get('writeautomaticsub'): - available_subs = available_subs.copy() - for lang, cap_info in available_autocaps.items(): + available_subs = {} + if normal_subtitles and self.params.get('writesubtitles'): + available_subs.update(normal_subtitles) + if automatic_captions and self.params.get('writeautomaticsub'): + for lang, cap_info in automatic_captions.items(): if lang not in available_subs: available_subs[lang] = cap_info From b7bb76df05f53d4fc0570d07be5abcee238745e6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sun, 22 Feb 2015 11:49:27 +0100 Subject: [PATCH 45/83] [test/subtitles] Remove some tests Test only with 'allsubtitles', the language selection is already tested in test_YoutubeDL.py --- test/test_subtitles.py | 121 +++++------------------------------------ 1 file changed, 13 insertions(+), 108 deletions(-) diff --git a/test/test_subtitles.py b/test/test_subtitles.py index 457f268fa..aa4e2bec4 100644 --- a/test/test_subtitles.py +++ b/test/test_subtitles.py @@ -57,22 +57,15 @@ class TestYoutubeSubtitles(BaseTestSubtitles): url = 'QRS8MkLhQmM' IE = YoutubeIE - def test_youtube_subtitles(self): - self.DL.params['writesubtitles'] = True - subtitles = self.getSubtitles() - self.assertEqual(md5(subtitles['en']), '4cd9278a35ba2305f47354ee13472260') - - def test_youtube_subtitles_lang(self): - self.DL.params['writesubtitles'] = True - self.DL.params['subtitleslangs'] = ['it'] - subtitles = self.getSubtitles() - self.assertEqual(md5(subtitles['it']), '164a51f16f260476a05b50fe4c2f161d') - def test_youtube_allsubtitles(self): self.DL.params['writesubtitles'] = True self.DL.params['allsubtitles'] = True subtitles = self.getSubtitles() self.assertEqual(len(subtitles.keys()), 13) + self.assertEqual(md5(subtitles['en']), '4cd9278a35ba2305f47354ee13472260') + self.assertEqual(md5(subtitles['it']), '164a51f16f260476a05b50fe4c2f161d') + for lang in ['it', 'fr', 'de']: + self.assertTrue(subtitles.get(lang) is not None, 'Subtitles for \'%s\' not extracted' % lang) def test_youtube_subtitles_sbv_format(self): self.DL.params['writesubtitles'] = True @@ -86,12 +79,6 @@ class TestYoutubeSubtitles(BaseTestSubtitles): subtitles = self.getSubtitles() self.assertEqual(md5(subtitles['en']), '3cb210999d3e021bd6c7f0ea751eab06') - def test_youtube_list_subtitles(self): - self.DL.expect_warning('Video doesn\'t have automatic captions') - self.DL.params['listsubtitles'] = True - info_dict = self.getInfoDict() - self.assertEqual(info_dict, None) - def test_youtube_automatic_captions(self): self.url = '8YoUxe5ncPo' self.DL.params['writeautomaticsub'] = True @@ -115,36 +102,20 @@ class TestYoutubeSubtitles(BaseTestSubtitles): subtitles = self.getSubtitles() self.assertFalse(subtitles) - def test_youtube_multiple_langs(self): - self.url = 'QRS8MkLhQmM' - self.DL.params['writesubtitles'] = True - langs = ['it', 'fr', 'de'] - self.DL.params['subtitleslangs'] = langs - subtitles = self.getSubtitles() - for lang in langs: - self.assertTrue(subtitles.get(lang) is not None, 'Subtitles for \'%s\' not extracted' % lang) - class TestDailymotionSubtitles(BaseTestSubtitles): url = 'http://www.dailymotion.com/video/xczg00' IE = DailymotionIE - def test_subtitles(self): - self.DL.params['writesubtitles'] = True - subtitles = self.getSubtitles() - self.assertEqual(md5(subtitles['en']), '976553874490cba125086bbfea3ff76f') - - def test_subtitles_lang(self): - self.DL.params['writesubtitles'] = True - self.DL.params['subtitleslangs'] = ['fr'] - subtitles = self.getSubtitles() - self.assertEqual(md5(subtitles['fr']), '594564ec7d588942e384e920e5341792') - def test_allsubtitles(self): self.DL.params['writesubtitles'] = True self.DL.params['allsubtitles'] = True subtitles = self.getSubtitles() self.assertTrue(len(subtitles.keys()) >= 6) + self.assertEqual(md5(subtitles['en']), '976553874490cba125086bbfea3ff76f') + self.assertEqual(md5(subtitles['fr']), '594564ec7d588942e384e920e5341792') + for lang in ['es', 'fr', 'de']: + self.assertTrue(subtitles.get(lang) is not None, 'Subtitles for \'%s\' not extracted' % lang) def test_nosubtitles(self): self.DL.expect_warning('video doesn\'t have subtitles') @@ -154,51 +125,19 @@ class TestDailymotionSubtitles(BaseTestSubtitles): subtitles = self.getSubtitles() self.assertFalse(subtitles) - def test_multiple_langs(self): - self.DL.params['writesubtitles'] = True - langs = ['es', 'fr', 'de'] - self.DL.params['subtitleslangs'] = langs - subtitles = self.getSubtitles() - for lang in langs: - self.assertTrue(subtitles.get(lang) is not None, 'Subtitles for \'%s\' not extracted' % lang) - class TestTedSubtitles(BaseTestSubtitles): url = 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html' IE = TEDIE - def test_no_writesubtitles(self): - subtitles = self.getSubtitles() - self.assertFalse(subtitles) - - def test_subtitles(self): - self.DL.params['writesubtitles'] = True - subtitles = self.getSubtitles() - self.assertEqual(md5(subtitles['en']), '4262c1665ff928a2dada178f62cb8d14') - - def test_subtitles_lang(self): - self.DL.params['writesubtitles'] = True - self.DL.params['subtitleslangs'] = ['fr'] - subtitles = self.getSubtitles() - self.assertEqual(md5(subtitles['fr']), '66a63f7f42c97a50f8c0e90bc7797bb5') - def test_allsubtitles(self): self.DL.params['writesubtitles'] = True self.DL.params['allsubtitles'] = True subtitles = self.getSubtitles() self.assertTrue(len(subtitles.keys()) >= 28) - - def test_list_subtitles(self): - self.DL.params['listsubtitles'] = True - info_dict = self.getInfoDict() - self.assertEqual(info_dict, None) - - def test_multiple_langs(self): - self.DL.params['writesubtitles'] = True - langs = ['es', 'fr', 'de'] - self.DL.params['subtitleslangs'] = langs - subtitles = self.getSubtitles() - for lang in langs: + self.assertEqual(md5(subtitles['en']), '4262c1665ff928a2dada178f62cb8d14') + self.assertEqual(md5(subtitles['fr']), '66a63f7f42c97a50f8c0e90bc7797bb5') + for lang in ['es', 'fr', 'de']: self.assertTrue(subtitles.get(lang) is not None, 'Subtitles for \'%s\' not extracted' % lang) @@ -206,11 +145,6 @@ class TestBlipTVSubtitles(BaseTestSubtitles): url = 'http://blip.tv/a/a-6603250' IE = BlipTVIE - def test_list_subtitles(self): - self.DL.params['listsubtitles'] = True - info_dict = self.getInfoDict() - self.assertEqual(info_dict, None) - def test_allsubtitles(self): self.DL.params['writesubtitles'] = True self.DL.params['allsubtitles'] = True @@ -223,22 +157,13 @@ class TestVimeoSubtitles(BaseTestSubtitles): url = 'http://vimeo.com/76979871' IE = VimeoIE - def test_subtitles(self): - self.DL.params['writesubtitles'] = True - subtitles = self.getSubtitles() - self.assertEqual(md5(subtitles['en']), '8062383cf4dec168fc40a088aa6d5888') - - def test_subtitles_lang(self): - self.DL.params['writesubtitles'] = True - self.DL.params['subtitleslangs'] = ['fr'] - subtitles = self.getSubtitles() - self.assertEqual(md5(subtitles['fr']), 'b6191146a6c5d3a452244d853fde6dc8') - def test_allsubtitles(self): self.DL.params['writesubtitles'] = True self.DL.params['allsubtitles'] = True subtitles = self.getSubtitles() self.assertEqual(set(subtitles.keys()), set(['de', 'en', 'es', 'fr'])) + self.assertEqual(md5(subtitles['en']), '8062383cf4dec168fc40a088aa6d5888') + self.assertEqual(md5(subtitles['fr']), 'b6191146a6c5d3a452244d853fde6dc8') def test_nosubtitles(self): self.DL.expect_warning('video doesn\'t have subtitles') @@ -248,25 +173,11 @@ class TestVimeoSubtitles(BaseTestSubtitles): subtitles = self.getSubtitles() self.assertFalse(subtitles) - def test_multiple_langs(self): - self.DL.params['writesubtitles'] = True - langs = ['es', 'fr', 'de'] - self.DL.params['subtitleslangs'] = langs - subtitles = self.getSubtitles() - for lang in langs: - self.assertTrue(subtitles.get(lang) is not None, 'Subtitles for \'%s\' not extracted' % lang) - class TestWallaSubtitles(BaseTestSubtitles): url = 'http://vod.walla.co.il/movie/2705958/the-yes-men' IE = WallaIE - def test_list_subtitles(self): - self.DL.expect_warning('Automatic Captions not supported by this server') - self.DL.params['listsubtitles'] = True - info_dict = self.getInfoDict() - self.assertEqual(info_dict, None) - def test_allsubtitles(self): self.DL.expect_warning('Automatic Captions not supported by this server') self.DL.params['writesubtitles'] = True @@ -288,12 +199,6 @@ class TestCeskaTelevizeSubtitles(BaseTestSubtitles): url = 'http://www.ceskatelevize.cz/ivysilani/10600540290-u6-uzasny-svet-techniky' IE = CeskaTelevizeIE - def test_list_subtitles(self): - self.DL.expect_warning('Automatic Captions not supported by this server') - self.DL.params['listsubtitles'] = True - info_dict = self.getInfoDict() - self.assertEqual(info_dict, None) - def test_allsubtitles(self): self.DL.expect_warning('Automatic Captions not supported by this server') self.DL.params['writesubtitles'] = True From 80970e531bd377e1952ac358e7e345cfbf23593d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sun, 22 Feb 2015 11:52:22 +0100 Subject: [PATCH 46/83] [test/subtitles] Update checksum for Viki --- test/test_subtitles.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_subtitles.py b/test/test_subtitles.py index aa4e2bec4..7f93f0a75 100644 --- a/test/test_subtitles.py +++ b/test/test_subtitles.py @@ -288,7 +288,7 @@ class TestVikiSubtitles(BaseTestSubtitles): self.DL.params['allsubtitles'] = True subtitles = self.getSubtitles() self.assertEqual(set(subtitles.keys()), set(['en'])) - self.assertEqual(md5(subtitles['en']), 'b0b781eeb45efd3f6398a925b259150b') + self.assertEqual(md5(subtitles['en']), '53cb083a5914b2d84ef1ab67b880d18a') class TestThePlatformSubtitles(BaseTestSubtitles): From f311cfa23153fee51f94f14d1ab1f7f8b6a74702 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 22 Feb 2015 19:53:32 +0600 Subject: [PATCH 47/83] [appletrailers] Extend _VALID_URL (Closes #5027) --- youtube_dl/extractor/appletrailers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/appletrailers.py b/youtube_dl/extractor/appletrailers.py index 43e82847f..9c718ea66 100644 --- a/youtube_dl/extractor/appletrailers.py +++ b/youtube_dl/extractor/appletrailers.py @@ -11,7 +11,7 @@ from ..utils import ( class AppleTrailersIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?trailers\.apple\.com/trailers/(?P[^/]+)/(?P[^/]+)' + _VALID_URL = r'https?://(?:www\.)?trailers\.apple\.com/(?:trailers|ca)/(?P[^/]+)/(?P[^/]+)' _TEST = { "url": "http://trailers.apple.com/trailers/wb/manofsteel/", 'info_dict': { From 35b798230334e984977090ae03a307eaf7eedbc1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 22 Feb 2015 19:58:39 +0600 Subject: [PATCH 48/83] [appletrailers] Add test (#5027) --- youtube_dl/extractor/appletrailers.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/appletrailers.py b/youtube_dl/extractor/appletrailers.py index 9c718ea66..576f03b5b 100644 --- a/youtube_dl/extractor/appletrailers.py +++ b/youtube_dl/extractor/appletrailers.py @@ -12,7 +12,7 @@ from ..utils import ( class AppleTrailersIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?trailers\.apple\.com/(?:trailers|ca)/(?P[^/]+)/(?P[^/]+)' - _TEST = { + _TESTS = [{ "url": "http://trailers.apple.com/trailers/wb/manofsteel/", 'info_dict': { 'id': 'manofsteel', @@ -63,7 +63,10 @@ class AppleTrailersIE(InfoExtractor): }, }, ] - } + }, { + 'url': 'http://trailers.apple.com/ca/metropole/autrui/', + 'only_matching': True, + }] _JSON_RE = r'iTunes.playURL\((.*?)\);' From c010af6f195c2e84aec7d0ddec060fcbe9c45089 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 22 Feb 2015 23:11:33 +0600 Subject: [PATCH 49/83] [escapist] Make regexes more robust (Closes #5028) --- youtube_dl/extractor/escapist.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/escapist.py b/youtube_dl/extractor/escapist.py index 6b693b3b6..b49b9869f 100644 --- a/youtube_dl/extractor/escapist.py +++ b/youtube_dl/extractor/escapist.py @@ -31,10 +31,10 @@ class EscapistIE(InfoExtractor): webpage = self._download_webpage(url, video_id) uploader_id = self._html_search_regex( - r"

\s*(.*?)", + r"(.*?)", webpage, 'uploader', fatal=False) description = self._html_search_meta('description', webpage) @@ -42,7 +42,7 @@ class EscapistIE(InfoExtractor): title = raw_title.partition(' : ')[2] config_url = compat_urllib_parse.unquote(self._html_search_regex( - r' Date: Mon, 23 Feb 2015 03:30:10 +0600 Subject: [PATCH 50/83] [extractor/common] Fix preference for m3u8 quality selection URL --- youtube_dl/extractor/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index d3f86cf4a..79f6d199b 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -833,7 +833,7 @@ class InfoExtractor(object): 'url': m3u8_url, 'ext': ext, 'protocol': 'm3u8', - 'preference': -1, + 'preference': preference - 1 if preference else -1, 'resolution': 'multiple', 'format_note': 'Quality selection URL', }] From fcc3e6138b372e13578949dc724f456ae76dd065 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 23 Feb 2015 03:32:53 +0600 Subject: [PATCH 51/83] [r7] Add extractor (Closes #4405, closes #5004) --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/r7.py | 88 ++++++++++++++++++++++++++++++++ 2 files changed, 89 insertions(+) create mode 100644 youtube_dl/extractor/r7.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index ef0adfd87..4d3e79de9 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -364,6 +364,7 @@ from .promptfile import PromptFileIE from .prosiebensat1 import ProSiebenSat1IE from .pyvideo import PyvideoIE from .quickvid import QuickVidIE +from .r7 import R7IE from .radiode import RadioDeIE from .radiobremen import RadioBremenIE from .radiofrance import RadioFranceIE diff --git a/youtube_dl/extractor/r7.py b/youtube_dl/extractor/r7.py new file mode 100644 index 000000000..976c8feec --- /dev/null +++ b/youtube_dl/extractor/r7.py @@ -0,0 +1,88 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + js_to_json, + unescapeHTML, + int_or_none, +) + + +class R7IE(InfoExtractor): + _VALID_URL = r'''(?x)https?:// + (?: + (?:[a-zA-Z]+)\.r7\.com(?:/[^/]+)+/idmedia/| + noticias\.r7\.com(?:/[^/]+)+/[^/]+-| + player\.r7\.com/video/i/ + ) + (?P[\da-f]{24}) + ''' + _TESTS = [{ + 'url': 'http://videos.r7.com/policiais-humilham-suspeito-a-beira-da-morte-morre-com-dignidade-/idmedia/54e7050b0cf2ff57e0279389.html', + 'md5': '403c4e393617e8e8ddc748978ee8efde', + 'info_dict': { + 'id': '54e7050b0cf2ff57e0279389', + 'ext': 'mp4', + 'title': 'Policiais humilham suspeito à beira da morte: "Morre com dignidade"', + 'thumbnail': 're:^https?://.*\.jpg$', + 'duration': 98, + 'like_count': int, + 'view_count': int, + }, + }, { + 'url': 'http://esportes.r7.com/videos/cigano-manda-recado-aos-fas/idmedia/4e176727b51a048ee6646a1b.html', + 'only_matching': True, + }, { + 'url': 'http://noticias.r7.com/record-news/video/representante-do-instituto-sou-da-paz-fala-sobre-fim-do-estatuto-do-desarmamento-5480fc580cf2285b117f438d/', + 'only_matching': True, + }, { + 'url': 'http://player.r7.com/video/i/54e7050b0cf2ff57e0279389?play=true&video=http://vsh.r7.com/54e7050b0cf2ff57e0279389/ER7_RE_BG_MORTE_JOVENS_570kbps_2015-02-2009f17818-cc82-4c8f-86dc-89a66934e633-ATOS_copy.mp4&linkCallback=http://videos.r7.com/policiais-humilham-suspeito-a-beira-da-morte-morre-com-dignidade-/idmedia/54e7050b0cf2ff57e0279389.html&thumbnail=http://vtb.r7.com/ER7_RE_BG_MORTE_JOVENS_570kbps_2015-02-2009f17818-cc82-4c8f-86dc-89a66934e633-thumb.jpg&idCategory=192&share=true&layout=full&full=true', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage( + 'http://player.r7.com/video/i/%s' % video_id, video_id) + + item = self._parse_json(js_to_json(self._search_regex( + r'(?s)var\s+item\s*=\s*({.+?});', webpage, 'player')), video_id) + + title = unescapeHTML(item['title']) + thumbnail = item.get('init', {}).get('thumbUri') + duration = None + + statistics = item.get('statistics', {}) + like_count = int_or_none(statistics.get('likes')) + view_count = int_or_none(statistics.get('views')) + + formats = [] + for format_key, format_dict in item['playlist'][0].items(): + src = format_dict.get('src') + if not src: + continue + format_id = format_dict.get('format') or format_key + if duration is None: + duration = format_dict.get('duration') + if '.f4m' in src: + formats.extend(self._extract_f4m_formats(src, video_id, preference=-1)) + elif src.endswith('.m3u8'): + formats.extend(self._extract_m3u8_formats(src, video_id, 'mp4', preference=-2)) + else: + formats.append({ + 'url': src, + 'format_id': format_id, + }) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'thumbnail': thumbnail, + 'duration': duration, + 'like_count': like_count, + 'view_count': view_count, + 'formats': formats, + } From ddc369f073fda4ddd429c2d9a104e561cefd417f Mon Sep 17 00:00:00 2001 From: "Leslie P. Polzer" Date: Mon, 23 Feb 2015 12:00:43 +0100 Subject: [PATCH 52/83] [chirbit] fix profile downloader regex. --- youtube_dl/extractor/chirbit.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/chirbit.py b/youtube_dl/extractor/chirbit.py index 47ce94aa0..443192f43 100644 --- a/youtube_dl/extractor/chirbit.py +++ b/youtube_dl/extractor/chirbit.py @@ -37,7 +37,7 @@ class ChirbitIE(InfoExtractor): } class ChirbitProfileIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?chirbit.com/(?P[^/]+)' + _VALID_URL = r'https?://(?:www\.)?chirbit.com/(?P[^/]+)/?$' _TEST = { 'url': 'http://chirbit.com/ScarletBeauty', 'playlist_count': 3, From 93b5071f73738d788c878b38a57f2b6efe0da883 Mon Sep 17 00:00:00 2001 From: "Leslie P. Polzer" Date: Mon, 23 Feb 2015 12:11:19 +0100 Subject: [PATCH 53/83] [soundgasm] add profile IE. --- youtube_dl/extractor/__init__.py | 5 ++++- youtube_dl/extractor/soundgasm.py | 36 +++++++++++++++++++++++++++++++ 2 files changed, 40 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 94e150826..cf58f0800 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -425,7 +425,10 @@ from .soundcloud import ( SoundcloudUserIE, SoundcloudPlaylistIE ) -from .soundgasm import SoundgasmIE +from .soundgasm import ( + SoundgasmIE, + SoundgasmProfileIE +) from .southpark import ( SouthParkIE, SouthparkDeIE, diff --git a/youtube_dl/extractor/soundgasm.py b/youtube_dl/extractor/soundgasm.py index a4f8ce6c3..e568ff18c 100644 --- a/youtube_dl/extractor/soundgasm.py +++ b/youtube_dl/extractor/soundgasm.py @@ -4,6 +4,7 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..utils import clean_html class SoundgasmIE(InfoExtractor): @@ -38,3 +39,38 @@ class SoundgasmIE(InfoExtractor): 'title': audio_title, 'description': description } + +class SoundgasmProfileIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?soundgasm\.net/u/(?P[0-9a-zA-Z_\-]+)/?$' + _TEST = { + 'url': 'http://soundgasm.net/u/ytdl', + 'playlist_count': 1, + 'info_dict': { + '_type': 'playlist', + 'id': 'ytdl', + 'title': 'ytdl' + } + } + + def _real_extract(self, url): + profile_id = self._match_id(url) + webpage = self._download_webpage(url, profile_id) + + ids = re.findall(r'''''' % re.escape(profile_id), webpage) + ids = [clean_html(id) for id in ids] + + entries = [] + for id in ids: + entries.append({ + '_type': 'url', + 'url': ('http://soundgasm.net/u/%s/%s' % (profile_id, id)) + }) + + info_dict = { + '_type': 'playlist', + 'id': profile_id, + 'title': profile_id, + 'entries': entries + } + + return info_dict; From b531cfc019576b682f930bd269f68eb87cfd5abf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Mon, 23 Feb 2015 16:12:35 +0100 Subject: [PATCH 54/83] [YoutubeDL] remove compatiblity with the old subtitles system --- youtube_dl/YoutubeDL.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 7319323e5..70b364c9b 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -1189,14 +1189,6 @@ class YoutubeDL(object): if formats is None: self.report_warning('%s subtitles not available for %s' % (lang, video_id)) continue - if isinstance(formats, compat_str): - # TODO: convert all IE with subtitles support to the new format - # and remove this - subs[lang] = { - 'ext': formats_preference[0], - 'data': formats, - } - continue for ext in formats_preference: if ext == 'best': f = formats[-1] From a65d4e7f1458a681f250d6e2e0190644b50d6793 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 23 Feb 2015 21:15:16 +0600 Subject: [PATCH 55/83] [chirbit] Simplify and extract profile from RSS (#5032) --- youtube_dl/extractor/__init__.py | 5 +- youtube_dl/extractor/chirbit.py | 113 ++++++++++++++----------------- 2 files changed, 53 insertions(+), 65 deletions(-) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index c3088fba2..40fc92cf7 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -63,7 +63,10 @@ from .ccc import CCCIE from .ceskatelevize import CeskaTelevizeIE from .channel9 import Channel9IE from .chilloutzone import ChilloutzoneIE -from .chirbit import ChirbitIE, ChirbitProfileIE +from .chirbit import ( + ChirbitIE, + ChirbitProfileIE, +) from .cinchcast import CinchcastIE from .clipfish import ClipfishIE from .cliphunter import CliphunterIE diff --git a/youtube_dl/extractor/chirbit.py b/youtube_dl/extractor/chirbit.py index 443192f43..124307b7c 100644 --- a/youtube_dl/extractor/chirbit.py +++ b/youtube_dl/extractor/chirbit.py @@ -1,97 +1,82 @@ # coding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor -from ..utils import clean_html +from ..utils import ( + parse_duration, + int_or_none, +) class ChirbitIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?chirb\.it/(?P[^/]+)' - _TEST = { + _VALID_URL = r'https?://(?:www\.)?chirb\.it/(?:(?:wp|pl)/|fb_chirbit_player\.swf\?key=)?(?P[\da-zA-Z]+)' + _TESTS = [{ 'url': 'http://chirb.it/PrIPv5', 'md5': '9847b0dad6ac3e074568bf2cfb197de8', 'info_dict': { 'id': 'PrIPv5', - 'display_id': 'kukushtv_1423231243', 'ext': 'mp3', 'title': 'Фасадстрой', - 'url': 'http://audio.chirbit.com/kukushtv_1423231243.mp3' + 'duration': 52, + 'view_count': int, + 'comment_count': int, } - } + }, { + 'url': 'https://chirb.it/fb_chirbit_player.swf?key=PrIPv5', + 'only_matching': True, + }] def _real_extract(self, url): - audio_linkid = self._match_id(url) - webpage = self._download_webpage(url, audio_linkid) + audio_id = self._match_id(url) - audio_title = self._html_search_regex(r'(.*?)

', webpage, 'title') - audio_id = self._html_search_regex(r'\("setFile",\s+"http://audio.chirbit.com/(.*?).mp3"\)', webpage, 'audio ID') - audio_url = 'http://audio.chirbit.com/' + audio_id + '.mp3'; + webpage = self._download_webpage( + 'http://chirb.it/%s' % audio_id, audio_id) + + audio_url = self._search_regex( + r'"setFile"\s*,\s*"([^"]+)"', webpage, 'audio url') + + title = self._search_regex( + r'itemprop="name">([^<]+)', webpage, 'title') + duration = parse_duration(self._html_search_meta( + 'duration', webpage, 'duration', fatal=False)) + view_count = int_or_none(self._search_regex( + r'itemprop="playCount"\s*>(\d+)', webpage, + 'listen count', fatal=False)) + comment_count = int_or_none(self._search_regex( + r'>(\d+) Comments?:', webpage, + 'comment count', fatal=False)) return { - 'id': audio_linkid, - 'display_id': audio_id, - 'title': audio_title, - 'url': audio_url + 'id': audio_id, + 'url': audio_url, + 'title': title, + 'duration': duration, + 'view_count': view_count, + 'comment_count': comment_count, } + class ChirbitProfileIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?chirbit.com/(?P[^/]+)/?$' + _VALID_URL = r'https?://(?:www\.)?chirbit.com/(?:rss/)?(?P[^/]+)' _TEST = { 'url': 'http://chirbit.com/ScarletBeauty', - 'playlist_count': 3, 'info_dict': { - '_type': 'playlist', - 'title': 'ScarletBeauty', - 'id': 'ScarletBeauty' - } + 'id': 'ScarletBeauty', + 'title': 'Chirbits by ScarletBeauty', + }, + 'playlist_mincount': 3, } def _real_extract(self, url): profile_id = self._match_id(url) - # Chirbit has a pretty weird "Last Page" navigation behavior. - # We grab the profile's oldest entry to determine when to - # stop fetching entries. - oldestpage = self._download_webpage(url + '/24599', profile_id) - oldest_page_entries = re.findall( - r'''soundFile:\s*"http://audio.chirbit.com/(.*?).mp3"''', - oldestpage); - oldestentry = clean_html(oldest_page_entries[-1]); + rss = self._download_xml( + 'http://chirbit.com/rss/%s' % profile_id, profile_id) - ids = [] - titles = [] - n = 0 - while True: - page = self._download_webpage(url + '/' + str(n), profile_id) - page_ids = re.findall( - r'''soundFile:\s*"http://audio.chirbit.com/(.*?).mp3"''', - page); - page_titles = re.findall( - r'''(.*?)''', - page); - ids += page_ids - titles += page_titles - if oldestentry in page_ids: - break - n += 1 + entries = [ + self.url_result(audio_url.text, 'Chirbit') + for audio_url in rss.findall('./channel/item/link')] - entries = [] - i = 0 - for id in ids: - entries.append({ - 'id': id, - 'title': titles[i], - 'url': 'http://audio.chirbit.com/' + id + '.mp3' - }); - i += 1 + title = rss.find('./channel/title').text - info_dict = { - '_type': 'playlist', - 'id': profile_id, - 'title': profile_id, - 'entries': entries - } - - return info_dict; + return self.playlist_result(entries, profile_id, title) From 3cc57f96455ce14cc5c72264a25b8d434174f7dd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 23 Feb 2015 21:27:24 +0600 Subject: [PATCH 56/83] [soundgasm:profile] Simplify --- youtube_dl/extractor/soundgasm.py | 30 ++++++++---------------------- 1 file changed, 8 insertions(+), 22 deletions(-) diff --git a/youtube_dl/extractor/soundgasm.py b/youtube_dl/extractor/soundgasm.py index e568ff18c..e11d999f3 100644 --- a/youtube_dl/extractor/soundgasm.py +++ b/youtube_dl/extractor/soundgasm.py @@ -41,36 +41,22 @@ class SoundgasmIE(InfoExtractor): } class SoundgasmProfileIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?soundgasm\.net/u/(?P[0-9a-zA-Z_\-]+)/?$' + _VALID_URL = r'https?://(?:www\.)?soundgasm\.net/u/(?P[^/]+)' _TEST = { 'url': 'http://soundgasm.net/u/ytdl', - 'playlist_count': 1, 'info_dict': { - '_type': 'playlist', 'id': 'ytdl', - 'title': 'ytdl' - } + }, + 'playlist_count': 1, } def _real_extract(self, url): profile_id = self._match_id(url) + webpage = self._download_webpage(url, profile_id) - ids = re.findall(r'''''' % re.escape(profile_id), webpage) - ids = [clean_html(id) for id in ids] + entries = [ + self.url_result(audio_url, 'Soundgasm') + for audio_url in re.findall(r'href="([^"]+/u/%s/[^"]+)' % profile_id, webpage)] - entries = [] - for id in ids: - entries.append({ - '_type': 'url', - 'url': ('http://soundgasm.net/u/%s/%s' % (profile_id, id)) - }) - - info_dict = { - '_type': 'playlist', - 'id': profile_id, - 'title': profile_id, - 'entries': entries - } - - return info_dict; + return self.playlist_result(entries, profile_id) From 80af2b73ab0b51e4416500301948caa71ec39cb8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 23 Feb 2015 21:27:56 +0600 Subject: [PATCH 57/83] [soundgasm] Clarify extractors' IE_NAMEs --- youtube_dl/extractor/soundgasm.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/youtube_dl/extractor/soundgasm.py b/youtube_dl/extractor/soundgasm.py index e11d999f3..26e96a120 100644 --- a/youtube_dl/extractor/soundgasm.py +++ b/youtube_dl/extractor/soundgasm.py @@ -8,6 +8,7 @@ from ..utils import clean_html class SoundgasmIE(InfoExtractor): + IE_NAME = 'soundgasm' _VALID_URL = r'https?://(?:www\.)?soundgasm\.net/u/(?P[0-9a-zA-Z_\-]+)/(?P[0-9a-zA-Z_\-]+)' _TEST = { 'url': 'http://soundgasm.net/u/ytdl/Piano-sample', @@ -41,6 +42,7 @@ class SoundgasmIE(InfoExtractor): } class SoundgasmProfileIE(InfoExtractor): + IE_NAME = 'soundgasm:profile' _VALID_URL = r'https?://(?:www\.)?soundgasm\.net/u/(?P<id>[^/]+)' _TEST = { 'url': 'http://soundgasm.net/u/ytdl', From 04e8c1108023d9fe5c466d16f988a469e04f326e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 23 Feb 2015 21:28:14 +0600 Subject: [PATCH 58/83] [chirbit] Clarify extractors' IE_NAMEs --- youtube_dl/extractor/chirbit.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/youtube_dl/extractor/chirbit.py b/youtube_dl/extractor/chirbit.py index 124307b7c..b1eeaf101 100644 --- a/youtube_dl/extractor/chirbit.py +++ b/youtube_dl/extractor/chirbit.py @@ -9,6 +9,7 @@ from ..utils import ( class ChirbitIE(InfoExtractor): + IE_NAME = 'chirbit' _VALID_URL = r'https?://(?:www\.)?chirb\.it/(?:(?:wp|pl)/|fb_chirbit_player\.swf\?key=)?(?P<id>[\da-zA-Z]+)' _TESTS = [{ 'url': 'http://chirb.it/PrIPv5', @@ -57,6 +58,7 @@ class ChirbitIE(InfoExtractor): class ChirbitProfileIE(InfoExtractor): + IE_NAME = 'chirbit:profile' _VALID_URL = r'https?://(?:www\.)?chirbit.com/(?:rss/)?(?P<id>[^/]+)' _TEST = { 'url': 'http://chirbit.com/ScarletBeauty', From 409693984f0acb8fbbf006c0d7965bc138211ac6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 23 Feb 2015 21:30:30 +0600 Subject: [PATCH 59/83] [soundgasm:profile] Fix _VALID_URL --- youtube_dl/extractor/soundgasm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/soundgasm.py b/youtube_dl/extractor/soundgasm.py index 26e96a120..9e992c9b7 100644 --- a/youtube_dl/extractor/soundgasm.py +++ b/youtube_dl/extractor/soundgasm.py @@ -43,7 +43,7 @@ class SoundgasmIE(InfoExtractor): class SoundgasmProfileIE(InfoExtractor): IE_NAME = 'soundgasm:profile' - _VALID_URL = r'https?://(?:www\.)?soundgasm\.net/u/(?P<id>[^/]+)' + _VALID_URL = r'https?://(?:www\.)?soundgasm\.net/u/(?P<id>[^/]+)/?(?:\#.*)?$' _TEST = { 'url': 'http://soundgasm.net/u/ytdl', 'info_dict': { From 09c200acf258de115caeda210741a59f2b971b65 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 23 Feb 2015 21:31:57 +0600 Subject: [PATCH 60/83] Credit @skypher for chirbit and soundgasm:profile (#5032) --- AUTHORS | 1 + 1 file changed, 1 insertion(+) diff --git a/AUTHORS b/AUTHORS index 47f12a9ee..bdd2a15dc 100644 --- a/AUTHORS +++ b/AUTHORS @@ -111,3 +111,4 @@ Paul Hartmann Frans de Jonge Robin de Rooij Ryan Schmidt +Leslie P. Polzer From 3438e7acd27d89d83d41e722d21d7660dbad7eea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 23 Feb 2015 21:40:50 +0600 Subject: [PATCH 61/83] [soundgasm] Remove unused import --- youtube_dl/extractor/soundgasm.py | 1 - 1 file changed, 1 deletion(-) diff --git a/youtube_dl/extractor/soundgasm.py b/youtube_dl/extractor/soundgasm.py index 9e992c9b7..1c48478a6 100644 --- a/youtube_dl/extractor/soundgasm.py +++ b/youtube_dl/extractor/soundgasm.py @@ -4,7 +4,6 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..utils import clean_html class SoundgasmIE(InfoExtractor): From bd61a9e770506283f82e2ddf9e53b587169c2f04 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 23 Feb 2015 16:47:19 +0100 Subject: [PATCH 62/83] release 2015.02.23 --- docs/supportedsites.md | 6 +++++- youtube_dl/version.py | 2 +- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 5fe3e47cd..9f70db80a 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -72,6 +72,8 @@ - **CeskaTelevize** - **channel9**: Channel 9 - **Chilloutzone** + - **chirbit** + - **chirbit:profile** - **Cinchcast** - **Cinemassacre** - **clipfish** @@ -330,6 +332,7 @@ - **prosiebensat1**: ProSiebenSat.1 Digital - **Pyvideo** - **QuickVid** + - **R7** - **radio.de** - **radiobremen** - **radiofrance** @@ -385,7 +388,8 @@ - **soundcloud:playlist** - **soundcloud:set** - **soundcloud:user** - - **Soundgasm** + - **soundgasm** + - **soundgasm:profile** - **southpark.cc.com** - **southpark.de** - **Space** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 7c8b29c3b..17317b29c 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2015.02.21' +__version__ = '2015.02.23' From 5bca2424bc2dfb15b5394a51fa5befd7148edc41 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 23 Feb 2015 16:51:09 +0100 Subject: [PATCH 63/83] [gdcvault] Remove dead code --- youtube_dl/extractor/gdcvault.py | 1 - 1 file changed, 1 deletion(-) diff --git a/youtube_dl/extractor/gdcvault.py b/youtube_dl/extractor/gdcvault.py index 05f58f1af..e5011a5dc 100644 --- a/youtube_dl/extractor/gdcvault.py +++ b/youtube_dl/extractor/gdcvault.py @@ -66,7 +66,6 @@ class GDCVaultIE(InfoExtractor): def _parse_flv(self, xml_description): video_formats = [] - akami_url = xml_description.find('./metadata/akamaiHost').text slide_video_path = xml_description.find('./metadata/slideVideo').text video_formats.append({ 'url': 'rtmp://%s/ondemand?ovpfv=1.1' % 'fms.digitallyspeaking.com/cfx/st', From 591ab1dff913d7ff88f30487c54c1e9c5d44d0cb Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 23 Feb 2015 16:51:21 +0100 Subject: [PATCH 64/83] [soundgasm] PEP8 --- youtube_dl/extractor/soundgasm.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/soundgasm.py b/youtube_dl/extractor/soundgasm.py index 1c48478a6..3a4ddf57e 100644 --- a/youtube_dl/extractor/soundgasm.py +++ b/youtube_dl/extractor/soundgasm.py @@ -40,6 +40,7 @@ class SoundgasmIE(InfoExtractor): 'description': description } + class SoundgasmProfileIE(InfoExtractor): IE_NAME = 'soundgasm:profile' _VALID_URL = r'https?://(?:www\.)?soundgasm\.net/u/(?P<id>[^/]+)/?(?:\#.*)?$' From 4432db35d9ddd0e6777df6c596d8637514ba0b56 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 23 Feb 2015 21:59:11 +0600 Subject: [PATCH 65/83] [gdcvault] Restore akamai host for rtmp videos --- youtube_dl/extractor/gdcvault.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/gdcvault.py b/youtube_dl/extractor/gdcvault.py index e5011a5dc..f7b467b0a 100644 --- a/youtube_dl/extractor/gdcvault.py +++ b/youtube_dl/extractor/gdcvault.py @@ -66,9 +66,10 @@ class GDCVaultIE(InfoExtractor): def _parse_flv(self, xml_description): video_formats = [] + akamai_url = xml_description.find('./metadata/akamaiHost').text slide_video_path = xml_description.find('./metadata/slideVideo').text video_formats.append({ - 'url': 'rtmp://%s/ondemand?ovpfv=1.1' % 'fms.digitallyspeaking.com/cfx/st', + 'url': 'rtmp://%s/ondemand?ovpfv=1.1' % akamai_url, 'play_path': remove_end(slide_video_path, '.flv'), 'ext': 'flv', 'format_note': 'slide deck video', @@ -78,7 +79,7 @@ class GDCVaultIE(InfoExtractor): }) speaker_video_path = xml_description.find('./metadata/speakerVideo').text video_formats.append({ - 'url': 'rtmp://%s/ondemand?ovpfv=1.1' % 'fms.digitallyspeaking.com/cfx/st', + 'url': 'rtmp://%s/ondemand?ovpfv=1.1' % akamai_url, 'play_path': remove_end(speaker_video_path, '.flv'), 'ext': 'flv', 'format_note': 'speaker video', From 459e5fbd5fa61064076534c4d6d8a1d010acb1b3 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 23 Feb 2015 18:17:39 +0100 Subject: [PATCH 66/83] release 2015.02.23.1 --- README.md | 4 ++-- youtube_dl/version.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 8ea31d605..699401b49 100644 --- a/README.md +++ b/README.md @@ -351,8 +351,8 @@ which means you can modify it, redistribute it or use it however you like. --all-subs downloads all the available subtitles of the video --list-subs lists all available subtitles for the video - --sub-format FORMAT subtitle format (default=srt) ([sbv/vtt] - youtube only) + --sub-format FORMAT subtitle format, accepts formats + preference, for example: "ass/srt/best" --sub-lang LANGS languages of the subtitles to download (optional) separated by commas, use IETF language tags like 'en,pt' diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 17317b29c..1852d834b 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2015.02.23' +__version__ = '2015.02.23.1' From ffdf972b9115d6d8c86439bf0828e945823bdcf8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Mon, 23 Feb 2015 18:54:15 +0100 Subject: [PATCH 67/83] [facebook] Extract all the formats (closes #5037) --- youtube_dl/extractor/facebook.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index 1ad4e77a8..f0e575320 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -126,11 +126,17 @@ class FacebookIE(InfoExtractor): params_raw = compat_urllib_parse.unquote(data['params']) params = json.loads(params_raw) video_data = params['video_data'][0] - video_url = video_data.get('hd_src') - if not video_url: - video_url = video_data['sd_src'] - if not video_url: - raise ExtractorError('Cannot find video URL') + + formats = [] + for quality in ['sd', 'hd']: + src = video_data.get('%s_src' % quality) + if src is not None: + formats.append({ + 'format_id': quality, + 'url': src, + }) + if not formats: + raise ExtractorError('Cannot find video formats') video_title = self._html_search_regex( r'<h2 class="uiHeaderTitle">([^<]*)</h2>', webpage, 'title', @@ -146,7 +152,7 @@ class FacebookIE(InfoExtractor): return { 'id': video_id, 'title': video_title, - 'url': video_url, + 'formats': formats, 'duration': int_or_none(video_data.get('video_duration')), 'thumbnail': video_data.get('thumbnail_src'), } From 3037b91e05e68a4ab3420cbbdb23cfb0739011d3 Mon Sep 17 00:00:00 2001 From: Naglis Jonaitis <njonaitis@gmail.com> Date: Mon, 23 Feb 2015 20:45:36 +0200 Subject: [PATCH 68/83] [laola1tv] Improve extraction and update test case (#3742) --- youtube_dl/extractor/laola1tv.py | 47 +++++++++++++++++++++----------- 1 file changed, 31 insertions(+), 16 deletions(-) diff --git a/youtube_dl/extractor/laola1tv.py b/youtube_dl/extractor/laola1tv.py index 2fd3b4699..135421406 100644 --- a/youtube_dl/extractor/laola1tv.py +++ b/youtube_dl/extractor/laola1tv.py @@ -1,23 +1,26 @@ +# -*- coding: utf-8 -*- from __future__ import unicode_literals import random import re from .common import InfoExtractor -from ..utils import ExtractorError +from ..utils import ( + ExtractorError, + xpath_text, +) class Laola1TvIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?laola1\.tv/(?P<lang>[a-z]+)-(?P<portal>[a-z]+)/.*?/(?P<id>[0-9]+)\.html' _TEST = { - 'url': 'http://www.laola1.tv/de-de/live/bwf-bitburger-open-grand-prix-gold-court-1/250019.html', + 'url': 'http://www.laola1.tv/de-de/video/straubing-tigers-koelner-haie/227883.html', 'info_dict': { - 'id': '250019', + 'id': '227883', 'ext': 'mp4', - 'title': 'Bitburger Open Grand Prix Gold - Court 1', - 'categories': ['Badminton'], - 'uploader': 'BWF - Badminton World Federation', - 'is_live': True, + 'title': 'Straubing Tigers - Kölner Haie', + 'categories': ['Eishockey'], + 'is_live': False, }, 'params': { 'skip_download': True, @@ -43,15 +46,26 @@ class Laola1TvIE(InfoExtractor): r'flashvars\.([_a-zA-Z0-9]+)\s*=\s*"([^"]*)";', iframe) flashvars = dict((m[0], m[1]) for m in flashvars_m) + partner_id = self._search_regex( + 'partnerid\s*:\s*"([^"]+)"', iframe, 'partner id') + xml_url = ('http://www.laola1.tv/server/hd_video.php?' + - 'play=%s&partner=1&portal=%s&v5ident=&lang=%s' % ( - video_id, portal, lang)) + 'play=%s&partner=%s&portal=%s&v5ident=&lang=%s' % ( + video_id, partner_id, portal, lang)) hd_doc = self._download_xml(xml_url, video_id) - title = hd_doc.find('.//video/title').text - flash_url = hd_doc.find('.//video/url').text - categories = hd_doc.find('.//video/meta_sports').text.split(',') - uploader = hd_doc.find('.//video/meta_organistation').text + title = xpath_text(hd_doc, './/video/title', fatal=True) + flash_url = xpath_text(hd_doc, './/video/url', fatal=True) + uploader = xpath_text(hd_doc, './/video/meta_organistation') + + is_live = xpath_text(hd_doc, './/video/islive') == 'true' + if is_live: + raise ExtractorError( + 'Live streams are not supported by the f4m downloader.') + + categories = xpath_text(hd_doc, './/video/meta_sports') + if categories: + categories = categories.split(',') ident = random.randint(10000000, 99999999) token_url = '%s&ident=%s&klub=0&unikey=0×tamp=%s&auth=%s' % ( @@ -60,15 +74,16 @@ class Laola1TvIE(InfoExtractor): token_doc = self._download_xml( token_url, video_id, note='Downloading token') token_attrib = token_doc.find('.//token').attrib - if token_attrib.get('auth') == 'blocked': - raise ExtractorError('Token error: ' % token_attrib.get('comment')) + if token_attrib.get('auth') in ('blocked', 'restricted'): + raise ExtractorError( + 'Token error: %s' % token_attrib.get('comment'), expected=True) video_url = '%s?hdnea=%s&hdcore=3.2.0' % ( token_attrib['url'], token_attrib['auth']) return { 'id': video_id, - 'is_live': True, + 'is_live': is_live, 'title': title, 'url': video_url, 'uploader': uploader, From 1fbaa0a5210976a2a8fc0c20207708c35621416a Mon Sep 17 00:00:00 2001 From: Naglis Jonaitis <njonaitis@gmail.com> Date: Mon, 23 Feb 2015 20:51:30 +0200 Subject: [PATCH 69/83] [laola1tv] Use raw strings for regular expressions Oops --- youtube_dl/extractor/laola1tv.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/laola1tv.py b/youtube_dl/extractor/laola1tv.py index 135421406..e8ca49fd1 100644 --- a/youtube_dl/extractor/laola1tv.py +++ b/youtube_dl/extractor/laola1tv.py @@ -47,7 +47,7 @@ class Laola1TvIE(InfoExtractor): flashvars = dict((m[0], m[1]) for m in flashvars_m) partner_id = self._search_regex( - 'partnerid\s*:\s*"([^"]+)"', iframe, 'partner id') + r'partnerid\s*:\s*"([^"]+)"', iframe, 'partner id') xml_url = ('http://www.laola1.tv/server/hd_video.php?' + 'play=%s&partner=%s&portal=%s&v5ident=&lang=%s' % ( From 99209c2916753799e9c68e8d466c5253113f25bc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 24 Feb 2015 01:35:15 +0600 Subject: [PATCH 70/83] [youtube] Extract UL playlists as mixes (Closes #5040) --- youtube_dl/extractor/youtube.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 22db896b1..3690f8021 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1153,13 +1153,13 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): | p/ ) ( - (?:PL|LL|EC|UU|FL|RD)?[0-9A-Za-z-_]{10,} + (?:PL|LL|EC|UU|FL|RD|UL)?[0-9A-Za-z-_]{10,} # Top tracks, they can also include dots |(?:MC)[\w\.]* ) .* | - ((?:PL|LL|EC|UU|FL|RD)[0-9A-Za-z-_]{10,}) + ((?:PL|LL|EC|UU|FL|RD|UL)[0-9A-Za-z-_]{10,}) )""" _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s' _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&[^"]*?index=(?P<index>\d+)' @@ -1244,7 +1244,7 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): for vid_id in ids] def _extract_mix(self, playlist_id): - # The mixes are generated from a a single video + # The mixes are generated from a single video # the id of the playlist is just 'RD' + video_id url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id) webpage = self._download_webpage( @@ -1280,7 +1280,7 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): else: self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id)) - if playlist_id.startswith('RD'): + if playlist_id.startswith('RD') or playlist_id.startswith('UL'): # Mixes require a custom extraction process return self._extract_mix(playlist_id) From 25ac63ed71bdc2a82842a593db9a150a0b8b7a6e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Mon, 23 Feb 2015 21:52:07 +0100 Subject: [PATCH 71/83] [rtve] Extract subtitles --- test/test_subtitles.py | 15 +++++++++++++++ youtube_dl/extractor/rtve.py | 12 ++++++++++++ 2 files changed, 27 insertions(+) diff --git a/test/test_subtitles.py b/test/test_subtitles.py index 7f93f0a75..3f2d8a2ba 100644 --- a/test/test_subtitles.py +++ b/test/test_subtitles.py @@ -25,6 +25,7 @@ from youtube_dl.extractor import ( RaiIE, VikiIE, ThePlatformIE, + RTVEALaCartaIE, ) @@ -305,5 +306,19 @@ class TestThePlatformSubtitles(BaseTestSubtitles): self.assertEqual(md5(subtitles['en']), '97e7670cbae3c4d26ae8bcc7fdd78d4b') +class TestRtveSubtitles(BaseTestSubtitles): + url = 'http://www.rtve.es/alacarta/videos/los-misterios-de-laura/misterios-laura-capitulo-32-misterio-del-numero-17-2-parte/2428621/' + IE = RTVEALaCartaIE + + def test_allsubtitles(self): + print('Skipping, only available from Spain') + return + self.DL.params['writesubtitles'] = True + self.DL.params['allsubtitles'] = True + subtitles = self.getSubtitles() + self.assertEqual(set(subtitles.keys()), set(['es'])) + self.assertEqual(md5(subtitles['es']), '69e70cae2d40574fb7316f31d6eb7fca') + + if __name__ == '__main__': unittest.main() diff --git a/youtube_dl/extractor/rtve.py b/youtube_dl/extractor/rtve.py index e60f85b5b..27cd34b7d 100644 --- a/youtube_dl/extractor/rtve.py +++ b/youtube_dl/extractor/rtve.py @@ -102,14 +102,26 @@ class RTVEALaCartaIE(InfoExtractor): video_url = compat_urlparse.urljoin( 'http://mvod1.akcdn.rtve.es/', video_path) + subtitles = None + if info.get('sbtFile') is not None: + subtitles = self.extract_subtitles(video_id, info['sbtFile']) + return { 'id': video_id, 'title': info['title'], 'url': video_url, 'thumbnail': info.get('image'), 'page_url': url, + 'subtitles': subtitles, } + def _get_subtitles(self, video_id, sub_file): + subs = self._download_json( + sub_file + '.json', video_id, + 'Downloading subtitles info')['page']['items'] + return dict((s['lang'], [{'ext': 'vtt', 'url': s['src']}]) + for s in subs) + class RTVELiveIE(InfoExtractor): IE_NAME = 'rtve.es:live' From ec5913b5cd92dfb8607ec535e02b04bdc09ff804 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Tue, 24 Feb 2015 11:08:00 +0100 Subject: [PATCH 72/83] [bloomberg] Modernize --- youtube_dl/extractor/bloomberg.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/bloomberg.py b/youtube_dl/extractor/bloomberg.py index c51a97ce4..4a88ccd13 100644 --- a/youtube_dl/extractor/bloomberg.py +++ b/youtube_dl/extractor/bloomberg.py @@ -6,7 +6,7 @@ from .common import InfoExtractor class BloombergIE(InfoExtractor): - _VALID_URL = r'https?://www\.bloomberg\.com/video/(?P<name>.+?)\.html' + _VALID_URL = r'https?://www\.bloomberg\.com/video/(?P<id>.+?)\.html' _TEST = { 'url': 'http://www.bloomberg.com/video/shah-s-presentation-on-foreign-exchange-strategies-qurhIVlJSB6hzkVi229d8g.html', @@ -20,9 +20,9 @@ class BloombergIE(InfoExtractor): } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - name = mobj.group('name') + name = self._match_id(url) webpage = self._download_webpage(url, name) + f4m_url = self._search_regex( r'<source src="(https?://[^"]+\.f4m.*?)"', webpage, 'f4m url') From b665ba6aa6551243aa1a5b707ee7034be356f1bb Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Tue, 24 Feb 2015 11:24:26 +0100 Subject: [PATCH 73/83] release 2015.02.24 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 1852d834b..589f38834 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2015.02.23.1' +__version__ = '2015.02.24' From 9c665ab72e5fc99989800109cdada5acc3af56c5 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Tue, 24 Feb 2015 11:37:27 +0100 Subject: [PATCH 74/83] [rtve] PEP8 --- youtube_dl/extractor/rtve.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/rtve.py b/youtube_dl/extractor/rtve.py index 27cd34b7d..c0fd23ff1 100644 --- a/youtube_dl/extractor/rtve.py +++ b/youtube_dl/extractor/rtve.py @@ -119,7 +119,8 @@ class RTVEALaCartaIE(InfoExtractor): subs = self._download_json( sub_file + '.json', video_id, 'Downloading subtitles info')['page']['items'] - return dict((s['lang'], [{'ext': 'vtt', 'url': s['src']}]) + return dict( + (s['lang'], [{'ext': 'vtt', 'url': s['src']}]) for s in subs) From 5a42414b9c4718f83f28fbc0e5a4a01ab67f23f6 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Tue, 24 Feb 2015 11:38:01 +0100 Subject: [PATCH 75/83] [utils] Prevent hyphen at beginning of filename (Fixes #5035) --- test/test_utils.py | 2 ++ youtube_dl/utils.py | 2 ++ 2 files changed, 4 insertions(+) diff --git a/test/test_utils.py b/test/test_utils.py index c7373af1e..2f8996d7b 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -85,6 +85,8 @@ class TestUtil(unittest.TestCase): self.assertEqual( sanitize_filename('New World record at 0:12:34'), 'New World record at 0_12_34') + self.assertEqual(sanitize_filename('--gasdgf'), '_-gasdgf') + self.assertEqual(sanitize_filename('--gasdgf', is_id=True), '--gasdgf') forbidden = '"\0\\/' for fc in forbidden: diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 475fad3c9..e2631dccd 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -304,6 +304,8 @@ def sanitize_filename(s, restricted=False, is_id=False): # Common case of "Foreign band name - English song title" if restricted and result.startswith('-_'): result = result[2:] + if result.startswith('-'): + result = '_' + result[len('-'):] if not result: result = '_' return result From db8e13ef714544574691f3dd4255dd0f12c1cf77 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Tue, 24 Feb 2015 11:38:21 +0100 Subject: [PATCH 76/83] release 2015.02.24.1 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 589f38834..a420860ed 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2015.02.24' +__version__ = '2015.02.24.1' From 54233c9080c1956f53802988fd8d5328cb38b7d7 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Tue, 24 Feb 2015 16:33:07 +0100 Subject: [PATCH 77/83] [escapist] Support JavaScript player (Fixes #5034) --- youtube_dl/extractor/escapist.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/escapist.py b/youtube_dl/extractor/escapist.py index b49b9869f..51ffec7ee 100644 --- a/youtube_dl/extractor/escapist.py +++ b/youtube_dl/extractor/escapist.py @@ -42,7 +42,14 @@ class EscapistIE(InfoExtractor): title = raw_title.partition(' : ')[2] config_url = compat_urllib_parse.unquote(self._html_search_regex( - r'<param\s+name="flashvars"\s+value="config=([^"&]+)', webpage, 'config URL')) + r'''(?x) + (?: + <param\s+name="flashvars"\s+value="config=| + flashvars="config= + ) + ([^"&]+) + ''', + webpage, 'config URL')) formats = [] From 4f3b21e1c738a7dacd514eb59242da43e81b5ae1 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Tue, 24 Feb 2015 16:34:42 +0100 Subject: [PATCH 78/83] release 2015.02.24.2 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index a420860ed..d23c6ae3d 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2015.02.24.1' +__version__ = '2015.02.24.2' From df4bd0d53ff4baff6ce25ad04a1e87f37777c3ff Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Tue, 24 Feb 2015 17:25:02 +0100 Subject: [PATCH 79/83] [options] Add --yes-playlist as inverse of --no-playlist (Fixes #5051) --- youtube_dl/options.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/youtube_dl/options.py b/youtube_dl/options.py index 5c2d153b1..886ce9613 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -272,6 +272,10 @@ def parseOpts(overrideArguments=None): '--no-playlist', action='store_true', dest='noplaylist', default=False, help='If the URL refers to a video and a playlist, download only the video.') + selection.add_option( + '--yes-playlist', + action='store_false', dest='noplaylist', default=False, + help='If the URL refers to a video and a playlist, download the playlist.') selection.add_option( '--age-limit', metavar='YEARS', dest='age_limit', default=None, type=int, From 41b264e77cd357444b632a132ea11ff7ddc3de1f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 24 Feb 2015 23:06:44 +0600 Subject: [PATCH 80/83] [nrktv] Workaround subtitles conversion issues on python 2.6 (Closes #5036) --- youtube_dl/extractor/nrk.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py index 46f493cfc..1e4cfa2e7 100644 --- a/youtube_dl/extractor/nrk.py +++ b/youtube_dl/extractor/nrk.py @@ -4,6 +4,7 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( ExtractorError, float_or_none, @@ -158,7 +159,9 @@ class NRKTVIE(InfoExtractor): def _get_subtitles(self, subtitlesurl, video_id, baseurl): url = "%s%s" % (baseurl, subtitlesurl) self._debug_print('%s: Subtitle url: %s' % (video_id, url)) - captions = self._download_xml(url, video_id, 'Downloading subtitles') + captions = self._download_xml( + url, video_id, 'Downloading subtitles', + transform_source=lambda s: s.replace(r'<br />', '\r\n')) lang = captions.get('lang', 'no') ps = captions.findall('./{0}body/{0}div/{0}p'.format('{http://www.w3.org/ns/ttml}')) srt = '' @@ -167,8 +170,7 @@ class NRKTVIE(InfoExtractor): duration = parse_duration(p.get('dur')) starttime = self._seconds2str(begin) endtime = self._seconds2str(begin + duration) - text = '\n'.join(p.itertext()) - srt += '%s\r\n%s --> %s\r\n%s\r\n\r\n' % (str(pos), starttime, endtime, text) + srt += '%s\r\n%s --> %s\r\n%s\r\n\r\n' % (compat_str(pos), starttime, endtime, p.text) return {lang: [ {'ext': 'ttml', 'url': url}, {'ext': 'srt', 'data': srt}, From 59c7cbd482ba82248cd1bdca3569da6035720f21 Mon Sep 17 00:00:00 2001 From: logon84 <rubenlogon@yahoo.es> Date: Tue, 24 Feb 2015 18:58:32 +0100 Subject: [PATCH 81/83] Update eporner.py Updated to work. Old version shows an error about being unable to extract "redirect_code" --- youtube_dl/extractor/eporner.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/youtube_dl/extractor/eporner.py b/youtube_dl/extractor/eporner.py index 4de8d4bc5..9ae28855b 100644 --- a/youtube_dl/extractor/eporner.py +++ b/youtube_dl/extractor/eporner.py @@ -35,10 +35,7 @@ class EpornerIE(InfoExtractor): title = self._html_search_regex( r'<title>(.*?) - EPORNER', webpage, 'title') - redirect_code = self._html_search_regex( - r'<script type="text/javascript" src="/config5/%s/([a-f\d]+)/">' % video_id, - webpage, 'redirect_code') - redirect_url = 'http://www.eporner.com/config5/%s/%s' % (video_id, redirect_code) + redirect_url = 'http://www.eporner.com/config5/%s/%s' % (video_id, display_id) player_code = self._download_webpage( redirect_url, display_id, note='Downloading player config') From e765ed3a9c4ba52d4709a4a696881eae3401efa0 Mon Sep 17 00:00:00 2001 From: logon84 <rubenlogon@yahoo.es> Date: Tue, 24 Feb 2015 19:41:46 +0100 Subject: [PATCH 82/83] [eporner] Fix redirect_code error --- youtube_dl/extractor/eporner.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/eporner.py b/youtube_dl/extractor/eporner.py index 9ae28855b..f5943caa5 100644 --- a/youtube_dl/extractor/eporner.py +++ b/youtube_dl/extractor/eporner.py @@ -35,6 +35,7 @@ class EpornerIE(InfoExtractor): title = self._html_search_regex( r'<title>(.*?) - EPORNER', webpage, 'title') + redirect_url = 'http://www.eporner.com/config5/%s/%s' % (video_id, display_id) player_code = self._download_webpage( redirect_url, display_id, note='Downloading player config') From 37f885650c323e040a200bda9376bc7dbdf2ca25 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 25 Feb 2015 01:08:54 +0600 Subject: [PATCH 83/83] [eporner] Simplify and hardcode age limit --- youtube_dl/extractor/eporner.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/eporner.py b/youtube_dl/extractor/eporner.py index f5943caa5..e006921ec 100644 --- a/youtube_dl/extractor/eporner.py +++ b/youtube_dl/extractor/eporner.py @@ -35,8 +35,7 @@ class EpornerIE(InfoExtractor): title = self._html_search_regex( r'<title>(.*?) - EPORNER', webpage, 'title') - - redirect_url = 'http://www.eporner.com/config5/%s/%s' % (video_id, display_id) + redirect_url = 'http://www.eporner.com/config5/%s' % video_id player_code = self._download_webpage( redirect_url, display_id, note='Downloading player config') @@ -67,5 +66,5 @@ class EpornerIE(InfoExtractor): 'duration': duration, 'view_count': view_count, 'formats': formats, - 'age_limit': self._rta_search(webpage), + 'age_limit': 18, }