From a504ced097e703a9bc6c18b6e31bcafb4783ed80 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sun, 15 Feb 2015 18:03:41 +0100 Subject: [PATCH 001/131] Improve subtitles support For each language the extractor builds a list with the available formats sorted (like for video formats), then YoutubeDL selects one of them using the '--sub-format' option which now allows giving the format preferences (for example 'ass/srt/best'). For each format the 'url' field can be set so that we only download the contents if needed, or if the contents needs to be processed (like in crunchyroll) the 'data' field can be used. The reasons for this change are: * We weren't checking that the format given with '--sub-format' was available, checking it in each extractor would be repetitive. * It allows to easily support giving a format preference. * The subtitles were automatically downloaded in the extractor, but I think that if you use for example the '--dump-json' option you want to finish as fast as possible. Currently only the ted extractor has been updated, but the old system still works. --- test/test_subtitles.py | 24 ++++----- youtube_dl/YoutubeDL.py | 85 +++++++++++++++++++++++++++--- youtube_dl/__init__.py | 1 - youtube_dl/extractor/common.py | 20 ++++++- youtube_dl/extractor/ted.py | 18 ++++--- youtube_dl/options.py | 4 +- youtube_dl/postprocessor/ffmpeg.py | 6 +-- 7 files changed, 121 insertions(+), 37 deletions(-) diff --git a/test/test_subtitles.py b/test/test_subtitles.py index bcc69a778..fbc9eaf4d 100644 --- a/test/test_subtitles.py +++ b/test/test_subtitles.py @@ -27,15 +27,23 @@ class BaseTestSubtitles(unittest.TestCase): def setUp(self): self.DL = FakeYDL() - self.ie = self.IE(self.DL) + self.ie = self.IE() + self.DL.add_info_extractor(self.ie) def getInfoDict(self): - info_dict = self.ie.extract(self.url) + info_dict = self.DL.extract_info(self.url, download=False) return info_dict def getSubtitles(self): info_dict = self.getInfoDict() - return info_dict['subtitles'] + subtitles = info_dict['subtitles'] + if not subtitles: + return subtitles + for sub_info in subtitles.values(): + if sub_info.get('data') is None: + uf = self.DL.urlopen(sub_info['url']) + sub_info['data'] = uf.read().decode('utf-8') + return dict((l, sub_info['data']) for l, sub_info in subtitles.items()) class TestYoutubeSubtitles(BaseTestSubtitles): @@ -176,7 +184,7 @@ class TestTedSubtitles(BaseTestSubtitles): def test_no_writesubtitles(self): subtitles = self.getSubtitles() - self.assertEqual(subtitles, None) + self.assertFalse(subtitles) def test_subtitles(self): self.DL.params['writesubtitles'] = True @@ -196,18 +204,10 @@ class TestTedSubtitles(BaseTestSubtitles): self.assertTrue(len(subtitles.keys()) >= 28) def test_list_subtitles(self): - self.DL.expect_warning('Automatic Captions not supported by this server') self.DL.params['listsubtitles'] = True info_dict = self.getInfoDict() self.assertEqual(info_dict, None) - def test_automatic_captions(self): - self.DL.expect_warning('Automatic Captions not supported by this server') - self.DL.params['writeautomaticsub'] = True - self.DL.params['subtitleslang'] = ['en'] - subtitles = self.getSubtitles() - self.assertTrue(len(subtitles.keys()) == 0) - def test_multiple_langs(self): self.DL.params['writesubtitles'] = True langs = ['es', 'fr', 'de'] diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 13d18e25e..e665e3d53 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -154,7 +154,7 @@ class YoutubeDL(object): allsubtitles: Downloads all the subtitles of the video (requires writesubtitles or writeautomaticsub) listsubtitles: Lists all available subtitles for the video - subtitlesformat: Subtitle format [srt/sbv/vtt] (default=srt) + subtitlesformat: The format code for subtitles subtitleslangs: List of languages of the subtitles to download keepvideo: Keep the video file after post-processing daterange: A DateRange object, download only if the upload_date is in the range. @@ -1019,6 +1019,11 @@ class YoutubeDL(object): info_dict['timestamp']) info_dict['upload_date'] = upload_date.strftime('%Y%m%d') + if self.params.get('listsubtitles', False): + self.list_subtitles(info_dict['id'], info_dict.get('subtitles')) + return + info_dict['subtitles'] = self.process_subtitles(info_dict['id'], info_dict.get('subtitles')) + # This extractors handle format selection themselves if info_dict['extractor'] in ['Youku']: if download: @@ -1147,6 +1152,53 @@ class YoutubeDL(object): info_dict.update(formats_to_download[-1]) return info_dict + def process_subtitles(self, video_id, available_subs): + """Select the requested subtitles and their format""" + if not available_subs: + return available_subs + + if self.params.get('allsubtitles', False): + requested_langs = available_subs.keys() + else: + if self.params.get('subtitleslangs', False): + requested_langs = self.params.get('subtitleslangs') + elif 'en' in available_subs: + requested_langs = ['en'] + else: + requested_langs = [list(available_subs.keys())[0]] + + formats_query = self.params.get('subtitlesformat', 'best') + formats_preference = formats_query.split('/') if formats_query else [] + subs = {} + for lang in requested_langs: + formats = available_subs.get(lang) + if formats is None: + self.report_warning('%s subtitles not available for %s' % (lang, video_id)) + continue + if isinstance(formats, compat_str): + # TODO: convert all IE with subtitles support to the new format + # and remove this + subs[lang] = { + 'ext': formats_preference[0], + 'data': formats, + } + continue + for ext in formats_preference: + if ext == 'best': + f = formats[-1] + break + matches = list(filter(lambda f: f['ext'] == ext, formats)) + if matches: + f = matches[-1] + break + else: + f = formats[-1] + self.report_warning( + 'No subtitle format found matching "%s" for language %s, ' + 'using %s' % (formats_query, lang, f['ext'])) + subs[lang] = f + return subs + def process_info(self, info_dict): """Process a single resolved IE result.""" @@ -1253,11 +1305,18 @@ class YoutubeDL(object): # subtitles download errors are already managed as troubles in relevant IE # that way it will silently go on when used with unsupporting IE subtitles = info_dict['subtitles'] - sub_format = self.params.get('subtitlesformat', 'srt') - for sub_lang in subtitles.keys(): - sub = subtitles[sub_lang] - if sub is None: - continue + for sub_lang, sub_info in subtitles.items(): + sub_format = sub_info['ext'] + if sub_info.get('data') is not None: + sub_data = sub_info['data'] + else: + try: + uf = self.urlopen(sub_info['url']) + sub_data = uf.read().decode('utf-8') + except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: + self.report_warning('Unable to download subtitle for "%s": %s' % + (sub_lang, compat_str(err))) + continue try: sub_filename = subtitles_filename(filename, sub_lang, sub_format) if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(sub_filename)): @@ -1265,7 +1324,7 @@ class YoutubeDL(object): else: self.to_screen('[info] Writing video subtitles to: ' + sub_filename) with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8') as subfile: - subfile.write(sub) + subfile.write(sub_data) except (OSError, IOError): self.report_error('Cannot write subtitles file ' + sub_filename) return @@ -1586,6 +1645,18 @@ class YoutubeDL(object): ['ID', 'width', 'height', 'URL'], [[t['id'], t.get('width', 'unknown'), t.get('height', 'unknown'), t['url']] for t in thumbnails])) + def list_subtitles(self, video_id, subtitles): + if not subtitles: + self.to_screen('%s has no subtitles' % video_id) + return + header_line = 'Language formats' + sub_lines = [ + '%-12s%s' % (lang, ', '.join(f['ext'] for f in reversed(formats))) + for lang, formats in subtitles.items()] + self.to_screen( + 'Available subtitles for %s:\n%s\n%s' % + (video_id, header_line, '\n'.join(sub_lines))) + def urlopen(self, req): """ Start an HTTP download """ diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index ed22f169f..5f2585003 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -226,7 +226,6 @@ def _real_main(argv=None): if opts.embedsubtitles: postprocessors.append({ 'key': 'FFmpegEmbedSubtitle', - 'subtitlesformat': opts.subtitlesformat, }) if opts.xattrs: postprocessors.append({'key': 'XAttrMetadata'}) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index c784eedb9..161c623eb 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -151,8 +151,14 @@ class InfoExtractor(object): If not explicitly set, calculated from timestamp. uploader_id: Nickname or id of the video uploader. location: Physical location where the video was filmed. - subtitles: The subtitle file contents as a dictionary in the format - {language: subtitles}. + subtitles: The available subtitles as a dictionary in the format + {language: subformats}. "subformats" is a list sorted from + lower to higher preference, each element is a dictionary + with the "ext" entry and one of: + * "data": The subtitles file contents + * "url": A url pointing to the subtitles file + Note: YoutubeDL.extract_info will get the requested + format and replace the "subformats" list with it. duration: Length of the video in seconds, as an integer. view_count: How many users have watched the video on the platform. like_count: Number of positive ratings of the video @@ -993,6 +999,16 @@ class InfoExtractor(object): any_restricted = any_restricted or is_restricted return not any_restricted + def extract_subtitles(self, *args, **kwargs): + subtitles = {} + list_subtitles = self._downloader.params.get('listsubtitles') + if self._downloader.params.get('writesubtitles', False) or list_subtitles: + subtitles.update(self._get_subtitles(*args, **kwargs)) + return subtitles + + def _get_subtitles(self, *args, **kwargs): + raise NotImplementedError("This method must be implemented by subclasses") + class SearchInfoExtractor(InfoExtractor): """ diff --git a/youtube_dl/extractor/ted.py b/youtube_dl/extractor/ted.py index 10b3b706a..1809eaae4 100644 --- a/youtube_dl/extractor/ted.py +++ b/youtube_dl/extractor/ted.py @@ -3,14 +3,14 @@ from __future__ import unicode_literals import json import re -from .subtitles import SubtitlesInfoExtractor +from .common import InfoExtractor from ..compat import ( compat_str, ) -class TEDIE(SubtitlesInfoExtractor): +class TEDIE(InfoExtractor): _VALID_URL = r'''(?x) (?Phttps?://) (?Pwww|embed(?:-ssl)?)(?P\.ted\.com/ @@ -165,9 +165,6 @@ class TEDIE(SubtitlesInfoExtractor): video_id = compat_str(talk_info['id']) # subtitles video_subtitles = self.extract_subtitles(video_id, talk_info) - if self._downloader.params.get('listsubtitles', False): - self._list_available_subtitles(video_id, talk_info) - return thumbnail = talk_info['thumb'] if not thumbnail.startswith('http'): @@ -183,13 +180,18 @@ class TEDIE(SubtitlesInfoExtractor): 'duration': talk_info.get('duration'), } - def _get_available_subtitles(self, video_id, talk_info): + def _get_subtitles(self, video_id, talk_info): languages = [lang['languageCode'] for lang in talk_info.get('languages', [])] if languages: sub_lang_list = {} for l in languages: - url = 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/srt' % (video_id, l) - sub_lang_list[l] = url + sub_lang_list[l] = [ + { + 'url': 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/%s' % (video_id, l, ext), + 'ext': ext, + } + for ext in ['ted', 'srt'] + ] return sub_lang_list else: self._downloader.report_warning('video doesn\'t have subtitles') diff --git a/youtube_dl/options.py b/youtube_dl/options.py index 873432bee..4fcf8c83d 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -387,8 +387,8 @@ def parseOpts(overrideArguments=None): help='lists all available subtitles for the video') subtitles.add_option( '--sub-format', - action='store', dest='subtitlesformat', metavar='FORMAT', default='srt', - help='subtitle format (default=srt) ([sbv/vtt] youtube only)') + action='store', dest='subtitlesformat', metavar='FORMAT', default='best', + help='subtitle format, accepts formats preference, for example: "ass/srt/best"') subtitles.add_option( '--sub-lang', '--sub-langs', '--srt-lang', action='callback', dest='subtitleslangs', metavar='LANGS', type='str', diff --git a/youtube_dl/postprocessor/ffmpeg.py b/youtube_dl/postprocessor/ffmpeg.py index 5238ce534..d1bbfbfe3 100644 --- a/youtube_dl/postprocessor/ffmpeg.py +++ b/youtube_dl/postprocessor/ffmpeg.py @@ -453,10 +453,6 @@ class FFmpegEmbedSubtitlePP(FFmpegPostProcessor): 'zu': 'zul', } - def __init__(self, downloader=None, subtitlesformat='srt'): - super(FFmpegEmbedSubtitlePP, self).__init__(downloader) - self._subformat = subtitlesformat - @classmethod def _conver_lang_code(cls, code): """Convert language code from ISO 639-1 to ISO 639-2/T""" @@ -472,7 +468,7 @@ class FFmpegEmbedSubtitlePP(FFmpegPostProcessor): sub_langs = [key for key in information['subtitles']] filename = information['filepath'] - input_files = [filename] + [subtitles_filename(filename, lang, self._subformat) for lang in sub_langs] + input_files = [filename] + [subtitles_filename(filename, lang, sub_info['ext']) for lang, sub_info in information['subtitles'].items()] opts = [ '-map', '0', From b5857f62e2c2ca70316e041212aa9e89d54cc253 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sun, 15 Feb 2015 18:21:42 +0100 Subject: [PATCH 002/131] [crunchyroll] Convert to new subtitles system --- youtube_dl/extractor/crunchyroll.py | 66 +++++++++++++++-------------- 1 file changed, 35 insertions(+), 31 deletions(-) diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index 1680f532f..f1da7d09b 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -9,7 +9,7 @@ import xml.etree.ElementTree from hashlib import sha1 from math import pow, sqrt, floor -from .subtitles import SubtitlesInfoExtractor +from .common import InfoExtractor from ..compat import ( compat_urllib_parse, compat_urllib_request, @@ -25,10 +25,9 @@ from ..aes import ( aes_cbc_decrypt, inc, ) -from .common import InfoExtractor -class CrunchyrollIE(SubtitlesInfoExtractor): +class CrunchyrollIE(InfoExtractor): _VALID_URL = r'https?://(?:(?Pwww|m)\.)?(?Pcrunchyroll\.(?:com|fr)/(?:[^/]*/[^/?&]*?|media/\?id=)(?P[0-9]+))(?:[/?&]|$)' _TESTS = [{ 'url': 'http://www.crunchyroll.com/wanna-be-the-strongest-in-the-world/episode-1-an-idol-wrestler-is-born-645513', @@ -187,6 +186,38 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text return output + def _get_subtitles(self, video_id, webpage): + subtitles = {} + for sub_id, sub_name in re.findall(r'\?ssid=([0-9]+)" title="([^"]+)', webpage): + sub_page = self._download_webpage( + 'http://www.crunchyroll.com/xml/?req=RpcApiSubtitle_GetXml&subtitle_script_id=' + sub_id, + video_id, note='Downloading subtitles for ' + sub_name) + id = self._search_regex(r'id=\'([0-9]+)', sub_page, 'subtitle_id', fatal=False) + iv = self._search_regex(r'([^<]+)', sub_page, 'subtitle_iv', fatal=False) + data = self._search_regex(r'([^<]+)', sub_page, 'subtitle_data', fatal=False) + if not id or not iv or not data: + continue + id = int(id) + iv = base64.b64decode(iv) + data = base64.b64decode(data) + + subtitle = self._decrypt_subtitles(data, iv, id).decode('utf-8') + lang_code = self._search_regex(r'lang_code=["\']([^"\']+)', subtitle, 'subtitle_lang_code', fatal=False) + if not lang_code: + continue + sub_root = xml.etree.ElementTree.fromstring(subtitle) + subtitles[lang_code] = [ + { + 'ext': 'srt', + 'data': self._convert_subtitles_to_srt(sub_root), + }, + { + 'ext': 'ass', + 'data': self._convert_subtitles_to_ass(sub_root), + }, + ] + return subtitles + def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('video_id') @@ -249,34 +280,7 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text 'format_id': video_format, }) - subtitles = {} - sub_format = self._downloader.params.get('subtitlesformat', 'srt') - for sub_id, sub_name in re.findall(r'\?ssid=([0-9]+)" title="([^"]+)', webpage): - sub_page = self._download_webpage( - 'http://www.crunchyroll.com/xml/?req=RpcApiSubtitle_GetXml&subtitle_script_id=' + sub_id, - video_id, note='Downloading subtitles for ' + sub_name) - id = self._search_regex(r'id=\'([0-9]+)', sub_page, 'subtitle_id', fatal=False) - iv = self._search_regex(r'([^<]+)', sub_page, 'subtitle_iv', fatal=False) - data = self._search_regex(r'([^<]+)', sub_page, 'subtitle_data', fatal=False) - if not id or not iv or not data: - continue - id = int(id) - iv = base64.b64decode(iv) - data = base64.b64decode(data) - - subtitle = self._decrypt_subtitles(data, iv, id).decode('utf-8') - lang_code = self._search_regex(r'lang_code=["\']([^"\']+)', subtitle, 'subtitle_lang_code', fatal=False) - if not lang_code: - continue - sub_root = xml.etree.ElementTree.fromstring(subtitle) - if sub_format == 'ass': - subtitles[lang_code] = self._convert_subtitles_to_ass(sub_root) - else: - subtitles[lang_code] = self._convert_subtitles_to_srt(sub_root) - - if self._downloader.params.get('listsubtitles', False): - self._list_available_subtitles(video_id, subtitles) - return + subtitles = self.extract_subtitles(video_id, webpage) return { 'id': video_id, From 6b597516c12c7fd81e832f3ec05dd0dca6089823 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sun, 15 Feb 2015 18:32:40 +0100 Subject: [PATCH 003/131] [atresplayer] Convert to new subtitles system --- youtube_dl/extractor/atresplayer.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/youtube_dl/extractor/atresplayer.py b/youtube_dl/extractor/atresplayer.py index f016368fa..7669e0e3d 100644 --- a/youtube_dl/extractor/atresplayer.py +++ b/youtube_dl/extractor/atresplayer.py @@ -3,7 +3,7 @@ from __future__ import unicode_literals import time import hmac -from .subtitles import SubtitlesInfoExtractor +from .common import InfoExtractor from ..compat import ( compat_str, compat_urllib_parse, @@ -17,7 +17,7 @@ from ..utils import ( ) -class AtresPlayerIE(SubtitlesInfoExtractor): +class AtresPlayerIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?atresplayer\.com/television/[^/]+/[^/]+/[^/]+/(?P.+?)_\d+\.html' _TESTS = [ { @@ -144,13 +144,12 @@ class AtresPlayerIE(SubtitlesInfoExtractor): thumbnail = xpath_text(episode, './media/asset/files/background', 'thumbnail') subtitles = {} - subtitle = xpath_text(episode, './media/asset/files/subtitle', 'subtitle') - if subtitle: - subtitles['es'] = subtitle - - if self._downloader.params.get('listsubtitles', False): - self._list_available_subtitles(video_id, subtitles) - return + subtitle_url = xpath_text(episode, './media/asset/files/subtitle', 'subtitle') + if subtitle_url: + subtitles['es'] = [{ + 'ext': 'srt', + 'url': subtitle_url, + }] return { 'id': video_id, @@ -159,5 +158,5 @@ class AtresPlayerIE(SubtitlesInfoExtractor): 'thumbnail': thumbnail, 'duration': duration, 'formats': formats, - 'subtitles': self.extract_subtitles(video_id, subtitles), + 'subtitles': subtitles, } From 65469a7f8b0ba50bd3c8918707e35125962aa2cd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sun, 15 Feb 2015 18:52:07 +0100 Subject: [PATCH 004/131] [vimeo] Convert to new subtitles system Removed some tests, the behaviour should be checked in a test for the YoutubeDL class --- test/parameters.json | 2 +- test/test_subtitles.py | 17 ----------------- youtube_dl/extractor/vimeo.py | 15 ++++++--------- 3 files changed, 7 insertions(+), 27 deletions(-) diff --git a/test/parameters.json b/test/parameters.json index af77b89b4..cbff9bd16 100644 --- a/test/parameters.json +++ b/test/parameters.json @@ -28,7 +28,7 @@ "retries": 10, "simulate": false, "subtitleslang": null, - "subtitlesformat": "srt", + "subtitlesformat": "best", "test": true, "updatetime": true, "usenetrc": false, diff --git a/test/test_subtitles.py b/test/test_subtitles.py index fbc9eaf4d..3f2d61d36 100644 --- a/test/test_subtitles.py +++ b/test/test_subtitles.py @@ -240,10 +240,6 @@ class TestVimeoSubtitles(BaseTestSubtitles): url = 'http://vimeo.com/76979871' IE = VimeoIE - def test_no_writesubtitles(self): - subtitles = self.getSubtitles() - self.assertEqual(subtitles, None) - def test_subtitles(self): self.DL.params['writesubtitles'] = True subtitles = self.getSubtitles() @@ -261,19 +257,6 @@ class TestVimeoSubtitles(BaseTestSubtitles): subtitles = self.getSubtitles() self.assertEqual(set(subtitles.keys()), set(['de', 'en', 'es', 'fr'])) - def test_list_subtitles(self): - self.DL.expect_warning('Automatic Captions not supported by this server') - self.DL.params['listsubtitles'] = True - info_dict = self.getInfoDict() - self.assertEqual(info_dict, None) - - def test_automatic_captions(self): - self.DL.expect_warning('Automatic Captions not supported by this server') - self.DL.params['writeautomaticsub'] = True - self.DL.params['subtitleslang'] = ['en'] - subtitles = self.getSubtitles() - self.assertTrue(len(subtitles.keys()) == 0) - def test_nosubtitles(self): self.DL.expect_warning('video doesn\'t have subtitles') self.url = 'http://vimeo.com/56015672' diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 303e81447..5930d5984 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -6,7 +6,6 @@ import re import itertools from .common import InfoExtractor -from .subtitles import SubtitlesInfoExtractor from ..compat import ( compat_HTTPError, compat_urllib_parse, @@ -51,7 +50,7 @@ class VimeoBaseInfoExtractor(InfoExtractor): self._download_webpage(login_request, None, False, 'Wrong login info') -class VimeoIE(VimeoBaseInfoExtractor, SubtitlesInfoExtractor): +class VimeoIE(VimeoBaseInfoExtractor): """Information extractor for vimeo.com.""" # _VALID_URL matches Vimeo URLs @@ -368,12 +367,10 @@ class VimeoIE(VimeoBaseInfoExtractor, SubtitlesInfoExtractor): text_tracks = config['request'].get('text_tracks') if text_tracks: for tt in text_tracks: - subtitles[tt['lang']] = 'http://vimeo.com' + tt['url'] - - video_subtitles = self.extract_subtitles(video_id, subtitles) - if self._downloader.params.get('listsubtitles', False): - self._list_available_subtitles(video_id, subtitles) - return + subtitles[tt['lang']] = [{ + 'ext': 'vtt', + 'url': 'http://vimeo.com' + tt['url'], + }] return { 'id': video_id, @@ -389,7 +386,7 @@ class VimeoIE(VimeoBaseInfoExtractor, SubtitlesInfoExtractor): 'view_count': view_count, 'like_count': like_count, 'comment_count': comment_count, - 'subtitles': video_subtitles, + 'subtitles': subtitles, } From c84dd8a90dcc75547b343449b921b644a2119c4f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Mon, 16 Feb 2015 21:12:31 +0100 Subject: [PATCH 005/131] [YoutubeDL] store the subtitles to download in the 'requested_subtitles' field We need to keep the orginal subtitles information, so that the '--load-info' option can be used to list or select the subtitles again. We'll also be able to have a separate field for storing the automatic captions info. --- test/test_subtitles.py | 2 +- youtube_dl/YoutubeDL.py | 6 +++--- youtube_dl/extractor/common.py | 2 -- youtube_dl/postprocessor/ffmpeg.py | 7 ++++--- 4 files changed, 8 insertions(+), 9 deletions(-) diff --git a/test/test_subtitles.py b/test/test_subtitles.py index 3f2d61d36..b3c615c4f 100644 --- a/test/test_subtitles.py +++ b/test/test_subtitles.py @@ -36,7 +36,7 @@ class BaseTestSubtitles(unittest.TestCase): def getSubtitles(self): info_dict = self.getInfoDict() - subtitles = info_dict['subtitles'] + subtitles = info_dict['requested_subtitles'] if not subtitles: return subtitles for sub_info in subtitles.values(): diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index e665e3d53..8545dc9e9 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -1022,7 +1022,7 @@ class YoutubeDL(object): if self.params.get('listsubtitles', False): self.list_subtitles(info_dict['id'], info_dict.get('subtitles')) return - info_dict['subtitles'] = self.process_subtitles(info_dict['id'], info_dict.get('subtitles')) + info_dict['requested_subtitles'] = self.process_subtitles(info_dict['id'], info_dict.get('subtitles')) # This extractors handle format selection themselves if info_dict['extractor'] in ['Youku']: @@ -1301,10 +1301,10 @@ class YoutubeDL(object): subtitles_are_requested = any([self.params.get('writesubtitles', False), self.params.get('writeautomaticsub')]) - if subtitles_are_requested and 'subtitles' in info_dict and info_dict['subtitles']: + if subtitles_are_requested and info_dict.get('requested_subtitles'): # subtitles download errors are already managed as troubles in relevant IE # that way it will silently go on when used with unsupporting IE - subtitles = info_dict['subtitles'] + subtitles = info_dict['requested_subtitles'] for sub_lang, sub_info in subtitles.items(): sub_format = sub_info['ext'] if sub_info.get('data') is not None: diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 161c623eb..d149e0f92 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -157,8 +157,6 @@ class InfoExtractor(object): with the "ext" entry and one of: * "data": The subtitles file contents * "url": A url pointing to the subtitles file - Note: YoutubeDL.extract_info will get the requested - format and replace the "subformats" list with it. duration: Length of the video in seconds, as an integer. view_count: How many users have watched the video on the platform. like_count: Number of positive ratings of the video diff --git a/youtube_dl/postprocessor/ffmpeg.py b/youtube_dl/postprocessor/ffmpeg.py index d1bbfbfe3..e42298f0e 100644 --- a/youtube_dl/postprocessor/ffmpeg.py +++ b/youtube_dl/postprocessor/ffmpeg.py @@ -462,13 +462,14 @@ class FFmpegEmbedSubtitlePP(FFmpegPostProcessor): if information['ext'] != 'mp4': self._downloader.to_screen('[ffmpeg] Subtitles can only be embedded in mp4 files') return True, information - if not information.get('subtitles'): + subtitles = information.get('requested_subtitles') + if not subtitles: self._downloader.to_screen('[ffmpeg] There aren\'t any subtitles to embed') return True, information - sub_langs = [key for key in information['subtitles']] + sub_langs = list(subtitles.keys()) filename = information['filepath'] - input_files = [filename] + [subtitles_filename(filename, lang, sub_info['ext']) for lang, sub_info in information['subtitles'].items()] + input_files = [filename] + [subtitles_filename(filename, lang, sub_info['ext']) for lang, sub_info in subtitles.items()] opts = [ '-map', '0', From a1f2a06b34807a2e1b5eb5176fa418da2405392d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Mon, 16 Feb 2015 21:28:06 +0100 Subject: [PATCH 006/131] [dailymotion] Convert to new subtitles system --- test/test_subtitles.py | 17 ----------------- youtube_dl/extractor/dailymotion.py | 10 +++------- 2 files changed, 3 insertions(+), 24 deletions(-) diff --git a/test/test_subtitles.py b/test/test_subtitles.py index b3c615c4f..84ae0e714 100644 --- a/test/test_subtitles.py +++ b/test/test_subtitles.py @@ -127,10 +127,6 @@ class TestDailymotionSubtitles(BaseTestSubtitles): url = 'http://www.dailymotion.com/video/xczg00' IE = DailymotionIE - def test_no_writesubtitles(self): - subtitles = self.getSubtitles() - self.assertEqual(subtitles, None) - def test_subtitles(self): self.DL.params['writesubtitles'] = True subtitles = self.getSubtitles() @@ -148,19 +144,6 @@ class TestDailymotionSubtitles(BaseTestSubtitles): subtitles = self.getSubtitles() self.assertTrue(len(subtitles.keys()) >= 6) - def test_list_subtitles(self): - self.DL.expect_warning('Automatic Captions not supported by this server') - self.DL.params['listsubtitles'] = True - info_dict = self.getInfoDict() - self.assertEqual(info_dict, None) - - def test_automatic_captions(self): - self.DL.expect_warning('Automatic Captions not supported by this server') - self.DL.params['writeautomaticsub'] = True - self.DL.params['subtitleslang'] = ['en'] - subtitles = self.getSubtitles() - self.assertTrue(len(subtitles.keys()) == 0) - def test_nosubtitles(self): self.DL.expect_warning('video doesn\'t have subtitles') self.url = 'http://www.dailymotion.com/video/x12u166_le-zapping-tele-star-du-08-aout-2013_tv' diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index cf5841a7c..4ca892926 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -6,7 +6,6 @@ import json import itertools from .common import InfoExtractor -from .subtitles import SubtitlesInfoExtractor from ..compat import ( compat_str, @@ -31,7 +30,7 @@ class DailymotionBaseInfoExtractor(InfoExtractor): return request -class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor): +class DailymotionIE(DailymotionBaseInfoExtractor): """Information Extractor for Dailymotion""" _VALID_URL = r'(?i)(?:https?://)?(?:(www|touch)\.)?dailymotion\.[a-z]{2,3}/(?:(embed|#)/)?video/(?P[^/?_]+)' @@ -143,9 +142,6 @@ class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor): # subtitles video_subtitles = self.extract_subtitles(video_id, webpage) - if self._downloader.params.get('listsubtitles', False): - self._list_available_subtitles(video_id, webpage) - return view_count = str_to_int(self._search_regex( r'video_views_count[^>]+>\s+([\d\.,]+)', @@ -169,7 +165,7 @@ class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor): 'view_count': view_count, } - def _get_available_subtitles(self, video_id, webpage): + def _get_subtitles(self, video_id, webpage): try: sub_list = self._download_webpage( 'https://api.dailymotion.com/video/%s/subtitles?fields=id,language,url' % video_id, @@ -179,7 +175,7 @@ class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor): return {} info = json.loads(sub_list) if (info['total'] > 0): - sub_lang_list = dict((l['language'], l['url']) for l in info['list']) + sub_lang_list = dict((l['language'], [{'url': l['url'], 'ext': 'srt'}]) for l in info['list']) return sub_lang_list self._downloader.report_warning('video doesn\'t have subtitles') return {} From 360e1ca5ccabcb5d48228d9472b09f1bce68bbc4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Mon, 16 Feb 2015 21:44:17 +0100 Subject: [PATCH 007/131] [youtube] Convert to new subtitles system The automatic captions are stored in the 'automactic_captions' field, which is used if no normal subtitles are found for an specific language. --- test/test_subtitles.py | 5 --- youtube_dl/YoutubeDL.py | 24 ++++++++++---- youtube_dl/extractor/common.py | 12 +++++++ youtube_dl/extractor/youtube.py | 57 ++++++++++++++++++--------------- 4 files changed, 61 insertions(+), 37 deletions(-) diff --git a/test/test_subtitles.py b/test/test_subtitles.py index 84ae0e714..91cebce28 100644 --- a/test/test_subtitles.py +++ b/test/test_subtitles.py @@ -50,11 +50,6 @@ class TestYoutubeSubtitles(BaseTestSubtitles): url = 'QRS8MkLhQmM' IE = YoutubeIE - def test_youtube_no_writesubtitles(self): - self.DL.params['writesubtitles'] = False - subtitles = self.getSubtitles() - self.assertEqual(subtitles, None) - def test_youtube_subtitles(self): self.DL.params['writesubtitles'] = True subtitles = self.getSubtitles() diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 8545dc9e9..a47f8f5de 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -1020,9 +1020,13 @@ class YoutubeDL(object): info_dict['upload_date'] = upload_date.strftime('%Y%m%d') if self.params.get('listsubtitles', False): - self.list_subtitles(info_dict['id'], info_dict.get('subtitles')) + if 'automatic_captions' in info_dict: + self.list_subtitles(info_dict['id'], info_dict.get('automatic_captions'), 'automatic captions') + self.list_subtitles(info_dict['id'], info_dict.get('subtitles'), 'subtitles') return - info_dict['requested_subtitles'] = self.process_subtitles(info_dict['id'], info_dict.get('subtitles')) + info_dict['requested_subtitles'] = self.process_subtitles( + info_dict['id'], info_dict.get('subtitles'), + info_dict.get('automatic_captions')) # This extractors handle format selection themselves if info_dict['extractor'] in ['Youku']: @@ -1152,8 +1156,14 @@ class YoutubeDL(object): info_dict.update(formats_to_download[-1]) return info_dict - def process_subtitles(self, video_id, available_subs): + def process_subtitles(self, video_id, available_subs, available_autocaps): """Select the requested subtitles and their format""" + if available_autocaps and self.params.get('writeautomaticsub'): + available_subs = available_subs.copy() + for lang, cap_info in available_autocaps.items(): + if lang not in available_subs: + available_subs[lang] = cap_info + if not available_subs: return available_subs @@ -1645,17 +1655,17 @@ class YoutubeDL(object): ['ID', 'width', 'height', 'URL'], [[t['id'], t.get('width', 'unknown'), t.get('height', 'unknown'), t['url']] for t in thumbnails])) - def list_subtitles(self, video_id, subtitles): + def list_subtitles(self, video_id, subtitles, name='subtitles'): if not subtitles: - self.to_screen('%s has no subtitles' % video_id) + self.to_screen('%s has no %s' % (video_id, name)) return header_line = 'Language formats' sub_lines = [ '%-12s%s' % (lang, ', '.join(f['ext'] for f in reversed(formats))) for lang, formats in subtitles.items()] self.to_screen( - 'Available subtitles for %s:\n%s\n%s' % - (video_id, header_line, '\n'.join(sub_lines))) + 'Available %s for %s:\n%s\n%s' % + (name, video_id, header_line, '\n'.join(sub_lines))) def urlopen(self, req): """ Start an HTTP download """ diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index d149e0f92..fe7d8dbc9 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -157,6 +157,8 @@ class InfoExtractor(object): with the "ext" entry and one of: * "data": The subtitles file contents * "url": A url pointing to the subtitles file + automatic_captions: Like 'subtitles', used by the YoutubeIE for + automatically generated captions duration: Length of the video in seconds, as an integer. view_count: How many users have watched the video on the platform. like_count: Number of positive ratings of the video @@ -1007,6 +1009,16 @@ class InfoExtractor(object): def _get_subtitles(self, *args, **kwargs): raise NotImplementedError("This method must be implemented by subclasses") + def extract_automatic_captions(self, *args, **kwargs): + automatic_captions = {} + list_subtitles = self._downloader.params.get('listsubtitles') + if self._downloader.params.get('writeautomaticsub', False) or list_subtitles: + automatic_captions.update(self._get_automatic_captions(*args, **kwargs)) + return automatic_captions + + def _get_automatic_captions(self, *args, **kwargs): + raise NotImplementedError("This method must be implemented by subclasses") + class SearchInfoExtractor(InfoExtractor): """ diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 35ef4c303..1b2dbf276 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -11,7 +11,6 @@ import time import traceback from .common import InfoExtractor, SearchInfoExtractor -from .subtitles import SubtitlesInfoExtractor from ..jsinterp import JSInterpreter from ..swfinterp import SWFInterpreter from ..compat import ( @@ -185,7 +184,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor): return -class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): +class YoutubeIE(YoutubeBaseInfoExtractor): IE_DESC = 'YouTube.com' _VALID_URL = r"""(?x)^ ( @@ -644,7 +643,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): raise ExtractorError( 'Signature extraction failed: ' + tb, cause=e) - def _get_available_subtitles(self, video_id, webpage): + def _get_subtitles(self, video_id, webpage): try: subs_doc = self._download_xml( 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id, @@ -658,23 +657,27 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): lang = track.attrib['lang_code'] if lang in sub_lang_list: continue - params = compat_urllib_parse.urlencode({ - 'lang': lang, - 'v': video_id, - 'fmt': self._downloader.params.get('subtitlesformat', 'srt'), - 'name': track.attrib['name'].encode('utf-8'), - }) - url = 'https://www.youtube.com/api/timedtext?' + params - sub_lang_list[lang] = url + sub_formats = [] + for ext in ['sbv', 'vtt', 'srt']: + params = compat_urllib_parse.urlencode({ + 'lang': lang, + 'v': video_id, + 'fmt': ext, + 'name': track.attrib['name'].encode('utf-8'), + }) + sub_formats.append({ + 'url': 'https://www.youtube.com/api/timedtext?' + params, + 'ext': ext, + }) + sub_lang_list[lang] = sub_formats if not sub_lang_list: self._downloader.report_warning('video doesn\'t have subtitles') return {} return sub_lang_list - def _get_available_automatic_caption(self, video_id, webpage): + def _get_automatic_captions(self, video_id, webpage): """We need the webpage for getting the captions url, pass it as an argument to speed up the process.""" - sub_format = self._downloader.params.get('subtitlesformat', 'srt') self.to_screen('%s: Looking for automatic captions' % video_id) mobj = re.search(r';ytplayer.config = ({.*?});', webpage) err_msg = 'Couldn\'t find automatic captions for %s' % video_id @@ -704,14 +707,20 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): sub_lang_list = {} for lang_node in caption_list.findall('target'): sub_lang = lang_node.attrib['lang_code'] - params = compat_urllib_parse.urlencode({ - 'lang': original_lang, - 'tlang': sub_lang, - 'fmt': sub_format, - 'ts': timestamp, - 'kind': caption_kind, - }) - sub_lang_list[sub_lang] = caption_url + '&' + params + sub_formats = [] + for ext in ['sbv', 'vtt', 'srt']: + params = compat_urllib_parse.urlencode({ + 'lang': original_lang, + 'tlang': sub_lang, + 'fmt': ext, + 'ts': timestamp, + 'kind': caption_kind, + }) + sub_formats.append({ + 'url': caption_url + '&' + params, + 'ext': ext, + }) + sub_lang_list[sub_lang] = sub_formats return sub_lang_list # An extractor error can be raise by the download process if there are # no automatic captions but there are subtitles @@ -966,10 +975,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): # subtitles video_subtitles = self.extract_subtitles(video_id, video_webpage) - - if self._downloader.params.get('listsubtitles', False): - self._list_available_subtitles(video_id, video_webpage) - return + automatic_captions = self.extract_automatic_captions(video_id, video_webpage) if 'length_seconds' not in video_info: self._downloader.report_warning('unable to extract video duration') @@ -1118,6 +1124,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): 'description': video_description, 'categories': video_categories, 'subtitles': video_subtitles, + 'automatic_captions': automatic_captions, 'duration': video_duration, 'age_limit': 18 if age_gate else 0, 'annotations': video_annotations, From 48246541da66a12486505804f9519391a298ff54 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Tue, 17 Feb 2015 21:17:47 +0100 Subject: [PATCH 008/131] [ceskatelevize] Convert to new subtitles system --- youtube_dl/extractor/ceskatelevize.py | 30 +++++++++++++-------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/youtube_dl/extractor/ceskatelevize.py b/youtube_dl/extractor/ceskatelevize.py index f70e090bb..65f6be623 100644 --- a/youtube_dl/extractor/ceskatelevize.py +++ b/youtube_dl/extractor/ceskatelevize.py @@ -3,7 +3,7 @@ from __future__ import unicode_literals import re -from .subtitles import SubtitlesInfoExtractor +from .common import InfoExtractor from ..compat import ( compat_urllib_request, compat_urllib_parse, @@ -15,7 +15,7 @@ from ..utils import ( ) -class CeskaTelevizeIE(SubtitlesInfoExtractor): +class CeskaTelevizeIE(InfoExtractor): _VALID_URL = r'https?://www\.ceskatelevize\.cz/(porady|ivysilani)/(.+/)?(?P[^?#]+)' _TESTS = [ @@ -107,13 +107,7 @@ class CeskaTelevizeIE(SubtitlesInfoExtractor): subtitles = {} subs = item.get('subtitles') if subs: - subtitles['cs'] = subs[0]['url'] - - if self._downloader.params.get('listsubtitles', False): - self._list_available_subtitles(video_id, subtitles) - return - - subtitles = self._fix_subtitles(self.extract_subtitles(video_id, subtitles)) + subtitles = self.extract_subtitles(episode_id, subs) return { 'id': episode_id, @@ -125,11 +119,20 @@ class CeskaTelevizeIE(SubtitlesInfoExtractor): 'subtitles': subtitles, } + def _get_subtitles(self, episode_id, subs): + original_subtitles = self._download_webpage( + subs[0]['url'], episode_id, 'Downloading subtitles') + srt_subs = self._fix_subtitles(original_subtitles) + return { + 'cs': [{ + 'ext': 'srt', + 'data': srt_subs, + }] + } + @staticmethod def _fix_subtitles(subtitles): """ Convert millisecond-based subtitles to SRT """ - if subtitles is None: - return subtitles # subtitles not requested def _msectotimecode(msec): """ Helper utility to convert milliseconds to timecode """ @@ -149,7 +152,4 @@ class CeskaTelevizeIE(SubtitlesInfoExtractor): else: yield line - fixed_subtitles = {} - for k, v in subtitles.items(): - fixed_subtitles[k] = "\r\n".join(_fix_subtitle(v)) - return fixed_subtitles + return "\r\n".join(_fix_subtitle(subtitles)) From bd7fe0cf6668c9ea9272dbe25774072b383e67d4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Tue, 17 Feb 2015 21:23:09 +0100 Subject: [PATCH 009/131] [walla] Convert to new subtitles system --- youtube_dl/extractor/walla.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/youtube_dl/extractor/walla.py b/youtube_dl/extractor/walla.py index 672bda7a7..24efbd6e6 100644 --- a/youtube_dl/extractor/walla.py +++ b/youtube_dl/extractor/walla.py @@ -3,14 +3,14 @@ from __future__ import unicode_literals import re -from .subtitles import SubtitlesInfoExtractor +from .common import InfoExtractor from ..utils import ( xpath_text, int_or_none, ) -class WallaIE(SubtitlesInfoExtractor): +class WallaIE(InfoExtractor): _VALID_URL = r'http://vod\.walla\.co\.il/[^/]+/(?P\d+)/(?P.+)' _TEST = { 'url': 'http://vod.walla.co.il/movie/2642630/one-direction-all-for-one', @@ -52,13 +52,10 @@ class WallaIE(SubtitlesInfoExtractor): subtitles = {} for subtitle in item.findall('./subtitles/subtitle'): lang = xpath_text(subtitle, './title') - subtitles[self._SUBTITLE_LANGS.get(lang, lang)] = xpath_text(subtitle, './src') - - if self._downloader.params.get('listsubtitles', False): - self._list_available_subtitles(video_id, subtitles) - return - - subtitles = self.extract_subtitles(video_id, subtitles) + subtitles[self._SUBTITLE_LANGS.get(lang, lang)] = [{ + 'ext': 'srt', + 'url': xpath_text(subtitle, './src'), + }] formats = [] for quality in item.findall('./qualities/quality'): From 85920dd01d98cf74ea7d3ab7834a3b50cd6f1fde Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Tue, 17 Feb 2015 21:56:25 +0100 Subject: [PATCH 010/131] [bliptv] Convert to new subtitles system --- test/test_subtitles.py | 2 -- youtube_dl/extractor/bliptv.py | 34 ++++++++++++++++++---------------- 2 files changed, 18 insertions(+), 18 deletions(-) diff --git a/test/test_subtitles.py b/test/test_subtitles.py index 91cebce28..0ca510310 100644 --- a/test/test_subtitles.py +++ b/test/test_subtitles.py @@ -200,13 +200,11 @@ class TestBlipTVSubtitles(BaseTestSubtitles): IE = BlipTVIE def test_list_subtitles(self): - self.DL.expect_warning('Automatic Captions not supported by this server') self.DL.params['listsubtitles'] = True info_dict = self.getInfoDict() self.assertEqual(info_dict, None) def test_allsubtitles(self): - self.DL.expect_warning('Automatic Captions not supported by this server') self.DL.params['writesubtitles'] = True self.DL.params['allsubtitles'] = True subtitles = self.getSubtitles() diff --git a/youtube_dl/extractor/bliptv.py b/youtube_dl/extractor/bliptv.py index 436cc5155..8c7ba4b91 100644 --- a/youtube_dl/extractor/bliptv.py +++ b/youtube_dl/extractor/bliptv.py @@ -3,7 +3,6 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from .subtitles import SubtitlesInfoExtractor from ..compat import ( compat_str, @@ -18,7 +17,7 @@ from ..utils import ( ) -class BlipTVIE(SubtitlesInfoExtractor): +class BlipTVIE(InfoExtractor): _VALID_URL = r'https?://(?:\w+\.)?blip\.tv/(?:(?:.+-|rss/flash/)(?P\d+)|((?:play/|api\.swf#)(?P[\da-zA-Z+_]+)))' _TESTS = [ @@ -143,7 +142,7 @@ class BlipTVIE(SubtitlesInfoExtractor): categories = [category.text for category in item.findall('category')] formats = [] - subtitles = {} + subtitles_urls = {} media_group = item.find(media('group')) for media_content in media_group.findall(media('content')): @@ -161,7 +160,7 @@ class BlipTVIE(SubtitlesInfoExtractor): } lang = role.rpartition('-')[-1].strip().lower() langcode = LANGS.get(lang, lang) - subtitles[langcode] = url + subtitles_urls[langcode] = url elif media_type.startswith('video/'): formats.append({ 'url': real_url, @@ -175,11 +174,7 @@ class BlipTVIE(SubtitlesInfoExtractor): }) self._sort_formats(formats) - # subtitles - video_subtitles = self.extract_subtitles(video_id, subtitles) - if self._downloader.params.get('listsubtitles', False): - self._list_available_subtitles(video_id, subtitles) - return + subtitles = self.extract_subtitles(video_id, subtitles_urls) return { 'id': video_id, @@ -192,15 +187,22 @@ class BlipTVIE(SubtitlesInfoExtractor): 'thumbnail': thumbnail, 'categories': categories, 'formats': formats, - 'subtitles': video_subtitles, + 'subtitles': subtitles, } - def _download_subtitle_url(self, sub_lang, url): - # For some weird reason, blip.tv serves a video instead of subtitles - # when we request with a common UA - req = compat_urllib_request.Request(url) - req.add_header('User-Agent', 'youtube-dl') - return self._download_webpage(req, None, note=False) + def _get_subtitles(self, video_id, subtitles_urls): + subtitles = {} + for lang, url in subtitles_urls.items(): + # For some weird reason, blip.tv serves a video instead of subtitles + # when we request with a common UA + req = compat_urllib_request.Request(url) + req.add_header('User-Agent', 'youtube-dl') + subtitles[lang] = [{ + # The extension is 'srt' but it's actually an 'ass' file + 'ext': 'ass', + 'data': self._download_webpage(req, None, note=False), + }] + return subtitles class BlipTVUserIE(InfoExtractor): From 9868ea493626a3a81d30d084fd00d22982a0f86a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Tue, 17 Feb 2015 22:16:29 +0100 Subject: [PATCH 011/131] [extractor/common] Simplify subtitles handling methods Initially I was going to use a single method for handling both subtitles and automatic captions, that's why I used the 'list_subtitles' and the 'subtitles' variables. --- youtube_dl/extractor/common.py | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index fe7d8dbc9..7d8ce1808 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1000,21 +1000,19 @@ class InfoExtractor(object): return not any_restricted def extract_subtitles(self, *args, **kwargs): - subtitles = {} - list_subtitles = self._downloader.params.get('listsubtitles') - if self._downloader.params.get('writesubtitles', False) or list_subtitles: - subtitles.update(self._get_subtitles(*args, **kwargs)) - return subtitles + if (self._downloader.params.get('writesubtitles', False) or + self._downloader.params.get('listsubtitles')): + return self._get_subtitles(*args, **kwargs) + return {} def _get_subtitles(self, *args, **kwargs): raise NotImplementedError("This method must be implemented by subclasses") def extract_automatic_captions(self, *args, **kwargs): - automatic_captions = {} - list_subtitles = self._downloader.params.get('listsubtitles') - if self._downloader.params.get('writeautomaticsub', False) or list_subtitles: - automatic_captions.update(self._get_automatic_captions(*args, **kwargs)) - return automatic_captions + if (self._downloader.params.get('writeautomaticsub', False) or + self._downloader.params.get('listsubtitles')): + return self._get_automatic_captions(*args, **kwargs) + return {} def _get_automatic_captions(self, *args, **kwargs): raise NotImplementedError("This method must be implemented by subclasses") From edab9dbf4d00a7f76fbfd2df9ef4b205c88e47a8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Tue, 17 Feb 2015 22:59:19 +0100 Subject: [PATCH 012/131] [YoutubeDL] use the 'render_table' function for listing the subtitles --- youtube_dl/YoutubeDL.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index a47f8f5de..f8b8fb0c1 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -1659,13 +1659,12 @@ class YoutubeDL(object): if not subtitles: self.to_screen('%s has no %s' % (video_id, name)) return - header_line = 'Language formats' - sub_lines = [ - '%-12s%s' % (lang, ', '.join(f['ext'] for f in reversed(formats))) - for lang, formats in subtitles.items()] self.to_screen( - 'Available %s for %s:\n%s\n%s' % - (name, video_id, header_line, '\n'.join(sub_lines))) + 'Available %s for %s:' % (name, video_id)) + self.to_screen(render_table( + ['Language', 'formats'], + [[lang, ', '.join(f['ext'] for f in reversed(formats))] + for lang, formats in subtitles.items()])) def urlopen(self, req): """ Start an HTTP download """ From 37dd5d4629ae955940265f245316c43cd0373a3c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Wed, 18 Feb 2015 16:54:36 +0100 Subject: [PATCH 013/131] [mit] Don't set the subtitles field YouTube already provides them in more formats --- youtube_dl/extractor/mit.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/youtube_dl/extractor/mit.py b/youtube_dl/extractor/mit.py index 3c61a850f..d7ab6a9ae 100644 --- a/youtube_dl/extractor/mit.py +++ b/youtube_dl/extractor/mit.py @@ -5,9 +5,6 @@ import json from .common import InfoExtractor from .youtube import YoutubeIE -from ..compat import ( - compat_urlparse, -) from ..utils import ( clean_html, ExtractorError, @@ -108,7 +105,6 @@ class OCWMITIE(InfoExtractor): 'upload_date': '20121109', 'uploader_id': 'MIT', 'uploader': 'MIT OpenCourseWare', - # 'subtitles': 'http://ocw.mit.edu/courses/electrical-engineering-and-computer-science/6-041-probabilistic-systems-analysis-and-applied-probability-fall-2010/video-lectures/lecture-7-multiple-variables-expectations-independence/MIT6_041F11_lec07_300k.mp4.srt' } }, { @@ -121,7 +117,6 @@ class OCWMITIE(InfoExtractor): 'uploader_id': 'MIT', 'uploader': 'MIT OpenCourseWare', 'description': 'This section contains lecture video excerpts, lecture notes, an interactive mathlet with supporting documents, and problem solving videos.', - # 'subtitles': 'http://ocw.mit.edu//courses/mathematics/18-01sc-single-variable-calculus-fall-2010/ocw-18.01-f07-lec01_300k.SRT' } } ] @@ -140,7 +135,6 @@ class OCWMITIE(InfoExtractor): metadata = re.sub(r'[\'"]', '', embed_chapter_media.group(1)) metadata = re.split(r', ?', metadata) yt = metadata[1] - subs = compat_urlparse.urljoin(self._BASE_URL, metadata[7]) else: # search for call to ocw_embed_chapter_media(container_id, media_url, provider, page_url, image_url, captions_file) embed_media = re.search(r'ocw_embed_media\((.+?)\)', webpage) @@ -148,7 +142,6 @@ class OCWMITIE(InfoExtractor): metadata = re.sub(r'[\'"]', '', embed_media.group(1)) metadata = re.split(r', ?', metadata) yt = metadata[1] - subs = compat_urlparse.urljoin(self._BASE_URL, metadata[5]) else: raise ExtractorError('Unable to find embedded YouTube video.') video_id = YoutubeIE.extract_id(yt) @@ -159,7 +152,5 @@ class OCWMITIE(InfoExtractor): 'title': title, 'description': description, 'url': yt, - 'url_transparent' - 'subtitles': subs, 'ie_key': 'Youtube', } From 18c1c424057dd06f85f4420b14089e032fcb0000 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Wed, 18 Feb 2015 17:20:22 +0100 Subject: [PATCH 014/131] [drtv] Convert to new subtitles system --- youtube_dl/extractor/drtv.py | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/youtube_dl/extractor/drtv.py b/youtube_dl/extractor/drtv.py index d5df18d7c..8257e35a4 100644 --- a/youtube_dl/extractor/drtv.py +++ b/youtube_dl/extractor/drtv.py @@ -1,11 +1,10 @@ from __future__ import unicode_literals -from .subtitles import SubtitlesInfoExtractor -from .common import ExtractorError +from .common import InfoExtractor, ExtractorError from ..utils import parse_iso8601 -class DRTVIE(SubtitlesInfoExtractor): +class DRTVIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?dr\.dk/tv/se/(?:[^/]+/)*(?P[\da-z-]+)(?:[/#?]|$)' _TEST = { @@ -76,7 +75,7 @@ class DRTVIE(SubtitlesInfoExtractor): } for subs in subtitles_list: lang = subs['Language'] - subtitles[LANGS.get(lang, lang)] = subs['Uri'] + subtitles[LANGS.get(lang, lang)] = [{'url': subs['Uri'], 'ext': 'vtt'}] if not formats and restricted_to_denmark: raise ExtractorError( @@ -84,10 +83,6 @@ class DRTVIE(SubtitlesInfoExtractor): self._sort_formats(formats) - if self._downloader.params.get('listsubtitles', False): - self._list_available_subtitles(video_id, subtitles) - return - return { 'id': video_id, 'title': title, @@ -96,5 +91,5 @@ class DRTVIE(SubtitlesInfoExtractor): 'timestamp': timestamp, 'duration': duration, 'formats': formats, - 'subtitles': self.extract_subtitles(video_id, subtitles), + 'subtitles': subtitles, } From 311c39383827e42649a287633a67ef021476d23a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Wed, 18 Feb 2015 17:46:33 +0100 Subject: [PATCH 015/131] [lynda] Convert to new subtitles system --- test/test_subtitles.py | 13 ++++++++ youtube_dl/extractor/lynda.py | 60 ++++++++++++++--------------------- 2 files changed, 37 insertions(+), 36 deletions(-) diff --git a/test/test_subtitles.py b/test/test_subtitles.py index 0ca510310..ee170879f 100644 --- a/test/test_subtitles.py +++ b/test/test_subtitles.py @@ -18,6 +18,7 @@ from youtube_dl.extractor import ( VimeoIE, WallaIE, CeskaTelevizeIE, + LyndaIE, ) @@ -304,5 +305,17 @@ class TestCeskaTelevizeSubtitles(BaseTestSubtitles): self.assertEqual(len(subtitles), 0) +class TestLyndaSubtitles(BaseTestSubtitles): + url = 'http://www.lynda.com/Bootstrap-tutorials/Using-exercise-files/110885/114408-4.html' + IE = LyndaIE + + def test_allsubtitles(self): + self.DL.params['writesubtitles'] = True + self.DL.params['allsubtitles'] = True + subtitles = self.getSubtitles() + self.assertEqual(set(subtitles.keys()), set(['en'])) + self.assertEqual(md5(subtitles['en']), '09bbe67222259bed60deaa26997d73a7') + + if __name__ == '__main__': unittest.main() diff --git a/youtube_dl/extractor/lynda.py b/youtube_dl/extractor/lynda.py index 762cefa34..109055e72 100644 --- a/youtube_dl/extractor/lynda.py +++ b/youtube_dl/extractor/lynda.py @@ -3,7 +3,6 @@ from __future__ import unicode_literals import re import json -from .subtitles import SubtitlesInfoExtractor from .common import InfoExtractor from ..compat import ( compat_str, @@ -16,7 +15,7 @@ from ..utils import ( ) -class LyndaIE(SubtitlesInfoExtractor): +class LyndaIE(InfoExtractor): IE_NAME = 'lynda' IE_DESC = 'lynda.com videos' _VALID_URL = r'https?://www\.lynda\.com/[^/]+/[^/]+/\d+/(\d+)-\d\.html' @@ -88,11 +87,7 @@ class LyndaIE(SubtitlesInfoExtractor): self._check_formats(formats, video_id) self._sort_formats(formats) - if self._downloader.params.get('listsubtitles', False): - self._list_available_subtitles(video_id, page) - return - - subtitles = self._fix_subtitles(self.extract_subtitles(video_id, page)) + subtitles = self.extract_subtitles(video_id, page) return { 'id': video_id, @@ -144,38 +139,31 @@ class LyndaIE(SubtitlesInfoExtractor): if re.search(self._SUCCESSFUL_LOGIN_REGEX, login_page) is None: raise ExtractorError('Unable to log in') - def _fix_subtitles(self, subtitles): - if subtitles is None: - return subtitles # subtitles not requested - - fixed_subtitles = {} - for k, v in subtitles.items(): - subs = json.loads(v) - if len(subs) == 0: + def _fix_subtitles(self, subs): + srt = '' + for pos in range(0, len(subs) - 1): + seq_current = subs[pos] + m_current = re.match(self._TIMECODE_REGEX, seq_current['Timecode']) + if m_current is None: continue - srt = '' - for pos in range(0, len(subs) - 1): - seq_current = subs[pos] - m_current = re.match(self._TIMECODE_REGEX, seq_current['Timecode']) - if m_current is None: - continue - seq_next = subs[pos + 1] - m_next = re.match(self._TIMECODE_REGEX, seq_next['Timecode']) - if m_next is None: - continue - appear_time = m_current.group('timecode') - disappear_time = m_next.group('timecode') - text = seq_current['Caption'] - srt += '%s\r\n%s --> %s\r\n%s' % (str(pos), appear_time, disappear_time, text) - if srt: - fixed_subtitles[k] = srt - return fixed_subtitles + seq_next = subs[pos + 1] + m_next = re.match(self._TIMECODE_REGEX, seq_next['Timecode']) + if m_next is None: + continue + appear_time = m_current.group('timecode') + disappear_time = m_next.group('timecode') + text = seq_current['Caption'] + srt += '%s\r\n%s --> %s\r\n%s' % (str(pos), appear_time, disappear_time, text) + if srt: + return srt - def _get_available_subtitles(self, video_id, webpage): + def _get_subtitles(self, video_id, webpage): url = 'http://www.lynda.com/ajax/player?videoId=%s&type=transcript' % video_id - sub = self._download_webpage(url, None, False) - sub_json = json.loads(sub) - return {'en': url} if len(sub_json) > 0 else {} + subs = self._download_json(url, None, False) + if subs: + return {'en': [{'ext': 'srt', 'data': self._fix_subtitles(subs)}]} + else: + return {} class LyndaCourseIE(InfoExtractor): From b9b42f2ea0c564f3e75a8f052bfe0dfe21cf320f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Wed, 18 Feb 2015 17:57:10 +0100 Subject: [PATCH 016/131] [npo] Convert to new subtitles system --- test/test_subtitles.py | 13 +++++++++++++ youtube_dl/extractor/npo.py | 14 +++++--------- 2 files changed, 18 insertions(+), 9 deletions(-) diff --git a/test/test_subtitles.py b/test/test_subtitles.py index ee170879f..b2195cac4 100644 --- a/test/test_subtitles.py +++ b/test/test_subtitles.py @@ -19,6 +19,7 @@ from youtube_dl.extractor import ( WallaIE, CeskaTelevizeIE, LyndaIE, + NPOIE, ) @@ -317,5 +318,17 @@ class TestLyndaSubtitles(BaseTestSubtitles): self.assertEqual(md5(subtitles['en']), '09bbe67222259bed60deaa26997d73a7') +class TestNPOSubtitles(BaseTestSubtitles): + url = 'http://www.npo.nl/nos-journaal/28-08-2014/POW_00722860' + IE = NPOIE + + def test_allsubtitles(self): + self.DL.params['writesubtitles'] = True + self.DL.params['allsubtitles'] = True + subtitles = self.getSubtitles() + self.assertEqual(set(subtitles.keys()), set(['nl'])) + self.assertEqual(md5(subtitles['nl']), 'fc6435027572b63fb4ab143abd5ad3f4') + + if __name__ == '__main__': unittest.main() diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index c075618e8..9c01eb0af 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -1,6 +1,5 @@ from __future__ import unicode_literals -from .subtitles import SubtitlesInfoExtractor from .common import InfoExtractor from ..utils import ( fix_xml_ampersands, @@ -12,7 +11,7 @@ from ..utils import ( ) -class NPOBaseIE(SubtitlesInfoExtractor): +class NPOBaseIE(InfoExtractor): def _get_token(self, video_id): token_page = self._download_webpage( 'http://ida.omroep.nl/npoplayer/i.js', @@ -164,13 +163,10 @@ class NPOIE(NPOBaseIE): subtitles = {} if metadata.get('tt888') == 'ja': - subtitles['nl'] = 'http://e.omroep.nl/tt888/%s' % video_id - - if self._downloader.params.get('listsubtitles', False): - self._list_available_subtitles(video_id, subtitles) - return - - subtitles = self.extract_subtitles(video_id, subtitles) + subtitles['nl'] = [{ + 'ext': 'vtt', + 'url': 'http://e.omroep.nl/tt888/%s' % video_id, + }] return { 'id': video_id, From 0af25f784bc5bff7cbce2d4af725b4bf2d2262c5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Wed, 18 Feb 2015 18:27:45 +0100 Subject: [PATCH 017/131] [mtv] Convert to new subtitles system --- test/test_subtitles.py | 16 ++++++++++++++++ youtube_dl/extractor/mtv.py | 26 +++++++------------------- 2 files changed, 23 insertions(+), 19 deletions(-) diff --git a/test/test_subtitles.py b/test/test_subtitles.py index b2195cac4..c018d9b49 100644 --- a/test/test_subtitles.py +++ b/test/test_subtitles.py @@ -20,6 +20,7 @@ from youtube_dl.extractor import ( CeskaTelevizeIE, LyndaIE, NPOIE, + ComedyCentralIE, ) @@ -330,5 +331,20 @@ class TestNPOSubtitles(BaseTestSubtitles): self.assertEqual(md5(subtitles['nl']), 'fc6435027572b63fb4ab143abd5ad3f4') +class TestMTVSubtitles(BaseTestSubtitles): + url = 'http://www.cc.com/video-clips/kllhuv/stand-up-greg-fitzsimmons--uncensored---too-good-of-a-mother' + IE = ComedyCentralIE + + def getInfoDict(self): + return super(TestMTVSubtitles, self).getInfoDict()['entries'][0] + + def test_allsubtitles(self): + self.DL.params['writesubtitles'] = True + self.DL.params['allsubtitles'] = True + subtitles = self.getSubtitles() + self.assertEqual(set(subtitles.keys()), set(['en'])) + self.assertEqual(md5(subtitles['en']), 'b9f6ca22a6acf597ec76f61749765e65') + + if __name__ == '__main__': unittest.main() diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py index bc7f49ebb..c11de1cb6 100644 --- a/youtube_dl/extractor/mtv.py +++ b/youtube_dl/extractor/mtv.py @@ -2,7 +2,7 @@ from __future__ import unicode_literals import re -from .subtitles import SubtitlesInfoExtractor +from .common import InfoExtractor from ..compat import ( compat_urllib_parse, compat_urllib_request, @@ -23,7 +23,7 @@ def _media_xml_tag(tag): return '{http://search.yahoo.com/mrss/}%s' % tag -class MTVServicesInfoExtractor(SubtitlesInfoExtractor): +class MTVServicesInfoExtractor(InfoExtractor): _MOBILE_TEMPLATE = None @staticmethod @@ -95,25 +95,15 @@ class MTVServicesInfoExtractor(SubtitlesInfoExtractor): def _extract_subtitles(self, mdoc, mtvn_id): subtitles = {} - FORMATS = { - 'scc': 'cea-608', - 'eia-608': 'cea-608', - 'xml': 'ttml', - } - subtitles_format = FORMATS.get( - self._downloader.params.get('subtitlesformat'), 'ttml') for transcript in mdoc.findall('.//transcript'): if transcript.get('kind') != 'captions': continue lang = transcript.get('srclang') - for typographic in transcript.findall('./typographic'): - captions_format = typographic.get('format') - if captions_format == subtitles_format: - subtitles[lang] = compat_str(typographic.get('src')) - break - if self._downloader.params.get('listsubtitles', False): - self._list_available_subtitles(mtvn_id, subtitles) - return self.extract_subtitles(mtvn_id, subtitles) + subtitles[lang] = [{ + 'url': compat_str(typographic.get('src')), + 'ext': typographic.get('format') + } for typographic in transcript.findall('./typographic')] + return subtitles def _get_video_info(self, itemdoc): uri = itemdoc.find('guid').text @@ -196,8 +186,6 @@ class MTVServicesInfoExtractor(SubtitlesInfoExtractor): webpage, 'mgid') videos_info = self._get_videos_info(mgid) - if self._downloader.params.get('listsubtitles', False): - return return videos_info From 01561da142485a581e67ce98ef009ebe0ed7b4c1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Wed, 18 Feb 2015 18:57:01 +0100 Subject: [PATCH 018/131] [nrk] Convert to new subtitles system --- test/test_subtitles.py | 13 +++++++++++++ youtube_dl/extractor/nrk.py | 15 +++++++-------- 2 files changed, 20 insertions(+), 8 deletions(-) diff --git a/test/test_subtitles.py b/test/test_subtitles.py index c018d9b49..1e2324232 100644 --- a/test/test_subtitles.py +++ b/test/test_subtitles.py @@ -21,6 +21,7 @@ from youtube_dl.extractor import ( LyndaIE, NPOIE, ComedyCentralIE, + NRKTVIE, ) @@ -346,5 +347,17 @@ class TestMTVSubtitles(BaseTestSubtitles): self.assertEqual(md5(subtitles['en']), 'b9f6ca22a6acf597ec76f61749765e65') +class TestNRKSubtitles(BaseTestSubtitles): + url = 'http://tv.nrk.no/serie/ikke-gjoer-dette-hjemme/DMPV73000411/sesong-2/episode-1' + IE = NRKTVIE + + def test_allsubtitles(self): + self.DL.params['writesubtitles'] = True + self.DL.params['allsubtitles'] = True + subtitles = self.getSubtitles() + self.assertEqual(set(subtitles.keys()), set(['no'])) + self.assertEqual(md5(subtitles['no']), '1d221e6458c95c5494dcd38e6a1f129a') + + if __name__ == '__main__': unittest.main() diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py index f6de26022..46f493cfc 100644 --- a/youtube_dl/extractor/nrk.py +++ b/youtube_dl/extractor/nrk.py @@ -10,7 +10,6 @@ from ..utils import ( parse_duration, unified_strdate, ) -from .subtitles import SubtitlesInfoExtractor class NRKIE(InfoExtractor): @@ -73,7 +72,7 @@ class NRKIE(InfoExtractor): } -class NRKTVIE(SubtitlesInfoExtractor): +class NRKTVIE(InfoExtractor): _VALID_URL = r'(?Phttp://tv\.nrk(?:super)?\.no/)(?:serie/[^/]+|program)/(?P[a-zA-Z]{4}\d{8})(?:/\d{2}-\d{2}-\d{4})?(?:#del=(?P\d+))?' _TESTS = [ @@ -156,7 +155,7 @@ class NRKTVIE(SubtitlesInfoExtractor): if self._downloader.params.get('verbose', False): self.to_screen('[debug] %s' % txt) - def _extract_captions(self, subtitlesurl, video_id, baseurl): + def _get_subtitles(self, subtitlesurl, video_id, baseurl): url = "%s%s" % (baseurl, subtitlesurl) self._debug_print('%s: Subtitle url: %s' % (video_id, url)) captions = self._download_xml(url, video_id, 'Downloading subtitles') @@ -170,7 +169,10 @@ class NRKTVIE(SubtitlesInfoExtractor): endtime = self._seconds2str(begin + duration) text = '\n'.join(p.itertext()) srt += '%s\r\n%s --> %s\r\n%s\r\n\r\n' % (str(pos), starttime, endtime, text) - return {lang: srt} + return {lang: [ + {'ext': 'ttml', 'url': url}, + {'ext': 'srt', 'data': srt}, + ]} def _extract_f4m(self, manifest_url, video_id): return self._extract_f4m_formats(manifest_url + '?hdcore=3.1.1&plugin=aasp-3.1.1.69.124', video_id) @@ -243,10 +245,7 @@ class NRKTVIE(SubtitlesInfoExtractor): webpage, 'subtitle URL', default=None) subtitles = None if subtitles_url: - subtitles = self._extract_captions(subtitles_url, video_id, baseurl) - if self._downloader.params.get('listsubtitles', False): - self._list_available_subtitles(video_id, subtitles) - return + subtitles = self.extract_subtitles(subtitles_url, video_id, baseurl) return { 'id': video_id, From afbdd3acc36130d1a717b3cacab69c0dfc716622 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Wed, 18 Feb 2015 20:14:42 +0100 Subject: [PATCH 019/131] [rai] Convert to new subtitles system --- test/test_subtitles.py | 13 +++++++++++++ youtube_dl/extractor/rai.py | 22 +++++++++------------- 2 files changed, 22 insertions(+), 13 deletions(-) diff --git a/test/test_subtitles.py b/test/test_subtitles.py index 1e2324232..4dbb50515 100644 --- a/test/test_subtitles.py +++ b/test/test_subtitles.py @@ -22,6 +22,7 @@ from youtube_dl.extractor import ( NPOIE, ComedyCentralIE, NRKTVIE, + RaiIE, ) @@ -359,5 +360,17 @@ class TestNRKSubtitles(BaseTestSubtitles): self.assertEqual(md5(subtitles['no']), '1d221e6458c95c5494dcd38e6a1f129a') +class TestRaiSubtitles(BaseTestSubtitles): + url = 'http://www.rai.tv/dl/RaiTV/programmi/media/ContentItem-cb27157f-9dd0-4aee-b788-b1f67643a391.html' + IE = RaiIE + + def test_allsubtitles(self): + self.DL.params['writesubtitles'] = True + self.DL.params['allsubtitles'] = True + subtitles = self.getSubtitles() + self.assertEqual(set(subtitles.keys()), set(['it'])) + self.assertEqual(md5(subtitles['it']), 'b1d90a98755126b61e667567a1f6680a') + + if __name__ == '__main__': unittest.main() diff --git a/youtube_dl/extractor/rai.py b/youtube_dl/extractor/rai.py index aa26b7e0b..144e33982 100644 --- a/youtube_dl/extractor/rai.py +++ b/youtube_dl/extractor/rai.py @@ -2,7 +2,7 @@ from __future__ import unicode_literals import re -from .subtitles import SubtitlesInfoExtractor +from .common import InfoExtractor from ..compat import ( compat_urllib_parse, ) @@ -12,7 +12,7 @@ from ..utils import ( ) -class RaiIE(SubtitlesInfoExtractor): +class RaiIE(InfoExtractor): _VALID_URL = r'(?Phttp://(?:.+?\.)?(?:rai\.it|rai\.tv|rainews\.it)/dl/.+?-(?P[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})(?:-.+?)?\.html)' _TESTS = [ { @@ -89,15 +89,7 @@ class RaiIE(SubtitlesInfoExtractor): 'ext': 'mp4', }) - if self._downloader.params.get('listsubtitles', False): - page = self._download_webpage(url, video_id) - self._list_available_subtitles(video_id, page) - return - - subtitles = {} - if self._have_to_download_any_subtitles: - page = self._download_webpage(url, video_id) - subtitles = self.extract_subtitles(video_id, page) + subtitles = self.extract_subtitles(video_id, url) return { 'id': video_id, @@ -111,7 +103,8 @@ class RaiIE(SubtitlesInfoExtractor): 'subtitles': subtitles, } - def _get_available_subtitles(self, video_id, webpage): + def _get_subtitles(self, video_id, url): + webpage = self._download_webpage(url, video_id) subtitles = {} m = re.search(r' Date: Wed, 18 Feb 2015 20:37:16 +0100 Subject: [PATCH 020/131] [viki] Convert to new subtitles system --- test/test_subtitles.py | 13 +++++++++++++ youtube_dl/extractor/viki.py | 17 +++++++++-------- 2 files changed, 22 insertions(+), 8 deletions(-) diff --git a/test/test_subtitles.py b/test/test_subtitles.py index 4dbb50515..98d1afff4 100644 --- a/test/test_subtitles.py +++ b/test/test_subtitles.py @@ -23,6 +23,7 @@ from youtube_dl.extractor import ( ComedyCentralIE, NRKTVIE, RaiIE, + VikiIE, ) @@ -372,5 +373,17 @@ class TestRaiSubtitles(BaseTestSubtitles): self.assertEqual(md5(subtitles['it']), 'b1d90a98755126b61e667567a1f6680a') +class TestVikiSubtitles(BaseTestSubtitles): + url = 'http://www.viki.com/videos/1060846v-punch-episode-18' + IE = VikiIE + + def test_allsubtitles(self): + self.DL.params['writesubtitles'] = True + self.DL.params['allsubtitles'] = True + subtitles = self.getSubtitles() + self.assertEqual(set(subtitles.keys()), set(['en'])) + self.assertEqual(md5(subtitles['en']), 'b0b781eeb45efd3f6398a925b259150b') + + if __name__ == '__main__': unittest.main() diff --git a/youtube_dl/extractor/viki.py b/youtube_dl/extractor/viki.py index 944901e14..6816dacb6 100644 --- a/youtube_dl/extractor/viki.py +++ b/youtube_dl/extractor/viki.py @@ -2,16 +2,17 @@ from __future__ import unicode_literals import re +from ..compat import compat_urlparse from ..utils import ( ExtractorError, unescapeHTML, unified_strdate, US_RATINGS, ) -from .subtitles import SubtitlesInfoExtractor +from .common import InfoExtractor -class VikiIE(SubtitlesInfoExtractor): +class VikiIE(InfoExtractor): IE_NAME = 'viki' _VALID_URL = r'^https?://(?:www\.)?viki\.com/videos/(?P[0-9]+v)' @@ -69,9 +70,6 @@ class VikiIE(SubtitlesInfoExtractor): # subtitles video_subtitles = self.extract_subtitles(video_id, info_webpage) - if self._downloader.params.get('listsubtitles', False): - self._list_available_subtitles(video_id, info_webpage) - return return { 'id': video_id, @@ -85,12 +83,15 @@ class VikiIE(SubtitlesInfoExtractor): 'upload_date': upload_date, } - def _get_available_subtitles(self, video_id, info_webpage): + def _get_subtitles(self, video_id, info_webpage): res = {} - for sturl_html in re.findall(r'', info_webpage): + for sturl_html in re.findall(r'[a-z]+)\.vtt', sturl) if not m: continue - res[m.group('lang')] = sturl + res[m.group('lang')] = [{ + 'url': compat_urlparse.urljoin('http://www.viki.com', sturl), + 'ext': 'vtt', + }] return res From 8807f1277f8c69488046fc7215cc79165e976ff3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Thu, 19 Feb 2015 14:54:50 +0100 Subject: [PATCH 021/131] [theplatform] Convert to new subtitles system --- test/test_subtitles.py | 15 +++++++++++++++ youtube_dl/extractor/theplatform.py | 18 +++++++----------- 2 files changed, 22 insertions(+), 11 deletions(-) diff --git a/test/test_subtitles.py b/test/test_subtitles.py index 98d1afff4..c04fe6f22 100644 --- a/test/test_subtitles.py +++ b/test/test_subtitles.py @@ -24,6 +24,7 @@ from youtube_dl.extractor import ( NRKTVIE, RaiIE, VikiIE, + ThePlatformIE, ) @@ -385,5 +386,19 @@ class TestVikiSubtitles(BaseTestSubtitles): self.assertEqual(md5(subtitles['en']), 'b0b781eeb45efd3f6398a925b259150b') +class TestThePlatformSubtitles(BaseTestSubtitles): + # from http://www.3playmedia.com/services-features/tools/integrations/theplatform/ + # (see http://theplatform.com/about/partners/type/subtitles-closed-captioning/) + url = 'theplatform:JFUjUE1_ehvq' + IE = ThePlatformIE + + def test_allsubtitles(self): + self.DL.params['writesubtitles'] = True + self.DL.params['allsubtitles'] = True + subtitles = self.getSubtitles() + self.assertEqual(set(subtitles.keys()), set(['en'])) + self.assertEqual(md5(subtitles['en']), '97e7670cbae3c4d26ae8bcc7fdd78d4b') + + if __name__ == '__main__': unittest.main() diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py index 1579822f2..5f24189cc 100644 --- a/youtube_dl/extractor/theplatform.py +++ b/youtube_dl/extractor/theplatform.py @@ -8,7 +8,7 @@ import binascii import hashlib -from .subtitles import SubtitlesInfoExtractor +from .common import InfoExtractor from ..compat import ( compat_str, ) @@ -22,7 +22,7 @@ from ..utils import ( _x = lambda p: xpath_with_ns(p, {'smil': 'http://www.w3.org/2005/SMIL21/Language'}) -class ThePlatformIE(SubtitlesInfoExtractor): +class ThePlatformIE(InfoExtractor): _VALID_URL = r'''(?x) (?:https?://(?:link|player)\.theplatform\.com/[sp]/(?P[^/]+)/ (?P(?:[^/\?]+/(?:swf|config)|onsite)/select/)? @@ -104,15 +104,11 @@ class ThePlatformIE(SubtitlesInfoExtractor): captions = info.get('captions') if isinstance(captions, list): for caption in captions: - lang, src = caption.get('lang'), caption.get('src') - if lang and src: - subtitles[lang] = src - - if self._downloader.params.get('listsubtitles', False): - self._list_available_subtitles(video_id, subtitles) - return - - subtitles = self.extract_subtitles(video_id, subtitles) + lang, src, mime = caption.get('lang', 'en'), caption.get('src'), caption.get('type') + subtitles[lang] = [{ + 'ext': 'srt' if mime == 'text/srt' else 'ttml', + 'url': src, + }] head = meta.find(_x('smil:head')) body = meta.find(_x('smil:body')) From f13b1e7d7fd4a63c9ca4a0aa9930c540033cc408 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Thu, 19 Feb 2015 16:46:41 +0100 Subject: [PATCH 022/131] [bbccouk] Convert to new subtitles system I haven't found any video available outside the UK, so I haven't added tests. I have updated how the srt file is build, because (at least for www.bbc.co.uk/programmes/p02j9b69) the subtitles is inside 'span' elements. --- youtube_dl/extractor/bbccouk.py | 33 ++++++++++++++++++++++----------- 1 file changed, 22 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/bbccouk.py b/youtube_dl/extractor/bbccouk.py index f23e39545..abc34a576 100644 --- a/youtube_dl/extractor/bbccouk.py +++ b/youtube_dl/extractor/bbccouk.py @@ -2,12 +2,12 @@ from __future__ import unicode_literals import xml.etree.ElementTree -from .subtitles import SubtitlesInfoExtractor +from .common import InfoExtractor from ..utils import ExtractorError from ..compat import compat_HTTPError -class BBCCoUkIE(SubtitlesInfoExtractor): +class BBCCoUkIE(InfoExtractor): IE_NAME = 'bbc.co.uk' IE_DESC = 'BBC iPlayer' _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:(?:(?:programmes|iplayer(?:/[^/]+)?/(?:episode|playlist))/)|music/clips[/#])(?P[\da-z]{8})' @@ -215,17 +215,32 @@ class BBCCoUkIE(SubtitlesInfoExtractor): formats.extend(conn_formats) return formats - def _extract_captions(self, media, programme_id): + def _get_subtitles(self, media, programme_id): subtitles = {} for connection in self._extract_connections(media): captions = self._download_xml(connection.get('href'), programme_id, 'Downloading captions') lang = captions.get('{http://www.w3.org/XML/1998/namespace}lang', 'en') ps = captions.findall('./{0}body/{0}div/{0}p'.format('{http://www.w3.org/2006/10/ttaf1}')) srt = '' + + def _extract_text(p): + if p.text is not None: + stripped_text = p.text.strip() + if stripped_text: + return stripped_text + return ' '.join(span.text.strip() for span in p.findall('{http://www.w3.org/2006/10/ttaf1}span')) for pos, p in enumerate(ps): - srt += '%s\r\n%s --> %s\r\n%s\r\n\r\n' % (str(pos), p.get('begin'), p.get('end'), - p.text.strip() if p.text is not None else '') - subtitles[lang] = srt + srt += '%s\r\n%s --> %s\r\n%s\r\n\r\n' % (str(pos), p.get('begin'), p.get('end'), _extract_text(p)) + subtitles[lang] = [ + { + 'url': connection.get('href'), + 'ext': 'ttml', + }, + { + 'data': srt, + 'ext': 'srt', + }, + ] return subtitles def _download_media_selector(self, programme_id): @@ -249,7 +264,7 @@ class BBCCoUkIE(SubtitlesInfoExtractor): elif kind == 'video': formats.extend(self._extract_video(media, programme_id)) elif kind == 'captions': - subtitles = self._extract_captions(media, programme_id) + subtitles = self.extract_subtitles(media, programme_id) return formats, subtitles @@ -324,10 +339,6 @@ class BBCCoUkIE(SubtitlesInfoExtractor): else: programme_id, title, description, duration, formats, subtitles = self._download_playlist(group_id) - if self._downloader.params.get('listsubtitles', False): - self._list_available_subtitles(programme_id, subtitles) - return - self._sort_formats(formats) return { From fb7cb6823e5ace9437bc79f2e1928a30f317856b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Thu, 19 Feb 2015 23:24:24 +0100 Subject: [PATCH 023/131] Remove the SubtitlesInfoExtractor class No longer needed --- youtube_dl/extractor/subtitles.py | 99 ------------------------------- 1 file changed, 99 deletions(-) delete mode 100644 youtube_dl/extractor/subtitles.py diff --git a/youtube_dl/extractor/subtitles.py b/youtube_dl/extractor/subtitles.py deleted file mode 100644 index 59a51268d..000000000 --- a/youtube_dl/extractor/subtitles.py +++ /dev/null @@ -1,99 +0,0 @@ -from __future__ import unicode_literals -from .common import InfoExtractor - -from ..compat import compat_str -from ..utils import ( - ExtractorError, -) - - -class SubtitlesInfoExtractor(InfoExtractor): - @property - def _have_to_download_any_subtitles(self): - return any([self._downloader.params.get('writesubtitles', False), - self._downloader.params.get('writeautomaticsub')]) - - def _list_available_subtitles(self, video_id, webpage): - """ outputs the available subtitles for the video """ - sub_lang_list = self._get_available_subtitles(video_id, webpage) - auto_captions_list = self._get_available_automatic_caption(video_id, webpage) - sub_lang = ",".join(list(sub_lang_list.keys())) - self.to_screen('%s: Available subtitles for video: %s' % - (video_id, sub_lang)) - auto_lang = ",".join(auto_captions_list.keys()) - self.to_screen('%s: Available automatic captions for video: %s' % - (video_id, auto_lang)) - - def extract_subtitles(self, video_id, webpage): - """ - returns {sub_lang: sub} ,{} if subtitles not found or None if the - subtitles aren't requested. - """ - if not self._have_to_download_any_subtitles: - return None - available_subs_list = {} - if self._downloader.params.get('writeautomaticsub', False): - available_subs_list.update(self._get_available_automatic_caption(video_id, webpage)) - if self._downloader.params.get('writesubtitles', False): - available_subs_list.update(self._get_available_subtitles(video_id, webpage)) - - if not available_subs_list: # error, it didn't get the available subtitles - return {} - if self._downloader.params.get('allsubtitles', False): - sub_lang_list = available_subs_list - else: - if self._downloader.params.get('subtitleslangs', False): - requested_langs = self._downloader.params.get('subtitleslangs') - elif 'en' in available_subs_list: - requested_langs = ['en'] - else: - requested_langs = [list(available_subs_list.keys())[0]] - - sub_lang_list = {} - for sub_lang in requested_langs: - if sub_lang not in available_subs_list: - self._downloader.report_warning('no closed captions found in the specified language "%s"' % sub_lang) - continue - sub_lang_list[sub_lang] = available_subs_list[sub_lang] - - subtitles = {} - for sub_lang, url in sub_lang_list.items(): - subtitle = self._request_subtitle_url(sub_lang, url) - if subtitle: - subtitles[sub_lang] = subtitle - return subtitles - - def _download_subtitle_url(self, sub_lang, url): - return self._download_webpage(url, None, note=False) - - def _request_subtitle_url(self, sub_lang, url): - """ makes the http request for the subtitle """ - try: - sub = self._download_subtitle_url(sub_lang, url) - except ExtractorError as err: - self._downloader.report_warning('unable to download video subtitles for %s: %s' % (sub_lang, compat_str(err))) - return - if not sub: - self._downloader.report_warning('Did not fetch video subtitles') - return - return sub - - def _get_available_subtitles(self, video_id, webpage): - """ - returns {sub_lang: url} or {} if not available - Must be redefined by the subclasses - """ - - # By default, allow implementations to simply pass in the result - assert isinstance(webpage, dict), \ - '_get_available_subtitles not implemented' - return webpage - - def _get_available_automatic_caption(self, video_id, webpage): - """ - returns {sub_lang: url} or {} if not available - Must be redefined by the subclasses that support automatic captions, - otherwise it will return {} - """ - self._downloader.report_warning('Automatic Captions not supported by this server') - return {} From 5da6bd00837236cf8a5dc5aeeadae5cfed7f2021 Mon Sep 17 00:00:00 2001 From: "Leslie P. Polzer" Date: Fri, 20 Feb 2015 10:49:45 +0100 Subject: [PATCH 024/131] [chirbit] Add new extractor. --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/chirbit.py | 34 ++++++++++++++++++++++++++++++++ 2 files changed, 35 insertions(+) create mode 100644 youtube_dl/extractor/chirbit.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index f225ac654..de08e69bc 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -63,6 +63,7 @@ from .ccc import CCCIE from .ceskatelevize import CeskaTelevizeIE from .channel9 import Channel9IE from .chilloutzone import ChilloutzoneIE +from .chirbit import ChirbitIE from .cinchcast import CinchcastIE from .clipfish import ClipfishIE from .cliphunter import CliphunterIE diff --git a/youtube_dl/extractor/chirbit.py b/youtube_dl/extractor/chirbit.py new file mode 100644 index 000000000..06a3e1a7a --- /dev/null +++ b/youtube_dl/extractor/chirbit.py @@ -0,0 +1,34 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class ChirbitIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?chirb\.it/(?P[^/]+)' + _TEST = { + 'url': 'http://chirb.it/PrIPv5', + 'md5': '9847b0dad6ac3e074568bf2cfb197de8', + 'info_dict': { + 'id': 'PrIPv5', + 'display_id': 'kukushtv_1423231243', + 'ext': 'mp3', + 'title': 'Фасадстрой', + 'url': 'http://audio.chirbit.com/kukushtv_1423231243.mp3' + } + } + + def _real_extract(self, url): + audio_linkid = self._match_id(url) + webpage = self._download_webpage(url, audio_linkid) + + audio_title = self._html_search_regex(r'(.*?)', webpage, 'title') + audio_id = self._html_search_regex(r'\("setFile",\s+"http://audio.chirbit.com/(.*?).mp3"\)', webpage, 'audio ID') + audio_url = 'http://audio.chirbit.com/' + audio_id + '.mp3'; + + return { + 'id': audio_linkid, + 'display_id': audio_id, + 'title': audio_title, + 'url': audio_url + } From 365577f5676d63089cb834855dd4cdce7d0dc8aa Mon Sep 17 00:00:00 2001 From: "Leslie P. Polzer" Date: Fri, 20 Feb 2015 14:48:12 +0100 Subject: [PATCH 025/131] [chirbit] add profile extractor. --- youtube_dl/extractor/__init__.py | 2 +- youtube_dl/extractor/chirbit.py | 63 ++++++++++++++++++++++++++++++++ 2 files changed, 64 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index de08e69bc..94e150826 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -63,7 +63,7 @@ from .ccc import CCCIE from .ceskatelevize import CeskaTelevizeIE from .channel9 import Channel9IE from .chilloutzone import ChilloutzoneIE -from .chirbit import ChirbitIE +from .chirbit import ChirbitIE, ChirbitProfileIE from .cinchcast import CinchcastIE from .clipfish import ClipfishIE from .cliphunter import CliphunterIE diff --git a/youtube_dl/extractor/chirbit.py b/youtube_dl/extractor/chirbit.py index 06a3e1a7a..47ce94aa0 100644 --- a/youtube_dl/extractor/chirbit.py +++ b/youtube_dl/extractor/chirbit.py @@ -1,7 +1,10 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor +from ..utils import clean_html class ChirbitIE(InfoExtractor): @@ -32,3 +35,63 @@ class ChirbitIE(InfoExtractor): 'title': audio_title, 'url': audio_url } + +class ChirbitProfileIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?chirbit.com/(?P[^/]+)' + _TEST = { + 'url': 'http://chirbit.com/ScarletBeauty', + 'playlist_count': 3, + 'info_dict': { + '_type': 'playlist', + 'title': 'ScarletBeauty', + 'id': 'ScarletBeauty' + } + } + + def _real_extract(self, url): + profile_id = self._match_id(url) + + # Chirbit has a pretty weird "Last Page" navigation behavior. + # We grab the profile's oldest entry to determine when to + # stop fetching entries. + oldestpage = self._download_webpage(url + '/24599', profile_id) + oldest_page_entries = re.findall( + r'''soundFile:\s*"http://audio.chirbit.com/(.*?).mp3"''', + oldestpage); + oldestentry = clean_html(oldest_page_entries[-1]); + + ids = [] + titles = [] + n = 0 + while True: + page = self._download_webpage(url + '/' + str(n), profile_id) + page_ids = re.findall( + r'''soundFile:\s*"http://audio.chirbit.com/(.*?).mp3"''', + page); + page_titles = re.findall( + r'''(.*?)''', + page); + ids += page_ids + titles += page_titles + if oldestentry in page_ids: + break + n += 1 + + entries = [] + i = 0 + for id in ids: + entries.append({ + 'id': id, + 'title': titles[i], + 'url': 'http://audio.chirbit.com/' + id + '.mp3' + }); + i += 1 + + info_dict = { + '_type': 'playlist', + 'id': profile_id, + 'title': profile_id, + 'entries': entries + } + + return info_dict; From fad6768bd1a67dccbb153ac371d3e82575321ea9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 21 Feb 2015 18:00:25 +0600 Subject: [PATCH 026/131] [vimeo] Fix password protected videos (Closes #5001) --- youtube_dl/extractor/vimeo.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 78d287e0e..5f8649e35 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -4,6 +4,7 @@ from __future__ import unicode_literals import json import re import itertools +import hashlib from .common import InfoExtractor from .subtitles import SubtitlesInfoExtractor @@ -225,6 +226,10 @@ class VimeoIE(VimeoBaseInfoExtractor, SubtitlesInfoExtractor): if mobj.group('pro') or mobj.group('player'): url = 'http://player.vimeo.com/video/' + video_id + password = self._downloader.params.get('videopassword', None) + if password: + headers['Cookie'] = '%s_password=%s' % (video_id, hashlib.md5(password).hexdigest()) + # Retrieve video webpage to extract further information request = compat_urllib_request.Request(url, None, headers) try: From 62b013df0dcb5f902d745b77b3de62b64b828863 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 21 Feb 2015 18:31:10 +0600 Subject: [PATCH 027/131] [vimeo] Encode password before hash calculation --- youtube_dl/extractor/vimeo.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 5f8649e35..4cd2f73d9 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -228,7 +228,8 @@ class VimeoIE(VimeoBaseInfoExtractor, SubtitlesInfoExtractor): password = self._downloader.params.get('videopassword', None) if password: - headers['Cookie'] = '%s_password=%s' % (video_id, hashlib.md5(password).hexdigest()) + headers['Cookie'] = '%s_password=%s' % ( + video_id, hashlib.md5(password.encode('utf-8')).hexdigest()) # Retrieve video webpage to extract further information request = compat_urllib_request.Request(url, None, headers) From 77b2986b5b0246234b72ae9dd78fb40f9d37374f Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Sat, 21 Feb 2015 14:51:07 +0100 Subject: [PATCH 028/131] [extractor/common] Recognize Indian censorship (#5021) --- youtube_dl/extractor/common.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 08b8ad37c..ee64ad329 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -391,6 +391,16 @@ class InfoExtractor(object): if blocked_iframe: msg += ' Visit %s for more details' % blocked_iframe raise ExtractorError(msg, expected=True) + if 'The URL you requested has been blocked' in content[:512]: + msg = ( + 'Access to this webpage has been blocked by Indian censorship. ' + 'Use a VPN or proxy server (with --proxy) to route around it.') + block_msg = self._html_search_regex( + r'

(.*?)

', + content, 'block message', default=None) + if block_msg: + msg += ' (Message: "%s")' % block_msg.replace('\n', ' ') + raise ExtractorError(msg, expected=True) return content From 8fb3ac3649ca7df6f328971f58afa84dd9d05cc6 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Sat, 21 Feb 2015 14:55:13 +0100 Subject: [PATCH 029/131] PEP8: W503 --- devscripts/check-porn.py | 8 ++++---- test/test_swfinterp.py | 4 ++-- youtube_dl/YoutubeDL.py | 8 ++++---- youtube_dl/__init__.py | 16 ++++++++-------- youtube_dl/downloader/common.py | 10 +++++----- youtube_dl/downloader/f4m.py | 4 ++-- youtube_dl/extractor/adobetv.py | 7 ++++--- youtube_dl/extractor/common.py | 4 ++-- youtube_dl/extractor/defense.py | 5 +++-- youtube_dl/utils.py | 4 ++-- 10 files changed, 36 insertions(+), 34 deletions(-) diff --git a/devscripts/check-porn.py b/devscripts/check-porn.py index 216282712..6a5bd9eda 100644 --- a/devscripts/check-porn.py +++ b/devscripts/check-porn.py @@ -45,12 +45,12 @@ for test in get_testcases(): RESULT = ('.' + domain + '\n' in LIST or '\n' + domain + '\n' in LIST) - if RESULT and ('info_dict' not in test or 'age_limit' not in test['info_dict'] - or test['info_dict']['age_limit'] != 18): + if RESULT and ('info_dict' not in test or 'age_limit' not in test['info_dict'] or + test['info_dict']['age_limit'] != 18): print('\nPotential missing age_limit check: {0}'.format(test['name'])) - elif not RESULT and ('info_dict' in test and 'age_limit' in test['info_dict'] - and test['info_dict']['age_limit'] == 18): + elif not RESULT and ('info_dict' in test and 'age_limit' in test['info_dict'] and + test['info_dict']['age_limit'] == 18): print('\nPotential false negative: {0}'.format(test['name'])) else: diff --git a/test/test_swfinterp.py b/test/test_swfinterp.py index 9f18055e6..f1e899819 100644 --- a/test/test_swfinterp.py +++ b/test/test_swfinterp.py @@ -34,8 +34,8 @@ def _make_testfunc(testfile): def test_func(self): as_file = os.path.join(TEST_DIR, testfile) swf_file = os.path.join(TEST_DIR, test_id + '.swf') - if ((not os.path.exists(swf_file)) - or os.path.getmtime(swf_file) < os.path.getmtime(as_file)): + if ((not os.path.exists(swf_file)) or + os.path.getmtime(swf_file) < os.path.getmtime(as_file)): # Recompile try: subprocess.check_call([ diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 88809783b..ca7c3f5c6 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -308,8 +308,8 @@ class YoutubeDL(object): raise if (sys.version_info >= (3,) and sys.platform != 'win32' and - sys.getfilesystemencoding() in ['ascii', 'ANSI_X3.4-1968'] - and not params.get('restrictfilenames', False)): + sys.getfilesystemencoding() in ['ascii', 'ANSI_X3.4-1968'] and + not params.get('restrictfilenames', False)): # On Python 3, the Unicode filesystem API will throw errors (#1474) self.report_warning( 'Assuming --restrict-filenames since file system encoding ' @@ -1366,8 +1366,8 @@ class YoutubeDL(object): """Download a given list of URLs.""" outtmpl = self.params.get('outtmpl', DEFAULT_OUTTMPL) if (len(url_list) > 1 and - '%' not in outtmpl - and self.params.get('max_downloads') != 1): + '%' not in outtmpl and + self.params.get('max_downloads') != 1): raise SameFileError(outtmpl) for url in url_list: diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index eac2a26ec..25ab3fdfe 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -189,14 +189,14 @@ def _real_main(argv=None): # In Python 2, sys.argv is a bytestring (also note http://bugs.python.org/issue2128 for Windows systems) if opts.outtmpl is not None: opts.outtmpl = opts.outtmpl.decode(preferredencoding()) - outtmpl = ((opts.outtmpl is not None and opts.outtmpl) - or (opts.format == '-1' and opts.usetitle and '%(title)s-%(id)s-%(format)s.%(ext)s') - or (opts.format == '-1' and '%(id)s-%(format)s.%(ext)s') - or (opts.usetitle and opts.autonumber and '%(autonumber)s-%(title)s-%(id)s.%(ext)s') - or (opts.usetitle and '%(title)s-%(id)s.%(ext)s') - or (opts.useid and '%(id)s.%(ext)s') - or (opts.autonumber and '%(autonumber)s-%(id)s.%(ext)s') - or DEFAULT_OUTTMPL) + outtmpl = ((opts.outtmpl is not None and opts.outtmpl) or + (opts.format == '-1' and opts.usetitle and '%(title)s-%(id)s-%(format)s.%(ext)s') or + (opts.format == '-1' and '%(id)s-%(format)s.%(ext)s') or + (opts.usetitle and opts.autonumber and '%(autonumber)s-%(title)s-%(id)s.%(ext)s') or + (opts.usetitle and '%(title)s-%(id)s.%(ext)s') or + (opts.useid and '%(id)s.%(ext)s') or + (opts.autonumber and '%(autonumber)s-%(id)s.%(ext)s') or + DEFAULT_OUTTMPL) if not os.path.splitext(outtmpl)[1] and opts.extractaudio: parser.error('Cannot download a video and extract audio into the same' ' file! Use "{0}.%(ext)s" instead of "{0}" as the output' diff --git a/youtube_dl/downloader/common.py b/youtube_dl/downloader/common.py index 45e55b99c..3ae90021a 100644 --- a/youtube_dl/downloader/common.py +++ b/youtube_dl/downloader/common.py @@ -311,14 +311,14 @@ class FileDownloader(object): """ nooverwrites_and_exists = ( - self.params.get('nooverwrites', False) - and os.path.exists(encodeFilename(filename)) + self.params.get('nooverwrites', False) and + os.path.exists(encodeFilename(filename)) ) continuedl_and_exists = ( - self.params.get('continuedl', False) - and os.path.isfile(encodeFilename(filename)) - and not self.params.get('nopart', False) + self.params.get('continuedl', False) and + os.path.isfile(encodeFilename(filename)) and + not self.params.get('nopart', False) ) # Check file already present diff --git a/youtube_dl/downloader/f4m.py b/youtube_dl/downloader/f4m.py index b40ebfa50..7b8fe8cf5 100644 --- a/youtube_dl/downloader/f4m.py +++ b/youtube_dl/downloader/f4m.py @@ -325,8 +325,8 @@ class F4mFD(FileDownloader): state['frag_index'] += 1 estimated_size = ( - (state['downloaded_bytes'] + frag_total_bytes) - / (state['frag_index'] + 1) * total_frags) + (state['downloaded_bytes'] + frag_total_bytes) / + (state['frag_index'] + 1) * total_frags) time_now = time.time() state['total_bytes_estimate'] = estimated_size state['elapsed'] = time_now - start diff --git a/youtube_dl/extractor/adobetv.py b/youtube_dl/extractor/adobetv.py index 28e07f8b0..97d128560 100644 --- a/youtube_dl/extractor/adobetv.py +++ b/youtube_dl/extractor/adobetv.py @@ -28,7 +28,6 @@ class AdobeTVIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) player = self._parse_json( @@ -44,8 +43,10 @@ class AdobeTVIE(InfoExtractor): self._html_search_meta('datepublished', webpage, 'upload date')) duration = parse_duration( - self._html_search_meta('duration', webpage, 'duration') - or self._search_regex(r'Runtime:\s*(\d{2}:\d{2}:\d{2})', webpage, 'duration')) + self._html_search_meta('duration', webpage, 'duration') or + self._search_regex( + r'Runtime:\s*(\d{2}:\d{2}:\d{2})', + webpage, 'duration', fatal=False)) view_count = str_to_int(self._search_regex( r'
\s*Views?:\s*([\d,.]+)\s*
', diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index ee64ad329..d3f86cf4a 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -808,8 +808,8 @@ class InfoExtractor(object): media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media') for i, media_el in enumerate(media_nodes): if manifest_version == '2.0': - manifest_url = ('/'.join(manifest_url.split('/')[:-1]) + '/' - + (media_el.attrib.get('href') or media_el.attrib.get('url'))) + manifest_url = ('/'.join(manifest_url.split('/')[:-1]) + '/' + + (media_el.attrib.get('href') or media_el.attrib.get('url'))) tbr = int_or_none(media_el.attrib.get('bitrate')) formats.append({ 'format_id': '-'.join(filter(None, [f4m_id, 'f4m-%d' % (i if tbr is None else tbr)])), diff --git a/youtube_dl/extractor/defense.py b/youtube_dl/extractor/defense.py index 2b90bf4fc..98e3aedfd 100644 --- a/youtube_dl/extractor/defense.py +++ b/youtube_dl/extractor/defense.py @@ -25,8 +25,9 @@ class DefenseGouvFrIE(InfoExtractor): r"flashvars.pvg_id=\"(\d+)\";", webpage, 'ID') - json_url = ('http://static.videos.gouv.fr/brightcovehub/export/json/' - + video_id) + json_url = ( + 'http://static.videos.gouv.fr/brightcovehub/export/json/%s' % + video_id) info = self._download_json(json_url, title, 'Downloading JSON config') video_url = info['renditions'][0]['url'] diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 238b6556b..475fad3c9 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -900,8 +900,8 @@ def _windows_write_string(s, out): def not_a_console(handle): if handle == INVALID_HANDLE_VALUE or handle is None: return True - return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR - or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0) + return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or + GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0) if not_a_console(h): return False From 93540ee10e4143f8de7885af2d68c213aab7d8cb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sat, 21 Feb 2015 19:31:39 +0100 Subject: [PATCH 030/131] [rtve] Fix the video url Changing mvod to mvod1 fixes the url, we don't need to add the query. --- youtube_dl/extractor/rtve.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/rtve.py b/youtube_dl/extractor/rtve.py index 3469d9578..e60f85b5b 100644 --- a/youtube_dl/extractor/rtve.py +++ b/youtube_dl/extractor/rtve.py @@ -6,6 +6,7 @@ import re import time from .common import InfoExtractor +from ..compat import compat_urlparse from ..utils import ( struct_unpack, remove_end, @@ -96,12 +97,10 @@ class RTVEALaCartaIE(InfoExtractor): ).replace('.net.rtve', '.multimedia.cdn.rtve') video_path = self._download_webpage( auth_url, video_id, 'Getting video url') - # Use mvod.akcdn instead of flash.akamaihd.multimedia.cdn to get + # Use mvod1.akcdn instead of flash.akamaihd.multimedia.cdn to get # the right Content-Length header and the mp4 format - video_url = ( - 'http://mvod.akcdn.rtve.es/{0}&v=2.6.8' - '&fp=MAC%2016,0,0,296&r=MRUGG&g=OEOJWFXNFGCP'.format(video_path) - ) + video_url = compat_urlparse.urljoin( + 'http://mvod1.akcdn.rtve.es/', video_path) return { 'id': video_id, From 4aeccadf4ef8528c252c917d071e98a091f7766c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 22 Feb 2015 01:38:57 +0600 Subject: [PATCH 031/131] [zapiks] Add extractor (#5014) --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/zapiks.py | 102 +++++++++++++++++++++++++++++++ 2 files changed, 103 insertions(+) create mode 100644 youtube_dl/extractor/zapiks.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 7d0c6b5ac..ef0adfd87 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -612,6 +612,7 @@ from .youtube import ( YoutubeUserIE, YoutubeWatchLaterIE, ) +from .zapiks import ZapiksIE from .zdf import ZDFIE, ZDFChannelIE from .zingmp3 import ( ZingMp3SongIE, diff --git a/youtube_dl/extractor/zapiks.py b/youtube_dl/extractor/zapiks.py new file mode 100644 index 000000000..12810637e --- /dev/null +++ b/youtube_dl/extractor/zapiks.py @@ -0,0 +1,102 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + parse_duration, + parse_iso8601, + xpath_with_ns, + xpath_text, + int_or_none, +) + + +class ZapiksIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?zapiks\.fr/(?:(?P.+?)\.html|index\.php\?.*\bmedia_id=(?P\d+))' + _TESTS = [ + { + 'url': 'http://www.zapiks.fr/ep2s3-bon-appetit-eh-be-viva.html', + 'md5': 'aeb3c473b2d564b2d46d664d28d5f050', + 'info_dict': { + 'id': '80798', + 'ext': 'mp4', + 'title': 'EP2S3 - Bon Appétit - Eh bé viva les pyrénées con!', + 'description': 'md5:7054d6f6f620c6519be1fe710d4da847', + 'thumbnail': 're:^https?://.*\.jpg$', + 'duration': 528, + 'timestamp': 1359044972, + 'upload_date': '20130124', + 'view_count': int, + 'comment_count': int, + }, + }, + { + 'url': 'http://www.zapiks.fr/index.php?action=playerIframe&media_id=118046&width=640&height=360&autoStart=false&language=fr', + 'only_matching': True, + }, + ] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + display_id = mobj.group('display_id') or video_id + + webpage = self._download_webpage(url, display_id) + + if not video_id: + video_id = self._search_regex( + r'data-media-id="(\d+)"', webpage, 'video id') + + playlist = self._download_xml( + 'http://www.zapiks.fr/view/index.php?action=playlist&media_id=%s&lang=en' % video_id, + display_id) + + NS_MAP = { + 'jwplayer': 'http://rss.jwpcdn.com/' + } + + def ns(path): + return xpath_with_ns(path, NS_MAP) + + item = playlist.find('./channel/item') + + title = xpath_text(item, 'title', 'title') or self._og_search_title(webpage) + description = self._og_search_description(webpage, default=None) + thumbnail = xpath_text( + item, ns('./jwplayer:image'), 'thumbnail') or self._og_search_thumbnail(webpage, default=None) + duration = parse_duration(self._html_search_meta( + 'duration', webpage, 'duration', default=None)) + timestamp = parse_iso8601(self._html_search_meta( + 'uploadDate', webpage, 'upload date', default=None), ' ') + + view_count = int_or_none(self._search_regex( + r'UserPlays:(\d+)', webpage, 'view count', default=None)) + comment_count = int_or_none(self._search_regex( + r'UserComments:(\d+)', webpage, 'comment count', default=None)) + + formats = [] + for source in item.findall(ns('./jwplayer:source')): + format_id = source.attrib['label'] + f = { + 'url': source.attrib['file'], + 'format_id': format_id, + } + m = re.search(r'^(?P\d+)[pP]', format_id) + if m: + f['height'] = int(m.group('height')) + formats.append(f) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'duration': duration, + 'timestamp': timestamp, + 'view_count': view_count, + 'comment_count': comment_count, + 'formats': formats, + } From 255fca5eea70a171530a5a0f2af143362f0211cf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 22 Feb 2015 01:39:26 +0600 Subject: [PATCH 032/131] [generic] Add support for Zapiks embeds (#5014) --- youtube_dl/extractor/generic.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 8dce96a64..875e1bf05 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -547,7 +547,16 @@ class GenericIE(InfoExtractor): 'id': 'aanslagen-kopenhagen', 'title': 'Aanslagen Kopenhagen | RTL Nieuws', } - } + }, + # Zapiks embed + { + 'url': 'http://www.skipass.com/news/116090-bon-appetit-s5ep3-baqueira-mi-cor.html', + 'info_dict': { + 'id': '118046', + 'ext': 'mp4', + 'title': 'EP3S5 - Bon Appétit - Baqueira Mi Corazon !', + } + }, ] def report_following_redirect(self, new_url): @@ -1098,6 +1107,12 @@ class GenericIE(InfoExtractor): if mobj is not None: return self.url_result(mobj.group('url'), 'Livestream') + # Look for Zapiks embed + mobj = re.search( + r']+src="(?Phttps?://(?:www\.)?zapiks\.fr/index\.php\?.+?)"', webpage) + if mobj is not None: + return self.url_result(mobj.group('url'), 'Zapiks') + def check_video(vurl): if YoutubeIE.suitable(vurl): return True From ea5152cae110d55b82c755c23926f077b90c071c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 22 Feb 2015 01:42:47 +0600 Subject: [PATCH 033/131] [zapiks] Extend _VALID_URL (#5014) --- youtube_dl/extractor/zapiks.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/zapiks.py b/youtube_dl/extractor/zapiks.py index 12810637e..22a9a57e8 100644 --- a/youtube_dl/extractor/zapiks.py +++ b/youtube_dl/extractor/zapiks.py @@ -14,7 +14,7 @@ from ..utils import ( class ZapiksIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?zapiks\.fr/(?:(?P.+?)\.html|index\.php\?.*\bmedia_id=(?P\d+))' + _VALID_URL = r'https?://(?:www\.)?zapiks\.(?:fr|com)/(?:(?:[a-z]{2}/)?(?P.+?)\.html|index\.php\?.*\bmedia_id=(?P\d+))' _TESTS = [ { 'url': 'http://www.zapiks.fr/ep2s3-bon-appetit-eh-be-viva.html', @@ -32,6 +32,14 @@ class ZapiksIE(InfoExtractor): 'comment_count': int, }, }, + { + 'url': 'http://www.zapiks.com/ep3s5-bon-appetit-baqueira-m-1.html', + 'only_matching': True, + }, + { + 'url': 'http://www.zapiks.com/nl/ep3s5-bon-appetit-baqueira-m-1.html', + 'only_matching': True, + }, { 'url': 'http://www.zapiks.fr/index.php?action=playerIframe&media_id=118046&width=640&height=360&autoStart=false&language=fr', 'only_matching': True, From c5181ab4101323de94bdb20850c64711c625c3ad Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 22 Feb 2015 02:10:26 +0600 Subject: [PATCH 034/131] [gdcvault] Fix rtmp streams (Closes #5024) --- youtube_dl/extractor/gdcvault.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/gdcvault.py b/youtube_dl/extractor/gdcvault.py index fed968f51..05f58f1af 100644 --- a/youtube_dl/extractor/gdcvault.py +++ b/youtube_dl/extractor/gdcvault.py @@ -7,6 +7,7 @@ from ..compat import ( compat_urllib_parse, compat_urllib_request, ) +from ..utils import remove_end class GDCVaultIE(InfoExtractor): @@ -68,7 +69,9 @@ class GDCVaultIE(InfoExtractor): akami_url = xml_description.find('./metadata/akamaiHost').text slide_video_path = xml_description.find('./metadata/slideVideo').text video_formats.append({ - 'url': 'rtmp://' + akami_url + '/' + slide_video_path, + 'url': 'rtmp://%s/ondemand?ovpfv=1.1' % 'fms.digitallyspeaking.com/cfx/st', + 'play_path': remove_end(slide_video_path, '.flv'), + 'ext': 'flv', 'format_note': 'slide deck video', 'quality': -2, 'preference': -2, @@ -76,7 +79,9 @@ class GDCVaultIE(InfoExtractor): }) speaker_video_path = xml_description.find('./metadata/speakerVideo').text video_formats.append({ - 'url': 'rtmp://' + akami_url + '/' + speaker_video_path, + 'url': 'rtmp://%s/ondemand?ovpfv=1.1' % 'fms.digitallyspeaking.com/cfx/st', + 'play_path': remove_end(speaker_video_path, '.flv'), + 'ext': 'flv', 'format_note': 'speaker video', 'quality': -1, 'preference': -1, From 314368c822428437e60bbc24af65d5415717632c Mon Sep 17 00:00:00 2001 From: Naglis Jonaitis Date: Sat, 21 Feb 2015 22:19:39 +0200 Subject: [PATCH 035/131] [teamcoco] Fix extraction Also, use a single style of quotes --- youtube_dl/extractor/teamcoco.py | 49 ++++++++++++++++---------------- 1 file changed, 24 insertions(+), 25 deletions(-) diff --git a/youtube_dl/extractor/teamcoco.py b/youtube_dl/extractor/teamcoco.py index a73da1c9c..5793dbc10 100644 --- a/youtube_dl/extractor/teamcoco.py +++ b/youtube_dl/extractor/teamcoco.py @@ -1,8 +1,10 @@ from __future__ import unicode_literals +import base64 import re from .common import InfoExtractor +from ..utils import qualities class TeamcocoIE(InfoExtractor): @@ -24,8 +26,8 @@ class TeamcocoIE(InfoExtractor): 'info_dict': { 'id': '19705', 'ext': 'mp4', - "description": "Louis C.K. got starstruck by George W. Bush, so what? Part one.", - "title": "Louis C.K. Interview Pt. 1 11/3/11", + 'description': 'Louis C.K. got starstruck by George W. Bush, so what? Part one.', + 'title': 'Louis C.K. Interview Pt. 1 11/3/11', 'age_limit': 0, } } @@ -42,42 +44,39 @@ class TeamcocoIE(InfoExtractor): display_id = mobj.group('display_id') webpage = self._download_webpage(url, display_id) - video_id = mobj.group("video_id") + video_id = mobj.group('video_id') if not video_id: video_id = self._html_search_regex( self._VIDEO_ID_REGEXES, webpage, 'video id') - data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id - data = self._download_xml( - data_url, display_id, 'Downloading data webpage') + embed_url = 'http://teamcoco.com/embed/v/%s' % video_id + embed = self._download_webpage( + embed_url, video_id, 'Downloading embed page') + + encoded_data = self._search_regex( + r'"preload"\s*:\s*"([^"]+)"', embed, 'encoded data') + data = self._parse_json( + base64.b64decode(encoded_data.encode('ascii')).decode('utf-8'), video_id) - qualities = ['500k', '480p', '1000k', '720p', '1080p'] formats = [] - for filed in data.findall('files/file'): - if filed.attrib.get('playmode') == 'all': - # it just duplicates one of the entries - break - file_url = filed.text - m_format = re.search(r'(\d+(k|p))\.mp4', file_url) + get_quality = qualities(['500k', '480p', '1000k', '720p', '1080p']) + for filed in data['files']: + m_format = re.search(r'(\d+(k|p))\.mp4', filed['url']) if m_format is not None: format_id = m_format.group(1) else: - format_id = filed.attrib['bitrate'] + format_id = filed['bitrate'] tbr = ( - int(filed.attrib['bitrate']) - if filed.attrib['bitrate'].isdigit() + int(filed['bitrate']) + if filed['bitrate'].isdigit() else None) - try: - quality = qualities.index(format_id) - except ValueError: - quality = -1 formats.append({ - 'url': file_url, + 'url': filed['url'], 'ext': 'mp4', 'tbr': tbr, 'format_id': format_id, - 'quality': quality, + 'quality': get_quality(format_id), }) self._sort_formats(formats) @@ -86,8 +85,8 @@ class TeamcocoIE(InfoExtractor): 'id': video_id, 'display_id': display_id, 'formats': formats, - 'title': self._og_search_title(webpage), - 'thumbnail': self._og_search_thumbnail(webpage), - 'description': self._og_search_description(webpage), + 'title': data['title'], + 'thumbnail': data.get('thumb', {}).get('href'), + 'description': data.get('teaser'), 'age_limit': self._family_friendly_search(webpage), } From e086e0eb6cef80db2d4ab44572a1a5d6b6f1dee0 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Sat, 21 Feb 2015 21:25:29 +0100 Subject: [PATCH 036/131] release 2015.02.21 --- docs/supportedsites.md | 1 + youtube_dl/version.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index dbff5e270..5fe3e47cd 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -559,6 +559,7 @@ - **youtube:subscriptions**: YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication) - **youtube:user**: YouTube.com user videos (URL or "ytuser" keyword) - **youtube:watch_later**: Youtube watch later list, ":ytwatchlater" for short (requires authentication) + - **Zapiks** - **ZDF** - **ZDFChannel** - **zingmp3:album**: mp3.zing.vn albums diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 9fd0ee963..7c8b29c3b 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2015.02.20' +__version__ = '2015.02.21' From 4d1718481755dde078678b6e55d457fc6351fcdd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sat, 21 Feb 2015 22:31:53 +0100 Subject: [PATCH 037/131] [YoutubeDL] don't set the 'requested_subtitles' without writesubtitles or writeautomaticsub --- test/test_subtitles.py | 10 +++++----- youtube_dl/YoutubeDL.py | 6 ++++-- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/test/test_subtitles.py b/test/test_subtitles.py index c04fe6f22..457f268fa 100644 --- a/test/test_subtitles.py +++ b/test/test_subtitles.py @@ -113,7 +113,7 @@ class TestYoutubeSubtitles(BaseTestSubtitles): self.DL.params['writesubtitles'] = True self.DL.params['allsubtitles'] = True subtitles = self.getSubtitles() - self.assertEqual(len(subtitles), 0) + self.assertFalse(subtitles) def test_youtube_multiple_langs(self): self.url = 'QRS8MkLhQmM' @@ -152,7 +152,7 @@ class TestDailymotionSubtitles(BaseTestSubtitles): self.DL.params['writesubtitles'] = True self.DL.params['allsubtitles'] = True subtitles = self.getSubtitles() - self.assertEqual(len(subtitles), 0) + self.assertFalse(subtitles) def test_multiple_langs(self): self.DL.params['writesubtitles'] = True @@ -246,7 +246,7 @@ class TestVimeoSubtitles(BaseTestSubtitles): self.DL.params['writesubtitles'] = True self.DL.params['allsubtitles'] = True subtitles = self.getSubtitles() - self.assertEqual(len(subtitles), 0) + self.assertFalse(subtitles) def test_multiple_langs(self): self.DL.params['writesubtitles'] = True @@ -281,7 +281,7 @@ class TestWallaSubtitles(BaseTestSubtitles): self.DL.params['writesubtitles'] = True self.DL.params['allsubtitles'] = True subtitles = self.getSubtitles() - self.assertEqual(len(subtitles), 0) + self.assertFalse(subtitles) class TestCeskaTelevizeSubtitles(BaseTestSubtitles): @@ -308,7 +308,7 @@ class TestCeskaTelevizeSubtitles(BaseTestSubtitles): self.DL.params['writesubtitles'] = True self.DL.params['allsubtitles'] = True subtitles = self.getSubtitles() - self.assertEqual(len(subtitles), 0) + self.assertFalse(subtitles) class TestLyndaSubtitles(BaseTestSubtitles): diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index f8b8fb0c1..088b111eb 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -1164,8 +1164,10 @@ class YoutubeDL(object): if lang not in available_subs: available_subs[lang] = cap_info - if not available_subs: - return available_subs + if (not self.params.get('writesubtitles') and not + self.params.get('writeautomaticsub') or not + available_subs): + return None if self.params.get('allsubtitles', False): requested_langs = available_subs.keys() From 03091e372f7033fa52c7961b1a99cd3790c0f60f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sat, 21 Feb 2015 22:33:11 +0100 Subject: [PATCH 038/131] [ted] Always extract the subtitles The required info is already in the webpage --- youtube_dl/extractor/ted.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/youtube_dl/extractor/ted.py b/youtube_dl/extractor/ted.py index 1809eaae4..0c38c8f89 100644 --- a/youtube_dl/extractor/ted.py +++ b/youtube_dl/extractor/ted.py @@ -163,8 +163,6 @@ class TEDIE(InfoExtractor): self._sort_formats(formats) video_id = compat_str(talk_info['id']) - # subtitles - video_subtitles = self.extract_subtitles(video_id, talk_info) thumbnail = talk_info['thumb'] if not thumbnail.startswith('http'): @@ -175,7 +173,7 @@ class TEDIE(InfoExtractor): 'uploader': talk_info['speaker'], 'thumbnail': thumbnail, 'description': self._og_search_description(webpage), - 'subtitles': video_subtitles, + 'subtitles': self._get_subtitles(video_id, talk_info), 'formats': formats, 'duration': talk_info.get('duration'), } @@ -194,7 +192,6 @@ class TEDIE(InfoExtractor): ] return sub_lang_list else: - self._downloader.report_warning('video doesn\'t have subtitles') return {} def _watch_info(self, url, name): From ab84349b16b3c94775543a04855fc77005f8237e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sun, 22 Feb 2015 11:26:27 +0100 Subject: [PATCH 039/131] [test/YoutubeDL] Add test for subtitles Updated the offlinetest make target to not skip it --- Makefile | 2 +- test/test_YoutubeDL.py | 52 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 53 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 0636fc4cb..07c90c225 100644 --- a/Makefile +++ b/Makefile @@ -43,7 +43,7 @@ test: ot: offlinetest offlinetest: codetest - nosetests --verbose test --exclude test_download --exclude test_age_restriction --exclude test_subtitles --exclude test_write_annotations --exclude test_youtube_lists + nosetests --verbose test --exclude test_download.py --exclude test_age_restriction.py --exclude test_subtitles.py --exclude test_write_annotations.py --exclude test_youtube_lists.py tar: youtube-dl.tar.gz diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index b1cd6a69f..e11292211 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -337,6 +337,58 @@ class TestFormatSelection(unittest.TestCase): downloaded = ydl.downloaded_info_dicts[0] self.assertEqual(downloaded['format_id'], 'G') + def test_subtitles(self): + def s_formats(lang, autocaption=False): + return [{ + 'ext': ext, + 'url': 'http://localhost/video.%s.%s' % (lang, ext), + '_auto': autocaption, + } for ext in ['vtt', 'srt', 'ass']] + subtitles = dict((l, s_formats(l)) for l in ['en', 'fr', 'es']) + auto_captions = dict((l, s_formats(l, True)) for l in ['it', 'pt', 'es']) + info_dict = { + 'id': 'test', + 'title': 'Test', + 'url': 'http://localhost/video.mp4', + 'subtitles': subtitles, + 'automatic_captions': auto_captions, + 'extractor': 'TEST', + } + + def get_info(params={}): + params.setdefault('simulate', True) + ydl = YDL(params) + ydl.report_warning = lambda *args, **kargs: None + return ydl.process_video_result(info_dict, download=False) + + result = get_info() + self.assertFalse(result.get('requested_subtitles')) + self.assertEqual(result['subtitles'], subtitles) + self.assertEqual(result['automatic_captions'], auto_captions) + + result = get_info({'writesubtitles': True}) + subs = result['requested_subtitles'] + self.assertTrue(subs) + self.assertEqual(set(subs.keys()), set(['en'])) + self.assertTrue(subs['en'].get('data') is None) + self.assertEqual(subs['en']['ext'], 'ass') + + result = get_info({'writesubtitles': True, 'subtitlesformat': 'foo/srt'}) + subs = result['requested_subtitles'] + self.assertEqual(subs['en']['ext'], 'srt') + + result = get_info({'writesubtitles': True, 'subtitleslangs': ['es', 'fr', 'it']}) + subs = result['requested_subtitles'] + self.assertTrue(subs) + self.assertEqual(set(subs.keys()), set(['es', 'fr'])) + + result = get_info({'writesubtitles': True, 'writeautomaticsub': True, 'subtitleslangs': ['es', 'pt']}) + subs = result['requested_subtitles'] + self.assertTrue(subs) + self.assertEqual(set(subs.keys()), set(['es', 'pt'])) + self.assertFalse(subs['es']['_auto']) + self.assertTrue(subs['pt']['_auto']) + def test_add_extra_info(self): test_dict = { 'extractor': 'Foo', From 98c70d6fc7006c8cbbd76fb1b8661d758fc4f5d8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sun, 22 Feb 2015 11:37:27 +0100 Subject: [PATCH 040/131] [YoutubeDL] only add normal subtitles to the 'requested_subtitles' field if 'writesubtitles' is True --- test/test_YoutubeDL.py | 7 +++++++ youtube_dl/YoutubeDL.py | 10 ++++++---- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index e11292211..055e42555 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -389,6 +389,13 @@ class TestFormatSelection(unittest.TestCase): self.assertFalse(subs['es']['_auto']) self.assertTrue(subs['pt']['_auto']) + result = get_info({'writeautomaticsub': True, 'subtitleslangs': ['es', 'pt']}) + subs = result['requested_subtitles'] + self.assertTrue(subs) + self.assertEqual(set(subs.keys()), set(['es', 'pt'])) + self.assertTrue(subs['es']['_auto']) + self.assertTrue(subs['pt']['_auto']) + def test_add_extra_info(self): test_dict = { 'extractor': 'Foo', diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 088b111eb..7319323e5 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -1156,11 +1156,13 @@ class YoutubeDL(object): info_dict.update(formats_to_download[-1]) return info_dict - def process_subtitles(self, video_id, available_subs, available_autocaps): + def process_subtitles(self, video_id, normal_subtitles, automatic_captions): """Select the requested subtitles and their format""" - if available_autocaps and self.params.get('writeautomaticsub'): - available_subs = available_subs.copy() - for lang, cap_info in available_autocaps.items(): + available_subs = {} + if normal_subtitles and self.params.get('writesubtitles'): + available_subs.update(normal_subtitles) + if automatic_captions and self.params.get('writeautomaticsub'): + for lang, cap_info in automatic_captions.items(): if lang not in available_subs: available_subs[lang] = cap_info From b7bb76df05f53d4fc0570d07be5abcee238745e6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sun, 22 Feb 2015 11:49:27 +0100 Subject: [PATCH 041/131] [test/subtitles] Remove some tests Test only with 'allsubtitles', the language selection is already tested in test_YoutubeDL.py --- test/test_subtitles.py | 121 +++++------------------------------------ 1 file changed, 13 insertions(+), 108 deletions(-) diff --git a/test/test_subtitles.py b/test/test_subtitles.py index 457f268fa..aa4e2bec4 100644 --- a/test/test_subtitles.py +++ b/test/test_subtitles.py @@ -57,22 +57,15 @@ class TestYoutubeSubtitles(BaseTestSubtitles): url = 'QRS8MkLhQmM' IE = YoutubeIE - def test_youtube_subtitles(self): - self.DL.params['writesubtitles'] = True - subtitles = self.getSubtitles() - self.assertEqual(md5(subtitles['en']), '4cd9278a35ba2305f47354ee13472260') - - def test_youtube_subtitles_lang(self): - self.DL.params['writesubtitles'] = True - self.DL.params['subtitleslangs'] = ['it'] - subtitles = self.getSubtitles() - self.assertEqual(md5(subtitles['it']), '164a51f16f260476a05b50fe4c2f161d') - def test_youtube_allsubtitles(self): self.DL.params['writesubtitles'] = True self.DL.params['allsubtitles'] = True subtitles = self.getSubtitles() self.assertEqual(len(subtitles.keys()), 13) + self.assertEqual(md5(subtitles['en']), '4cd9278a35ba2305f47354ee13472260') + self.assertEqual(md5(subtitles['it']), '164a51f16f260476a05b50fe4c2f161d') + for lang in ['it', 'fr', 'de']: + self.assertTrue(subtitles.get(lang) is not None, 'Subtitles for \'%s\' not extracted' % lang) def test_youtube_subtitles_sbv_format(self): self.DL.params['writesubtitles'] = True @@ -86,12 +79,6 @@ class TestYoutubeSubtitles(BaseTestSubtitles): subtitles = self.getSubtitles() self.assertEqual(md5(subtitles['en']), '3cb210999d3e021bd6c7f0ea751eab06') - def test_youtube_list_subtitles(self): - self.DL.expect_warning('Video doesn\'t have automatic captions') - self.DL.params['listsubtitles'] = True - info_dict = self.getInfoDict() - self.assertEqual(info_dict, None) - def test_youtube_automatic_captions(self): self.url = '8YoUxe5ncPo' self.DL.params['writeautomaticsub'] = True @@ -115,36 +102,20 @@ class TestYoutubeSubtitles(BaseTestSubtitles): subtitles = self.getSubtitles() self.assertFalse(subtitles) - def test_youtube_multiple_langs(self): - self.url = 'QRS8MkLhQmM' - self.DL.params['writesubtitles'] = True - langs = ['it', 'fr', 'de'] - self.DL.params['subtitleslangs'] = langs - subtitles = self.getSubtitles() - for lang in langs: - self.assertTrue(subtitles.get(lang) is not None, 'Subtitles for \'%s\' not extracted' % lang) - class TestDailymotionSubtitles(BaseTestSubtitles): url = 'http://www.dailymotion.com/video/xczg00' IE = DailymotionIE - def test_subtitles(self): - self.DL.params['writesubtitles'] = True - subtitles = self.getSubtitles() - self.assertEqual(md5(subtitles['en']), '976553874490cba125086bbfea3ff76f') - - def test_subtitles_lang(self): - self.DL.params['writesubtitles'] = True - self.DL.params['subtitleslangs'] = ['fr'] - subtitles = self.getSubtitles() - self.assertEqual(md5(subtitles['fr']), '594564ec7d588942e384e920e5341792') - def test_allsubtitles(self): self.DL.params['writesubtitles'] = True self.DL.params['allsubtitles'] = True subtitles = self.getSubtitles() self.assertTrue(len(subtitles.keys()) >= 6) + self.assertEqual(md5(subtitles['en']), '976553874490cba125086bbfea3ff76f') + self.assertEqual(md5(subtitles['fr']), '594564ec7d588942e384e920e5341792') + for lang in ['es', 'fr', 'de']: + self.assertTrue(subtitles.get(lang) is not None, 'Subtitles for \'%s\' not extracted' % lang) def test_nosubtitles(self): self.DL.expect_warning('video doesn\'t have subtitles') @@ -154,51 +125,19 @@ class TestDailymotionSubtitles(BaseTestSubtitles): subtitles = self.getSubtitles() self.assertFalse(subtitles) - def test_multiple_langs(self): - self.DL.params['writesubtitles'] = True - langs = ['es', 'fr', 'de'] - self.DL.params['subtitleslangs'] = langs - subtitles = self.getSubtitles() - for lang in langs: - self.assertTrue(subtitles.get(lang) is not None, 'Subtitles for \'%s\' not extracted' % lang) - class TestTedSubtitles(BaseTestSubtitles): url = 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html' IE = TEDIE - def test_no_writesubtitles(self): - subtitles = self.getSubtitles() - self.assertFalse(subtitles) - - def test_subtitles(self): - self.DL.params['writesubtitles'] = True - subtitles = self.getSubtitles() - self.assertEqual(md5(subtitles['en']), '4262c1665ff928a2dada178f62cb8d14') - - def test_subtitles_lang(self): - self.DL.params['writesubtitles'] = True - self.DL.params['subtitleslangs'] = ['fr'] - subtitles = self.getSubtitles() - self.assertEqual(md5(subtitles['fr']), '66a63f7f42c97a50f8c0e90bc7797bb5') - def test_allsubtitles(self): self.DL.params['writesubtitles'] = True self.DL.params['allsubtitles'] = True subtitles = self.getSubtitles() self.assertTrue(len(subtitles.keys()) >= 28) - - def test_list_subtitles(self): - self.DL.params['listsubtitles'] = True - info_dict = self.getInfoDict() - self.assertEqual(info_dict, None) - - def test_multiple_langs(self): - self.DL.params['writesubtitles'] = True - langs = ['es', 'fr', 'de'] - self.DL.params['subtitleslangs'] = langs - subtitles = self.getSubtitles() - for lang in langs: + self.assertEqual(md5(subtitles['en']), '4262c1665ff928a2dada178f62cb8d14') + self.assertEqual(md5(subtitles['fr']), '66a63f7f42c97a50f8c0e90bc7797bb5') + for lang in ['es', 'fr', 'de']: self.assertTrue(subtitles.get(lang) is not None, 'Subtitles for \'%s\' not extracted' % lang) @@ -206,11 +145,6 @@ class TestBlipTVSubtitles(BaseTestSubtitles): url = 'http://blip.tv/a/a-6603250' IE = BlipTVIE - def test_list_subtitles(self): - self.DL.params['listsubtitles'] = True - info_dict = self.getInfoDict() - self.assertEqual(info_dict, None) - def test_allsubtitles(self): self.DL.params['writesubtitles'] = True self.DL.params['allsubtitles'] = True @@ -223,22 +157,13 @@ class TestVimeoSubtitles(BaseTestSubtitles): url = 'http://vimeo.com/76979871' IE = VimeoIE - def test_subtitles(self): - self.DL.params['writesubtitles'] = True - subtitles = self.getSubtitles() - self.assertEqual(md5(subtitles['en']), '8062383cf4dec168fc40a088aa6d5888') - - def test_subtitles_lang(self): - self.DL.params['writesubtitles'] = True - self.DL.params['subtitleslangs'] = ['fr'] - subtitles = self.getSubtitles() - self.assertEqual(md5(subtitles['fr']), 'b6191146a6c5d3a452244d853fde6dc8') - def test_allsubtitles(self): self.DL.params['writesubtitles'] = True self.DL.params['allsubtitles'] = True subtitles = self.getSubtitles() self.assertEqual(set(subtitles.keys()), set(['de', 'en', 'es', 'fr'])) + self.assertEqual(md5(subtitles['en']), '8062383cf4dec168fc40a088aa6d5888') + self.assertEqual(md5(subtitles['fr']), 'b6191146a6c5d3a452244d853fde6dc8') def test_nosubtitles(self): self.DL.expect_warning('video doesn\'t have subtitles') @@ -248,25 +173,11 @@ class TestVimeoSubtitles(BaseTestSubtitles): subtitles = self.getSubtitles() self.assertFalse(subtitles) - def test_multiple_langs(self): - self.DL.params['writesubtitles'] = True - langs = ['es', 'fr', 'de'] - self.DL.params['subtitleslangs'] = langs - subtitles = self.getSubtitles() - for lang in langs: - self.assertTrue(subtitles.get(lang) is not None, 'Subtitles for \'%s\' not extracted' % lang) - class TestWallaSubtitles(BaseTestSubtitles): url = 'http://vod.walla.co.il/movie/2705958/the-yes-men' IE = WallaIE - def test_list_subtitles(self): - self.DL.expect_warning('Automatic Captions not supported by this server') - self.DL.params['listsubtitles'] = True - info_dict = self.getInfoDict() - self.assertEqual(info_dict, None) - def test_allsubtitles(self): self.DL.expect_warning('Automatic Captions not supported by this server') self.DL.params['writesubtitles'] = True @@ -288,12 +199,6 @@ class TestCeskaTelevizeSubtitles(BaseTestSubtitles): url = 'http://www.ceskatelevize.cz/ivysilani/10600540290-u6-uzasny-svet-techniky' IE = CeskaTelevizeIE - def test_list_subtitles(self): - self.DL.expect_warning('Automatic Captions not supported by this server') - self.DL.params['listsubtitles'] = True - info_dict = self.getInfoDict() - self.assertEqual(info_dict, None) - def test_allsubtitles(self): self.DL.expect_warning('Automatic Captions not supported by this server') self.DL.params['writesubtitles'] = True From 80970e531bd377e1952ac358e7e345cfbf23593d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sun, 22 Feb 2015 11:52:22 +0100 Subject: [PATCH 042/131] [test/subtitles] Update checksum for Viki --- test/test_subtitles.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_subtitles.py b/test/test_subtitles.py index aa4e2bec4..7f93f0a75 100644 --- a/test/test_subtitles.py +++ b/test/test_subtitles.py @@ -288,7 +288,7 @@ class TestVikiSubtitles(BaseTestSubtitles): self.DL.params['allsubtitles'] = True subtitles = self.getSubtitles() self.assertEqual(set(subtitles.keys()), set(['en'])) - self.assertEqual(md5(subtitles['en']), 'b0b781eeb45efd3f6398a925b259150b') + self.assertEqual(md5(subtitles['en']), '53cb083a5914b2d84ef1ab67b880d18a') class TestThePlatformSubtitles(BaseTestSubtitles): From f311cfa23153fee51f94f14d1ab1f7f8b6a74702 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 22 Feb 2015 19:53:32 +0600 Subject: [PATCH 043/131] [appletrailers] Extend _VALID_URL (Closes #5027) --- youtube_dl/extractor/appletrailers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/appletrailers.py b/youtube_dl/extractor/appletrailers.py index 43e82847f..9c718ea66 100644 --- a/youtube_dl/extractor/appletrailers.py +++ b/youtube_dl/extractor/appletrailers.py @@ -11,7 +11,7 @@ from ..utils import ( class AppleTrailersIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?trailers\.apple\.com/trailers/(?P[^/]+)/(?P[^/]+)' + _VALID_URL = r'https?://(?:www\.)?trailers\.apple\.com/(?:trailers|ca)/(?P[^/]+)/(?P[^/]+)' _TEST = { "url": "http://trailers.apple.com/trailers/wb/manofsteel/", 'info_dict': { From 35b798230334e984977090ae03a307eaf7eedbc1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 22 Feb 2015 19:58:39 +0600 Subject: [PATCH 044/131] [appletrailers] Add test (#5027) --- youtube_dl/extractor/appletrailers.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/appletrailers.py b/youtube_dl/extractor/appletrailers.py index 9c718ea66..576f03b5b 100644 --- a/youtube_dl/extractor/appletrailers.py +++ b/youtube_dl/extractor/appletrailers.py @@ -12,7 +12,7 @@ from ..utils import ( class AppleTrailersIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?trailers\.apple\.com/(?:trailers|ca)/(?P[^/]+)/(?P[^/]+)' - _TEST = { + _TESTS = [{ "url": "http://trailers.apple.com/trailers/wb/manofsteel/", 'info_dict': { 'id': 'manofsteel', @@ -63,7 +63,10 @@ class AppleTrailersIE(InfoExtractor): }, }, ] - } + }, { + 'url': 'http://trailers.apple.com/ca/metropole/autrui/', + 'only_matching': True, + }] _JSON_RE = r'iTunes.playURL\((.*?)\);' From c010af6f195c2e84aec7d0ddec060fcbe9c45089 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 22 Feb 2015 23:11:33 +0600 Subject: [PATCH 045/131] [escapist] Make regexes more robust (Closes #5028) --- youtube_dl/extractor/escapist.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/escapist.py b/youtube_dl/extractor/escapist.py index 6b693b3b6..b49b9869f 100644 --- a/youtube_dl/extractor/escapist.py +++ b/youtube_dl/extractor/escapist.py @@ -31,10 +31,10 @@ class EscapistIE(InfoExtractor): webpage = self._download_webpage(url, video_id) uploader_id = self._html_search_regex( - r"

\s*(.*?)", + r"(.*?)", webpage, 'uploader', fatal=False) description = self._html_search_meta('description', webpage) @@ -42,7 +42,7 @@ class EscapistIE(InfoExtractor): title = raw_title.partition(' : ')[2] config_url = compat_urllib_parse.unquote(self._html_search_regex( - r' Date: Mon, 23 Feb 2015 03:30:10 +0600 Subject: [PATCH 046/131] [extractor/common] Fix preference for m3u8 quality selection URL --- youtube_dl/extractor/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index d3f86cf4a..79f6d199b 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -833,7 +833,7 @@ class InfoExtractor(object): 'url': m3u8_url, 'ext': ext, 'protocol': 'm3u8', - 'preference': -1, + 'preference': preference - 1 if preference else -1, 'resolution': 'multiple', 'format_note': 'Quality selection URL', }] From fcc3e6138b372e13578949dc724f456ae76dd065 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 23 Feb 2015 03:32:53 +0600 Subject: [PATCH 047/131] [r7] Add extractor (Closes #4405, closes #5004) --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/r7.py | 88 ++++++++++++++++++++++++++++++++ 2 files changed, 89 insertions(+) create mode 100644 youtube_dl/extractor/r7.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index ef0adfd87..4d3e79de9 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -364,6 +364,7 @@ from .promptfile import PromptFileIE from .prosiebensat1 import ProSiebenSat1IE from .pyvideo import PyvideoIE from .quickvid import QuickVidIE +from .r7 import R7IE from .radiode import RadioDeIE from .radiobremen import RadioBremenIE from .radiofrance import RadioFranceIE diff --git a/youtube_dl/extractor/r7.py b/youtube_dl/extractor/r7.py new file mode 100644 index 000000000..976c8feec --- /dev/null +++ b/youtube_dl/extractor/r7.py @@ -0,0 +1,88 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + js_to_json, + unescapeHTML, + int_or_none, +) + + +class R7IE(InfoExtractor): + _VALID_URL = r'''(?x)https?:// + (?: + (?:[a-zA-Z]+)\.r7\.com(?:/[^/]+)+/idmedia/| + noticias\.r7\.com(?:/[^/]+)+/[^/]+-| + player\.r7\.com/video/i/ + ) + (?P[\da-f]{24}) + ''' + _TESTS = [{ + 'url': 'http://videos.r7.com/policiais-humilham-suspeito-a-beira-da-morte-morre-com-dignidade-/idmedia/54e7050b0cf2ff57e0279389.html', + 'md5': '403c4e393617e8e8ddc748978ee8efde', + 'info_dict': { + 'id': '54e7050b0cf2ff57e0279389', + 'ext': 'mp4', + 'title': 'Policiais humilham suspeito à beira da morte: "Morre com dignidade"', + 'thumbnail': 're:^https?://.*\.jpg$', + 'duration': 98, + 'like_count': int, + 'view_count': int, + }, + }, { + 'url': 'http://esportes.r7.com/videos/cigano-manda-recado-aos-fas/idmedia/4e176727b51a048ee6646a1b.html', + 'only_matching': True, + }, { + 'url': 'http://noticias.r7.com/record-news/video/representante-do-instituto-sou-da-paz-fala-sobre-fim-do-estatuto-do-desarmamento-5480fc580cf2285b117f438d/', + 'only_matching': True, + }, { + 'url': 'http://player.r7.com/video/i/54e7050b0cf2ff57e0279389?play=true&video=http://vsh.r7.com/54e7050b0cf2ff57e0279389/ER7_RE_BG_MORTE_JOVENS_570kbps_2015-02-2009f17818-cc82-4c8f-86dc-89a66934e633-ATOS_copy.mp4&linkCallback=http://videos.r7.com/policiais-humilham-suspeito-a-beira-da-morte-morre-com-dignidade-/idmedia/54e7050b0cf2ff57e0279389.html&thumbnail=http://vtb.r7.com/ER7_RE_BG_MORTE_JOVENS_570kbps_2015-02-2009f17818-cc82-4c8f-86dc-89a66934e633-thumb.jpg&idCategory=192&share=true&layout=full&full=true', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage( + 'http://player.r7.com/video/i/%s' % video_id, video_id) + + item = self._parse_json(js_to_json(self._search_regex( + r'(?s)var\s+item\s*=\s*({.+?});', webpage, 'player')), video_id) + + title = unescapeHTML(item['title']) + thumbnail = item.get('init', {}).get('thumbUri') + duration = None + + statistics = item.get('statistics', {}) + like_count = int_or_none(statistics.get('likes')) + view_count = int_or_none(statistics.get('views')) + + formats = [] + for format_key, format_dict in item['playlist'][0].items(): + src = format_dict.get('src') + if not src: + continue + format_id = format_dict.get('format') or format_key + if duration is None: + duration = format_dict.get('duration') + if '.f4m' in src: + formats.extend(self._extract_f4m_formats(src, video_id, preference=-1)) + elif src.endswith('.m3u8'): + formats.extend(self._extract_m3u8_formats(src, video_id, 'mp4', preference=-2)) + else: + formats.append({ + 'url': src, + 'format_id': format_id, + }) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'thumbnail': thumbnail, + 'duration': duration, + 'like_count': like_count, + 'view_count': view_count, + 'formats': formats, + } From 1b40dc92eb27b2a3f299157f83bfc8e95ca42268 Mon Sep 17 00:00:00 2001 From: Duncan Keall Date: Mon, 23 Feb 2015 16:10:08 +1300 Subject: [PATCH 048/131] [airmozilla] Add new extractor --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/airmozilla.py | 73 ++++++++++++++++++++++++++++++ 2 files changed, 74 insertions(+) create mode 100644 youtube_dl/extractor/airmozilla.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index ef0adfd87..7e1864315 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -8,6 +8,7 @@ from .adobetv import AdobeTVIE from .adultswim import AdultSwimIE from .aftenposten import AftenpostenIE from .aftonbladet import AftonbladetIE +from .airmozilla import AirMozillaIE from .aljazeera import AlJazeeraIE from .alphaporno import AlphaPornoIE from .anitube import AnitubeIE diff --git a/youtube_dl/extractor/airmozilla.py b/youtube_dl/extractor/airmozilla.py new file mode 100644 index 000000000..44c20f886 --- /dev/null +++ b/youtube_dl/extractor/airmozilla.py @@ -0,0 +1,73 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import parse_iso8601 + + +class AirMozillaIE(InfoExtractor): + _VALID_URL = r'https?://air\.mozilla\.org/(?P[0-9a-z-]+)/?' + _TEST = { + 'url': 'https://air.mozilla.org/privacy-lab-a-meetup-for-privacy-minded-people-in-san-francisco/', + 'md5': '2e3e7486ba5d180e829d453875b9b8bf', + 'info_dict': { + 'id': '6x4q2w', + 'ext': 'mp4', + 'title': 'Privacy Lab - a meetup for privacy minded people in San Francisco', + 'thumbnail': 're:https://\w+\.cloudfront\.net/6x4q2w/poster\.jpg\?t=\d+', + 'description': 'Brings together privacy professionals and others interested in privacy at for-profits, non-profits, and NGOs in an effort to contribute to the state of the ecosystem...', + 'timestamp': 1422487800, + 'upload_date': '20150128', + 'location': 'SFO Commons', + 'duration': 3780, + 'view_count': int, + 'categories': ['Main'], + } + } + + _QUALITY_MAP = { + '360p': 0, + '576p': 1, + '640p': 2, + '720p': 3, + } + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + video_id = self._html_search_regex(r'//vid.ly/(.*?)/embed', webpage, 'id') + + embed_script = self._download_webpage('https://vid.ly/{0}/embed'.format(video_id), video_id) + jwconfig = self._search_regex(r'\svar jwconfig = (\{.*?\});\s', embed_script, 'metadata') + metadata = self._parse_json(jwconfig, video_id) + + formats = [] + for source in metadata['playlist'][0]['sources']: + fmt = { + 'url': source['file'], + 'ext': source['type'], + 'format_id': self._search_regex(r'&format=(.*)$', source['file'], 'video format'), + 'resolution': source['label'], + 'quality': self._QUALITY_MAP.get(source['label'], -1), + } + formats.append(fmt) + self._sort_formats(formats) + + duration_match = re.search(r'Duration:(?: (?P\d+) hours?)?(?: (?P\d+) minutes?)?', webpage) + + return { + 'id': video_id, + 'title': self._og_search_title(webpage), + 'formats': formats, + 'url': self._og_search_url(webpage), + 'display_id': display_id, + 'thumbnail': metadata['playlist'][0]['image'], + 'description': self._og_search_description(webpage), + 'timestamp': parse_iso8601(self._html_search_regex(r'

', webpage, 'title') - audio_id = self._html_search_regex(r'\("setFile",\s+"http://audio.chirbit.com/(.*?).mp3"\)', webpage, 'audio ID') - audio_url = 'http://audio.chirbit.com/' + audio_id + '.mp3'; + webpage = self._download_webpage( + 'http://chirb.it/%s' % audio_id, audio_id) + + audio_url = self._search_regex( + r'"setFile"\s*,\s*"([^"]+)"', webpage, 'audio url') + + title = self._search_regex( + r'itemprop="name">([^<]+)', webpage, 'title') + duration = parse_duration(self._html_search_meta( + 'duration', webpage, 'duration', fatal=False)) + view_count = int_or_none(self._search_regex( + r'itemprop="playCount"\s*>(\d+)', webpage, + 'listen count', fatal=False)) + comment_count = int_or_none(self._search_regex( + r'>(\d+) Comments?:', webpage, + 'comment count', fatal=False)) return { - 'id': audio_linkid, - 'display_id': audio_id, - 'title': audio_title, - 'url': audio_url + 'id': audio_id, + 'url': audio_url, + 'title': title, + 'duration': duration, + 'view_count': view_count, + 'comment_count': comment_count, } + class ChirbitProfileIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?chirbit.com/(?P[^/]+)/?$' + _VALID_URL = r'https?://(?:www\.)?chirbit.com/(?:rss/)?(?P[^/]+)' _TEST = { 'url': 'http://chirbit.com/ScarletBeauty', - 'playlist_count': 3, 'info_dict': { - '_type': 'playlist', - 'title': 'ScarletBeauty', - 'id': 'ScarletBeauty' - } + 'id': 'ScarletBeauty', + 'title': 'Chirbits by ScarletBeauty', + }, + 'playlist_mincount': 3, } def _real_extract(self, url): profile_id = self._match_id(url) - # Chirbit has a pretty weird "Last Page" navigation behavior. - # We grab the profile's oldest entry to determine when to - # stop fetching entries. - oldestpage = self._download_webpage(url + '/24599', profile_id) - oldest_page_entries = re.findall( - r'''soundFile:\s*"http://audio.chirbit.com/(.*?).mp3"''', - oldestpage); - oldestentry = clean_html(oldest_page_entries[-1]); + rss = self._download_xml( + 'http://chirbit.com/rss/%s' % profile_id, profile_id) - ids = [] - titles = [] - n = 0 - while True: - page = self._download_webpage(url + '/' + str(n), profile_id) - page_ids = re.findall( - r'''soundFile:\s*"http://audio.chirbit.com/(.*?).mp3"''', - page); - page_titles = re.findall( - r'''(.*?)''', - page); - ids += page_ids - titles += page_titles - if oldestentry in page_ids: - break - n += 1 + entries = [ + self.url_result(audio_url.text, 'Chirbit') + for audio_url in rss.findall('./channel/item/link')] - entries = [] - i = 0 - for id in ids: - entries.append({ - 'id': id, - 'title': titles[i], - 'url': 'http://audio.chirbit.com/' + id + '.mp3' - }); - i += 1 + title = rss.find('./channel/title').text - info_dict = { - '_type': 'playlist', - 'id': profile_id, - 'title': profile_id, - 'entries': entries - } - - return info_dict; + return self.playlist_result(entries, profile_id, title) From 3cc57f96455ce14cc5c72264a25b8d434174f7dd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 23 Feb 2015 21:27:24 +0600 Subject: [PATCH 053/131] [soundgasm:profile] Simplify --- youtube_dl/extractor/soundgasm.py | 30 ++++++++---------------------- 1 file changed, 8 insertions(+), 22 deletions(-) diff --git a/youtube_dl/extractor/soundgasm.py b/youtube_dl/extractor/soundgasm.py index e568ff18c..e11d999f3 100644 --- a/youtube_dl/extractor/soundgasm.py +++ b/youtube_dl/extractor/soundgasm.py @@ -41,36 +41,22 @@ class SoundgasmIE(InfoExtractor): } class SoundgasmProfileIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?soundgasm\.net/u/(?P[0-9a-zA-Z_\-]+)/?$' + _VALID_URL = r'https?://(?:www\.)?soundgasm\.net/u/(?P[^/]+)' _TEST = { 'url': 'http://soundgasm.net/u/ytdl', - 'playlist_count': 1, 'info_dict': { - '_type': 'playlist', 'id': 'ytdl', - 'title': 'ytdl' - } + }, + 'playlist_count': 1, } def _real_extract(self, url): profile_id = self._match_id(url) + webpage = self._download_webpage(url, profile_id) - ids = re.findall(r'''''' % re.escape(profile_id), webpage) - ids = [clean_html(id) for id in ids] + entries = [ + self.url_result(audio_url, 'Soundgasm') + for audio_url in re.findall(r'href="([^"]+/u/%s/[^"]+)' % profile_id, webpage)] - entries = [] - for id in ids: - entries.append({ - '_type': 'url', - 'url': ('http://soundgasm.net/u/%s/%s' % (profile_id, id)) - }) - - info_dict = { - '_type': 'playlist', - 'id': profile_id, - 'title': profile_id, - 'entries': entries - } - - return info_dict; + return self.playlist_result(entries, profile_id) From 80af2b73ab0b51e4416500301948caa71ec39cb8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 23 Feb 2015 21:27:56 +0600 Subject: [PATCH 054/131] [soundgasm] Clarify extractors' IE_NAMEs --- youtube_dl/extractor/soundgasm.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/youtube_dl/extractor/soundgasm.py b/youtube_dl/extractor/soundgasm.py index e11d999f3..26e96a120 100644 --- a/youtube_dl/extractor/soundgasm.py +++ b/youtube_dl/extractor/soundgasm.py @@ -8,6 +8,7 @@ from ..utils import clean_html class SoundgasmIE(InfoExtractor): + IE_NAME = 'soundgasm' _VALID_URL = r'https?://(?:www\.)?soundgasm\.net/u/(?P[0-9a-zA-Z_\-]+)/(?P[0-9a-zA-Z_\-]+)' _TEST = { 'url': 'http://soundgasm.net/u/ytdl/Piano-sample', @@ -41,6 +42,7 @@ class SoundgasmIE(InfoExtractor): } class SoundgasmProfileIE(InfoExtractor): + IE_NAME = 'soundgasm:profile' _VALID_URL = r'https?://(?:www\.)?soundgasm\.net/u/(?P<id>[^/]+)' _TEST = { 'url': 'http://soundgasm.net/u/ytdl', From 04e8c1108023d9fe5c466d16f988a469e04f326e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 23 Feb 2015 21:28:14 +0600 Subject: [PATCH 055/131] [chirbit] Clarify extractors' IE_NAMEs --- youtube_dl/extractor/chirbit.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/youtube_dl/extractor/chirbit.py b/youtube_dl/extractor/chirbit.py index 124307b7c..b1eeaf101 100644 --- a/youtube_dl/extractor/chirbit.py +++ b/youtube_dl/extractor/chirbit.py @@ -9,6 +9,7 @@ from ..utils import ( class ChirbitIE(InfoExtractor): + IE_NAME = 'chirbit' _VALID_URL = r'https?://(?:www\.)?chirb\.it/(?:(?:wp|pl)/|fb_chirbit_player\.swf\?key=)?(?P<id>[\da-zA-Z]+)' _TESTS = [{ 'url': 'http://chirb.it/PrIPv5', @@ -57,6 +58,7 @@ class ChirbitIE(InfoExtractor): class ChirbitProfileIE(InfoExtractor): + IE_NAME = 'chirbit:profile' _VALID_URL = r'https?://(?:www\.)?chirbit.com/(?:rss/)?(?P<id>[^/]+)' _TEST = { 'url': 'http://chirbit.com/ScarletBeauty', From 409693984f0acb8fbbf006c0d7965bc138211ac6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 23 Feb 2015 21:30:30 +0600 Subject: [PATCH 056/131] [soundgasm:profile] Fix _VALID_URL --- youtube_dl/extractor/soundgasm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/soundgasm.py b/youtube_dl/extractor/soundgasm.py index 26e96a120..9e992c9b7 100644 --- a/youtube_dl/extractor/soundgasm.py +++ b/youtube_dl/extractor/soundgasm.py @@ -43,7 +43,7 @@ class SoundgasmIE(InfoExtractor): class SoundgasmProfileIE(InfoExtractor): IE_NAME = 'soundgasm:profile' - _VALID_URL = r'https?://(?:www\.)?soundgasm\.net/u/(?P<id>[^/]+)' + _VALID_URL = r'https?://(?:www\.)?soundgasm\.net/u/(?P<id>[^/]+)/?(?:\#.*)?$' _TEST = { 'url': 'http://soundgasm.net/u/ytdl', 'info_dict': { From 09c200acf258de115caeda210741a59f2b971b65 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 23 Feb 2015 21:31:57 +0600 Subject: [PATCH 057/131] Credit @skypher for chirbit and soundgasm:profile (#5032) --- AUTHORS | 1 + 1 file changed, 1 insertion(+) diff --git a/AUTHORS b/AUTHORS index 47f12a9ee..bdd2a15dc 100644 --- a/AUTHORS +++ b/AUTHORS @@ -111,3 +111,4 @@ Paul Hartmann Frans de Jonge Robin de Rooij Ryan Schmidt +Leslie P. Polzer From 3438e7acd27d89d83d41e722d21d7660dbad7eea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 23 Feb 2015 21:40:50 +0600 Subject: [PATCH 058/131] [soundgasm] Remove unused import --- youtube_dl/extractor/soundgasm.py | 1 - 1 file changed, 1 deletion(-) diff --git a/youtube_dl/extractor/soundgasm.py b/youtube_dl/extractor/soundgasm.py index 9e992c9b7..1c48478a6 100644 --- a/youtube_dl/extractor/soundgasm.py +++ b/youtube_dl/extractor/soundgasm.py @@ -4,7 +4,6 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..utils import clean_html class SoundgasmIE(InfoExtractor): From bd61a9e770506283f82e2ddf9e53b587169c2f04 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 23 Feb 2015 16:47:19 +0100 Subject: [PATCH 059/131] release 2015.02.23 --- docs/supportedsites.md | 6 +++++- youtube_dl/version.py | 2 +- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 5fe3e47cd..9f70db80a 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -72,6 +72,8 @@ - **CeskaTelevize** - **channel9**: Channel 9 - **Chilloutzone** + - **chirbit** + - **chirbit:profile** - **Cinchcast** - **Cinemassacre** - **clipfish** @@ -330,6 +332,7 @@ - **prosiebensat1**: ProSiebenSat.1 Digital - **Pyvideo** - **QuickVid** + - **R7** - **radio.de** - **radiobremen** - **radiofrance** @@ -385,7 +388,8 @@ - **soundcloud:playlist** - **soundcloud:set** - **soundcloud:user** - - **Soundgasm** + - **soundgasm** + - **soundgasm:profile** - **southpark.cc.com** - **southpark.de** - **Space** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 7c8b29c3b..17317b29c 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2015.02.21' +__version__ = '2015.02.23' From 5bca2424bc2dfb15b5394a51fa5befd7148edc41 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 23 Feb 2015 16:51:09 +0100 Subject: [PATCH 060/131] [gdcvault] Remove dead code --- youtube_dl/extractor/gdcvault.py | 1 - 1 file changed, 1 deletion(-) diff --git a/youtube_dl/extractor/gdcvault.py b/youtube_dl/extractor/gdcvault.py index 05f58f1af..e5011a5dc 100644 --- a/youtube_dl/extractor/gdcvault.py +++ b/youtube_dl/extractor/gdcvault.py @@ -66,7 +66,6 @@ class GDCVaultIE(InfoExtractor): def _parse_flv(self, xml_description): video_formats = [] - akami_url = xml_description.find('./metadata/akamaiHost').text slide_video_path = xml_description.find('./metadata/slideVideo').text video_formats.append({ 'url': 'rtmp://%s/ondemand?ovpfv=1.1' % 'fms.digitallyspeaking.com/cfx/st', From 591ab1dff913d7ff88f30487c54c1e9c5d44d0cb Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 23 Feb 2015 16:51:21 +0100 Subject: [PATCH 061/131] [soundgasm] PEP8 --- youtube_dl/extractor/soundgasm.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/soundgasm.py b/youtube_dl/extractor/soundgasm.py index 1c48478a6..3a4ddf57e 100644 --- a/youtube_dl/extractor/soundgasm.py +++ b/youtube_dl/extractor/soundgasm.py @@ -40,6 +40,7 @@ class SoundgasmIE(InfoExtractor): 'description': description } + class SoundgasmProfileIE(InfoExtractor): IE_NAME = 'soundgasm:profile' _VALID_URL = r'https?://(?:www\.)?soundgasm\.net/u/(?P<id>[^/]+)/?(?:\#.*)?$' From 4432db35d9ddd0e6777df6c596d8637514ba0b56 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 23 Feb 2015 21:59:11 +0600 Subject: [PATCH 062/131] [gdcvault] Restore akamai host for rtmp videos --- youtube_dl/extractor/gdcvault.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/gdcvault.py b/youtube_dl/extractor/gdcvault.py index e5011a5dc..f7b467b0a 100644 --- a/youtube_dl/extractor/gdcvault.py +++ b/youtube_dl/extractor/gdcvault.py @@ -66,9 +66,10 @@ class GDCVaultIE(InfoExtractor): def _parse_flv(self, xml_description): video_formats = [] + akamai_url = xml_description.find('./metadata/akamaiHost').text slide_video_path = xml_description.find('./metadata/slideVideo').text video_formats.append({ - 'url': 'rtmp://%s/ondemand?ovpfv=1.1' % 'fms.digitallyspeaking.com/cfx/st', + 'url': 'rtmp://%s/ondemand?ovpfv=1.1' % akamai_url, 'play_path': remove_end(slide_video_path, '.flv'), 'ext': 'flv', 'format_note': 'slide deck video', @@ -78,7 +79,7 @@ class GDCVaultIE(InfoExtractor): }) speaker_video_path = xml_description.find('./metadata/speakerVideo').text video_formats.append({ - 'url': 'rtmp://%s/ondemand?ovpfv=1.1' % 'fms.digitallyspeaking.com/cfx/st', + 'url': 'rtmp://%s/ondemand?ovpfv=1.1' % akamai_url, 'play_path': remove_end(speaker_video_path, '.flv'), 'ext': 'flv', 'format_note': 'speaker video', From 459e5fbd5fa61064076534c4d6d8a1d010acb1b3 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 23 Feb 2015 18:17:39 +0100 Subject: [PATCH 063/131] release 2015.02.23.1 --- README.md | 4 ++-- youtube_dl/version.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 8ea31d605..699401b49 100644 --- a/README.md +++ b/README.md @@ -351,8 +351,8 @@ which means you can modify it, redistribute it or use it however you like. --all-subs downloads all the available subtitles of the video --list-subs lists all available subtitles for the video - --sub-format FORMAT subtitle format (default=srt) ([sbv/vtt] - youtube only) + --sub-format FORMAT subtitle format, accepts formats + preference, for example: "ass/srt/best" --sub-lang LANGS languages of the subtitles to download (optional) separated by commas, use IETF language tags like 'en,pt' diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 17317b29c..1852d834b 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2015.02.23' +__version__ = '2015.02.23.1' From ffdf972b9115d6d8c86439bf0828e945823bdcf8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Mon, 23 Feb 2015 18:54:15 +0100 Subject: [PATCH 064/131] [facebook] Extract all the formats (closes #5037) --- youtube_dl/extractor/facebook.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index 1ad4e77a8..f0e575320 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -126,11 +126,17 @@ class FacebookIE(InfoExtractor): params_raw = compat_urllib_parse.unquote(data['params']) params = json.loads(params_raw) video_data = params['video_data'][0] - video_url = video_data.get('hd_src') - if not video_url: - video_url = video_data['sd_src'] - if not video_url: - raise ExtractorError('Cannot find video URL') + + formats = [] + for quality in ['sd', 'hd']: + src = video_data.get('%s_src' % quality) + if src is not None: + formats.append({ + 'format_id': quality, + 'url': src, + }) + if not formats: + raise ExtractorError('Cannot find video formats') video_title = self._html_search_regex( r'<h2 class="uiHeaderTitle">([^<]*)</h2>', webpage, 'title', @@ -146,7 +152,7 @@ class FacebookIE(InfoExtractor): return { 'id': video_id, 'title': video_title, - 'url': video_url, + 'formats': formats, 'duration': int_or_none(video_data.get('video_duration')), 'thumbnail': video_data.get('thumbnail_src'), } From 3037b91e05e68a4ab3420cbbdb23cfb0739011d3 Mon Sep 17 00:00:00 2001 From: Naglis Jonaitis <njonaitis@gmail.com> Date: Mon, 23 Feb 2015 20:45:36 +0200 Subject: [PATCH 065/131] [laola1tv] Improve extraction and update test case (#3742) --- youtube_dl/extractor/laola1tv.py | 47 +++++++++++++++++++++----------- 1 file changed, 31 insertions(+), 16 deletions(-) diff --git a/youtube_dl/extractor/laola1tv.py b/youtube_dl/extractor/laola1tv.py index 2fd3b4699..135421406 100644 --- a/youtube_dl/extractor/laola1tv.py +++ b/youtube_dl/extractor/laola1tv.py @@ -1,23 +1,26 @@ +# -*- coding: utf-8 -*- from __future__ import unicode_literals import random import re from .common import InfoExtractor -from ..utils import ExtractorError +from ..utils import ( + ExtractorError, + xpath_text, +) class Laola1TvIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?laola1\.tv/(?P<lang>[a-z]+)-(?P<portal>[a-z]+)/.*?/(?P<id>[0-9]+)\.html' _TEST = { - 'url': 'http://www.laola1.tv/de-de/live/bwf-bitburger-open-grand-prix-gold-court-1/250019.html', + 'url': 'http://www.laola1.tv/de-de/video/straubing-tigers-koelner-haie/227883.html', 'info_dict': { - 'id': '250019', + 'id': '227883', 'ext': 'mp4', - 'title': 'Bitburger Open Grand Prix Gold - Court 1', - 'categories': ['Badminton'], - 'uploader': 'BWF - Badminton World Federation', - 'is_live': True, + 'title': 'Straubing Tigers - Kölner Haie', + 'categories': ['Eishockey'], + 'is_live': False, }, 'params': { 'skip_download': True, @@ -43,15 +46,26 @@ class Laola1TvIE(InfoExtractor): r'flashvars\.([_a-zA-Z0-9]+)\s*=\s*"([^"]*)";', iframe) flashvars = dict((m[0], m[1]) for m in flashvars_m) + partner_id = self._search_regex( + 'partnerid\s*:\s*"([^"]+)"', iframe, 'partner id') + xml_url = ('http://www.laola1.tv/server/hd_video.php?' + - 'play=%s&partner=1&portal=%s&v5ident=&lang=%s' % ( - video_id, portal, lang)) + 'play=%s&partner=%s&portal=%s&v5ident=&lang=%s' % ( + video_id, partner_id, portal, lang)) hd_doc = self._download_xml(xml_url, video_id) - title = hd_doc.find('.//video/title').text - flash_url = hd_doc.find('.//video/url').text - categories = hd_doc.find('.//video/meta_sports').text.split(',') - uploader = hd_doc.find('.//video/meta_organistation').text + title = xpath_text(hd_doc, './/video/title', fatal=True) + flash_url = xpath_text(hd_doc, './/video/url', fatal=True) + uploader = xpath_text(hd_doc, './/video/meta_organistation') + + is_live = xpath_text(hd_doc, './/video/islive') == 'true' + if is_live: + raise ExtractorError( + 'Live streams are not supported by the f4m downloader.') + + categories = xpath_text(hd_doc, './/video/meta_sports') + if categories: + categories = categories.split(',') ident = random.randint(10000000, 99999999) token_url = '%s&ident=%s&klub=0&unikey=0×tamp=%s&auth=%s' % ( @@ -60,15 +74,16 @@ class Laola1TvIE(InfoExtractor): token_doc = self._download_xml( token_url, video_id, note='Downloading token') token_attrib = token_doc.find('.//token').attrib - if token_attrib.get('auth') == 'blocked': - raise ExtractorError('Token error: ' % token_attrib.get('comment')) + if token_attrib.get('auth') in ('blocked', 'restricted'): + raise ExtractorError( + 'Token error: %s' % token_attrib.get('comment'), expected=True) video_url = '%s?hdnea=%s&hdcore=3.2.0' % ( token_attrib['url'], token_attrib['auth']) return { 'id': video_id, - 'is_live': True, + 'is_live': is_live, 'title': title, 'url': video_url, 'uploader': uploader, From 1fbaa0a5210976a2a8fc0c20207708c35621416a Mon Sep 17 00:00:00 2001 From: Naglis Jonaitis <njonaitis@gmail.com> Date: Mon, 23 Feb 2015 20:51:30 +0200 Subject: [PATCH 066/131] [laola1tv] Use raw strings for regular expressions Oops --- youtube_dl/extractor/laola1tv.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/laola1tv.py b/youtube_dl/extractor/laola1tv.py index 135421406..e8ca49fd1 100644 --- a/youtube_dl/extractor/laola1tv.py +++ b/youtube_dl/extractor/laola1tv.py @@ -47,7 +47,7 @@ class Laola1TvIE(InfoExtractor): flashvars = dict((m[0], m[1]) for m in flashvars_m) partner_id = self._search_regex( - 'partnerid\s*:\s*"([^"]+)"', iframe, 'partner id') + r'partnerid\s*:\s*"([^"]+)"', iframe, 'partner id') xml_url = ('http://www.laola1.tv/server/hd_video.php?' + 'play=%s&partner=%s&portal=%s&v5ident=&lang=%s' % ( From 99209c2916753799e9c68e8d466c5253113f25bc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 24 Feb 2015 01:35:15 +0600 Subject: [PATCH 067/131] [youtube] Extract UL playlists as mixes (Closes #5040) --- youtube_dl/extractor/youtube.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 22db896b1..3690f8021 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1153,13 +1153,13 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): | p/ ) ( - (?:PL|LL|EC|UU|FL|RD)?[0-9A-Za-z-_]{10,} + (?:PL|LL|EC|UU|FL|RD|UL)?[0-9A-Za-z-_]{10,} # Top tracks, they can also include dots |(?:MC)[\w\.]* ) .* | - ((?:PL|LL|EC|UU|FL|RD)[0-9A-Za-z-_]{10,}) + ((?:PL|LL|EC|UU|FL|RD|UL)[0-9A-Za-z-_]{10,}) )""" _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s' _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&[^"]*?index=(?P<index>\d+)' @@ -1244,7 +1244,7 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): for vid_id in ids] def _extract_mix(self, playlist_id): - # The mixes are generated from a a single video + # The mixes are generated from a single video # the id of the playlist is just 'RD' + video_id url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id) webpage = self._download_webpage( @@ -1280,7 +1280,7 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): else: self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id)) - if playlist_id.startswith('RD'): + if playlist_id.startswith('RD') or playlist_id.startswith('UL'): # Mixes require a custom extraction process return self._extract_mix(playlist_id) From 25ac63ed71bdc2a82842a593db9a150a0b8b7a6e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Mon, 23 Feb 2015 21:52:07 +0100 Subject: [PATCH 068/131] [rtve] Extract subtitles --- test/test_subtitles.py | 15 +++++++++++++++ youtube_dl/extractor/rtve.py | 12 ++++++++++++ 2 files changed, 27 insertions(+) diff --git a/test/test_subtitles.py b/test/test_subtitles.py index 7f93f0a75..3f2d8a2ba 100644 --- a/test/test_subtitles.py +++ b/test/test_subtitles.py @@ -25,6 +25,7 @@ from youtube_dl.extractor import ( RaiIE, VikiIE, ThePlatformIE, + RTVEALaCartaIE, ) @@ -305,5 +306,19 @@ class TestThePlatformSubtitles(BaseTestSubtitles): self.assertEqual(md5(subtitles['en']), '97e7670cbae3c4d26ae8bcc7fdd78d4b') +class TestRtveSubtitles(BaseTestSubtitles): + url = 'http://www.rtve.es/alacarta/videos/los-misterios-de-laura/misterios-laura-capitulo-32-misterio-del-numero-17-2-parte/2428621/' + IE = RTVEALaCartaIE + + def test_allsubtitles(self): + print('Skipping, only available from Spain') + return + self.DL.params['writesubtitles'] = True + self.DL.params['allsubtitles'] = True + subtitles = self.getSubtitles() + self.assertEqual(set(subtitles.keys()), set(['es'])) + self.assertEqual(md5(subtitles['es']), '69e70cae2d40574fb7316f31d6eb7fca') + + if __name__ == '__main__': unittest.main() diff --git a/youtube_dl/extractor/rtve.py b/youtube_dl/extractor/rtve.py index e60f85b5b..27cd34b7d 100644 --- a/youtube_dl/extractor/rtve.py +++ b/youtube_dl/extractor/rtve.py @@ -102,14 +102,26 @@ class RTVEALaCartaIE(InfoExtractor): video_url = compat_urlparse.urljoin( 'http://mvod1.akcdn.rtve.es/', video_path) + subtitles = None + if info.get('sbtFile') is not None: + subtitles = self.extract_subtitles(video_id, info['sbtFile']) + return { 'id': video_id, 'title': info['title'], 'url': video_url, 'thumbnail': info.get('image'), 'page_url': url, + 'subtitles': subtitles, } + def _get_subtitles(self, video_id, sub_file): + subs = self._download_json( + sub_file + '.json', video_id, + 'Downloading subtitles info')['page']['items'] + return dict((s['lang'], [{'ext': 'vtt', 'url': s['src']}]) + for s in subs) + class RTVELiveIE(InfoExtractor): IE_NAME = 'rtve.es:live' From ec5913b5cd92dfb8607ec535e02b04bdc09ff804 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Tue, 24 Feb 2015 11:08:00 +0100 Subject: [PATCH 069/131] [bloomberg] Modernize --- youtube_dl/extractor/bloomberg.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/bloomberg.py b/youtube_dl/extractor/bloomberg.py index c51a97ce4..4a88ccd13 100644 --- a/youtube_dl/extractor/bloomberg.py +++ b/youtube_dl/extractor/bloomberg.py @@ -6,7 +6,7 @@ from .common import InfoExtractor class BloombergIE(InfoExtractor): - _VALID_URL = r'https?://www\.bloomberg\.com/video/(?P<name>.+?)\.html' + _VALID_URL = r'https?://www\.bloomberg\.com/video/(?P<id>.+?)\.html' _TEST = { 'url': 'http://www.bloomberg.com/video/shah-s-presentation-on-foreign-exchange-strategies-qurhIVlJSB6hzkVi229d8g.html', @@ -20,9 +20,9 @@ class BloombergIE(InfoExtractor): } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - name = mobj.group('name') + name = self._match_id(url) webpage = self._download_webpage(url, name) + f4m_url = self._search_regex( r'<source src="(https?://[^"]+\.f4m.*?)"', webpage, 'f4m url') From b665ba6aa6551243aa1a5b707ee7034be356f1bb Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Tue, 24 Feb 2015 11:24:26 +0100 Subject: [PATCH 070/131] release 2015.02.24 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 1852d834b..589f38834 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2015.02.23.1' +__version__ = '2015.02.24' From 9c665ab72e5fc99989800109cdada5acc3af56c5 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Tue, 24 Feb 2015 11:37:27 +0100 Subject: [PATCH 071/131] [rtve] PEP8 --- youtube_dl/extractor/rtve.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/rtve.py b/youtube_dl/extractor/rtve.py index 27cd34b7d..c0fd23ff1 100644 --- a/youtube_dl/extractor/rtve.py +++ b/youtube_dl/extractor/rtve.py @@ -119,7 +119,8 @@ class RTVEALaCartaIE(InfoExtractor): subs = self._download_json( sub_file + '.json', video_id, 'Downloading subtitles info')['page']['items'] - return dict((s['lang'], [{'ext': 'vtt', 'url': s['src']}]) + return dict( + (s['lang'], [{'ext': 'vtt', 'url': s['src']}]) for s in subs) From 5a42414b9c4718f83f28fbc0e5a4a01ab67f23f6 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Tue, 24 Feb 2015 11:38:01 +0100 Subject: [PATCH 072/131] [utils] Prevent hyphen at beginning of filename (Fixes #5035) --- test/test_utils.py | 2 ++ youtube_dl/utils.py | 2 ++ 2 files changed, 4 insertions(+) diff --git a/test/test_utils.py b/test/test_utils.py index c7373af1e..2f8996d7b 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -85,6 +85,8 @@ class TestUtil(unittest.TestCase): self.assertEqual( sanitize_filename('New World record at 0:12:34'), 'New World record at 0_12_34') + self.assertEqual(sanitize_filename('--gasdgf'), '_-gasdgf') + self.assertEqual(sanitize_filename('--gasdgf', is_id=True), '--gasdgf') forbidden = '"\0\\/' for fc in forbidden: diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 475fad3c9..e2631dccd 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -304,6 +304,8 @@ def sanitize_filename(s, restricted=False, is_id=False): # Common case of "Foreign band name - English song title" if restricted and result.startswith('-_'): result = result[2:] + if result.startswith('-'): + result = '_' + result[len('-'):] if not result: result = '_' return result From db8e13ef714544574691f3dd4255dd0f12c1cf77 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Tue, 24 Feb 2015 11:38:21 +0100 Subject: [PATCH 073/131] release 2015.02.24.1 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 589f38834..a420860ed 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2015.02.24' +__version__ = '2015.02.24.1' From 54233c9080c1956f53802988fd8d5328cb38b7d7 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Tue, 24 Feb 2015 16:33:07 +0100 Subject: [PATCH 074/131] [escapist] Support JavaScript player (Fixes #5034) --- youtube_dl/extractor/escapist.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/escapist.py b/youtube_dl/extractor/escapist.py index b49b9869f..51ffec7ee 100644 --- a/youtube_dl/extractor/escapist.py +++ b/youtube_dl/extractor/escapist.py @@ -42,7 +42,14 @@ class EscapistIE(InfoExtractor): title = raw_title.partition(' : ')[2] config_url = compat_urllib_parse.unquote(self._html_search_regex( - r'<param\s+name="flashvars"\s+value="config=([^"&]+)', webpage, 'config URL')) + r'''(?x) + (?: + <param\s+name="flashvars"\s+value="config=| + flashvars="config= + ) + ([^"&]+) + ''', + webpage, 'config URL')) formats = [] From 4f3b21e1c738a7dacd514eb59242da43e81b5ae1 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Tue, 24 Feb 2015 16:34:42 +0100 Subject: [PATCH 075/131] release 2015.02.24.2 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index a420860ed..d23c6ae3d 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2015.02.24.1' +__version__ = '2015.02.24.2' From 7f09a662a0db2ad0a338645e4f2dbd43056a6fa6 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Tue, 24 Feb 2015 23:55:57 +0800 Subject: [PATCH 076/131] [Letv] Add new extractor. Single video only --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/letv.py | 120 +++++++++++++++++++++++++++++++ 2 files changed, 121 insertions(+) create mode 100644 youtube_dl/extractor/letv.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 40fc92cf7..829ab18a9 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -237,6 +237,7 @@ from .krasview import KrasViewIE from .ku6 import Ku6IE from .la7 import LA7IE from .laola1tv import Laola1TvIE +from .letv import LetvIE from .lifenews import LifeNewsIE from .liveleak import LiveLeakIE from .livestream import ( diff --git a/youtube_dl/extractor/letv.py b/youtube_dl/extractor/letv.py new file mode 100644 index 000000000..3eb20678e --- /dev/null +++ b/youtube_dl/extractor/letv.py @@ -0,0 +1,120 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import os.path +import time +import datetime + +from .common import InfoExtractor +from ..compat import (compat_urlparse, compat_urllib_parse) +from ..utils import (ExtractorError, parse_iso8601) + + +class LetvIE(InfoExtractor): + _VALID_URL = r'http://www.letv.com/ptv/vplay/(?P<id>\d+).html' + + _TESTS = [{ + 'url': 'http://www.letv.com/ptv/vplay/22005890.html', + 'md5': 'cab23bd68d5a8db9be31c9a222c1e8df', + 'info_dict': { + 'id': '22005890', + 'ext': 'mp4', + 'title': '第87届奥斯卡颁奖礼完美落幕 《鸟人》成最大赢家', + 'timestamp': 1424747397, + 'upload_date': '20150224', + } + }, { + 'url': 'http://www.letv.com/ptv/vplay/1118082.html', + 'info_dict': { + 'id': '1118082', + 'ext': 'mp4', + } + }] + + @staticmethod + def urshift(val, n): + return val >> n if val >= 0 else (val + 0x100000000) >> n + + # ror() and calcTimeKey() are reversed from a embedded swf file in KLetvPlayer.swf + def ror(self, param1, param2): + _loc3_ = 0 + while _loc3_ < param2: + param1 = self.urshift(param1, 1) + ((param1 & 1) << 31) + _loc3_ += 1 + return param1 + + def calcTimeKey(self, param1): + _loc2_ = 773625421 + _loc3_ = self.ror(param1, _loc2_ % 13) + _loc3_ = _loc3_ ^ _loc2_ + _loc3_ = self.ror(_loc3_, _loc2_ % 17) + return _loc3_ + + def _real_extract(self, url): + media_id = self._match_id(url) + page = self._download_webpage(url, media_id) + params = { + 'id': media_id, + 'platid': 1, + 'splatid': 101, + 'format': 1, + 'tkey': self.calcTimeKey(int(time.time())), + 'domain': 'www.letv.com' + } + play_json = self._download_json( + 'http://api.letv.com/mms/out/video/playJson?' + compat_urllib_parse.urlencode(params), + media_id, 'playJson data') + + # Check for errors + playstatus = play_json['playstatus'] + if playstatus['status'] == 0: + flag = playstatus['flag'] + if flag == 1: + msg = 'Country %s auth error' % playstatus['country'] + else: + msg = 'Generic error. flag = %d' % flag + raise ExtractorError(msg, expected=True) + + playurl = play_json['playurl'] + + formats = ['350', '1000', '1300', '720p', '1080p'] + dispatch = playurl['dispatch'] + + urls = [] + for format_id in formats: + if format_id in dispatch: + media_url = playurl['domain'][0] + dispatch[format_id][0] + + # Mimic what flvxz.com do + url_parts = list(compat_urlparse.urlparse(media_url)) + qs = dict(compat_urlparse.parse_qs(url_parts[4])) + qs.update({ + 'platid': '14', + 'splatid': '1401', + 'tss': 'no', + 'retry': 1 + }) + url_parts[4] = compat_urllib_parse.urlencode(qs) + media_url = compat_urlparse.urlunparse(url_parts) + + url_info_dict = { + 'url': media_url, + 'ext': os.path.splitext(dispatch[format_id][1])[1][1:] + } + + if format_id[-1:] == 'p': + url_info_dict['height'] = format_id[:-1] + + urls.append(url_info_dict) + + publish_time = parse_iso8601(self._html_search_regex( + r'发布时间 ([^<>]+) ', page, 'publish time', fatal=False), + delimiter=' ', timezone=datetime.timedelta(hours=8)) + + return { + 'id': media_id, + 'formats': urls, + 'title': playurl['title'], + 'thumbnail': playurl['pic'], + 'timestamp': publish_time, + } From df4bd0d53ff4baff6ce25ad04a1e87f37777c3ff Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Tue, 24 Feb 2015 17:25:02 +0100 Subject: [PATCH 077/131] [options] Add --yes-playlist as inverse of --no-playlist (Fixes #5051) --- youtube_dl/options.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/youtube_dl/options.py b/youtube_dl/options.py index 5c2d153b1..886ce9613 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -272,6 +272,10 @@ def parseOpts(overrideArguments=None): '--no-playlist', action='store_true', dest='noplaylist', default=False, help='If the URL refers to a video and a playlist, download only the video.') + selection.add_option( + '--yes-playlist', + action='store_false', dest='noplaylist', default=False, + help='If the URL refers to a video and a playlist, download the playlist.') selection.add_option( '--age-limit', metavar='YEARS', dest='age_limit', default=None, type=int, From 41b264e77cd357444b632a132ea11ff7ddc3de1f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 24 Feb 2015 23:06:44 +0600 Subject: [PATCH 078/131] [nrktv] Workaround subtitles conversion issues on python 2.6 (Closes #5036) --- youtube_dl/extractor/nrk.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py index 46f493cfc..1e4cfa2e7 100644 --- a/youtube_dl/extractor/nrk.py +++ b/youtube_dl/extractor/nrk.py @@ -4,6 +4,7 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( ExtractorError, float_or_none, @@ -158,7 +159,9 @@ class NRKTVIE(InfoExtractor): def _get_subtitles(self, subtitlesurl, video_id, baseurl): url = "%s%s" % (baseurl, subtitlesurl) self._debug_print('%s: Subtitle url: %s' % (video_id, url)) - captions = self._download_xml(url, video_id, 'Downloading subtitles') + captions = self._download_xml( + url, video_id, 'Downloading subtitles', + transform_source=lambda s: s.replace(r'<br />', '\r\n')) lang = captions.get('lang', 'no') ps = captions.findall('./{0}body/{0}div/{0}p'.format('{http://www.w3.org/ns/ttml}')) srt = '' @@ -167,8 +170,7 @@ class NRKTVIE(InfoExtractor): duration = parse_duration(p.get('dur')) starttime = self._seconds2str(begin) endtime = self._seconds2str(begin + duration) - text = '\n'.join(p.itertext()) - srt += '%s\r\n%s --> %s\r\n%s\r\n\r\n' % (str(pos), starttime, endtime, text) + srt += '%s\r\n%s --> %s\r\n%s\r\n\r\n' % (compat_str(pos), starttime, endtime, p.text) return {lang: [ {'ext': 'ttml', 'url': url}, {'ext': 'srt', 'data': srt}, From 570311610e33eb67d5a65a86705d54daaee17361 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Wed, 25 Feb 2015 00:45:11 +0800 Subject: [PATCH 079/131] [Letv] Add playlist support --- youtube_dl/extractor/__init__.py | 6 +++- youtube_dl/extractor/letv.py | 54 +++++++++++++++++++++++++++++++- 2 files changed, 58 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 829ab18a9..7b7d41adf 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -237,7 +237,11 @@ from .krasview import KrasViewIE from .ku6 import Ku6IE from .la7 import LA7IE from .laola1tv import Laola1TvIE -from .letv import LetvIE +from .letv import ( + LetvIE, + LetvTvIE, + LetvPlaylistIE +) from .lifenews import LifeNewsIE from .liveleak import LiveLeakIE from .livestream import ( diff --git a/youtube_dl/extractor/letv.py b/youtube_dl/extractor/letv.py index 3eb20678e..d7d315634 100644 --- a/youtube_dl/extractor/letv.py +++ b/youtube_dl/extractor/letv.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals import os.path +import re import time import datetime @@ -11,7 +12,7 @@ from ..utils import (ExtractorError, parse_iso8601) class LetvIE(InfoExtractor): - _VALID_URL = r'http://www.letv.com/ptv/vplay/(?P<id>\d+).html' + _VALID_URL = r'http://www\.letv\.com/ptv/vplay/(?P<id>\d+).html' _TESTS = [{ 'url': 'http://www.letv.com/ptv/vplay/22005890.html', @@ -118,3 +119,54 @@ class LetvIE(InfoExtractor): 'thumbnail': playurl['pic'], 'timestamp': publish_time, } + + +class LetvTvIE(InfoExtractor): + _VALID_URL = r'http://www.letv.com/tv/(?P<id>\d+).html' + _TESTS = [{ + 'url': 'http://www.letv.com/tv/46177.html', + 'info_dict': { + 'id': '46177', + 'title': '美人天下', + 'description': 'md5:395666ff41b44080396e59570dbac01c' + }, + 'playlist_count': 35 + }] + + def _real_extract(self, url): + playlist_id = self._match_id(url) + page = self._download_webpage(url, playlist_id) + + media_urls = list(set(re.findall( + r'http://www.letv.com/ptv/vplay/\d+.html', page))) + entries = [self.url_result(media_url, ie='Letv') + for media_url in media_urls] + + title = self._html_search_meta('keywords', page, fatal=False).split(',')[0] + description = self._html_search_meta('description', page, fatal=False) + + return self.playlist_result(entries, playlist_id, playlist_title=title, + playlist_description=description) + + +class LetvPlaylistIE(LetvTvIE): + _VALID_URL = r'http://tv.letv.com/[a-z]+/(?P<id>[a-z]+)/index.s?html' + _TESTS = [{ + 'url': 'http://tv.letv.com/izt/wuzetian/index.html', + 'info_dict': { + 'id': 'wuzetian', + 'title': '武媚娘传奇', + 'description': 'md5:e12499475ab3d50219e5bba00b3cb248' + }, + 'playlist_count': 96 + }, { + 'url': 'http://tv.letv.com/pzt/lswjzzjc/index.shtml', + 'info_dict': { + 'id': 'lswjzzjc', + # should be "劲舞青春", but I can't find a simple way to determine + # the playlist title + 'title': '乐视午间自制剧场', + 'description': 'md5:b1eef244f45589a7b5b1af9ff25a4489' + }, + 'playlist_mincount': 7 + }] From 59c7cbd482ba82248cd1bdca3569da6035720f21 Mon Sep 17 00:00:00 2001 From: logon84 <rubenlogon@yahoo.es> Date: Tue, 24 Feb 2015 18:58:32 +0100 Subject: [PATCH 080/131] Update eporner.py Updated to work. Old version shows an error about being unable to extract "redirect_code" --- youtube_dl/extractor/eporner.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/youtube_dl/extractor/eporner.py b/youtube_dl/extractor/eporner.py index 4de8d4bc5..9ae28855b 100644 --- a/youtube_dl/extractor/eporner.py +++ b/youtube_dl/extractor/eporner.py @@ -35,10 +35,7 @@ class EpornerIE(InfoExtractor): title = self._html_search_regex( r'<title>(.*?) - EPORNER', webpage, 'title') - redirect_code = self._html_search_regex( - r'<script type="text/javascript" src="/config5/%s/([a-f\d]+)/">' % video_id, - webpage, 'redirect_code') - redirect_url = 'http://www.eporner.com/config5/%s/%s' % (video_id, redirect_code) + redirect_url = 'http://www.eporner.com/config5/%s/%s' % (video_id, display_id) player_code = self._download_webpage( redirect_url, display_id, note='Downloading player config') From 677063594e28d11ada9d2bfd3a30397d4de24360 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Wed, 25 Feb 2015 02:10:55 +0800 Subject: [PATCH 081/131] [Letv] Update testcases --- youtube_dl/extractor/letv.py | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/letv.py b/youtube_dl/extractor/letv.py index d7d315634..d5839263c 100644 --- a/youtube_dl/extractor/letv.py +++ b/youtube_dl/extractor/letv.py @@ -23,14 +23,22 @@ class LetvIE(InfoExtractor): 'title': '第87届奥斯卡颁奖礼完美落幕 《鸟人》成最大赢家', 'timestamp': 1424747397, 'upload_date': '20150224', + 'description': 'md5:a9cb175fd753e2962176b7beca21a47c', } }, { - 'url': 'http://www.letv.com/ptv/vplay/1118082.html', + 'url': 'http://www.letv.com/ptv/vplay/1415246.html', 'info_dict': { - 'id': '1118082', + 'id': '1415246', 'ext': 'mp4', - } + 'title': '美人天下01', + 'description': 'md5:f88573d9d7225ada1359eaf0dbf8bcda', + }, + 'expected_warnings': [ + 'publish time' + ] }] + # http://www.letv.com/ptv/vplay/1118082.html + # This video is available only in Mainland China @staticmethod def urshift(val, n): @@ -111,12 +119,14 @@ class LetvIE(InfoExtractor): publish_time = parse_iso8601(self._html_search_regex( r'发布时间 ([^<>]+) ', page, 'publish time', fatal=False), delimiter=' ', timezone=datetime.timedelta(hours=8)) + description = self._html_search_meta('description', page, fatal=False) return { 'id': media_id, 'formats': urls, 'title': playurl['title'], 'thumbnail': playurl['pic'], + 'description': description, 'timestamp': publish_time, } @@ -142,7 +152,8 @@ class LetvTvIE(InfoExtractor): entries = [self.url_result(media_url, ie='Letv') for media_url in media_urls] - title = self._html_search_meta('keywords', page, fatal=False).split(',')[0] + title = self._html_search_meta('keywords', page, + fatal=False).split(',')[0] description = self._html_search_meta('description', page, fatal=False) return self.playlist_result(entries, playlist_id, playlist_title=title, @@ -158,13 +169,14 @@ class LetvPlaylistIE(LetvTvIE): 'title': '武媚娘传奇', 'description': 'md5:e12499475ab3d50219e5bba00b3cb248' }, - 'playlist_count': 96 + # This playlist contains some extra videos other than the drama itself + 'playlist_mincount': 96 }, { 'url': 'http://tv.letv.com/pzt/lswjzzjc/index.shtml', 'info_dict': { 'id': 'lswjzzjc', - # should be "劲舞青春", but I can't find a simple way to determine - # the playlist title + # The title should be "劲舞青春", but I can't find a simple way to + # determine the playlist title 'title': '乐视午间自制剧场', 'description': 'md5:b1eef244f45589a7b5b1af9ff25a4489' }, From e765ed3a9c4ba52d4709a4a696881eae3401efa0 Mon Sep 17 00:00:00 2001 From: logon84 <rubenlogon@yahoo.es> Date: Tue, 24 Feb 2015 19:41:46 +0100 Subject: [PATCH 082/131] [eporner] Fix redirect_code error --- youtube_dl/extractor/eporner.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/eporner.py b/youtube_dl/extractor/eporner.py index 9ae28855b..f5943caa5 100644 --- a/youtube_dl/extractor/eporner.py +++ b/youtube_dl/extractor/eporner.py @@ -35,6 +35,7 @@ class EpornerIE(InfoExtractor): title = self._html_search_regex( r'<title>(.*?) - EPORNER', webpage, 'title') + redirect_url = 'http://www.eporner.com/config5/%s/%s' % (video_id, display_id) player_code = self._download_webpage( redirect_url, display_id, note='Downloading player config') From 37f885650c323e040a200bda9376bc7dbdf2ca25 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 25 Feb 2015 01:08:54 +0600 Subject: [PATCH 083/131] [eporner] Simplify and hardcode age limit --- youtube_dl/extractor/eporner.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/eporner.py b/youtube_dl/extractor/eporner.py index f5943caa5..e006921ec 100644 --- a/youtube_dl/extractor/eporner.py +++ b/youtube_dl/extractor/eporner.py @@ -35,8 +35,7 @@ class EpornerIE(InfoExtractor): title = self._html_search_regex( r'<title>(.*?) - EPORNER', webpage, 'title') - - redirect_url = 'http://www.eporner.com/config5/%s/%s' % (video_id, display_id) + redirect_url = 'http://www.eporner.com/config5/%s' % video_id player_code = self._download_webpage( redirect_url, display_id, note='Downloading player config') @@ -67,5 +66,5 @@ class EpornerIE(InfoExtractor): 'duration': duration, 'view_count': view_count, 'formats': formats, - 'age_limit': self._rta_search(webpage), + 'age_limit': 18, } From 637570326bfa12575fe210e52e2a39d6585891d8 Mon Sep 17 00:00:00 2001 From: Antti Ajanki <antti.ajanki@iki.fi> Date: Sun, 22 Feb 2015 10:16:51 +0200 Subject: [PATCH 084/131] [extractor/common] Extract the first of a seq of videos in a .smil file --- youtube_dl/extractor/common.py | 68 +++++++++++++++++++--------------- 1 file changed, 39 insertions(+), 29 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 87fce9cd8..4fe99d25d 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -921,39 +921,49 @@ class InfoExtractor(object): formats = [] rtmp_count = 0 - for video in smil.findall('./body/switch/video'): - src = video.get('src') - if not src: - continue - bitrate = int_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000) - width = int_or_none(video.get('width')) - height = int_or_none(video.get('height')) - proto = video.get('proto') - if not proto: - if base: - if base.startswith('rtmp'): - proto = 'rtmp' - elif base.startswith('http'): - proto = 'http' - ext = video.get('ext') - if proto == 'm3u8': - formats.extend(self._extract_m3u8_formats(src, video_id, ext)) - elif proto == 'rtmp': - rtmp_count += 1 - streamer = video.get('streamer') or base - formats.append({ - 'url': streamer, - 'play_path': src, - 'ext': 'flv', - 'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate), - 'tbr': bitrate, - 'width': width, - 'height': height, - }) + if smil.findall('./body/seq/video'): + video = smil.findall('./body/seq/video')[0] + fmts, rtmp_count = self._parse_smil_video(video, base, rtmp_count) + formats.extend(fmts) + else: + for video in smil.findall('./body/switch/video'): + fmts, rtmp_count = self._parse_smil_video(video, base, rtmp_count) + formats.extend(fmts) + self._sort_formats(formats) return formats + def _parse_smil_video(self, video, base, rtmp_count): + src = video.get('src') + if not src: + return ([], rtmp_count) + bitrate = int_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000) + width = int_or_none(video.get('width')) + height = int_or_none(video.get('height')) + proto = video.get('proto') + if not proto: + if base: + if base.startswith('rtmp'): + proto = 'rtmp' + elif base.startswith('http'): + proto = 'http' + ext = video.get('ext') + if proto == 'm3u8': + return (self._extract_m3u8_formats(src, video_id, ext), rtmp_count) + elif proto == 'rtmp': + rtmp_count += 1 + streamer = video.get('streamer') or base + return ([{ + 'url': streamer, + 'play_path': src, + 'ext': 'flv', + 'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate), + 'tbr': bitrate, + 'width': width, + 'height': height, + }], rtmp_count) + def _live_title(self, name): """ Generate the title for a live video """ now = datetime.datetime.now() From 6f4ba54079893a09c6aa78fe3420523fb96df858 Mon Sep 17 00:00:00 2001 From: Antti Ajanki <antti.ajanki@iki.fi> Date: Sun, 22 Feb 2015 10:18:36 +0200 Subject: [PATCH 085/131] [extractor/common] Extract HTTP (possibly f4m) URLs from a .smil file --- youtube_dl/extractor/common.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 4fe99d25d..313688208 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -963,6 +963,14 @@ class InfoExtractor(object): 'width': width, 'height': height, }], rtmp_count) + elif proto.startswith('http'): + return ([{ + 'url': base + src, + 'ext': ext or 'flv', + 'tbr': bitrate, + 'width': width, + 'height': height, + }], rtmp_count) def _live_title(self, name): """ Generate the title for a live video """ From c4f8c453ae2f735fc2320856e15e66510d74fd72 Mon Sep 17 00:00:00 2001 From: Antti Ajanki <antti.ajanki@iki.fi> Date: Sun, 22 Feb 2015 21:03:49 +0200 Subject: [PATCH 086/131] [f4m] Refresh fragment list periodically on live streams --- youtube_dl/downloader/f4m.py | 59 ++++++++++++++++++++++++++++++------ 1 file changed, 49 insertions(+), 10 deletions(-) diff --git a/youtube_dl/downloader/f4m.py b/youtube_dl/downloader/f4m.py index 7b8fe8cf5..1df9ebe5b 100644 --- a/youtube_dl/downloader/f4m.py +++ b/youtube_dl/downloader/f4m.py @@ -121,7 +121,8 @@ class FlvReader(io.BytesIO): self.read_unsigned_int() # BootstrapinfoVersion # Profile,Live,Update,Reserved - self.read(1) + flags = self.read_unsigned_char() + live = flags & 0x20 != 0 # time scale self.read_unsigned_int() # CurrentMediaTime @@ -160,6 +161,7 @@ class FlvReader(io.BytesIO): return { 'segments': segments, 'fragments': fragments, + 'live': live, } def read_bootstrap_info(self): @@ -182,6 +184,10 @@ def build_fragments_list(boot_info): for segment, fragments_count in segment_run_table['segment_run']: for _ in range(fragments_count): res.append((segment, next(fragments_counter))) + + if boot_info['live']: + res = res[-2:] + return res @@ -246,6 +252,38 @@ class F4mFD(FileDownloader): self.report_error('Unsupported DRM') return media + def _get_bootstrap_from_url(self, bootstrap_url): + bootstrap = self.ydl.urlopen(bootstrap_url).read() + return read_bootstrap_info(bootstrap) + + def _update_live_fragments(self, bootstrap_url, latest_fragment): + fragments_list = [] + retries = 30 + while (not fragments_list) and (retries > 0): + boot_info = self._get_bootstrap_from_url(bootstrap_url) + fragments_list = build_fragments_list(boot_info) + fragments_list = [f for f in fragments_list if f[1] > latest_fragment] + if not fragments_list: + # Retry after a while + time.sleep(5.0) + retries -= 1 + + if not fragments_list: + self.report_error('Failed to update fragments') + + return fragments_list + + def _parse_bootstrap_node(self, node, base_url): + if node.text is None: + bootstrap_url = compat_urlparse.urljoin( + base_url, node.attrib['url']) + boot_info = self._get_bootstrap_from_url(bootstrap_url) + else: + bootstrap_url = None + bootstrap = base64.b64decode(node.text) + boot_info = read_bootstrap_info(bootstrap) + return (boot_info, bootstrap_url) + def real_download(self, filename, info_dict): man_url = info_dict['url'] requested_bitrate = info_dict.get('tbr') @@ -265,18 +303,13 @@ class F4mFD(FileDownloader): base_url = compat_urlparse.urljoin(man_url, media.attrib['url']) bootstrap_node = doc.find(_add_ns('bootstrapInfo')) - if bootstrap_node.text is None: - bootstrap_url = compat_urlparse.urljoin( - base_url, bootstrap_node.attrib['url']) - bootstrap = self.ydl.urlopen(bootstrap_url).read() - else: - bootstrap = base64.b64decode(bootstrap_node.text) + boot_info, bootstrap_url = self._parse_bootstrap_node(bootstrap_node, base_url) + live = boot_info['live'] metadata_node = media.find(_add_ns('metadata')) if metadata_node is not None: metadata = base64.b64decode(metadata_node.text) else: metadata = None - boot_info = read_bootstrap_info(bootstrap) fragments_list = build_fragments_list(boot_info) if self.params.get('test', False): @@ -301,7 +334,8 @@ class F4mFD(FileDownloader): (dest_stream, tmpfilename) = sanitize_open(tmpfilename, 'wb') write_flv_header(dest_stream) - write_metadata_tag(dest_stream, metadata) + if not live: + write_metadata_tag(dest_stream, metadata) # This dict stores the download progress, it's updated by the progress # hook @@ -348,7 +382,8 @@ class F4mFD(FileDownloader): http_dl.add_progress_hook(frag_progress_hook) frags_filenames = [] - for (seg_i, frag_i) in fragments_list: + while fragments_list: + seg_i, frag_i = fragments_list.pop(0) name = 'Seg%d-Frag%d' % (seg_i, frag_i) url = base_url + name if akamai_pv: @@ -367,6 +402,10 @@ class F4mFD(FileDownloader): break frags_filenames.append(frag_filename) + if not fragments_list and live and bootstrap_url: + fragments_list = self._update_live_fragments(bootstrap_url, frag_i) + self.to_screen('Updated available fragments: %d' % len(fragments_list)) + dest_stream.close() elapsed = time.time() - start From 5eaaeb7c317a543af4bde5eb7d465f3695fc97d9 Mon Sep 17 00:00:00 2001 From: Antti Ajanki <antti.ajanki@iki.fi> Date: Mon, 23 Feb 2015 21:56:35 +0200 Subject: [PATCH 087/131] [f4m] Tolerate missed fragments on live streams --- youtube_dl/downloader/f4m.py | 43 +++++++++++++++++++++++++----------- 1 file changed, 30 insertions(+), 13 deletions(-) diff --git a/youtube_dl/downloader/f4m.py b/youtube_dl/downloader/f4m.py index 1df9ebe5b..3dc796faa 100644 --- a/youtube_dl/downloader/f4m.py +++ b/youtube_dl/downloader/f4m.py @@ -11,6 +11,7 @@ from .common import FileDownloader from .http import HttpFD from ..compat import ( compat_urlparse, + compat_urllib_error, ) from ..utils import ( struct_pack, @@ -389,22 +390,38 @@ class F4mFD(FileDownloader): if akamai_pv: url += '?' + akamai_pv.strip(';') frag_filename = '%s-%s' % (tmpfilename, name) - success = http_dl.download(frag_filename, {'url': url}) - if not success: - return False - with open(frag_filename, 'rb') as down: - down_data = down.read() - reader = FlvReader(down_data) - while True: - _, box_type, box_data = reader.read_box_info() - if box_type == b'mdat': - dest_stream.write(box_data) - break - frags_filenames.append(frag_filename) + try: + success = http_dl.download(frag_filename, {'url': url}) + if not success: + return False + with open(frag_filename, 'rb') as down: + down_data = down.read() + reader = FlvReader(down_data) + while True: + _, box_type, box_data = reader.read_box_info() + if box_type == b'mdat': + dest_stream.write(box_data) + break + if live: + os.remove(frag_filename) + else: + frags_filenames.append(frag_filename) + except (compat_urllib_error.HTTPError, ) as err: + if live and (err.code == 404 or err.code == 410): + # We didn't keep up with the live window. Continue + # with the next available fragment. + msg = 'Fragment %d unavailable' % frag_i + self.report_warning(msg) + fragments_list = [] + else: + raise if not fragments_list and live and bootstrap_url: fragments_list = self._update_live_fragments(bootstrap_url, frag_i) - self.to_screen('Updated available fragments: %d' % len(fragments_list)) + total_frags += len(fragments_list) + if fragments_list and (fragments_list[0][1] > frag_i + 1): + msg = 'Missed %d fragments' % (fragments_list[0][1] - (frag_i + 1)) + self.report_warning(msg) dest_stream.close() From b8988b63a6baa206b7f5e35d99a9f4eff6ec7b5e Mon Sep 17 00:00:00 2001 From: Antti Ajanki <antti.ajanki@iki.fi> Date: Tue, 24 Feb 2015 21:23:59 +0200 Subject: [PATCH 088/131] [wdr] Download a live stream --- youtube_dl/extractor/wdr.py | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/youtube_dl/extractor/wdr.py b/youtube_dl/extractor/wdr.py index c90488500..b46802306 100644 --- a/youtube_dl/extractor/wdr.py +++ b/youtube_dl/extractor/wdr.py @@ -28,6 +28,7 @@ class WDRIE(InfoExtractor): 'title': 'Servicezeit', 'description': 'md5:c8f43e5e815eeb54d0b96df2fba906cb', 'upload_date': '20140310', + 'is_live': False }, 'params': { 'skip_download': True, @@ -41,6 +42,7 @@ class WDRIE(InfoExtractor): 'title': 'Marga Spiegel ist tot', 'description': 'md5:2309992a6716c347891c045be50992e4', 'upload_date': '20140311', + 'is_live': False }, 'params': { 'skip_download': True, @@ -55,6 +57,7 @@ class WDRIE(InfoExtractor): 'title': 'Erlebte Geschichten: Marga Spiegel (29.11.2009)', 'description': 'md5:2309992a6716c347891c045be50992e4', 'upload_date': '20091129', + 'is_live': False }, }, { @@ -66,6 +69,7 @@ class WDRIE(InfoExtractor): 'title': 'Flavia Coelho: Amar é Amar', 'description': 'md5:7b29e97e10dfb6e265238b32fa35b23a', 'upload_date': '20140717', + 'is_live': False }, }, { @@ -74,6 +78,20 @@ class WDRIE(InfoExtractor): 'info_dict': { 'id': 'mediathek/video/sendungen/quarks_und_co/filterseite-quarks-und-co100', } + }, + { + 'url': 'http://www1.wdr.de/mediathek/video/livestream/index.html', + 'info_dict': { + 'id': 'mdb-103364', + 'title': 're:^WDR Fernsehen [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'description': 'md5:ae2ff888510623bf8d4b115f95a9b7c9', + 'ext': 'flv', + 'upload_date': '20150212', + 'is_live': True + }, + 'params': { + 'skip_download': True, + }, } ] @@ -119,6 +137,10 @@ class WDRIE(InfoExtractor): video_url = flashvars['dslSrc'][0] title = flashvars['trackerClipTitle'][0] thumbnail = flashvars['startPicture'][0] if 'startPicture' in flashvars else None + is_live = flashvars.get('isLive', ['0'])[0] == '1' + + if is_live: + title = self._live_title(title) if 'trackerClipAirTime' in flashvars: upload_date = flashvars['trackerClipAirTime'][0] @@ -131,6 +153,13 @@ class WDRIE(InfoExtractor): if video_url.endswith('.f4m'): video_url += '?hdcore=3.2.0&plugin=aasp-3.2.0.77.18' ext = 'flv' + elif video_url.endswith('.smil'): + fmt = self._extract_smil_formats(video_url, page_id)[0] + video_url = fmt['url'] + sep = '&' if '?' in video_url else '?' + video_url += sep + video_url += 'hdcore=3.3.0&plugin=aasp-3.3.0.99.43' + ext = fmt['ext'] else: ext = determine_ext(video_url) @@ -144,6 +173,7 @@ class WDRIE(InfoExtractor): 'description': description, 'thumbnail': thumbnail, 'upload_date': upload_date, + 'is_live': is_live } From 13d8fbef30f1d7608b02e78818fa2ec211d69e21 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Wed, 25 Feb 2015 17:56:51 +0100 Subject: [PATCH 089/131] [generic] Don't set the 'title' if it's not defined in the entry (closes #5061) Some of them may be an 'url' result, which in general don't have the 'title' field. --- youtube_dl/extractor/generic.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 875e1bf05..3aff57e30 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -1208,7 +1208,9 @@ class GenericIE(InfoExtractor): return entries[0] else: for num, e in enumerate(entries, start=1): - e['title'] = '%s (%d)' % (e['title'], num) + # 'url' results don't have a title + if e.get('title') is not None: + e['title'] = '%s (%d)' % (e['title'], num) return { '_type': 'playlist', 'entries': entries, From 9504fc21b583d6ff968eec9d5843fc40b48725e7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Markus=20M=C3=BCller?= <mmu@grummel.net> Date: Wed, 25 Feb 2015 23:27:19 +0100 Subject: [PATCH 090/131] Fix the RTL extractor for new episodes by using a different hostname --- youtube_dl/extractor/rtlnow.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/rtlnow.py b/youtube_dl/extractor/rtlnow.py index fd93cc66f..785a8045e 100644 --- a/youtube_dl/extractor/rtlnow.py +++ b/youtube_dl/extractor/rtlnow.py @@ -146,7 +146,7 @@ class RTLnowIE(InfoExtractor): mobj = re.search(r'.*/(?P<hoster>[^/]+)/videos/(?P<play_path>.+)\.f4m', filename.text) if mobj: fmt = { - 'url': 'rtmpe://fmspay-fra2.rtl.de/' + mobj.group('hoster'), + 'url': 'rtmpe://fms.rtl.de/' + mobj.group('hoster'), 'play_path': 'mp4:' + mobj.group('play_path'), 'page_url': url, 'player_url': video_page_url + 'includes/vodplayer.swf', From c10ea454dc7a06bf4d911774fd12b49ce7550845 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Wed, 25 Feb 2015 23:52:54 +0100 Subject: [PATCH 091/131] [telecinco] Recognize more urls (closes #5065) --- youtube_dl/extractor/mitele.py | 4 ++-- youtube_dl/extractor/telecinco.py | 9 ++++++--- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/mitele.py b/youtube_dl/extractor/mitele.py index 256758323..d8897eb90 100644 --- a/youtube_dl/extractor/mitele.py +++ b/youtube_dl/extractor/mitele.py @@ -18,7 +18,7 @@ class MiTeleIE(InfoExtractor): IE_NAME = 'mitele.es' _VALID_URL = r'http://www\.mitele\.es/[^/]+/[^/]+/[^/]+/(?P<id>[^/]+)/' - _TEST = { + _TESTS = [{ 'url': 'http://www.mitele.es/programas-tv/diario-de/la-redaccion/programa-144/', 'md5': '6a75fe9d0d3275bead0cb683c616fddb', 'info_dict': { @@ -29,7 +29,7 @@ class MiTeleIE(InfoExtractor): 'display_id': 'programa-144', 'duration': 2913, }, - } + }] def _real_extract(self, url): episode = self._match_id(url) diff --git a/youtube_dl/extractor/telecinco.py b/youtube_dl/extractor/telecinco.py index be3f72df7..251a68680 100644 --- a/youtube_dl/extractor/telecinco.py +++ b/youtube_dl/extractor/telecinco.py @@ -6,9 +6,9 @@ from .mitele import MiTeleIE class TelecincoIE(MiTeleIE): IE_NAME = 'telecinco.es' - _VALID_URL = r'https?://www\.telecinco\.es/[^/]+/[^/]+/[^/]+/(?P<id>.*?)\.html' + _VALID_URL = r'https?://www\.telecinco\.es/[^/]+/[^/]+/(?:[^/]+/)?(?P<id>.*?)\.html' - _TEST = { + _TESTS = [{ 'url': 'http://www.telecinco.es/robinfood/temporada-01/t01xp14/Bacalao-cocochas-pil-pil_0_1876350223.html', 'info_dict': { 'id': 'MDSVID20141015_0058', @@ -16,4 +16,7 @@ class TelecincoIE(MiTeleIE): 'title': 'Con Martín Berasategui, hacer un bacalao al ...', 'duration': 662, }, - } + }, { + 'url': 'http://www.telecinco.es/informativos/nacional/Pablo_Iglesias-Informativos_Telecinco-entrevista-Pedro_Piqueras_2_1945155182.html', + 'only_matching': True, + }] From 1c69bca25871ebb8b54ebc5cb9bb4b2124bb12ab Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Thu, 26 Feb 2015 00:24:54 +0100 Subject: [PATCH 092/131] [escapist] Fix config URL matching --- youtube_dl/extractor/escapist.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/escapist.py b/youtube_dl/extractor/escapist.py index 51ffec7ee..3244306dc 100644 --- a/youtube_dl/extractor/escapist.py +++ b/youtube_dl/extractor/escapist.py @@ -44,10 +44,10 @@ class EscapistIE(InfoExtractor): config_url = compat_urllib_parse.unquote(self._html_search_regex( r'''(?x) (?: - <param\s+name="flashvars"\s+value="config=| + <param\s+name="flashvars".*?\s+value="config=| flashvars="config= ) - ([^"&]+) + (https?://[^"&]+) ''', webpage, 'config URL')) From 183139340b4841f22357fcf2b6a5ce2dedd71146 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Thu, 26 Feb 2015 00:40:12 +0100 Subject: [PATCH 093/131] [utils] Bump our user agent --- youtube_dl/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index e2631dccd..506c896de 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -54,7 +54,7 @@ from .compat import ( compiled_regex_type = type(re.compile('')) std_headers = { - 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)', + 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/20.0 (Chrome)', 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', From 13cd97f3dfd87373cfc501224692352cb20e8194 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Thu, 26 Feb 2015 00:42:02 +0100 Subject: [PATCH 094/131] release 2015.02.26 --- README.md | 2 ++ youtube_dl/version.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 699401b49..2c53e2211 100644 --- a/README.md +++ b/README.md @@ -139,6 +139,8 @@ which means you can modify it, redistribute it or use it however you like. dislike_count <? 50 & description" . --no-playlist If the URL refers to a video and a playlist, download only the video. + --yes-playlist If the URL refers to a video and a + playlist, download the playlist. --age-limit YEARS download only videos suitable for the given age --download-archive FILE Download only videos not listed in the diff --git a/youtube_dl/version.py b/youtube_dl/version.py index d23c6ae3d..0cbf66ed1 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2015.02.24.2' +__version__ = '2015.02.26' From 9a48926a5734e0c1530c4788680961610090a837 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Thu, 26 Feb 2015 00:59:53 +0100 Subject: [PATCH 095/131] [escapist] Add support for advertisements --- youtube_dl/extractor/escapist.py | 41 ++++++++++++++++++++++++-------- 1 file changed, 31 insertions(+), 10 deletions(-) diff --git a/youtube_dl/extractor/escapist.py b/youtube_dl/extractor/escapist.py index 3244306dc..b45c1dbd0 100644 --- a/youtube_dl/extractor/escapist.py +++ b/youtube_dl/extractor/escapist.py @@ -52,6 +52,7 @@ class EscapistIE(InfoExtractor): webpage, 'config URL')) formats = [] + ad_formats = [] def _add_format(name, cfgurl, quality): config = self._download_json( @@ -61,14 +62,19 @@ class EscapistIE(InfoExtractor): transform_source=js_to_json) playlist = config['playlist'] - video_url = next( - p['url'] for p in playlist - if p.get('eventCategory') == 'Video') - formats.append({ - 'url': video_url, - 'format_id': name, - 'quality': quality, - }) + for p in playlist: + if p.get('eventCategory') == 'Video': + ar = formats + elif p.get('eventCategory') == 'Video Postroll': + ar = ad_formats + else: + continue + + ar.append({ + 'url': p['url'], + 'format_id': name, + 'quality': quality, + }) _add_format('normal', config_url, quality=0) hq_url = (config_url + @@ -77,10 +83,9 @@ class EscapistIE(InfoExtractor): _add_format('hq', hq_url, quality=1) except ExtractorError: pass # That's fine, we'll just use normal quality - self._sort_formats(formats) - return { + res = { 'id': video_id, 'formats': formats, 'uploader': uploader, @@ -89,3 +94,19 @@ class EscapistIE(InfoExtractor): 'thumbnail': self._og_search_thumbnail(webpage), 'description': description, } + + if self._downloader.params.get('include_ads') and ad_formats: + self._sort_formats(ad_formats) + ad_res = { + 'id': '%s-ad' % video_id, + 'title': '%s (Postroll)' % title, + 'formats': ad_formats, + } + return { + '_type': 'playlist', + 'entries': [res, ad_res], + 'title': title, + 'id': video_id, + } + + return res From 3e675fabe027ecf20e662454ff7a174596801256 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Thu, 26 Feb 2015 01:25:00 +0100 Subject: [PATCH 096/131] [airmozilla] Be more tolerant when nonessential items are missing (#5030) --- test/test_utils.py | 1 + youtube_dl/extractor/airmozilla.py | 47 +++++++++++++++--------------- youtube_dl/utils.py | 5 ++++ 3 files changed, 30 insertions(+), 23 deletions(-) diff --git a/test/test_utils.py b/test/test_utils.py index 2f8996d7b..3fba8ae11 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -246,6 +246,7 @@ class TestUtil(unittest.TestCase): self.assertEqual(parse_duration('2.5 hours'), 9000) self.assertEqual(parse_duration('02:03:04'), 7384) self.assertEqual(parse_duration('01:02:03:04'), 93784) + self.assertEqual(parse_duration('1 hour 3 minutes'), 3780) def test_fix_xml_ampersands(self): self.assertEqual( diff --git a/youtube_dl/extractor/airmozilla.py b/youtube_dl/extractor/airmozilla.py index 44c20f886..611ad1e9d 100644 --- a/youtube_dl/extractor/airmozilla.py +++ b/youtube_dl/extractor/airmozilla.py @@ -4,7 +4,11 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..utils import parse_iso8601 +from ..utils import ( + int_or_none, + parse_duration, + parse_iso8601, +) class AirMozillaIE(InfoExtractor): @@ -27,13 +31,6 @@ class AirMozillaIE(InfoExtractor): } } - _QUALITY_MAP = { - '360p': 0, - '576p': 1, - '640p': 2, - '720p': 3, - } - def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) @@ -43,19 +40,23 @@ class AirMozillaIE(InfoExtractor): jwconfig = self._search_regex(r'\svar jwconfig = (\{.*?\});\s', embed_script, 'metadata') metadata = self._parse_json(jwconfig, video_id) - formats = [] - for source in metadata['playlist'][0]['sources']: - fmt = { - 'url': source['file'], - 'ext': source['type'], - 'format_id': self._search_regex(r'&format=(.*)$', source['file'], 'video format'), - 'resolution': source['label'], - 'quality': self._QUALITY_MAP.get(source['label'], -1), - } - formats.append(fmt) + formats = [{ + 'url': source['file'], + 'ext': source['type'], + 'format_id': self._search_regex(r'&format=(.*)$', source['file'], 'video format'), + 'format': source['label'], + 'height': int(source['label'].rstrip('p')), + } for source in metadata['playlist'][0]['sources']] self._sort_formats(formats) - duration_match = re.search(r'Duration:(?: (?P<H>\d+) hours?)?(?: (?P<M>\d+) minutes?)?', webpage) + view_count = int_or_none(self._html_search_regex( + r'Views since archived: ([0-9]+)', + webpage, 'view count', fatal=False)) + timestamp = parse_iso8601(self._html_search_regex( + r'<time datetime="(.*?)"', webpage, 'timestamp', fatal=False)) + duration = parse_duration(self._search_regex( + r'Duration:\s*(\d+\s*hours?\s*\d+\s*minutes?)', + webpage, 'duration', fatal=False)) return { 'id': video_id, @@ -63,11 +64,11 @@ class AirMozillaIE(InfoExtractor): 'formats': formats, 'url': self._og_search_url(webpage), 'display_id': display_id, - 'thumbnail': metadata['playlist'][0]['image'], + 'thumbnail': metadata['playlist'][0].get('image'), 'description': self._og_search_description(webpage), - 'timestamp': parse_iso8601(self._html_search_regex(r'<time datetime="(.*?)"', webpage, 'timestamp')), + 'timestamp': timestamp, 'location': self._html_search_regex(r'Location: (.*)', webpage, 'location', default=None), - 'duration': int(duration_match.groupdict()['H'] or 0) * 3600 + int(duration_match.groupdict()['M'] or 0) * 60, - 'view_count': int(self._html_search_regex(r'Views since archived: ([0-9]+)', webpage, 'view count')), + 'duration': duration, + 'view_count': view_count, 'categories': re.findall(r'<a href=".*?" class="channel">(.*?)</a>', webpage), } diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 506c896de..1f3bfef7d 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1290,6 +1290,7 @@ def parse_duration(s): (?P<only_mins>[0-9.]+)\s*(?:mins?|minutes?)\s*| (?P<only_hours>[0-9.]+)\s*(?:hours?)| + \s*(?P<hours_reversed>[0-9]+)\s*(?:[:h]|hours?)\s*(?P<mins_reversed>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*| (?: (?: (?:(?P<days>[0-9]+)\s*(?:[:d]|days?)\s*)? @@ -1308,10 +1309,14 @@ def parse_duration(s): return float_or_none(m.group('only_hours'), invscale=60 * 60) if m.group('secs'): res += int(m.group('secs')) + if m.group('mins_reversed'): + res += int(m.group('mins_reversed')) * 60 if m.group('mins'): res += int(m.group('mins')) * 60 if m.group('hours'): res += int(m.group('hours')) * 60 * 60 + if m.group('hours_reversed'): + res += int(m.group('hours_reversed')) * 60 * 60 if m.group('days'): res += int(m.group('days')) * 24 * 60 * 60 if m.group('ms'): From 84be7c230cd7b17c54926928e4108a1118f9892e Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Thu, 26 Feb 2015 01:25:54 +0100 Subject: [PATCH 097/131] Cred @duncankl for airmozilla --- AUTHORS | 1 + 1 file changed, 1 insertion(+) diff --git a/AUTHORS b/AUTHORS index bdd2a15dc..4674a5af3 100644 --- a/AUTHORS +++ b/AUTHORS @@ -112,3 +112,4 @@ Frans de Jonge Robin de Rooij Ryan Schmidt Leslie P. Polzer +Duncan Keall From 265bfa2c79abc8f233132126be313ed2d4b18dc8 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Thu, 26 Feb 2015 01:30:18 +0100 Subject: [PATCH 098/131] [letv] Simplify --- youtube_dl/extractor/letv.py | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/letv.py b/youtube_dl/extractor/letv.py index d5839263c..583ce35b9 100644 --- a/youtube_dl/extractor/letv.py +++ b/youtube_dl/extractor/letv.py @@ -1,14 +1,20 @@ # coding: utf-8 from __future__ import unicode_literals -import os.path +import datetime import re import time -import datetime from .common import InfoExtractor -from ..compat import (compat_urlparse, compat_urllib_parse) -from ..utils import (ExtractorError, parse_iso8601) +from ..compat import ( + compat_urlparse, + compat_urllib_parse, +) +from ..utils import ( + determine_ext, + ExtractorError, + parse_iso8601, +) class LetvIE(InfoExtractor): @@ -44,7 +50,7 @@ class LetvIE(InfoExtractor): def urshift(val, n): return val >> n if val >= 0 else (val + 0x100000000) >> n - # ror() and calcTimeKey() are reversed from a embedded swf file in KLetvPlayer.swf + # ror() and calc_time_key() are reversed from a embedded swf file in KLetvPlayer.swf def ror(self, param1, param2): _loc3_ = 0 while _loc3_ < param2: @@ -52,7 +58,7 @@ class LetvIE(InfoExtractor): _loc3_ += 1 return param1 - def calcTimeKey(self, param1): + def calc_time_key(self, param1): _loc2_ = 773625421 _loc3_ = self.ror(param1, _loc2_ % 13) _loc3_ = _loc3_ ^ _loc2_ @@ -67,7 +73,7 @@ class LetvIE(InfoExtractor): 'platid': 1, 'splatid': 101, 'format': 1, - 'tkey': self.calcTimeKey(int(time.time())), + 'tkey': self.calc_time_key(int(time.time())), 'domain': 'www.letv.com' } play_json = self._download_json( @@ -108,7 +114,7 @@ class LetvIE(InfoExtractor): url_info_dict = { 'url': media_url, - 'ext': os.path.splitext(dispatch[format_id][1])[1][1:] + 'ext': determine_ext(dispatch[format_id][1]) } if format_id[-1:] == 'p': From 72a406e7aa6ef751f5f2ad4b110b749ce40d70a4 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Thu, 26 Feb 2015 01:35:43 +0100 Subject: [PATCH 099/131] [extractor/common] Pass in video_id (#5057) --- youtube_dl/extractor/common.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 313688208..7977fa8d0 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -923,18 +923,18 @@ class InfoExtractor(object): rtmp_count = 0 if smil.findall('./body/seq/video'): video = smil.findall('./body/seq/video')[0] - fmts, rtmp_count = self._parse_smil_video(video, base, rtmp_count) + fmts, rtmp_count = self._parse_smil_video(video, video_id, base, rtmp_count) formats.extend(fmts) else: for video in smil.findall('./body/switch/video'): - fmts, rtmp_count = self._parse_smil_video(video, base, rtmp_count) + fmts, rtmp_count = self._parse_smil_video(video, video_id, base, rtmp_count) formats.extend(fmts) self._sort_formats(formats) return formats - def _parse_smil_video(self, video, base, rtmp_count): + def _parse_smil_video(self, video, video_id, base, rtmp_count): src = video.get('src') if not src: return ([], rtmp_count) From 2a15a98a6a077381cba52207b791d03506b0416c Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Thu, 26 Feb 2015 01:44:20 +0100 Subject: [PATCH 100/131] [rmtp] Encode filename before invoking subprocess This fixes #5066. Reproducible with LC_ALL=C youtube-dl "http://www.prosieben.de/tv/germanys-next-topmodel/video/playlist/ganze-folge-episode-2-das-casting-in-muenchen" --- youtube_dl/downloader/rtmp.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/youtube_dl/downloader/rtmp.py b/youtube_dl/downloader/rtmp.py index 0a52c34c7..89e98ae61 100644 --- a/youtube_dl/downloader/rtmp.py +++ b/youtube_dl/downloader/rtmp.py @@ -119,7 +119,9 @@ class RtmpFD(FileDownloader): # Download using rtmpdump. rtmpdump returns exit code 2 when # the connection was interrumpted and resuming appears to be # possible. This is part of rtmpdump's normal usage, AFAIK. - basic_args = ['rtmpdump', '--verbose', '-r', url, '-o', tmpfilename] + basic_args = [ + 'rtmpdump', '--verbose', '-r', url, + '-o', encodeFilename(tmpfilename, True)] if player_url is not None: basic_args += ['--swfVfy', player_url] if page_url is not None: From 1c6510f57a458aa9634923c0f48596b57d084649 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Thu, 26 Feb 2015 01:47:12 +0100 Subject: [PATCH 101/131] [Makefile] clean pyc files in clean target --- Makefile | 1 + 1 file changed, 1 insertion(+) diff --git a/Makefile b/Makefile index 708732956..c6c76274f 100644 --- a/Makefile +++ b/Makefile @@ -2,6 +2,7 @@ all: youtube-dl README.md CONTRIBUTING.md README.txt youtube-dl.1 youtube-dl.bas clean: rm -rf youtube-dl.1.temp.md youtube-dl.1 youtube-dl.bash-completion README.txt MANIFEST build/ dist/ .coverage cover/ youtube-dl.tar.gz youtube-dl.zsh youtube-dl.fish *.dump *.part *.info.json *.mp4 *.flv *.mp3 *.avi CONTRIBUTING.md.tmp youtube-dl youtube-dl.exe + find -name "*.pyc" -delete PREFIX ?= /usr/local BINDIR ?= $(PREFIX)/bin From 5c340b038740061c61f55be1935e364a883a3138 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Thu, 26 Feb 2015 01:47:16 +0100 Subject: [PATCH 102/131] release 2015.02.26.1 --- docs/supportedsites.md | 4 ++++ youtube_dl/version.py | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 9f70db80a..49b4ac8c1 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -17,6 +17,7 @@ - **AdultSwim** - **Aftenposten** - **Aftonbladet** + - **AirMozilla** - **AlJazeera** - **Allocine** - **AlphaPorno** @@ -220,6 +221,9 @@ - **Ku6** - **la7.tv** - **Laola1Tv** + - **Letv** + - **LetvPlaylist** + - **LetvTv** - **lifenews**: LIFE | NEWS - **LiveLeak** - **livestream** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 0cbf66ed1..9043ca695 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2015.02.26' +__version__ = '2015.02.26.1' From 63a562f95ee62e05e689bf7fbf8e923749248b05 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Thu, 26 Feb 2015 09:19:26 +0100 Subject: [PATCH 103/131] [escapist] Detect IP blocking and use another UA (Fixes #5069) --- youtube_dl/extractor/escapist.py | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/escapist.py b/youtube_dl/extractor/escapist.py index b45c1dbd0..80e9084f4 100644 --- a/youtube_dl/extractor/escapist.py +++ b/youtube_dl/extractor/escapist.py @@ -3,6 +3,7 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..compat import ( compat_urllib_parse, + compat_urllib_request, ) from ..utils import ( ExtractorError, @@ -12,6 +13,7 @@ from ..utils import ( class EscapistIE(InfoExtractor): _VALID_URL = r'https?://?(www\.)?escapistmagazine\.com/videos/view/[^/?#]+/(?P<id>[0-9]+)-[^/?#]*(?:$|[?#])' + _USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko' _TEST = { 'url': 'http://www.escapistmagazine.com/videos/view/the-escapist-presents/6618-Breaking-Down-Baldurs-Gate', 'md5': 'ab3a706c681efca53f0a35f1415cf0d1', @@ -28,7 +30,9 @@ class EscapistIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + webpage_req = compat_urllib_request.Request(url) + webpage_req.add_header('User-Agent', self._USER_AGENT) + webpage = self._download_webpage(webpage_req, video_id) uploader_id = self._html_search_regex( r"<h1\s+class='headline'>\s*<a\s+href='/videos/view/(.*?)'", @@ -54,9 +58,11 @@ class EscapistIE(InfoExtractor): formats = [] ad_formats = [] - def _add_format(name, cfgurl, quality): + def _add_format(name, cfg_url, quality): + cfg_req = compat_urllib_request.Request(cfg_url) + cfg_req.add_header('User-Agent', self._USER_AGENT) config = self._download_json( - cfgurl, video_id, + cfg_req, video_id, 'Downloading ' + name + ' configuration', 'Unable to download ' + name + ' configuration', transform_source=js_to_json) @@ -74,6 +80,9 @@ class EscapistIE(InfoExtractor): 'url': p['url'], 'format_id': name, 'quality': quality, + 'http_headers': { + 'User-Agent': self._USER_AGENT, + }, }) _add_format('normal', config_url, quality=0) @@ -85,6 +94,9 @@ class EscapistIE(InfoExtractor): pass # That's fine, we'll just use normal quality self._sort_formats(formats) + if '/escapist/sales-marketing/' in formats[-1]['url']: + raise ExtractorError('This IP address has been blocked by The Escapist', expected=True) + res = { 'id': video_id, 'formats': formats, From 9724e5d33661b6d6c84b6da64b78c0b96221ab24 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Thu, 26 Feb 2015 09:45:11 +0100 Subject: [PATCH 104/131] release 2015.02.26.2 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 9043ca695..cf3e28bbe 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2015.02.26.1' +__version__ = '2015.02.26.2' From 2e241242a3ee6338cafd515c9cd7481eb5c6f928 Mon Sep 17 00:00:00 2001 From: "PishPosh.McGee" <pishposh.mcgee@gmail.com> Date: Thu, 26 Feb 2015 03:59:35 -0600 Subject: [PATCH 105/131] Adding subtitles --- youtube_dl/extractor/comedycentral.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/youtube_dl/extractor/comedycentral.py b/youtube_dl/extractor/comedycentral.py index b24538981..e5edcc84b 100644 --- a/youtube_dl/extractor/comedycentral.py +++ b/youtube_dl/extractor/comedycentral.py @@ -250,6 +250,8 @@ class ComedyCentralShowsIE(MTVServicesInfoExtractor): }) self._sort_formats(formats) + subtitles = self._extract_subtitles(cdoc, guid) + virtual_id = show_name + ' ' + epTitle + ' part ' + compat_str(part_num + 1) entries.append({ 'id': guid, @@ -260,6 +262,7 @@ class ComedyCentralShowsIE(MTVServicesInfoExtractor): 'duration': duration, 'thumbnail': thumbnail, 'description': description, + 'subtitles': subtitles, }) return { From e129c5bc0de4913564c1e1a62baae4bd0073824c Mon Sep 17 00:00:00 2001 From: Naglis Jonaitis <njonaitis@gmail.com> Date: Thu, 26 Feb 2015 14:35:48 +0200 Subject: [PATCH 106/131] [laola1tv] Allow live stream downloads --- youtube_dl/extractor/laola1tv.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/youtube_dl/extractor/laola1tv.py b/youtube_dl/extractor/laola1tv.py index e8ca49fd1..b459559b0 100644 --- a/youtube_dl/extractor/laola1tv.py +++ b/youtube_dl/extractor/laola1tv.py @@ -27,8 +27,6 @@ class Laola1TvIE(InfoExtractor): } } - _BROKEN = True # Not really - extractor works fine, but f4m downloader does not support live streams yet. - def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') @@ -57,11 +55,7 @@ class Laola1TvIE(InfoExtractor): title = xpath_text(hd_doc, './/video/title', fatal=True) flash_url = xpath_text(hd_doc, './/video/url', fatal=True) uploader = xpath_text(hd_doc, './/video/meta_organistation') - is_live = xpath_text(hd_doc, './/video/islive') == 'true' - if is_live: - raise ExtractorError( - 'Live streams are not supported by the f4m downloader.') categories = xpath_text(hd_doc, './/video/meta_sports') if categories: From 7ab7c9e93290f575747c1790c4aecf2055227a8d Mon Sep 17 00:00:00 2001 From: Naglis Jonaitis <njonaitis@gmail.com> Date: Thu, 26 Feb 2015 16:22:05 +0200 Subject: [PATCH 107/131] [gamestar] Fix title extraction --- youtube_dl/extractor/gamestar.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/gamestar.py b/youtube_dl/extractor/gamestar.py index 7591a151e..590ccf526 100644 --- a/youtube_dl/extractor/gamestar.py +++ b/youtube_dl/extractor/gamestar.py @@ -1,6 +1,8 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor from ..utils import ( int_or_none, @@ -31,7 +33,7 @@ class GameStarIE(InfoExtractor): webpage = self._download_webpage(url, video_id) og_title = self._og_search_title(webpage) - title = og_title.replace(' - Video bei GameStar.de', '').strip() + title = re.sub(r'\s*- Video (bei|-) GameStar\.de$', '', og_title) url = 'http://gamestar.de/_misc/videos/portal/getVideoUrl.cfm?premium=0&videoId=' + video_id From 6317a3e9da42f5d60e514fddcb89eec7f5768294 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 26 Feb 2015 21:10:49 +0600 Subject: [PATCH 108/131] [mpora] Fix extraction --- youtube_dl/extractor/mpora.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/mpora.py b/youtube_dl/extractor/mpora.py index 6db3c67a5..a07d28928 100644 --- a/youtube_dl/extractor/mpora.py +++ b/youtube_dl/extractor/mpora.py @@ -25,7 +25,9 @@ class MporaIE(InfoExtractor): webpage = self._download_webpage(url, video_id) data_json = self._search_regex( - r"new FM\.Player\('[^']+',\s*(\{.*?)\).player;", webpage, 'json') + [r"new FM\.Player\('[^']+',\s*(\{.*?)\).player;", + r"new\s+FM\.Kaltura\.Player\('[^']+'\s*,\s*({.+?})\);"], + webpage, 'json') data = self._parse_json(data_json, video_id) uploader = data['info_overlay'].get('username') From 250a9bdfe27ecf3d0acf179b4c21ff2126c64990 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 26 Feb 2015 21:16:35 +0600 Subject: [PATCH 109/131] [mpora] Improve _VALID_URL --- youtube_dl/extractor/mpora.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/mpora.py b/youtube_dl/extractor/mpora.py index a07d28928..5a1bee5c8 100644 --- a/youtube_dl/extractor/mpora.py +++ b/youtube_dl/extractor/mpora.py @@ -5,7 +5,7 @@ from ..utils import int_or_none class MporaIE(InfoExtractor): - _VALID_URL = r'https?://(www\.)?mpora\.(?:com|de)/videos/(?P<id>[^?#/]+)' + _VALID_URL = r'https?://(?:www\.)?mpora\.(?:com|de)/videos/(?P<id>[^?#/]+)' IE_NAME = 'MPORA' _TEST = { From 1a2313a6f2d83c9e0a5c0c25bf5a2e3167994dbe Mon Sep 17 00:00:00 2001 From: anovicecodemonkey <anovicecodemonkey13435@mailinator.com> Date: Fri, 27 Feb 2015 02:36:45 +1030 Subject: [PATCH 110/131] [TheChiveIE] added support for TheChive.com (Closes #5016) --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/thechive.py | 60 ++++++++++++++++++++++++++++++++ 2 files changed, 61 insertions(+) create mode 100644 youtube_dl/extractor/thechive.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index ddb9d6670..b064a3a7c 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -481,6 +481,7 @@ from .tenplay import TenPlayIE from .testurl import TestURLIE from .testtube import TestTubeIE from .tf1 import TF1IE +from .thechive import TheChiveIE from .theonion import TheOnionIE from .theplatform import ThePlatformIE from .thesixtyone import TheSixtyOneIE diff --git a/youtube_dl/extractor/thechive.py b/youtube_dl/extractor/thechive.py new file mode 100644 index 000000000..df1a7998d --- /dev/null +++ b/youtube_dl/extractor/thechive.py @@ -0,0 +1,60 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import unified_strdate + + + +class TheChiveIE(InfoExtractor): + _VALID_URL = r'http://(www\.)?thechive\.com/[^/]+/[^/]+/[^/]+/(?P<video_id>[A-Za-z\-]+)' + _TEST = { + 'url': "http://thechive.com/2015/02/20/so-thats-what-a-set-of-redneck-bagpipes-sound-like-video/", + 'md5': "366710dda77cfa727bdef3523ba8466f", + 'info_dict': { + 'id': "so-thats-what-a-set-of-redneck-bagpipes-sound-like-video", + 'title': "So that's what a set of redneck bagpipes sound like... (Video)", + 'description': "Okay that was pretty good. Now play Freebird!...", + 'thumbnail': "https://thechive.files.wordpress.com/2015/02/0_07dghz0w-thumbnail2.jpg", + 'author': "Ben", + 'upload_date': "20150220", + 'ext': "mp4" + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('video_id') + webpage = self._download_webpage(url, video_id) + + title = self._og_search_title(webpage) + description = self._html_search_regex(r'(?s)<meta name="description" content="(.*?)" />', webpage, 'description') + thumbnail = self._og_search_thumbnail(webpage) + author = self._html_search_regex( + r'(?s)itemprop="author">(.+?)</span>', webpage, 'author', fatal=False).capitalize() + upload_date = unified_strdate(self._html_search_regex( + r'(?s)itemprop="datePublished" datetime="(.+?)">', webpage, 'upload_date', fatal=False)) + + # Adapted from extractor/musicvault.py + VIDEO_URL_TEMPLATE = 'http://cdnapi.kaltura.com/p/%(uid)s/sp/%(wid)s/playManifest/entryId/%(entry_id)s/format/url/protocol/http' + + kaltura_id = self._search_regex( + r'entry_id=([^"]+)', + webpage, 'kaltura ID') + video_url = VIDEO_URL_TEMPLATE % { + 'entry_id': kaltura_id, + 'wid': self._search_regex(r'partner_id/([0-9]+)\?', webpage, 'wid'), + 'uid': self._search_regex(r'uiconf_id/([0-9]+)/', webpage, 'uid'), + } + + return { + 'url': video_url, + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'author': author, + 'upload_date': upload_date, + 'ext': 'mp4' + } \ No newline at end of file From 0d97ef43bec006157870fd4a5cedfac1eaebf3a9 Mon Sep 17 00:00:00 2001 From: Naglis Jonaitis <njonaitis@gmail.com> Date: Thu, 26 Feb 2015 23:45:54 +0200 Subject: [PATCH 111/131] [kaltura] Add new extractor --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/kaltura.py | 134 +++++++++++++++++++++++++++++++ 2 files changed, 135 insertions(+) create mode 100644 youtube_dl/extractor/kaltura.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index ddb9d6670..e3b2cb54f 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -227,6 +227,7 @@ from .jeuxvideo import JeuxVideoIE from .jove import JoveIE from .jukebox import JukeboxIE from .jpopsukitv import JpopsukiIE +from .kaltura import KalturaIE from .kankan import KankanIE from .karaoketv import KaraoketvIE from .keezmovies import KeezMoviesIE diff --git a/youtube_dl/extractor/kaltura.py b/youtube_dl/extractor/kaltura.py new file mode 100644 index 000000000..2aff410c5 --- /dev/null +++ b/youtube_dl/extractor/kaltura.py @@ -0,0 +1,134 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_urllib_parse +from ..utils import ( + ExtractorError, + int_or_none, +) + + +class KalturaIE(InfoExtractor): + _VALID_URL = r'''(?x) + (?:kaltura:| + https?://(:?www\.)?kaltura\.com/index\.php/kwidget/(?:[^/]+/)*?wid/_ + )(?P<partner_id>\d+) + (?::| + /(?:[^/]+/)*?entry_id/ + )(?P<id>[0-9a-z_]+)''' + _API_BASE = 'http://cdnapi.kaltura.com/api_v3/index.php?' + _TESTS = [ + { + 'url': 'kaltura:269692:1_1jc2y3e4', + 'md5': '3adcbdb3dcc02d647539e53f284ba171', + 'info_dict': { + 'id': '1_1jc2y3e4', + 'ext': 'mp4', + 'title': 'Track 4', + 'upload_date': '20131219', + 'uploader_id': 'mlundberg@wolfgangsvault.com', + 'description': 'The Allman Brothers Band, 12/16/1981', + 'thumbnail': 're:^https?://.*/thumbnail/.*', + 'timestamp': int, + }, + }, + { + 'url': 'http://www.kaltura.com/index.php/kwidget/cache_st/1300318621/wid/_269692/uiconf_id/3873291/entry_id/1_1jc2y3e4', + 'only_matching': True, + }, + ] + + def _kaltura_api_call(self, video_id, actions, *args, **kwargs): + params = actions[0] + if len(actions) > 1: + for i, a in enumerate(actions[1:], start=1): + for k, v in a.items(): + params['%d:%s' % (i, k)] = v + + query = compat_urllib_parse.urlencode(params) + url = self._API_BASE + query + data = self._download_json(url, video_id, *args, **kwargs) + + status = data if len(actions) == 1 else data[0] + if status.get('objectType') == 'KalturaAPIException': + raise ExtractorError( + '%s said: %s' % (self.IE_NAME, status['message'])) + + return data + + def _get_kaltura_signature(self, video_id, partner_id): + actions = [{ + 'apiVersion': '3.1', + 'expiry': 86400, + 'format': 1, + 'service': 'session', + 'action': 'startWidgetSession', + 'widgetId': '_%s' % partner_id, + }] + return self._kaltura_api_call( + video_id, actions, note='Downloading Kaltura signature')['ks'] + + def _get_video_info(self, video_id, partner_id): + signature = self._get_kaltura_signature(video_id, partner_id) + actions = [ + { + 'action': 'null', + 'apiVersion': '3.1.5', + 'clientTag': 'kdp:v3.8.5', + 'format': 1, # JSON, 2 = XML, 3 = PHP + 'service': 'multirequest', + 'ks': signature, + }, + { + 'action': 'get', + 'entryId': video_id, + 'service': 'baseentry', + 'version': '-1', + }, + { + 'action': 'getContextData', + 'contextDataParams:objectType': 'KalturaEntryContextDataParams', + 'contextDataParams:referrer': 'http://www.kaltura.com/', + 'contextDataParams:streamerType': 'http', + 'entryId': video_id, + 'service': 'baseentry', + }, + ] + return self._kaltura_api_call( + video_id, actions, note='Downloading video info JSON') + + def _real_extract(self, url): + video_id = self._match_id(url) + mobj = re.match(self._VALID_URL, url) + partner_id, entry_id = mobj.group('partner_id'), mobj.group('id') + + info, source_data = self._get_video_info(entry_id, partner_id) + + formats = [{ + 'format_id': '%(fileExt)s-%(bitrate)s' % f, + 'ext': f['fileExt'], + 'tbr': f['bitrate'], + 'fps': f.get('frameRate'), + 'filesize_approx': int_or_none(f.get('size'), invscale=1024), + 'container': f.get('containerFormat'), + 'vcodec': f.get('videoCodecId'), + 'height': f.get('height'), + 'width': f.get('width'), + 'url': '%s/flavorId/%s' % (info['dataUrl'], f['id']), + } for f in source_data['flavorAssets']] + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': info['name'], + 'formats': formats, + 'description': info.get('description'), + 'thumbnail': info.get('thumbnailUrl'), + 'duration': info.get('duration'), + 'timestamp': info.get('createdAt'), + 'uploader_id': info.get('userId'), + 'view_count': info.get('plays'), + } From da419e23320aef560c05f543c5ee375e8956c95a Mon Sep 17 00:00:00 2001 From: Naglis Jonaitis <njonaitis@gmail.com> Date: Thu, 26 Feb 2015 23:47:45 +0200 Subject: [PATCH 112/131] [musicvault] Use the Kaltura extractor --- youtube_dl/extractor/musicvault.py | 27 +++++++-------------------- 1 file changed, 7 insertions(+), 20 deletions(-) diff --git a/youtube_dl/extractor/musicvault.py b/youtube_dl/extractor/musicvault.py index ebb1eb8e9..0e46ac7c1 100644 --- a/youtube_dl/extractor/musicvault.py +++ b/youtube_dl/extractor/musicvault.py @@ -3,17 +3,13 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..utils import ( - parse_duration, - unified_strdate, -) class MusicVaultIE(InfoExtractor): _VALID_URL = r'https?://www\.musicvault\.com/(?P<uploader_id>[^/?#]*)/video/(?P<display_id>[^/?#]*)_(?P<id>[0-9]+)\.html' _TEST = { 'url': 'http://www.musicvault.com/the-allman-brothers-band/video/straight-from-the-heart_1010863.html', - 'md5': '2cdbb3ae75f7fb3519821507d2fb3c15', + 'md5': '3adcbdb3dcc02d647539e53f284ba171', 'info_dict': { 'id': '1010863', 'ext': 'mp4', @@ -22,9 +18,10 @@ class MusicVaultIE(InfoExtractor): 'duration': 244, 'uploader': 'The Allman Brothers Band', 'thumbnail': 're:^https?://.*/thumbnail/.*', - 'upload_date': '19811216', + 'upload_date': '20131219', 'location': 'Capitol Theatre (Passaic, NJ)', 'description': 'Listen to The Allman Brothers Band perform Straight from the Heart at Capitol Theatre (Passaic, NJ) on Dec 16, 1981', + 'timestamp': int, } } @@ -43,34 +40,24 @@ class MusicVaultIE(InfoExtractor): r'<h1.*?>(.*?)</h1>', data_div, 'uploader', fatal=False) title = self._html_search_regex( r'<h2.*?>(.*?)</h2>', data_div, 'title') - upload_date = unified_strdate(self._html_search_regex( - r'<h3.*?>(.*?)</h3>', data_div, 'uploader', fatal=False)) location = self._html_search_regex( r'<h4.*?>(.*?)</h4>', data_div, 'location', fatal=False) - duration = parse_duration(self._html_search_meta('duration', webpage)) - - VIDEO_URL_TEMPLATE = 'http://cdnapi.kaltura.com/p/%(uid)s/sp/%(wid)s/playManifest/entryId/%(entry_id)s/format/url/protocol/http' kaltura_id = self._search_regex( r'<div id="video-detail-player" data-kaltura-id="([^"]+)"', webpage, 'kaltura ID') - video_url = VIDEO_URL_TEMPLATE % { - 'entry_id': kaltura_id, - 'wid': self._search_regex(r'/wid/_([0-9]+)/', webpage, 'wid'), - 'uid': self._search_regex(r'uiconf_id/([0-9]+)/', webpage, 'uid'), - } + wid = self._search_regex(r'/wid/_([0-9]+)/', webpage, 'wid') return { 'id': mobj.group('id'), - 'url': video_url, - 'ext': 'mp4', + '_type': 'url_transparent', + 'url': 'kaltura:%s:%s' % (wid, kaltura_id), + 'ie_key': 'Kaltura', 'display_id': display_id, 'uploader_id': mobj.group('uploader_id'), 'thumbnail': thumbnail, 'description': self._html_search_meta('description', webpage), - 'upload_date': upload_date, 'location': location, 'title': title, 'uploader': uploader, - 'duration': duration, } From e3216b82bf6ef54db63984f7fece4e95fbc3b981 Mon Sep 17 00:00:00 2001 From: Naglis Jonaitis <njonaitis@gmail.com> Date: Fri, 27 Feb 2015 00:34:19 +0200 Subject: [PATCH 113/131] [generic] Support dynamic Kaltura embeds (#5016) (#5073) --- youtube_dl/extractor/generic.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 3aff57e30..27e2bc300 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -557,6 +557,18 @@ class GenericIE(InfoExtractor): 'title': 'EP3S5 - Bon Appétit - Baqueira Mi Corazon !', } }, + # Kaltura embed + { + 'url': 'http://www.monumentalnetwork.com/videos/john-carlson-postgame-2-25-15', + 'info_dict': { + 'id': '1_eergr3h1', + 'ext': 'mp4', + 'upload_date': '20150226', + 'uploader_id': 'MonumentalSports-Kaltura@perfectsensedigital.com', + 'timestamp': int, + 'title': 'John Carlson Postgame 2/25/15', + }, + }, ] def report_following_redirect(self, new_url): @@ -1113,6 +1125,12 @@ class GenericIE(InfoExtractor): if mobj is not None: return self.url_result(mobj.group('url'), 'Zapiks') + # Look for Kaltura embeds + mobj = re.search( + r"(?s)kWidget\.(?:thumb)?[Ee]mbed\(\{.*?'wid'\s*:\s*'_?(?P<partner_id>[^']+)',.*?'entry_id'\s*:\s*'(?P<id>[^']+)',", webpage) + if mobj is not None: + return self.url_result('kaltura:%(partner_id)s:%(id)s' % mobj.groupdict(), 'Kaltura') + def check_video(vurl): if YoutubeIE.suitable(vurl): return True From 0eba1e178230a88d1b316f54edbb671d216c1d02 Mon Sep 17 00:00:00 2001 From: Sergey <Ftornik@users.noreply.github.com> Date: Fri, 27 Feb 2015 00:51:22 +0200 Subject: [PATCH 114/131] [lynda] Fixed subtitles broken file --- youtube_dl/extractor/lynda.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/lynda.py b/youtube_dl/extractor/lynda.py index 109055e72..8bb21ee1d 100644 --- a/youtube_dl/extractor/lynda.py +++ b/youtube_dl/extractor/lynda.py @@ -152,7 +152,7 @@ class LyndaIE(InfoExtractor): continue appear_time = m_current.group('timecode') disappear_time = m_next.group('timecode') - text = seq_current['Caption'] + text = seq_current['Caption'].strip() srt += '%s\r\n%s --> %s\r\n%s' % (str(pos), appear_time, disappear_time, text) if srt: return srt From f3bff94cf96587462baa8da6d7e5f17801381753 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Fri, 27 Feb 2015 12:24:51 +0100 Subject: [PATCH 115/131] [rtve] Extract duration --- youtube_dl/extractor/rtve.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/rtve.py b/youtube_dl/extractor/rtve.py index c0fd23ff1..b42442d12 100644 --- a/youtube_dl/extractor/rtve.py +++ b/youtube_dl/extractor/rtve.py @@ -8,8 +8,9 @@ import time from .common import InfoExtractor from ..compat import compat_urlparse from ..utils import ( - struct_unpack, + float_or_none, remove_end, + struct_unpack, ) @@ -67,6 +68,7 @@ class RTVEALaCartaIE(InfoExtractor): 'id': '2491869', 'ext': 'mp4', 'title': 'Balonmano - Swiss Cup masculina. Final: España-Suecia', + 'duration': 5024.566, }, }, { 'note': 'Live stream', @@ -113,6 +115,7 @@ class RTVEALaCartaIE(InfoExtractor): 'thumbnail': info.get('image'), 'page_url': url, 'subtitles': subtitles, + 'duration': float_or_none(info.get('duration'), scale=1000), } def _get_subtitles(self, video_id, sub_file): From 7862ad88b725daae957ad27ae60993e360c01e13 Mon Sep 17 00:00:00 2001 From: HanYOLO <HanYOLO@users.noreply.github.com> Date: Fri, 27 Feb 2015 15:41:58 +0100 Subject: [PATCH 116/131] puls4 Add new extractor --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/puls4.py | 61 ++++++++++++++++++++++++++++++++ 2 files changed, 62 insertions(+) create mode 100644 youtube_dl/extractor/puls4.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index e3b2cb54f..d137e1104 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -373,6 +373,7 @@ from .pornotube import PornotubeIE from .pornoxo import PornoXOIE from .promptfile import PromptFileIE from .prosiebensat1 import ProSiebenSat1IE +from .puls4 import Puls4IE from .pyvideo import PyvideoIE from .quickvid import QuickVidIE from .r7 import R7IE diff --git a/youtube_dl/extractor/puls4.py b/youtube_dl/extractor/puls4.py new file mode 100644 index 000000000..70dedbff3 --- /dev/null +++ b/youtube_dl/extractor/puls4.py @@ -0,0 +1,61 @@ +# -*- coding: utf-8 -*- + +from __future__ import unicode_literals + +from .common import InfoExtractor + +import re + + +class Puls4IE(InfoExtractor): + + _VALID_URL = r'https?://www.puls4.com/video/.+?/play/(?P<id>[0-9]+)' + _TESTS = [{ + 'url': 'http://www.puls4.com/video/pro-und-contra/play/2716816', + 'md5': '49f6a6629747eeec43cef6a46b5df81d', + 'info_dict': { + 'id': '2716816', + 'ext': 'mp4', + 'title': 'Pro und Contra vom 23.02.2015'}}, + { + 'url': 'http://www.puls4.com/video/kult-spielfilme/play/1298106', + 'md5': '6a48316c8903ece8dab9b9a7bf7a59ec', + 'info_dict': { + 'id': '1298106', + 'ext': 'mp4', + 'title': 'Lucky Fritz'}} + ] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + # if fsk-button + real_url = self._html_search_regex(r'\"fsk-button\".+?href=\"([^"]+)', + webpage, 'fsk_button', default=None) + if real_url: + webpage = self._download_webpage(real_url, video_id) + + title = self._html_search_regex( + r'<div id="bg_brandableContent">.+?<h1>(.+?)</h1>', + webpage, 'title', flags=re.DOTALL) + + sd_url = self._html_search_regex( + r'{\"url\":\"([^"]+?)\",\"hd\":false', + webpage, 'sd_url').replace('\\', '') + + formats = [{'format_id': 'sd', 'url': sd_url, 'quality': -2}] + + hd_url = self._html_search_regex( + r'{\"url\":\"([^"]+?)\",\"hd\":true', + webpage, 'hd_url', default=None) + if hd_url: + hd_url = hd_url.replace('\\', '') + formats.append({'format_id': 'hd', 'url': hd_url, 'quality': -1}) + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'ext': 'mp4' + } From a0d646135aaf417e0aa000419974c676335d164a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 27 Feb 2015 20:56:06 +0600 Subject: [PATCH 117/131] [lynda] Extend _VALID_URL --- youtube_dl/extractor/lynda.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/lynda.py b/youtube_dl/extractor/lynda.py index 109055e72..e7e9d80de 100644 --- a/youtube_dl/extractor/lynda.py +++ b/youtube_dl/extractor/lynda.py @@ -18,7 +18,7 @@ from ..utils import ( class LyndaIE(InfoExtractor): IE_NAME = 'lynda' IE_DESC = 'lynda.com videos' - _VALID_URL = r'https?://www\.lynda\.com/[^/]+/[^/]+/\d+/(\d+)-\d\.html' + _VALID_URL = r'https?://www\.lynda\.com/(?:[^/]+/[^/]+/\d+|player/embed)/(\d+)' _LOGIN_URL = 'https://www.lynda.com/login/login.aspx' _NETRC_MACHINE = 'lynda' @@ -27,7 +27,7 @@ class LyndaIE(InfoExtractor): ACCOUNT_CREDENTIALS_HINT = 'Use --username and --password options to provide lynda.com account credentials.' - _TEST = { + _TESTS = [{ 'url': 'http://www.lynda.com/Bootstrap-tutorials/Using-exercise-files/110885/114408-4.html', 'md5': 'ecfc6862da89489161fb9cd5f5a6fac1', 'info_dict': { @@ -36,7 +36,10 @@ class LyndaIE(InfoExtractor): 'title': 'Using the exercise files', 'duration': 68 } - } + }, { + 'url': 'https://www.lynda.com/player/embed/133770?tr=foo=1;bar=g;fizz=rt&fs=0', + 'only_matching': True, + }] def _real_initialize(self): self._login() From 781a7ef60ab03cf1e646a80ecf4cbd70f1532db6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Fri, 27 Feb 2015 16:18:18 +0100 Subject: [PATCH 118/131] [lynda] Use 'lstrip' for the subtitles The newlines at the end are important, they separate each piece of text. --- youtube_dl/extractor/lynda.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/lynda.py b/youtube_dl/extractor/lynda.py index bfd9b73d2..5dc22da22 100644 --- a/youtube_dl/extractor/lynda.py +++ b/youtube_dl/extractor/lynda.py @@ -155,7 +155,7 @@ class LyndaIE(InfoExtractor): continue appear_time = m_current.group('timecode') disappear_time = m_next.group('timecode') - text = seq_current['Caption'].strip() + text = seq_current['Caption'].lstrip() srt += '%s\r\n%s --> %s\r\n%s' % (str(pos), appear_time, disappear_time, text) if srt: return srt From 4ffbf77886b1c6c3f8d767223ca088395d08ab14 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 28 Feb 2015 00:15:03 +0600 Subject: [PATCH 119/131] [odnoklassniki] Add extractor (Closes #5075) --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/odnoklassniki.py | 85 +++++++++++++++++++++++++++ 2 files changed, 86 insertions(+) create mode 100644 youtube_dl/extractor/odnoklassniki.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index e3b2cb54f..aecb67bf4 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -346,6 +346,7 @@ from .ntvde import NTVDeIE from .ntvru import NTVRuIE from .nytimes import NYTimesIE from .nuvid import NuvidIE +from .odnoklassniki import OdnoklassnikiIE from .oktoberfesttv import OktoberfestTVIE from .ooyala import OoyalaIE from .openfilm import OpenFilmIE diff --git a/youtube_dl/extractor/odnoklassniki.py b/youtube_dl/extractor/odnoklassniki.py new file mode 100644 index 000000000..155d0ee6a --- /dev/null +++ b/youtube_dl/extractor/odnoklassniki.py @@ -0,0 +1,85 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + unified_strdate, + int_or_none, + qualities, +) + + +class OdnoklassnikiIE(InfoExtractor): + _VALID_URL = r'https?://(?:odnoklassniki|ok)\.ru/(?:video|web-api/video/moviePlayer)/(?P<id>\d+)' + _TESTS = [{ + 'url': 'http://ok.ru/video/20079905452', + 'md5': '8e24ad2da6f387948e7a7d44eb8668fe', + 'info_dict': { + 'id': '20079905452', + 'ext': 'mp4', + 'title': 'Культура меняет нас (прекрасный ролик!))', + 'duration': 100, + 'upload_date': '20141207', + 'uploader_id': '330537914540', + 'uploader': 'Виталий Добровольский', + 'like_count': int, + 'age_limit': 0, + }, + }, { + 'url': 'http://ok.ru/web-api/video/moviePlayer/20079905452', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + player = self._parse_json( + self._search_regex( + r"OKVideo\.start\(({.+?})\s*,\s*'VideoAutoplay_player'", webpage, 'player'), + video_id) + + metadata = self._parse_json(player['flashvars']['metadata'], video_id) + + movie = metadata['movie'] + title = movie['title'] + thumbnail = movie.get('poster') + duration = int_or_none(movie.get('duration')) + + author = metadata.get('author', {}) + uploader_id = author.get('id') + uploader = author.get('name') + + upload_date = unified_strdate(self._html_search_meta( + 'ya:ovs:upload_date', webpage, 'upload date')) + + age_limit = None + adult = self._html_search_meta( + 'ya:ovs:adult', webpage, 'age limit') + if adult: + age_limit = 18 if adult == 'true' else 0 + + like_count = int_or_none(metadata.get('likeCount')) + + quality = qualities(('mobile', 'lowest', 'low', 'sd', 'hd')) + + formats = [{ + 'url': f['url'], + 'ext': 'mp4', + 'format_id': f['name'], + 'quality': quality(f['name']), + } for f in metadata['videos']] + + return { + 'id': video_id, + 'title': title, + 'thumbnail': thumbnail, + 'duration': duration, + 'upload_date': upload_date, + 'uploader': uploader, + 'uploader_id': uploader_id, + 'like_count': like_count, + 'age_limit': age_limit, + 'formats': formats, + } From bd3749ed693ae96becd3832f20e765e1efe01476 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 28 Feb 2015 00:19:31 +0600 Subject: [PATCH 120/131] [kaltura] Extend _VALID_URL (Closes #5081) --- youtube_dl/extractor/kaltura.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/kaltura.py b/youtube_dl/extractor/kaltura.py index 2aff410c5..d28730492 100644 --- a/youtube_dl/extractor/kaltura.py +++ b/youtube_dl/extractor/kaltura.py @@ -14,7 +14,7 @@ from ..utils import ( class KalturaIE(InfoExtractor): _VALID_URL = r'''(?x) (?:kaltura:| - https?://(:?www\.)?kaltura\.com/index\.php/kwidget/(?:[^/]+/)*?wid/_ + https?://(:?(?:www|cdnapisec)\.)?kaltura\.com/index\.php/kwidget/(?:[^/]+/)*?wid/_ )(?P<partner_id>\d+) (?::| /(?:[^/]+/)*?entry_id/ @@ -39,6 +39,10 @@ class KalturaIE(InfoExtractor): 'url': 'http://www.kaltura.com/index.php/kwidget/cache_st/1300318621/wid/_269692/uiconf_id/3873291/entry_id/1_1jc2y3e4', 'only_matching': True, }, + { + 'url': 'https://cdnapisec.kaltura.com/index.php/kwidget/wid/_557781/uiconf_id/22845202/entry_id/1_plr1syf3', + 'only_matching': True, + }, ] def _kaltura_api_call(self, video_id, actions, *args, **kwargs): From 40b077bc7ee2722463f23974ce488acb7f11815a Mon Sep 17 00:00:00 2001 From: thc202 <thc202@gmail.com> Date: Fri, 27 Feb 2015 22:27:30 +0000 Subject: [PATCH 121/131] [oppetarkiv] Add new extractor Some, if not all, of the videos appear to be geo-blocked (Sweden). Test might fail (403 Forbidden) if not run through a Swedish connection. --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/oppetarkiv.py | 56 ++++++++++++++++++++++++++++++ 2 files changed, 57 insertions(+) create mode 100644 youtube_dl/extractor/oppetarkiv.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index aecb67bf4..1544f1059 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -350,6 +350,7 @@ from .odnoklassniki import OdnoklassnikiIE from .oktoberfesttv import OktoberfestTVIE from .ooyala import OoyalaIE from .openfilm import OpenFilmIE +from .oppetarkiv import OppetArkivIE from .orf import ( ORFTVthekIE, ORFOE1IE, diff --git a/youtube_dl/extractor/oppetarkiv.py b/youtube_dl/extractor/oppetarkiv.py new file mode 100644 index 000000000..6dd1fad3f --- /dev/null +++ b/youtube_dl/extractor/oppetarkiv.py @@ -0,0 +1,56 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + determine_ext, +) + + +class OppetArkivIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?oppetarkiv.se/video/(?P<id>[0-9]+)' + _TEST = { + 'url': 'http://www.oppetarkiv.se/video/1058509/rederiet-sasong-1-avsnitt-1-av-318', + 'md5': '7b95ca9bedeead63012b2d7c3992c28f', + 'info_dict': { + 'id': '1058509', + 'ext': 'mp4', + 'title': 'Farlig kryssning', + 'duration': 2566, + 'thumbnail': 're:^https?://.*[\.-]jpg$', + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + info = self._download_json( + 'http://www.oppetarkiv.se/video/%s?output=json' % video_id, video_id) + + title = info['context']['title'] + thumbnail = info['context'].get('thumbnailImage') + + video_info = info['video'] + formats = [] + for vr in video_info['videoReferences']: + vurl = vr['url'] + if determine_ext(vurl) == 'm3u8': + formats.extend(self._extract_m3u8_formats( + vurl, video_id, + ext='mp4', entry_protocol='m3u8_native', + m3u8_id=vr.get('playerType'))) + else: + formats.append({ + 'format_id': vr.get('playerType'), + 'url': vurl, + }) + self._sort_formats(formats) + + duration = video_info.get('materialLength') + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'thumbnail': thumbnail, + 'duration': duration, + } From 0f2c0d335b14ba1596e6608db7a6f29a0d9e1c86 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Sat, 28 Feb 2015 14:03:27 +0100 Subject: [PATCH 122/131] [YoutubeDL] Use the InfoExtractor._download_webpage method for getting the subtitles It handles encodings better, for example for 'http://www.npo.nl/nos-journaal/14-02-2015/POW_00942207' --- youtube_dl/YoutubeDL.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 76fc394bc..74e426168 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -1300,17 +1300,18 @@ class YoutubeDL(object): # subtitles download errors are already managed as troubles in relevant IE # that way it will silently go on when used with unsupporting IE subtitles = info_dict['requested_subtitles'] + ie = self.get_info_extractor(info_dict['extractor_key']) for sub_lang, sub_info in subtitles.items(): sub_format = sub_info['ext'] if sub_info.get('data') is not None: sub_data = sub_info['data'] else: try: - uf = self.urlopen(sub_info['url']) - sub_data = uf.read().decode('utf-8') - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: + sub_data = ie._download_webpage( + sub_info['url'], info_dict['id'], note=False) + except ExtractorError as err: self.report_warning('Unable to download subtitle for "%s": %s' % - (sub_lang, compat_str(err))) + (sub_lang, compat_str(err.cause))) continue try: sub_filename = subtitles_filename(filename, sub_lang, sub_format) From e9fade72f347cbcce779ff176d516467a425b43a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Sat, 28 Feb 2015 14:43:24 +0100 Subject: [PATCH 123/131] Add postprocessor for converting subtitles (closes #4954) --- youtube_dl/__init__.py | 8 ++++++ youtube_dl/options.py | 4 +++ youtube_dl/postprocessor/__init__.py | 2 ++ youtube_dl/postprocessor/ffmpeg.py | 38 ++++++++++++++++++++++++++++ 4 files changed, 52 insertions(+) diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 5ce201800..49f382695 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -170,6 +170,9 @@ def _real_main(argv=None): if opts.recodevideo is not None: if opts.recodevideo not in ['mp4', 'flv', 'webm', 'ogg', 'mkv']: parser.error('invalid video recode format specified') + if opts.convertsubtitles is not None: + if opts.convertsubtitles not in ['srt', 'vtt', 'ass']: + parser.error('invalid subtitle format specified') if opts.date is not None: date = DateRange.day(opts.date) @@ -223,6 +226,11 @@ def _real_main(argv=None): 'key': 'FFmpegVideoConvertor', 'preferedformat': opts.recodevideo, }) + if opts.convertsubtitles: + postprocessors.append({ + 'key': 'FFmpegSubtitlesConvertor', + 'format': opts.convertsubtitles, + }) if opts.embedsubtitles: postprocessors.append({ 'key': 'FFmpegEmbedSubtitle', diff --git a/youtube_dl/options.py b/youtube_dl/options.py index 886ce9613..58f811162 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -751,6 +751,10 @@ def parseOpts(overrideArguments=None): '--exec', metavar='CMD', dest='exec_cmd', help='Execute a command on the file after downloading, similar to find\'s -exec syntax. Example: --exec \'adb push {} /sdcard/Music/ && rm {}\'') + postproc.add_option( + '--convert-subtitles', '--convert-subs', + metavar='FORMAT', dest='convertsubtitles', default=None, + help='Convert the subtitles to other format (currently supported: srt|ass|vtt)') parser.add_option_group(general) parser.add_option_group(network) diff --git a/youtube_dl/postprocessor/__init__.py b/youtube_dl/postprocessor/__init__.py index 0ffbca258..708df3dd4 100644 --- a/youtube_dl/postprocessor/__init__.py +++ b/youtube_dl/postprocessor/__init__.py @@ -11,6 +11,7 @@ from .ffmpeg import ( FFmpegMergerPP, FFmpegMetadataPP, FFmpegVideoConvertorPP, + FFmpegSubtitlesConvertorPP, ) from .xattrpp import XAttrMetadataPP from .execafterdownload import ExecAfterDownloadPP @@ -31,6 +32,7 @@ __all__ = [ 'FFmpegMergerPP', 'FFmpegMetadataPP', 'FFmpegPostProcessor', + 'FFmpegSubtitlesConvertorPP', 'FFmpegVideoConvertorPP', 'XAttrMetadataPP', ] diff --git a/youtube_dl/postprocessor/ffmpeg.py b/youtube_dl/postprocessor/ffmpeg.py index 398fe050e..30094c2f3 100644 --- a/youtube_dl/postprocessor/ffmpeg.py +++ b/youtube_dl/postprocessor/ffmpeg.py @@ -1,5 +1,6 @@ from __future__ import unicode_literals +import io import os import subprocess import sys @@ -635,3 +636,40 @@ class FFmpegFixupM4aPP(FFmpegPostProcessor): os.rename(encodeFilename(temp_filename), encodeFilename(filename)) return True, info + + +class FFmpegSubtitlesConvertorPP(FFmpegPostProcessor): + def __init__(self, downloader=None, format=None): + super(FFmpegSubtitlesConvertorPP, self).__init__(downloader) + self.format = format + + def run(self, info): + subs = info.get('requested_subtitles') + filename = info['filepath'] + new_ext = self.format + new_format = new_ext + if new_format == 'vtt': + new_format = 'webvtt' + if subs is None: + self._downloader.to_screen('[ffmpeg] There aren\'t any subtitles to convert') + return True, info + self._downloader.to_screen('[ffmpeg] Converting subtitles') + for lang, sub in subs.items(): + ext = sub['ext'] + if ext == new_ext: + self._downloader.to_screen( + '[ffmpeg] Subtitle file for %s is already in the requested' + 'format' % new_ext) + continue + new_file = subtitles_filename(filename, lang, new_ext) + self.run_ffmpeg( + subtitles_filename(filename, lang, ext), + new_file, ['-f', new_format]) + + with io.open(new_file, 'rt', encoding='utf-8') as f: + subs[lang] = { + 'ext': ext, + 'data': f.read(), + } + + return True, info From e143f5dae9c767529b8b522a9df63ac0ee8fc356 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 28 Feb 2015 21:12:06 +0600 Subject: [PATCH 124/131] [oppetarkiv] Extract f4m formats and age limit --- youtube_dl/extractor/oppetarkiv.py | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/oppetarkiv.py b/youtube_dl/extractor/oppetarkiv.py index 6dd1fad3f..ae6a28308 100644 --- a/youtube_dl/extractor/oppetarkiv.py +++ b/youtube_dl/extractor/oppetarkiv.py @@ -8,17 +8,19 @@ from ..utils import ( class OppetArkivIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?oppetarkiv.se/video/(?P<id>[0-9]+)' + _VALID_URL = r'https?://(?:www\.)?oppetarkiv\.se/video/(?P<id>[0-9]+)' _TEST = { 'url': 'http://www.oppetarkiv.se/video/1058509/rederiet-sasong-1-avsnitt-1-av-318', - 'md5': '7b95ca9bedeead63012b2d7c3992c28f', + 'md5': '5c1eb616e59f733d4af77edc5177d2fe', 'info_dict': { 'id': '1058509', - 'ext': 'mp4', + 'ext': 'flv', 'title': 'Farlig kryssning', 'duration': 2566, 'thumbnail': 're:^https?://.*[\.-]jpg$', + 'age_limit': 0, }, + 'skip': 'Only works from Sweden', } def _real_extract(self, url): @@ -33,11 +35,16 @@ class OppetArkivIE(InfoExtractor): formats = [] for vr in video_info['videoReferences']: vurl = vr['url'] - if determine_ext(vurl) == 'm3u8': + ext = determine_ext(vurl) + if ext == 'm3u8': formats.extend(self._extract_m3u8_formats( vurl, video_id, ext='mp4', entry_protocol='m3u8_native', m3u8_id=vr.get('playerType'))) + elif ext == 'f4m': + formats.extend(self._extract_f4m_formats( + vurl + '?hdcore=3.3.0', video_id, + f4m_id=vr.get('playerType'))) else: formats.append({ 'format_id': vr.get('playerType'), @@ -47,10 +54,13 @@ class OppetArkivIE(InfoExtractor): duration = video_info.get('materialLength') + age_limit = 18 if video_info.get('inappropriateForChildren') else 0 + return { 'id': video_id, 'title': title, 'formats': formats, 'thumbnail': thumbnail, 'duration': duration, + 'age_limit': age_limit, } From df5ae3eb16effec4dc2609ed3520ddc068305efe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 28 Feb 2015 21:25:04 +0600 Subject: [PATCH 125/131] [oppetarkiv] Merge with svtplay --- youtube_dl/extractor/__init__.py | 1 - youtube_dl/extractor/oppetarkiv.py | 66 ------------------------------ youtube_dl/extractor/svtplay.py | 42 +++++++++++++++---- 3 files changed, 34 insertions(+), 75 deletions(-) delete mode 100644 youtube_dl/extractor/oppetarkiv.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 1544f1059..aecb67bf4 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -350,7 +350,6 @@ from .odnoklassniki import OdnoklassnikiIE from .oktoberfesttv import OktoberfestTVIE from .ooyala import OoyalaIE from .openfilm import OpenFilmIE -from .oppetarkiv import OppetArkivIE from .orf import ( ORFTVthekIE, ORFOE1IE, diff --git a/youtube_dl/extractor/oppetarkiv.py b/youtube_dl/extractor/oppetarkiv.py deleted file mode 100644 index ae6a28308..000000000 --- a/youtube_dl/extractor/oppetarkiv.py +++ /dev/null @@ -1,66 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..utils import ( - determine_ext, -) - - -class OppetArkivIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?oppetarkiv\.se/video/(?P<id>[0-9]+)' - _TEST = { - 'url': 'http://www.oppetarkiv.se/video/1058509/rederiet-sasong-1-avsnitt-1-av-318', - 'md5': '5c1eb616e59f733d4af77edc5177d2fe', - 'info_dict': { - 'id': '1058509', - 'ext': 'flv', - 'title': 'Farlig kryssning', - 'duration': 2566, - 'thumbnail': 're:^https?://.*[\.-]jpg$', - 'age_limit': 0, - }, - 'skip': 'Only works from Sweden', - } - - def _real_extract(self, url): - video_id = self._match_id(url) - info = self._download_json( - 'http://www.oppetarkiv.se/video/%s?output=json' % video_id, video_id) - - title = info['context']['title'] - thumbnail = info['context'].get('thumbnailImage') - - video_info = info['video'] - formats = [] - for vr in video_info['videoReferences']: - vurl = vr['url'] - ext = determine_ext(vurl) - if ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - vurl, video_id, - ext='mp4', entry_protocol='m3u8_native', - m3u8_id=vr.get('playerType'))) - elif ext == 'f4m': - formats.extend(self._extract_f4m_formats( - vurl + '?hdcore=3.3.0', video_id, - f4m_id=vr.get('playerType'))) - else: - formats.append({ - 'format_id': vr.get('playerType'), - 'url': vurl, - }) - self._sort_formats(formats) - - duration = video_info.get('materialLength') - - age_limit = 18 if video_info.get('inappropriateForChildren') else 0 - - return { - 'id': video_id, - 'title': title, - 'formats': formats, - 'thumbnail': thumbnail, - 'duration': duration, - 'age_limit': age_limit, - } diff --git a/youtube_dl/extractor/svtplay.py b/youtube_dl/extractor/svtplay.py index eadb9ccb4..433dfd1cb 100644 --- a/youtube_dl/extractor/svtplay.py +++ b/youtube_dl/extractor/svtplay.py @@ -1,6 +1,8 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor from ..utils import ( determine_ext, @@ -8,23 +10,40 @@ from ..utils import ( class SVTPlayIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?svtplay\.se/video/(?P<id>[0-9]+)' - _TEST = { + IE_DESC = 'SVT Play and Öppet arkiv' + _VALID_URL = r'https?://(?:www\.)?(?P<host>svtplay|oppetarkiv)\.se/video/(?P<id>[0-9]+)' + _TESTS = [{ 'url': 'http://www.svtplay.se/video/2609989/sm-veckan/sm-veckan-rally-final-sasong-1-sm-veckan-rally-final', - 'md5': 'f4a184968bc9c802a9b41316657aaa80', + 'md5': 'ade3def0643fa1c40587a422f98edfd9', 'info_dict': { 'id': '2609989', - 'ext': 'mp4', + 'ext': 'flv', 'title': 'SM veckan vinter, Örebro - Rally, final', 'duration': 4500, 'thumbnail': 're:^https?://.*[\.-]jpg$', + 'age_limit': 0, }, - } + }, { + 'url': 'http://www.oppetarkiv.se/video/1058509/rederiet-sasong-1-avsnitt-1-av-318', + 'md5': 'c3101a17ce9634f4c1f9800f0746c187', + 'info_dict': { + 'id': '1058509', + 'ext': 'flv', + 'title': 'Farlig kryssning', + 'duration': 2566, + 'thumbnail': 're:^https?://.*[\.-]jpg$', + 'age_limit': 0, + }, + 'skip': 'Only works from Sweden', + }] def _real_extract(self, url): - video_id = self._match_id(url) + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + host = mobj.group('host') + info = self._download_json( - 'http://www.svtplay.se/video/%s?output=json' % video_id, video_id) + 'http://www.%s.se/video/%s?output=json' % (host, video_id), video_id) title = info['context']['title'] thumbnail = info['context'].get('thumbnailImage') @@ -33,11 +52,16 @@ class SVTPlayIE(InfoExtractor): formats = [] for vr in video_info['videoReferences']: vurl = vr['url'] - if determine_ext(vurl) == 'm3u8': + ext = determine_ext(vurl) + if ext == 'm3u8': formats.extend(self._extract_m3u8_formats( vurl, video_id, ext='mp4', entry_protocol='m3u8_native', m3u8_id=vr.get('playerType'))) + elif ext == 'f4m': + formats.extend(self._extract_f4m_formats( + vurl + '?hdcore=3.3.0', video_id, + f4m_id=vr.get('playerType'))) else: formats.append({ 'format_id': vr.get('playerType'), @@ -46,6 +70,7 @@ class SVTPlayIE(InfoExtractor): self._sort_formats(formats) duration = video_info.get('materialLength') + age_limit = 18 if video_info.get('inappropriateForChildren') else 0 return { 'id': video_id, @@ -53,4 +78,5 @@ class SVTPlayIE(InfoExtractor): 'formats': formats, 'thumbnail': thumbnail, 'duration': duration, + 'age_limit': age_limit, } From 6c87c2eea8b7d14c4178aaae3d74559347a772e6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 28 Feb 2015 22:25:57 +0600 Subject: [PATCH 126/131] [puls4] Improve and extract more metadata --- youtube_dl/extractor/puls4.py | 89 +++++++++++++++++++++++------------ 1 file changed, 58 insertions(+), 31 deletions(-) diff --git a/youtube_dl/extractor/puls4.py b/youtube_dl/extractor/puls4.py index 70dedbff3..cce84b9e4 100644 --- a/youtube_dl/extractor/puls4.py +++ b/youtube_dl/extractor/puls4.py @@ -1,61 +1,88 @@ # -*- coding: utf-8 -*- - from __future__ import unicode_literals from .common import InfoExtractor - -import re +from ..utils import ( + ExtractorError, + unified_strdate, + int_or_none, +) class Puls4IE(InfoExtractor): - - _VALID_URL = r'https?://www.puls4.com/video/.+?/play/(?P<id>[0-9]+)' + _VALID_URL = r'https?://(?:www\.)?puls4\.com/video/[^/]+/play/(?P<id>[0-9]+)' _TESTS = [{ 'url': 'http://www.puls4.com/video/pro-und-contra/play/2716816', 'md5': '49f6a6629747eeec43cef6a46b5df81d', 'info_dict': { 'id': '2716816', 'ext': 'mp4', - 'title': 'Pro und Contra vom 23.02.2015'}}, - { + 'title': 'Pro und Contra vom 23.02.2015', + 'description': 'md5:293e44634d9477a67122489994675db6', + 'duration': 2989, + 'upload_date': '20150224', + 'uploader': 'PULS_4', + }, + 'skip': 'Only works from Germany', + }, { 'url': 'http://www.puls4.com/video/kult-spielfilme/play/1298106', 'md5': '6a48316c8903ece8dab9b9a7bf7a59ec', 'info_dict': { 'id': '1298106', 'ext': 'mp4', - 'title': 'Lucky Fritz'}} - ] + 'title': 'Lucky Fritz', + }, + 'skip': 'Only works from Germany', + }] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - # if fsk-button - real_url = self._html_search_regex(r'\"fsk-button\".+?href=\"([^"]+)', - webpage, 'fsk_button', default=None) + error_message = self._html_search_regex( + r'<div class="message-error">(.+?)</div>', + webpage, 'error message', default=None) + if error_message: + raise ExtractorError( + '%s returned error: %s' % (self.IE_NAME, error_message), expected=True) + + real_url = self._html_search_regex( + r'\"fsk-button\".+?href=\"([^"]+)', + webpage, 'fsk_button', default=None) if real_url: webpage = self._download_webpage(real_url, video_id) - title = self._html_search_regex( - r'<div id="bg_brandableContent">.+?<h1>(.+?)</h1>', - webpage, 'title', flags=re.DOTALL) + player = self._search_regex( + r'p4_video_player(?:_iframe)?\("video_\d+_container"\s*,(.+?)\);\s*\}', + webpage, 'player') - sd_url = self._html_search_regex( - r'{\"url\":\"([^"]+?)\",\"hd\":false', - webpage, 'sd_url').replace('\\', '') + player_json = self._parse_json( + '[%s]' % player, video_id, + transform_source=lambda s: s.replace('undefined,', '')) - formats = [{'format_id': 'sd', 'url': sd_url, 'quality': -2}] + formats = None + result = None - hd_url = self._html_search_regex( - r'{\"url\":\"([^"]+?)\",\"hd\":true', - webpage, 'hd_url', default=None) - if hd_url: - hd_url = hd_url.replace('\\', '') - formats.append({'format_id': 'hd', 'url': hd_url, 'quality': -1}) + for v in player_json: + if isinstance(v, list) and not formats: + formats = [{ + 'url': f['url'], + 'format': 'hd' if f.get('hd') else 'sd', + 'width': int_or_none(f.get('size_x')), + 'height': int_or_none(f.get('size_y')), + 'tbr': int_or_none(f.get('bitrate')), + } for f in v] + self._sort_formats(formats) + elif isinstance(v, dict) and not result: + result = { + 'id': video_id, + 'title': v['videopartname'].strip(), + 'description': v.get('videotitle'), + 'duration': int_or_none(v.get('videoduration') or v.get('episodeduration')), + 'upload_date': unified_strdate(v.get('clipreleasetime')), + 'uploader': v.get('channel'), + } - return { - 'id': video_id, - 'title': title, - 'formats': formats, - 'ext': 'mp4' - } + result['formats'] = formats + + return result From 0d103de3b0b03c5027f0015327c2c44b9073513a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 28 Feb 2015 22:59:55 +0600 Subject: [PATCH 127/131] [twitch] Pass api_token along with every request (Closes #3986) --- youtube_dl/extractor/twitch.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py index 4b0d8988d..4b0ce54df 100644 --- a/youtube_dl/extractor/twitch.py +++ b/youtube_dl/extractor/twitch.py @@ -34,7 +34,15 @@ class TwitchBaseIE(InfoExtractor): expected=True) def _download_json(self, url, video_id, note='Downloading JSON metadata'): - response = super(TwitchBaseIE, self)._download_json(url, video_id, note) + headers = { + 'Referer': 'http://api.twitch.tv/crossdomain/receiver.html?v=2', + 'X-Requested-With': 'XMLHttpRequest', + } + for cookie in self._downloader.cookiejar: + if cookie.name == 'api_token': + headers['Twitch-Api-Token'] = cookie.value + request = compat_urllib_request.Request(url, headers=headers) + response = super(TwitchBaseIE, self)._download_json(request, video_id, note) self._handle_error(response) return response From 8237bec4f05d6930146a7f0b087ae1e259917799 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Sat, 28 Feb 2015 20:52:52 +0100 Subject: [PATCH 128/131] [escapist] Extract duration --- youtube_dl/extractor/escapist.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/youtube_dl/extractor/escapist.py b/youtube_dl/extractor/escapist.py index 80e9084f4..e47f3e27a 100644 --- a/youtube_dl/extractor/escapist.py +++ b/youtube_dl/extractor/escapist.py @@ -8,6 +8,7 @@ from ..compat import ( from ..utils import ( ExtractorError, js_to_json, + parse_duration, ) @@ -25,6 +26,7 @@ class EscapistIE(InfoExtractor): 'uploader': 'The Escapist Presents', 'title': "Breaking Down Baldur's Gate", 'thumbnail': 're:^https?://.*\.jpg$', + 'duration': 264, } } @@ -41,6 +43,7 @@ class EscapistIE(InfoExtractor): r"<h1\s+class='headline'>(.*?)</a>", webpage, 'uploader', fatal=False) description = self._html_search_meta('description', webpage) + duration = parse_duration(self._html_search_meta('duration', webpage)) raw_title = self._html_search_meta('title', webpage, fatal=True) title = raw_title.partition(' : ')[2] @@ -105,6 +108,7 @@ class EscapistIE(InfoExtractor): 'title': title, 'thumbnail': self._og_search_thumbnail(webpage), 'description': description, + 'duration': duration, } if self._downloader.params.get('include_ads') and ad_formats: From eee6293d572b85c700ef645ac96b1816cb45901d Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Sat, 28 Feb 2015 20:55:44 +0100 Subject: [PATCH 129/131] [thechive] remove in favor of Kaltura (#5072) --- youtube_dl/extractor/__init__.py | 1 - youtube_dl/extractor/thechive.py | 60 -------------------------------- 2 files changed, 61 deletions(-) delete mode 100644 youtube_dl/extractor/thechive.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index f3967ff7b..ffcc7d9ab 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -484,7 +484,6 @@ from .tenplay import TenPlayIE from .testurl import TestURLIE from .testtube import TestTubeIE from .tf1 import TF1IE -from .thechive import TheChiveIE from .theonion import TheOnionIE from .theplatform import ThePlatformIE from .thesixtyone import TheSixtyOneIE diff --git a/youtube_dl/extractor/thechive.py b/youtube_dl/extractor/thechive.py deleted file mode 100644 index df1a7998d..000000000 --- a/youtube_dl/extractor/thechive.py +++ /dev/null @@ -1,60 +0,0 @@ -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import unified_strdate - - - -class TheChiveIE(InfoExtractor): - _VALID_URL = r'http://(www\.)?thechive\.com/[^/]+/[^/]+/[^/]+/(?P<video_id>[A-Za-z\-]+)' - _TEST = { - 'url': "http://thechive.com/2015/02/20/so-thats-what-a-set-of-redneck-bagpipes-sound-like-video/", - 'md5': "366710dda77cfa727bdef3523ba8466f", - 'info_dict': { - 'id': "so-thats-what-a-set-of-redneck-bagpipes-sound-like-video", - 'title': "So that's what a set of redneck bagpipes sound like... (Video)", - 'description': "Okay that was pretty good. Now play Freebird!...", - 'thumbnail': "https://thechive.files.wordpress.com/2015/02/0_07dghz0w-thumbnail2.jpg", - 'author': "Ben", - 'upload_date': "20150220", - 'ext': "mp4" - } - } - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('video_id') - webpage = self._download_webpage(url, video_id) - - title = self._og_search_title(webpage) - description = self._html_search_regex(r'(?s)<meta name="description" content="(.*?)" />', webpage, 'description') - thumbnail = self._og_search_thumbnail(webpage) - author = self._html_search_regex( - r'(?s)itemprop="author">(.+?)</span>', webpage, 'author', fatal=False).capitalize() - upload_date = unified_strdate(self._html_search_regex( - r'(?s)itemprop="datePublished" datetime="(.+?)">', webpage, 'upload_date', fatal=False)) - - # Adapted from extractor/musicvault.py - VIDEO_URL_TEMPLATE = 'http://cdnapi.kaltura.com/p/%(uid)s/sp/%(wid)s/playManifest/entryId/%(entry_id)s/format/url/protocol/http' - - kaltura_id = self._search_regex( - r'entry_id=([^"]+)', - webpage, 'kaltura ID') - video_url = VIDEO_URL_TEMPLATE % { - 'entry_id': kaltura_id, - 'wid': self._search_regex(r'partner_id/([0-9]+)\?', webpage, 'wid'), - 'uid': self._search_regex(r'uiconf_id/([0-9]+)/', webpage, 'uid'), - } - - return { - 'url': video_url, - 'id': video_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'author': author, - 'upload_date': upload_date, - 'ext': 'mp4' - } \ No newline at end of file From 01349011088b10861e283c245f5da56aa3d5fba0 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Sat, 28 Feb 2015 21:24:25 +0100 Subject: [PATCH 130/131] release 2015.02.28 --- README.md | 2 ++ docs/supportedsites.md | 5 ++++- youtube_dl/version.py | 2 +- 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 2c53e2211..04f664cd3 100644 --- a/README.md +++ b/README.md @@ -408,6 +408,8 @@ which means you can modify it, redistribute it or use it however you like. downloading, similar to find's -exec syntax. Example: --exec 'adb push {} /sdcard/Music/ && rm {}' + --convert-subtitles FORMAT Convert the subtitles to other format + (currently supported: srt|ass|vtt) # CONFIGURATION diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 49b4ac8c1..062cb3d62 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -210,6 +210,7 @@ - **Jove** - **jpopsuki.tv** - **Jukebox** + - **Kaltura** - **Kankan** - **Karaoketv** - **keek** @@ -308,6 +309,7 @@ - **Nuvid** - **NYTimes** - **ocw.mit.edu** + - **Odnoklassniki** - **OktoberfestTV** - **on.aol.com** - **Ooyala** @@ -334,6 +336,7 @@ - **PornoXO** - **PromptFile** - **prosiebensat1**: ProSiebenSat.1 Digital + - **Puls4** - **Pyvideo** - **QuickVid** - **R7** @@ -412,7 +415,7 @@ - **StreamCZ** - **StreetVoice** - **SunPorno** - - **SVTPlay** + - **SVTPlay**: SVT Play and Öppet arkiv - **SWRMediathek** - **Syfy** - **SztvHu** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index cf3e28bbe..5582348ba 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2015.02.26.2' +__version__ = '2015.02.28' From 003c69a84b68cadb46aeb8e03115848a722fd675 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Sat, 28 Feb 2015 21:42:16 +0100 Subject: [PATCH 131/131] Use shutil.get_terminal_size for getting the terminal width if it's available (python >= 3.3) --- youtube_dl/YoutubeDL.py | 4 ++-- youtube_dl/compat.py | 30 ++++++++++++++++++++++++++++++ youtube_dl/options.py | 4 ++-- youtube_dl/utils.py | 17 ----------------- 4 files changed, 34 insertions(+), 21 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 74e426168..d7c6db0ff 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -28,6 +28,7 @@ from .compat import ( compat_basestring, compat_cookiejar, compat_expanduser, + compat_get_terminal_size, compat_http_client, compat_kwargs, compat_str, @@ -46,7 +47,6 @@ from .utils import ( ExtractorError, format_bytes, formatSeconds, - get_term_width, locked_file, make_HTTPS_handler, MaxDownloadsReached, @@ -284,7 +284,7 @@ class YoutubeDL(object): try: import pty master, slave = pty.openpty() - width = get_term_width() + width = compat_get_terminal_size().columns if width is None: width_args = [] else: diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index e989cdbbd..b2bf149ef 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -1,9 +1,11 @@ from __future__ import unicode_literals +import collections import getpass import optparse import os import re +import shutil import socket import subprocess import sys @@ -364,6 +366,33 @@ def workaround_optparse_bug9161(): return real_add_option(self, *bargs, **bkwargs) optparse.OptionGroup.add_option = _compat_add_option +if hasattr(shutil, 'get_terminal_size'): # Python >= 3.3 + compat_get_terminal_size = shutil.get_terminal_size +else: + _terminal_size = collections.namedtuple('terminal_size', ['columns', 'lines']) + + def compat_get_terminal_size(): + columns = compat_getenv('COLUMNS', None) + if columns: + columns = int(columns) + else: + columns = None + lines = compat_getenv('LINES', None) + if lines: + lines = int(lines) + else: + lines = None + + try: + sp = subprocess.Popen( + ['stty', 'size'], + stdout=subprocess.PIPE, stderr=subprocess.PIPE) + out, err = sp.communicate() + lines, columns = map(int, out.split()) + except: + pass + return _terminal_size(columns, lines) + __all__ = [ 'compat_HTTPError', @@ -371,6 +400,7 @@ __all__ = [ 'compat_chr', 'compat_cookiejar', 'compat_expanduser', + 'compat_get_terminal_size', 'compat_getenv', 'compat_getpass', 'compat_html_entities', diff --git a/youtube_dl/options.py b/youtube_dl/options.py index 58f811162..a2ffe96bc 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -8,11 +8,11 @@ import sys from .downloader.external import list_external_downloaders from .compat import ( compat_expanduser, + compat_get_terminal_size, compat_getenv, compat_kwargs, ) from .utils import ( - get_term_width, write_string, ) from .version import __version__ @@ -100,7 +100,7 @@ def parseOpts(overrideArguments=None): return opts # No need to wrap help messages if we're on a wide console - columns = get_term_width() + columns = compat_get_terminal_size().columns max_width = columns if columns else 80 max_help_position = 80 diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 1f3bfef7d..d4938ec36 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -35,7 +35,6 @@ import zlib from .compat import ( compat_basestring, compat_chr, - compat_getenv, compat_html_entities, compat_http_client, compat_parse_qs, @@ -1173,22 +1172,6 @@ def parse_filesize(s): return int(float(num_str) * mult) -def get_term_width(): - columns = compat_getenv('COLUMNS', None) - if columns: - return int(columns) - - try: - sp = subprocess.Popen( - ['stty', 'size'], - stdout=subprocess.PIPE, stderr=subprocess.PIPE) - out, err = sp.communicate() - return int(out.split()[1]) - except: - pass - return None - - def month_by_name(name): """ Return the number of a month by (locale-independently) English name """