From 67dfbc0cb92a19eda2981528b1456bdc0e3cb805 Mon Sep 17 00:00:00 2001 From: Ismael Mejia Date: Wed, 7 Aug 2013 18:42:40 +0200 Subject: [PATCH 001/215] Added exceptions for the subtitle and video types in .gitignore --- .gitignore | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index ca4e8f353..fca34b8ba 100644 --- a/.gitignore +++ b/.gitignore @@ -9,6 +9,7 @@ build/ dist/ MANIFEST README.txt +README.md youtube-dl.1 youtube-dl.bash-completion youtube-dl @@ -17,4 +18,10 @@ youtube-dl.tar.gz .coverage cover/ updates_key.pem -*.egg-info \ No newline at end of file +*.egg-info +*.srt +*.sbv +*.vtt +*.flv +*.mp4 +*.part From 5898e282726bc2f54fc52fe425c389226e31a797 Mon Sep 17 00:00:00 2001 From: Ismael Mejia Date: Wed, 7 Aug 2013 18:48:24 +0200 Subject: [PATCH 002/215] Fixed small type issue --- youtube_dl/YoutubeDL.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index e69d844b8..beed79fd0 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -492,7 +492,8 @@ class YoutubeDL(object): # that way it will silently go on when used with unsupporting IE subtitles = info_dict['subtitles'] sub_format = self.params.get('subtitlesformat') - for sub_lang in subtitles.keys(): + + for sub_lang in subtitles: sub = subtitles[sub_lang] if sub is None: continue From 953e32b2c1be077e65bba844010a5a2707af2e2b Mon Sep 17 00:00:00 2001 From: Ismael Mejia Date: Wed, 7 Aug 2013 18:59:11 +0200 Subject: [PATCH 003/215] [dailymotion] Added support for subtitles + new InfoExtractor for generic subtitle download. The idea is that all subtitle downloaders must descend from SubtitlesIE and implement only three basic methods to achieve the complete subtitle download functionality. This will allow to reduce the code in YoutubeIE once it is rewritten. --- test/test_dailymotion_subtitles.py | 96 +++++++++++++++++++++++++++++ youtube_dl/__init__.py | 10 +-- youtube_dl/extractor/dailymotion.py | 67 ++++++++++++++++++-- youtube_dl/extractor/subtitles.py | 80 ++++++++++++++++++++++++ 4 files changed, 242 insertions(+), 11 deletions(-) create mode 100644 test/test_dailymotion_subtitles.py create mode 100644 youtube_dl/extractor/subtitles.py diff --git a/test/test_dailymotion_subtitles.py b/test/test_dailymotion_subtitles.py new file mode 100644 index 000000000..f63426a18 --- /dev/null +++ b/test/test_dailymotion_subtitles.py @@ -0,0 +1,96 @@ +#!/usr/bin/env python + +import sys +import unittest +import json +import io +import hashlib + +# Allow direct execution +import os +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from youtube_dl.extractor import DailymotionIE +from youtube_dl.utils import * +from helper import FakeYDL + +md5 = lambda s: hashlib.md5(s.encode('utf-8')).hexdigest() +TEST_URL = 'http://www.dailymotion.com/video/xczg00' + +class TestDailymotionSubtitles(unittest.TestCase): + def setUp(self): + DL = FakeYDL() + DL.params['allsubtitles'] = False + DL.params['writesubtitles'] = False + DL.params['subtitlesformat'] = 'srt' + DL.params['listsubtitles'] = False + def test_no_subtitles(self): + DL = FakeYDL() + DL.params['writesubtitles'] = False + IE = DailymotionIE(DL) + info_dict = IE.extract(TEST_URL) + subtitles = info_dict[0]['subtitles'] + self.assertEqual(subtitles, None) + def test_subtitles(self): + DL = FakeYDL() + DL.params['writesubtitles'] = True + IE = DailymotionIE(DL) + info_dict = IE.extract(TEST_URL) + sub = info_dict[0]['subtitles']['en'] + self.assertEqual(md5(sub), '976553874490cba125086bbfea3ff76f') + def test_subtitles_fr(self): + DL = FakeYDL() + DL.params['writesubtitles'] = True + DL.params['subtitleslang'] = 'fr' + IE = DailymotionIE(DL) + info_dict = IE.extract(TEST_URL) + sub = info_dict[0]['subtitles']['fr'] + self.assertEqual(md5(sub), '594564ec7d588942e384e920e5341792') + def test_onlysubtitles(self): + DL = FakeYDL() + DL.params['writesubtitles'] = True + DL.params['onlysubtitles'] = True + IE = DailymotionIE(DL) + info_dict = IE.extract(TEST_URL) + sub = info_dict[0]['subtitles']['en'] + self.assertEqual(md5(sub), '976553874490cba125086bbfea3ff76f') + def test_allsubtitles(self): + DL = FakeYDL() + DL.params['allsubtitles'] = True + IE = DailymotionIE(DL) + info_dict = IE.extract(TEST_URL) + subtitles = info_dict[0]['subtitles'] + self.assertEqual(len(subtitles.keys()), 5) + # def test_subtitles_sbv_format(self): + # DL = FakeYDL() + # DL.params['writesubtitles'] = True + # DL.params['subtitlesformat'] = 'sbv' + # IE = DailymotionIE(DL) + # info_dict = IE.extract(TEST_URL) + # sub = info_dict[0]['subtitles'][0] + # self.assertEqual(md5(sub), '13aeaa0c245a8bed9a451cb643e3ad8b') + # def test_subtitles_vtt_format(self): + # DL = FakeYDL() + # DL.params['writesubtitles'] = True + # DL.params['subtitlesformat'] = 'vtt' + # IE = DailymotionIE(DL) + # info_dict = IE.extract(TEST_URL) + # sub = info_dict[0]['subtitles'][0] + # self.assertEqual(md5(sub), '356cdc577fde0c6783b9b822e7206ff7') + def test_list_subtitles(self): + DL = FakeYDL() + DL.params['listsubtitles'] = True + IE = DailymotionIE(DL) + info_dict = IE.extract(TEST_URL) + self.assertEqual(info_dict, None) + def test_automatic_captions(self): + DL = FakeYDL() + DL.params['writeautomaticsub'] = True + DL.params['subtitleslang'] = 'en' + IE = DailymotionIE(DL) + info_dict = IE.extract(TEST_URL) + sub = info_dict[0]['subtitles'] + self.assertTrue(len(sub) == 0) + +if __name__ == '__main__': + unittest.main() diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index eb23c53a5..c4d595e1c 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -187,22 +187,22 @@ def parseOpts(overrideArguments=None): action='store_true', dest='listformats', help='list all available formats (currently youtube only)') video_format.add_option('--write-sub', '--write-srt', action='store_true', dest='writesubtitles', - help='write subtitle file (currently youtube only)', default=False) + help='write subtitle file', default=False) video_format.add_option('--write-auto-sub', '--write-automatic-sub', action='store_true', dest='writeautomaticsub', - help='write automatic subtitle file (currently youtube only)', default=False) + help='write automatic subtitle file (youtube only)', default=False) video_format.add_option('--only-sub', action='store_true', dest='skip_download', help='[deprecated] alias of --skip-download', default=False) video_format.add_option('--all-subs', action='store_true', dest='allsubtitles', - help='downloads all the available subtitles of the video (currently youtube only)', default=False) + help='downloads all the available subtitles of the video', default=False) video_format.add_option('--list-subs', action='store_true', dest='listsubtitles', - help='lists all available subtitles for the video (currently youtube only)', default=False) + help='lists all available subtitles for the video', default=False) video_format.add_option('--sub-format', action='store', dest='subtitlesformat', metavar='FORMAT', - help='subtitle format [srt/sbv/vtt] (default=srt) (currently youtube only)', default='srt') + help='subtitle format (default=srt) ([sbv/vtt] youtube only)', default='srt') video_format.add_option('--sub-lang', '--srt-lang', action='store', dest='subtitleslang', metavar='LANG', help='language of the subtitles to download (optional) use IETF language tags like \'en\'') diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index 9bf7a28ca..eb2322d54 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -1,14 +1,49 @@ import re import json +import itertools +import socket from .common import InfoExtractor +from .subtitles import SubtitlesIE + from ..utils import ( + compat_http_client, + compat_urllib_error, compat_urllib_request, + compat_str, + get_element_by_attribute, + get_element_by_id, ExtractorError, ) -class DailymotionIE(InfoExtractor): + +class DailyMotionSubtitlesIE(SubtitlesIE): + + def _get_available_subtitles(self, video_id): + request = compat_urllib_request.Request('https://api.dailymotion.com/video/%s/subtitles?fields=id,language,url' % video_id) + try: + sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8') + except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: + self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err)) + return {} + info = json.loads(sub_list) + if (info['total'] > 0): + sub_lang_list = dict((l['language'], l['url']) for l in info['list']) + return sub_lang_list + self._downloader.report_warning(u'video doesn\'t have subtitles') + return {} + + def _get_subtitle_url(self, sub_lang, sub_name, video_id, format): + sub_lang_list = self._get_available_subtitles(video_id) + return sub_lang_list[sub_lang] + + def _request_automatic_caption(self, video_id, webpage): + self._downloader.report_warning(u'Automatic Captions not supported by dailymotion') + return {} + + +class DailymotionIE(DailyMotionSubtitlesIE): #,InfoExtractor): """Information Extractor for Dailymotion""" _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)' @@ -18,7 +53,7 @@ class DailymotionIE(InfoExtractor): u'file': u'x33vw9.mp4', u'md5': u'392c4b85a60a90dc4792da41ce3144eb', u'info_dict': { - u"uploader": u"Alex and Van .", + u"uploader": u"Alex and Van .", u"title": u"Tutoriel de Youtubeur\"DL DES VIDEO DE YOUTUBE\"" } } @@ -57,17 +92,36 @@ class DailymotionIE(InfoExtractor): # TODO: support choosing qualities - for key in ['stream_h264_hd1080_url','stream_h264_hd_url', - 'stream_h264_hq_url','stream_h264_url', + for key in ['stream_h264_hd1080_url', 'stream_h264_hd_url', + 'stream_h264_hq_url', 'stream_h264_url', 'stream_h264_ld_url']: - if info.get(key):#key in info and info[key]: + if info.get(key): # key in info and info[key]: max_quality = key - self.to_screen(u'Using %s' % key) + self.to_screen(u'%s: Using %s' % (video_id, key)) break else: raise ExtractorError(u'Unable to extract video URL') video_url = info[max_quality] + # subtitles + video_subtitles = None + video_webpage = None + + if self._downloader.params.get('writesubtitles', False) or self._downloader.params.get('allsubtitles', False): + video_subtitles = self._extract_subtitles(video_id) + elif self._downloader.params.get('writeautomaticsub', False): + video_subtitles = self._request_automatic_caption(video_id, video_webpage) + + if self._downloader.params.get('listsubtitles', False): + self._list_available_subtitles(video_id) + return + + if 'length_seconds' not in info: + self._downloader.report_warning(u'unable to extract video duration') + video_duration = '' + else: + video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0]) + return [{ 'id': video_id, 'url': video_url, @@ -75,5 +129,6 @@ class DailymotionIE(InfoExtractor): 'upload_date': video_upload_date, 'title': self._og_search_title(webpage), 'ext': video_extension, + 'subtitles': video_subtitles, 'thumbnail': info['thumbnail_url'] }] diff --git a/youtube_dl/extractor/subtitles.py b/youtube_dl/extractor/subtitles.py new file mode 100644 index 000000000..89864e5d7 --- /dev/null +++ b/youtube_dl/extractor/subtitles.py @@ -0,0 +1,80 @@ +import socket + +from .common import InfoExtractor + +from ..utils import ( + compat_http_client, + compat_urllib_error, + compat_urllib_request, + compat_str, +) + + +class SubtitlesIE(InfoExtractor): + + def report_video_subtitles_available(self, video_id, sub_lang_list): + """Report available subtitles.""" + sub_lang = ",".join(list(sub_lang_list.keys())) + self.to_screen(u'%s: Available subtitles for video: %s' % (video_id, sub_lang)) + + def _list_available_subtitles(self, video_id): + sub_lang_list = self._get_available_subtitles(video_id) + self.report_video_subtitles_available(video_id, sub_lang_list) + + def _extract_subtitles(self, video_id): + """ + Return a dictionary: {language: subtitles} or {} if the subtitles + couldn't be found + """ + sub_lang_list = self._get_available_subtitles(video_id) + sub_format = self._downloader.params.get('subtitlesformat') + if not sub_lang_list: #There was some error, it didn't get the available subtitles + return {} + if self._downloader.params.get('writesubtitles', False): + if self._downloader.params.get('subtitleslang', False): + sub_lang = self._downloader.params.get('subtitleslang') + elif 'en' in sub_lang_list: + sub_lang = 'en' + else: + sub_lang = list(sub_lang_list.keys())[0] + if not sub_lang in sub_lang_list: + self._downloader.report_warning(u'no closed captions found in the specified language "%s"' % sub_lang) + return {} + sub_lang_list = {sub_lang: sub_lang_list[sub_lang]} + subtitles = {} + for sub_lang in sub_lang_list: + subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format) + if subtitle: + subtitles[sub_lang] = subtitle + return subtitles + + def _request_subtitle(self, sub_lang, sub_name, video_id, format): + """ Return the subtitle as a string or None if they are not found """ + # return (u'Did not fetch video subtitles for %s' % sub_lang, None, None) + self.to_screen(u'%s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format)) + url = self._get_subtitle_url(sub_lang, sub_name, video_id, format) + try: + sub = compat_urllib_request.urlopen(url).read().decode('utf-8') + except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: + self._downloader.report_warning(u'unable to download video subtitles for %s: %s' % (sub_lang, compat_str(err))) + return + if not sub: + self._downloader.report_warning(u'Did not fetch video subtitles') + return + return sub + + def _get_available_subtitles(self, video_id): + """Get available subtitles. Redefine in subclasses.""" + """returns {(lang, url)} """ + # return {} + pass + + def _get_subtitle_url(self, sub_lang, sub_name, video_id, format): + """returns the url for the given subtitle. Redefine in subclasses.""" + pass + + def _request_automatic_caption(self, video_id, webpage): + """Request automatic caption. Redefine in subclasses.""" + """returns a tuple of ... """ + # return [(err_msg, None, None)] + pass From 372297e713c92489c113bf8649ec4aa1d23511f9 Mon Sep 17 00:00:00 2001 From: Ismael Mejia Date: Wed, 7 Aug 2013 21:24:42 +0200 Subject: [PATCH 004/215] Undo the previous commit (it was a mistake) --- youtube_dl/YoutubeDL.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index beed79fd0..ed5492826 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -493,7 +493,7 @@ class YoutubeDL(object): subtitles = info_dict['subtitles'] sub_format = self.params.get('subtitlesformat') - for sub_lang in subtitles: + for sub_lang in subtitles.keys(): sub = subtitles[sub_lang] if sub is None: continue From 8377574c9cb8740e24d45e9b3d30921fd6ec846c Mon Sep 17 00:00:00 2001 From: Ismael Mejia Date: Thu, 8 Aug 2013 08:54:10 +0200 Subject: [PATCH 005/215] [internal] Improved subtitle architecture + (update in youtube/dailymotion) The structure of subtitles was refined, you only need to implement one method that returns a dictionnary of the available subtitles (lang, url) to support all the subtitle options in a website. I updated the subtitle downloaders for youtube/dailymotion to show how it works. --- youtube_dl/extractor/dailymotion.py | 15 +-- youtube_dl/extractor/subtitles.py | 27 ++--- youtube_dl/extractor/youtube.py | 175 ++++++++++------------------ 3 files changed, 73 insertions(+), 144 deletions(-) diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index eb2322d54..97003ee35 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -1,6 +1,5 @@ import re import json -import itertools import socket from .common import InfoExtractor @@ -34,16 +33,12 @@ class DailyMotionSubtitlesIE(SubtitlesIE): self._downloader.report_warning(u'video doesn\'t have subtitles') return {} - def _get_subtitle_url(self, sub_lang, sub_name, video_id, format): - sub_lang_list = self._get_available_subtitles(video_id) - return sub_lang_list[sub_lang] - def _request_automatic_caption(self, video_id, webpage): - self._downloader.report_warning(u'Automatic Captions not supported by dailymotion') + self._downloader.report_warning(u'Automatic Captions not supported by this server') return {} -class DailymotionIE(DailyMotionSubtitlesIE): #,InfoExtractor): +class DailymotionIE(DailyMotionSubtitlesIE): """Information Extractor for Dailymotion""" _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)' @@ -116,12 +111,6 @@ class DailymotionIE(DailyMotionSubtitlesIE): #,InfoExtractor): self._list_available_subtitles(video_id) return - if 'length_seconds' not in info: - self._downloader.report_warning(u'unable to extract video duration') - video_duration = '' - else: - video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0]) - return [{ 'id': video_id, 'url': video_url, diff --git a/youtube_dl/extractor/subtitles.py b/youtube_dl/extractor/subtitles.py index 89864e5d7..8843e0220 100644 --- a/youtube_dl/extractor/subtitles.py +++ b/youtube_dl/extractor/subtitles.py @@ -15,7 +15,8 @@ class SubtitlesIE(InfoExtractor): def report_video_subtitles_available(self, video_id, sub_lang_list): """Report available subtitles.""" sub_lang = ",".join(list(sub_lang_list.keys())) - self.to_screen(u'%s: Available subtitles for video: %s' % (video_id, sub_lang)) + self.to_screen(u'%s: Available subtitles for video: %s' % + (video_id, sub_lang)) def _list_available_subtitles(self, video_id): sub_lang_list = self._get_available_subtitles(video_id) @@ -27,9 +28,9 @@ class SubtitlesIE(InfoExtractor): couldn't be found """ sub_lang_list = self._get_available_subtitles(video_id) - sub_format = self._downloader.params.get('subtitlesformat') - if not sub_lang_list: #There was some error, it didn't get the available subtitles + if not sub_lang_list: # error, it didn't get the available subtitles return {} + if self._downloader.params.get('writesubtitles', False): if self._downloader.params.get('subtitleslang', False): sub_lang = self._downloader.params.get('subtitleslang') @@ -41,18 +42,15 @@ class SubtitlesIE(InfoExtractor): self._downloader.report_warning(u'no closed captions found in the specified language "%s"' % sub_lang) return {} sub_lang_list = {sub_lang: sub_lang_list[sub_lang]} + subtitles = {} - for sub_lang in sub_lang_list: - subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format) + for sub_lang, url in sub_lang_list.iteritems(): + subtitle = self._request_subtitle_url(sub_lang, url) if subtitle: subtitles[sub_lang] = subtitle return subtitles - def _request_subtitle(self, sub_lang, sub_name, video_id, format): - """ Return the subtitle as a string or None if they are not found """ - # return (u'Did not fetch video subtitles for %s' % sub_lang, None, None) - self.to_screen(u'%s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format)) - url = self._get_subtitle_url(sub_lang, sub_name, video_id, format) + def _request_subtitle_url(self, sub_lang, url): try: sub = compat_urllib_request.urlopen(url).read().decode('utf-8') except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: @@ -64,13 +62,8 @@ class SubtitlesIE(InfoExtractor): return sub def _get_available_subtitles(self, video_id): - """Get available subtitles. Redefine in subclasses.""" - """returns {(lang, url)} """ - # return {} - pass - - def _get_subtitle_url(self, sub_lang, sub_name, video_id, format): - """returns the url for the given subtitle. Redefine in subclasses.""" + """returns the list of available subtitles like this {lang: url} """ + """or {} if not available. Must be redefined by the subclasses.""" pass def _request_automatic_caption(self, video_id, webpage): diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 2b03226f6..414e33b49 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -7,6 +7,7 @@ import socket import itertools from .common import InfoExtractor, SearchInfoExtractor +from .subtitles import SubtitlesIE from ..utils import ( compat_http_client, compat_parse_qs, @@ -24,7 +25,66 @@ from ..utils import ( ) -class YoutubeIE(InfoExtractor): +class YoutubeSubtitlesIE(SubtitlesIE): + + def _get_available_subtitles(self, video_id): + request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id) + try: + sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8') + except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: + self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err)) + return {} + lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list) + + sub_lang_list = {} + for l in lang_list: + lang = l[1] + params = compat_urllib_parse.urlencode({ + 'lang': lang, + 'v': video_id, + 'fmt': self._downloader.params.get('subtitlesformat'), + }) + url = u'http://www.youtube.com/api/timedtext?' + params + sub_lang_list[lang] = url + if not sub_lang_list: + self._downloader.report_warning(u'video doesn\'t have subtitles') + return {} + return sub_lang_list + + def _request_automatic_caption(self, video_id, webpage): + """We need the webpage for getting the captions url, pass it as an + argument to speed up the process.""" + sub_lang = self._downloader.params.get('subtitleslang') or 'en' + sub_format = self._downloader.params.get('subtitlesformat') + self.to_screen(u'%s: Looking for automatic captions' % video_id) + mobj = re.search(r';ytplayer.config = ({.*?});', webpage) + err_msg = u'Couldn\'t find automatic captions for "%s"' % sub_lang + if mobj is None: + self._downloader.report_warning(err_msg) + return {} + player_config = json.loads(mobj.group(1)) + try: + args = player_config[u'args'] + caption_url = args[u'ttsurl'] + timestamp = args[u'timestamp'] + params = compat_urllib_parse.urlencode({ + 'lang': 'en', + 'tlang': sub_lang, + 'fmt': sub_format, + 'ts': timestamp, + 'kind': 'asr', + }) + subtitles_url = caption_url + '&' + params + sub = self._download_webpage(subtitles_url, video_id, u'Downloading automatic captions') + return {sub_lang: sub} + # An extractor error can be raise by the download process if there are + # no automatic captions but there are subtitles + except (KeyError, ExtractorError): + self._downloader.report_warning(err_msg) + return {} + + +class YoutubeIE(YoutubeSubtitlesIE): IE_DESC = u'YouTube.com' _VALID_URL = r"""^ ( @@ -151,19 +211,6 @@ class YoutubeIE(InfoExtractor): """Report attempt to download video info webpage.""" self.to_screen(u'%s: Downloading video info webpage' % video_id) - def report_video_subtitles_download(self, video_id): - """Report attempt to download video info webpage.""" - self.to_screen(u'%s: Checking available subtitles' % video_id) - - def report_video_subtitles_request(self, video_id, sub_lang, format): - """Report attempt to download video info webpage.""" - self.to_screen(u'%s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format)) - - def report_video_subtitles_available(self, video_id, sub_lang_list): - """Report available subtitles.""" - sub_lang = ",".join(list(sub_lang_list.keys())) - self.to_screen(u'%s: Available subtitles for video: %s' % (video_id, sub_lang)) - def report_information_extraction(self, video_id): """Report attempt to extract video information.""" self.to_screen(u'%s: Extracting video information' % video_id) @@ -203,106 +250,6 @@ class YoutubeIE(InfoExtractor): else: raise ExtractorError(u'Unable to decrypt signature, key length %d not supported; retrying might work' % (len(s))) - def _get_available_subtitles(self, video_id): - self.report_video_subtitles_download(video_id) - request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id) - try: - sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8') - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err)) - return {} - sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list) - sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list) - if not sub_lang_list: - self._downloader.report_warning(u'video doesn\'t have subtitles') - return {} - return sub_lang_list - - def _list_available_subtitles(self, video_id): - sub_lang_list = self._get_available_subtitles(video_id) - self.report_video_subtitles_available(video_id, sub_lang_list) - - def _request_subtitle(self, sub_lang, sub_name, video_id, format): - """ - Return the subtitle as a string or None if they are not found - """ - self.report_video_subtitles_request(video_id, sub_lang, format) - params = compat_urllib_parse.urlencode({ - 'lang': sub_lang, - 'name': sub_name, - 'v': video_id, - 'fmt': format, - }) - url = 'http://www.youtube.com/api/timedtext?' + params - try: - sub = compat_urllib_request.urlopen(url).read().decode('utf-8') - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.report_warning(u'unable to download video subtitles for %s: %s' % (sub_lang, compat_str(err))) - return - if not sub: - self._downloader.report_warning(u'Did not fetch video subtitles') - return - return sub - - def _request_automatic_caption(self, video_id, webpage): - """We need the webpage for getting the captions url, pass it as an - argument to speed up the process.""" - sub_lang = self._downloader.params.get('subtitleslang') or 'en' - sub_format = self._downloader.params.get('subtitlesformat') - self.to_screen(u'%s: Looking for automatic captions' % video_id) - mobj = re.search(r';ytplayer.config = ({.*?});', webpage) - err_msg = u'Couldn\'t find automatic captions for "%s"' % sub_lang - if mobj is None: - self._downloader.report_warning(err_msg) - return {} - player_config = json.loads(mobj.group(1)) - try: - args = player_config[u'args'] - caption_url = args[u'ttsurl'] - timestamp = args[u'timestamp'] - params = compat_urllib_parse.urlencode({ - 'lang': 'en', - 'tlang': sub_lang, - 'fmt': sub_format, - 'ts': timestamp, - 'kind': 'asr', - }) - subtitles_url = caption_url + '&' + params - sub = self._download_webpage(subtitles_url, video_id, u'Downloading automatic captions') - return {sub_lang: sub} - # An extractor error can be raise by the download process if there are - # no automatic captions but there are subtitles - except (KeyError, ExtractorError): - self._downloader.report_warning(err_msg) - return {} - - def _extract_subtitles(self, video_id): - """ - Return a dictionary: {language: subtitles} or {} if the subtitles - couldn't be found - """ - sub_lang_list = self._get_available_subtitles(video_id) - sub_format = self._downloader.params.get('subtitlesformat') - if not sub_lang_list: #There was some error, it didn't get the available subtitles - return {} - if self._downloader.params.get('writesubtitles', False): - if self._downloader.params.get('subtitleslang', False): - sub_lang = self._downloader.params.get('subtitleslang') - elif 'en' in sub_lang_list: - sub_lang = 'en' - else: - sub_lang = list(sub_lang_list.keys())[0] - if not sub_lang in sub_lang_list: - self._downloader.report_warning(u'no closed captions found in the specified language "%s"' % sub_lang) - return {} - sub_lang_list = {sub_lang: sub_lang_list[sub_lang]} - subtitles = {} - for sub_lang in sub_lang_list: - subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format) - if subtitle: - subtitles[sub_lang] = subtitle - return subtitles - def _print_formats(self, formats): print('Available formats:') for x in formats: From 505c28aac90fbee46f0d54945b27e115f90785f2 Mon Sep 17 00:00:00 2001 From: Ismael Mejia Date: Thu, 8 Aug 2013 09:53:25 +0200 Subject: [PATCH 006/215] Separated subtitle options in their own group --- youtube_dl/__init__.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index c4d595e1c..8c6abddd9 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -119,6 +119,7 @@ def parseOpts(overrideArguments=None): selection = optparse.OptionGroup(parser, 'Video Selection') authentication = optparse.OptionGroup(parser, 'Authentication Options') video_format = optparse.OptionGroup(parser, 'Video Format Options') + subtitles = optparse.OptionGroup(parser, 'Subtitle Options') downloader = optparse.OptionGroup(parser, 'Download Options') postproc = optparse.OptionGroup(parser, 'Post-processing Options') filesystem = optparse.OptionGroup(parser, 'Filesystem Options') @@ -185,25 +186,26 @@ def parseOpts(overrideArguments=None): action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download') video_format.add_option('-F', '--list-formats', action='store_true', dest='listformats', help='list all available formats (currently youtube only)') - video_format.add_option('--write-sub', '--write-srt', + + subtitles.add_option('--write-sub', '--write-srt', action='store_true', dest='writesubtitles', help='write subtitle file', default=False) - video_format.add_option('--write-auto-sub', '--write-automatic-sub', + subtitles.add_option('--write-auto-sub', '--write-automatic-sub', action='store_true', dest='writeautomaticsub', help='write automatic subtitle file (youtube only)', default=False) - video_format.add_option('--only-sub', + subtitles.add_option('--only-sub', action='store_true', dest='skip_download', help='[deprecated] alias of --skip-download', default=False) - video_format.add_option('--all-subs', + subtitles.add_option('--all-subs', action='store_true', dest='allsubtitles', help='downloads all the available subtitles of the video', default=False) - video_format.add_option('--list-subs', + subtitles.add_option('--list-subs', action='store_true', dest='listsubtitles', help='lists all available subtitles for the video', default=False) - video_format.add_option('--sub-format', + subtitles.add_option('--sub-format', action='store', dest='subtitlesformat', metavar='FORMAT', help='subtitle format (default=srt) ([sbv/vtt] youtube only)', default='srt') - video_format.add_option('--sub-lang', '--srt-lang', + subtitles.add_option('--sub-lang', '--srt-lang', action='store', dest='subtitleslang', metavar='LANG', help='language of the subtitles to download (optional) use IETF language tags like \'en\'') @@ -328,6 +330,7 @@ def parseOpts(overrideArguments=None): parser.add_option_group(filesystem) parser.add_option_group(verbosity) parser.add_option_group(video_format) + parser.add_option_group(subtitles) parser.add_option_group(authentication) parser.add_option_group(postproc) From 33eb0ce4c4c515b30e5809f63f892b895601b442 Mon Sep 17 00:00:00 2001 From: Ismael Mejia Date: Thu, 8 Aug 2013 10:06:24 +0200 Subject: [PATCH 007/215] [subtitles] removed only-sub option (--skip-download achieves the same functionality) --- test/parameters.json | 1 - test/test_dailymotion_subtitles.py | 8 -------- test/test_youtube_subtitles.py | 8 -------- youtube_dl/__init__.py | 3 --- 4 files changed, 20 deletions(-) diff --git a/test/parameters.json b/test/parameters.json index 96998b5c3..f042880ed 100644 --- a/test/parameters.json +++ b/test/parameters.json @@ -38,7 +38,6 @@ "writedescription": false, "writeinfojson": true, "writesubtitles": false, - "onlysubtitles": false, "allsubtitles": false, "listssubtitles": false } diff --git a/test/test_dailymotion_subtitles.py b/test/test_dailymotion_subtitles.py index f63426a18..32e3f6abe 100644 --- a/test/test_dailymotion_subtitles.py +++ b/test/test_dailymotion_subtitles.py @@ -46,14 +46,6 @@ class TestDailymotionSubtitles(unittest.TestCase): info_dict = IE.extract(TEST_URL) sub = info_dict[0]['subtitles']['fr'] self.assertEqual(md5(sub), '594564ec7d588942e384e920e5341792') - def test_onlysubtitles(self): - DL = FakeYDL() - DL.params['writesubtitles'] = True - DL.params['onlysubtitles'] = True - IE = DailymotionIE(DL) - info_dict = IE.extract(TEST_URL) - sub = info_dict[0]['subtitles']['en'] - self.assertEqual(md5(sub), '976553874490cba125086bbfea3ff76f') def test_allsubtitles(self): DL = FakeYDL() DL.params['allsubtitles'] = True diff --git a/test/test_youtube_subtitles.py b/test/test_youtube_subtitles.py index fe0eac680..fe5d097ce 100644 --- a/test/test_youtube_subtitles.py +++ b/test/test_youtube_subtitles.py @@ -45,14 +45,6 @@ class TestYoutubeSubtitles(unittest.TestCase): info_dict = IE.extract('QRS8MkLhQmM') sub = info_dict[0]['subtitles']['it'] self.assertEqual(md5(sub), '164a51f16f260476a05b50fe4c2f161d') - def test_youtube_onlysubtitles(self): - DL = FakeYDL() - DL.params['writesubtitles'] = True - DL.params['onlysubtitles'] = True - IE = YoutubeIE(DL) - info_dict = IE.extract('QRS8MkLhQmM') - sub = info_dict[0]['subtitles']['en'] - self.assertEqual(md5(sub), '4cd9278a35ba2305f47354ee13472260') def test_youtube_allsubtitles(self): DL = FakeYDL() DL.params['allsubtitles'] = True diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 8c6abddd9..34f3dad0f 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -193,9 +193,6 @@ def parseOpts(overrideArguments=None): subtitles.add_option('--write-auto-sub', '--write-automatic-sub', action='store_true', dest='writeautomaticsub', help='write automatic subtitle file (youtube only)', default=False) - subtitles.add_option('--only-sub', - action='store_true', dest='skip_download', - help='[deprecated] alias of --skip-download', default=False) subtitles.add_option('--all-subs', action='store_true', dest='allsubtitles', help='downloads all the available subtitles of the video', default=False) From 447591e1aea39f3100b66a7b94337bf67546663f Mon Sep 17 00:00:00 2001 From: Ismael Mejia Date: Thu, 8 Aug 2013 11:03:52 +0200 Subject: [PATCH 008/215] [test] Cleaned subtitles tests --- test/test_dailymotion_subtitles.py | 83 +++++++++------------------- test/test_youtube_subtitles.py | 88 ++++++++++++------------------ 2 files changed, 61 insertions(+), 110 deletions(-) diff --git a/test/test_dailymotion_subtitles.py b/test/test_dailymotion_subtitles.py index 32e3f6abe..26c40493f 100644 --- a/test/test_dailymotion_subtitles.py +++ b/test/test_dailymotion_subtitles.py @@ -15,74 +15,43 @@ from youtube_dl.utils import * from helper import FakeYDL md5 = lambda s: hashlib.md5(s.encode('utf-8')).hexdigest() -TEST_URL = 'http://www.dailymotion.com/video/xczg00' class TestDailymotionSubtitles(unittest.TestCase): def setUp(self): - DL = FakeYDL() - DL.params['allsubtitles'] = False - DL.params['writesubtitles'] = False - DL.params['subtitlesformat'] = 'srt' - DL.params['listsubtitles'] = False + self.DL = FakeYDL() + self.url = 'http://www.dailymotion.com/video/xczg00' + def getInfoDict(self): + IE = DailymotionIE(self.DL) + info_dict = IE.extract(self.url) + return info_dict + def getSubtitles(self): + info_dict = self.getInfoDict() + return info_dict[0]['subtitles'] def test_no_subtitles(self): - DL = FakeYDL() - DL.params['writesubtitles'] = False - IE = DailymotionIE(DL) - info_dict = IE.extract(TEST_URL) - subtitles = info_dict[0]['subtitles'] + subtitles = self.getSubtitles() self.assertEqual(subtitles, None) def test_subtitles(self): - DL = FakeYDL() - DL.params['writesubtitles'] = True - IE = DailymotionIE(DL) - info_dict = IE.extract(TEST_URL) - sub = info_dict[0]['subtitles']['en'] - self.assertEqual(md5(sub), '976553874490cba125086bbfea3ff76f') + self.DL.params['writesubtitles'] = True + subtitles = self.getSubtitles() + self.assertEqual(md5(subtitles['en']), '976553874490cba125086bbfea3ff76f') def test_subtitles_fr(self): - DL = FakeYDL() - DL.params['writesubtitles'] = True - DL.params['subtitleslang'] = 'fr' - IE = DailymotionIE(DL) - info_dict = IE.extract(TEST_URL) - sub = info_dict[0]['subtitles']['fr'] - self.assertEqual(md5(sub), '594564ec7d588942e384e920e5341792') + self.DL.params['writesubtitles'] = True + self.DL.params['subtitleslang'] = 'fr' + subtitles = self.getSubtitles() + self.assertEqual(md5(subtitles['fr']), '594564ec7d588942e384e920e5341792') def test_allsubtitles(self): - DL = FakeYDL() - DL.params['allsubtitles'] = True - IE = DailymotionIE(DL) - info_dict = IE.extract(TEST_URL) - subtitles = info_dict[0]['subtitles'] + self.DL.params['allsubtitles'] = True + subtitles = self.getSubtitles() self.assertEqual(len(subtitles.keys()), 5) - # def test_subtitles_sbv_format(self): - # DL = FakeYDL() - # DL.params['writesubtitles'] = True - # DL.params['subtitlesformat'] = 'sbv' - # IE = DailymotionIE(DL) - # info_dict = IE.extract(TEST_URL) - # sub = info_dict[0]['subtitles'][0] - # self.assertEqual(md5(sub), '13aeaa0c245a8bed9a451cb643e3ad8b') - # def test_subtitles_vtt_format(self): - # DL = FakeYDL() - # DL.params['writesubtitles'] = True - # DL.params['subtitlesformat'] = 'vtt' - # IE = DailymotionIE(DL) - # info_dict = IE.extract(TEST_URL) - # sub = info_dict[0]['subtitles'][0] - # self.assertEqual(md5(sub), '356cdc577fde0c6783b9b822e7206ff7') - def test_list_subtitles(self): - DL = FakeYDL() - DL.params['listsubtitles'] = True - IE = DailymotionIE(DL) - info_dict = IE.extract(TEST_URL) + def test_list_subtitles(self): #ojo + self.DL.params['listsubtitles'] = True + info_dict = self.getInfoDict() self.assertEqual(info_dict, None) def test_automatic_captions(self): - DL = FakeYDL() - DL.params['writeautomaticsub'] = True - DL.params['subtitleslang'] = 'en' - IE = DailymotionIE(DL) - info_dict = IE.extract(TEST_URL) - sub = info_dict[0]['subtitles'] - self.assertTrue(len(sub) == 0) + self.DL.params['writeautomaticsub'] = True + self.DL.params['subtitleslang'] = 'en' + subtitles = self.getSubtitles() + self.assertTrue(len(subtitles.keys()) == 0) if __name__ == '__main__': unittest.main() diff --git a/test/test_youtube_subtitles.py b/test/test_youtube_subtitles.py index fe5d097ce..aa6a1a434 100644 --- a/test/test_youtube_subtitles.py +++ b/test/test_youtube_subtitles.py @@ -18,70 +18,52 @@ md5 = lambda s: hashlib.md5(s.encode('utf-8')).hexdigest() class TestYoutubeSubtitles(unittest.TestCase): def setUp(self): - DL = FakeYDL() - DL.params['allsubtitles'] = False - DL.params['writesubtitles'] = False - DL.params['subtitlesformat'] = 'srt' - DL.params['listsubtitles'] = False + self.DL = FakeYDL() + self.url = 'QRS8MkLhQmM' + def getInfoDict(self): + IE = YoutubeIE(self.DL) + info_dict = IE.extract(self.url) + return info_dict + def getSubtitles(self): + info_dict = self.getInfoDict() + return info_dict[0]['subtitles'] def test_youtube_no_subtitles(self): - DL = FakeYDL() - DL.params['writesubtitles'] = False - IE = YoutubeIE(DL) - info_dict = IE.extract('QRS8MkLhQmM') - subtitles = info_dict[0]['subtitles'] + self.DL.params['writesubtitles'] = False + subtitles = self.getSubtitles() self.assertEqual(subtitles, None) def test_youtube_subtitles(self): - DL = FakeYDL() - DL.params['writesubtitles'] = True - IE = YoutubeIE(DL) - info_dict = IE.extract('QRS8MkLhQmM') - sub = info_dict[0]['subtitles']['en'] - self.assertEqual(md5(sub), '4cd9278a35ba2305f47354ee13472260') + self.DL.params['writesubtitles'] = True + subtitles = self.getSubtitles() + self.assertEqual(md5(subtitles['en']), '4cd9278a35ba2305f47354ee13472260') def test_youtube_subtitles_it(self): - DL = FakeYDL() - DL.params['writesubtitles'] = True - DL.params['subtitleslang'] = 'it' - IE = YoutubeIE(DL) - info_dict = IE.extract('QRS8MkLhQmM') - sub = info_dict[0]['subtitles']['it'] - self.assertEqual(md5(sub), '164a51f16f260476a05b50fe4c2f161d') + self.DL.params['writesubtitles'] = True + self.DL.params['subtitleslang'] = 'it' + subtitles = self.getSubtitles() + self.assertEqual(md5(subtitles['it']), '164a51f16f260476a05b50fe4c2f161d') def test_youtube_allsubtitles(self): - DL = FakeYDL() - DL.params['allsubtitles'] = True - IE = YoutubeIE(DL) - info_dict = IE.extract('QRS8MkLhQmM') - subtitles = info_dict[0]['subtitles'] + self.DL.params['allsubtitles'] = True + subtitles = self.getSubtitles() self.assertEqual(len(subtitles.keys()), 13) def test_youtube_subtitles_sbv_format(self): - DL = FakeYDL() - DL.params['writesubtitles'] = True - DL.params['subtitlesformat'] = 'sbv' - IE = YoutubeIE(DL) - info_dict = IE.extract('QRS8MkLhQmM') - sub = info_dict[0]['subtitles']['en'] - self.assertEqual(md5(sub), '13aeaa0c245a8bed9a451cb643e3ad8b') + self.DL.params['writesubtitles'] = True + self.DL.params['subtitlesformat'] = 'sbv' + subtitles = self.getSubtitles() + self.assertEqual(md5(subtitles['en']), '13aeaa0c245a8bed9a451cb643e3ad8b') def test_youtube_subtitles_vtt_format(self): - DL = FakeYDL() - DL.params['writesubtitles'] = True - DL.params['subtitlesformat'] = 'vtt' - IE = YoutubeIE(DL) - info_dict = IE.extract('QRS8MkLhQmM') - sub = info_dict[0]['subtitles']['en'] - self.assertEqual(md5(sub), '356cdc577fde0c6783b9b822e7206ff7') + self.DL.params['writesubtitles'] = True + self.DL.params['subtitlesformat'] = 'vtt' + subtitles = self.getSubtitles() + self.assertEqual(md5(subtitles['en']), '356cdc577fde0c6783b9b822e7206ff7') def test_youtube_list_subtitles(self): - DL = FakeYDL() - DL.params['listsubtitles'] = True - IE = YoutubeIE(DL) - info_dict = IE.extract('QRS8MkLhQmM') + self.DL.params['listsubtitles'] = True + info_dict = self.getInfoDict() self.assertEqual(info_dict, None) def test_youtube_automatic_captions(self): - DL = FakeYDL() - DL.params['writeautomaticsub'] = True - DL.params['subtitleslang'] = 'it' - IE = YoutubeIE(DL) - info_dict = IE.extract('8YoUxe5ncPo') - sub = info_dict[0]['subtitles']['it'] - self.assertTrue(sub is not None) + self.url = '8YoUxe5ncPo' + self.DL.params['writeautomaticsub'] = True + self.DL.params['subtitleslang'] = 'it' + subtitles = self.getSubtitles() + self.assertTrue(subtitles['it'] is not None) if __name__ == '__main__': unittest.main() From 69df680b973841b61594c246a9cf4a708f09cb17 Mon Sep 17 00:00:00 2001 From: Ismael Mejia Date: Thu, 8 Aug 2013 11:20:56 +0200 Subject: [PATCH 009/215] [subtitles] Improved docs + new class for servers who don't support auto-caption --- youtube_dl/extractor/dailymotion.py | 9 ++------ youtube_dl/extractor/subtitles.py | 32 +++++++++++++++-------------- 2 files changed, 19 insertions(+), 22 deletions(-) diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index 97003ee35..8fab16005 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -3,7 +3,7 @@ import json import socket from .common import InfoExtractor -from .subtitles import SubtitlesIE +from .subtitles import NoAutoSubtitlesIE from ..utils import ( compat_http_client, @@ -17,7 +17,7 @@ from ..utils import ( ) -class DailyMotionSubtitlesIE(SubtitlesIE): +class DailyMotionSubtitlesIE(NoAutoSubtitlesIE): def _get_available_subtitles(self, video_id): request = compat_urllib_request.Request('https://api.dailymotion.com/video/%s/subtitles?fields=id,language,url' % video_id) @@ -33,11 +33,6 @@ class DailyMotionSubtitlesIE(SubtitlesIE): self._downloader.report_warning(u'video doesn\'t have subtitles') return {} - def _request_automatic_caption(self, video_id, webpage): - self._downloader.report_warning(u'Automatic Captions not supported by this server') - return {} - - class DailymotionIE(DailyMotionSubtitlesIE): """Information Extractor for Dailymotion""" diff --git a/youtube_dl/extractor/subtitles.py b/youtube_dl/extractor/subtitles.py index 8843e0220..caacea5fe 100644 --- a/youtube_dl/extractor/subtitles.py +++ b/youtube_dl/extractor/subtitles.py @@ -12,21 +12,15 @@ from ..utils import ( class SubtitlesIE(InfoExtractor): - def report_video_subtitles_available(self, video_id, sub_lang_list): - """Report available subtitles.""" + def _list_available_subtitles(self, video_id): + """ outputs the available subtitles for the video """ + sub_lang_list = self._get_available_subtitles(video_id) sub_lang = ",".join(list(sub_lang_list.keys())) self.to_screen(u'%s: Available subtitles for video: %s' % (video_id, sub_lang)) - def _list_available_subtitles(self, video_id): - sub_lang_list = self._get_available_subtitles(video_id) - self.report_video_subtitles_available(video_id, sub_lang_list) - def _extract_subtitles(self, video_id): - """ - Return a dictionary: {language: subtitles} or {} if the subtitles - couldn't be found - """ + """ returns {sub_lang: sub} or {} if subtitles not found """ sub_lang_list = self._get_available_subtitles(video_id) if not sub_lang_list: # error, it didn't get the available subtitles return {} @@ -51,6 +45,7 @@ class SubtitlesIE(InfoExtractor): return subtitles def _request_subtitle_url(self, sub_lang, url): + """ makes the http request for the subtitle """ try: sub = compat_urllib_request.urlopen(url).read().decode('utf-8') except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: @@ -62,12 +57,19 @@ class SubtitlesIE(InfoExtractor): return sub def _get_available_subtitles(self, video_id): - """returns the list of available subtitles like this {lang: url} """ - """or {} if not available. Must be redefined by the subclasses.""" + """ returns {sub_lang: url} or {} if not available """ + """ Must be redefined by the subclasses """ pass def _request_automatic_caption(self, video_id, webpage): - """Request automatic caption. Redefine in subclasses.""" - """returns a tuple of ... """ - # return [(err_msg, None, None)] + """ returns {sub_lang: sub} or {} if not available """ + """ Must be redefined by the subclasses """ pass + + +class NoAutoSubtitlesIE(SubtitlesIE): + """ A subtitle class for the servers that don't support auto-captions""" + + def _request_automatic_caption(self, video_id, webpage): + self._downloader.report_warning(u'Automatic Captions not supported by this server') + return {} From d55de6eec2adf7d1aaca87e75dad06ef15d9be26 Mon Sep 17 00:00:00 2001 From: Ismael Mejia Date: Thu, 8 Aug 2013 18:30:04 +0200 Subject: [PATCH 010/215] [subtitles] Skips now the subtitles that has already been downloaded. Just a validation for file exists, I also removed a method that wasn't been used because it was a copy paste from FileDownloader. --- youtube_dl/YoutubeDL.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index ed5492826..e11d6f994 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -221,19 +221,16 @@ class YoutubeDL(object): def report_writesubtitles(self, sub_filename): """ Report that the subtitles file is being written """ - self.to_screen(u'[info] Writing video subtitles to: ' + sub_filename) + self.to_screen(u'[info] Writing subtitle: ' + sub_filename) + + def report_existingsubtitles(self, sub_filename): + """ Report that the subtitles file has been already written """ + self.to_screen(u'[info] Skipping existing subtitle: ' + sub_filename) def report_writeinfojson(self, infofn): """ Report that the metadata file has been written """ self.to_screen(u'[info] Video description metadata as JSON to: ' + infofn) - def report_file_already_downloaded(self, file_name): - """Report file has already been fully downloaded.""" - try: - self.to_screen(u'[download] %s has already been downloaded' % file_name) - except (UnicodeEncodeError) as err: - self.to_screen(u'[download] The file has already been downloaded') - def increment_downloads(self): """Increment the ordinal that assigns a number to each file.""" self._num_downloads += 1 @@ -492,13 +489,16 @@ class YoutubeDL(object): # that way it will silently go on when used with unsupporting IE subtitles = info_dict['subtitles'] sub_format = self.params.get('subtitlesformat') - + for sub_lang in subtitles.keys(): sub = subtitles[sub_lang] if sub is None: continue try: sub_filename = filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format + if os.path.isfile(encodeFilename(sub_filename)): + self.report_existingsubtitles(sub_filename) + continue self.report_writesubtitles(sub_filename) with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8') as subfile: subfile.write(sub) From d80a064eff4fe2416f9db36b07f1e2ca641f1334 Mon Sep 17 00:00:00 2001 From: Ismael Mejia Date: Thu, 8 Aug 2013 22:22:33 +0200 Subject: [PATCH 011/215] [subtitles] Added tests to check correct behavior when no subtitles are available --- test/test_dailymotion_subtitles.py | 9 +++++++-- test/test_youtube_subtitles.py | 10 ++++++++-- 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/test/test_dailymotion_subtitles.py b/test/test_dailymotion_subtitles.py index 26c40493f..efc4e574f 100644 --- a/test/test_dailymotion_subtitles.py +++ b/test/test_dailymotion_subtitles.py @@ -27,14 +27,14 @@ class TestDailymotionSubtitles(unittest.TestCase): def getSubtitles(self): info_dict = self.getInfoDict() return info_dict[0]['subtitles'] - def test_no_subtitles(self): + def test_no_writesubtitles(self): subtitles = self.getSubtitles() self.assertEqual(subtitles, None) def test_subtitles(self): self.DL.params['writesubtitles'] = True subtitles = self.getSubtitles() self.assertEqual(md5(subtitles['en']), '976553874490cba125086bbfea3ff76f') - def test_subtitles_fr(self): + def test_subtitles_lang(self): self.DL.params['writesubtitles'] = True self.DL.params['subtitleslang'] = 'fr' subtitles = self.getSubtitles() @@ -52,6 +52,11 @@ class TestDailymotionSubtitles(unittest.TestCase): self.DL.params['subtitleslang'] = 'en' subtitles = self.getSubtitles() self.assertTrue(len(subtitles.keys()) == 0) + def test_nosubtitles(self): + self.url = 'http://www.dailymotion.com/video/x12u166_le-zapping-tele-star-du-08-aout-2013_tv' + self.DL.params['allsubtitles'] = True + subtitles = self.getSubtitles() + self.assertEqual(len(subtitles), 0) if __name__ == '__main__': unittest.main() diff --git a/test/test_youtube_subtitles.py b/test/test_youtube_subtitles.py index aa6a1a434..e40243077 100644 --- a/test/test_youtube_subtitles.py +++ b/test/test_youtube_subtitles.py @@ -27,7 +27,7 @@ class TestYoutubeSubtitles(unittest.TestCase): def getSubtitles(self): info_dict = self.getInfoDict() return info_dict[0]['subtitles'] - def test_youtube_no_subtitles(self): + def test_youtube_no_writesubtitles(self): self.DL.params['writesubtitles'] = False subtitles = self.getSubtitles() self.assertEqual(subtitles, None) @@ -35,7 +35,7 @@ class TestYoutubeSubtitles(unittest.TestCase): self.DL.params['writesubtitles'] = True subtitles = self.getSubtitles() self.assertEqual(md5(subtitles['en']), '4cd9278a35ba2305f47354ee13472260') - def test_youtube_subtitles_it(self): + def test_youtube_subtitles_lang(self): self.DL.params['writesubtitles'] = True self.DL.params['subtitleslang'] = 'it' subtitles = self.getSubtitles() @@ -64,6 +64,12 @@ class TestYoutubeSubtitles(unittest.TestCase): self.DL.params['subtitleslang'] = 'it' subtitles = self.getSubtitles() self.assertTrue(subtitles['it'] is not None) + def test_youtube_nosubtitles(self): + self.url = 'sAjKT8FhjI8' + self.DL.params['allsubtitles'] = True + subtitles = self.getSubtitles() + self.assertEqual(len(subtitles), 0) + if __name__ == '__main__': unittest.main() From b3f0e5304807862ce72c136da90b860df805ee5c Mon Sep 17 00:00:00 2001 From: Jai Grimshaw Date: Sat, 31 Aug 2013 01:53:01 +1000 Subject: [PATCH 012/215] Fixed issue #1277 KeyError when no description. Allows a continue with a warning when an extractor cannot retrieve a description. --- youtube_dl/YoutubeDL.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index b289bd9e2..afce28040 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -492,6 +492,8 @@ class YoutubeDL(object): self.report_writedescription(descfn) with io.open(encodeFilename(descfn), 'w', encoding='utf-8') as descfile: descfile.write(info_dict['description']) + except (KeyError, TypeError): + self.report_warning(u'Cannot extract description.') except (OSError, IOError): self.report_error(u'Cannot write description file ' + descfn) return From bdc6b3fc64a03045b8130cdc824ee3f6c15eeff1 Mon Sep 17 00:00:00 2001 From: Allan Zhou Date: Fri, 30 Aug 2013 17:51:50 -0700 Subject: [PATCH 013/215] add support for "-f mp4" for YouTube --- youtube_dl/__init__.py | 2 +- youtube_dl/extractor/youtube.py | 37 ++++++++++++++++++++++++++------- 2 files changed, 31 insertions(+), 8 deletions(-) diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 431460c57..b6b12683f 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -192,7 +192,7 @@ def parseOpts(overrideArguments=None): video_format.add_option('-f', '--format', action='store', dest='format', metavar='FORMAT', - help='video format code, specifiy the order of preference using slashes: "-f 22/17/18"') + help='video format code, specifiy the order of preference using slashes: "-f 22/17/18". "-f mp4" and "-f flv" are also supported') video_format.add_option('--all-formats', action='store_const', dest='format', help='download all available video formats', const='all') video_format.add_option('--prefer-free-formats', diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 8e486afd0..bd2b986ce 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -153,8 +153,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): $""" _NEXT_URL_RE = r'[\?&]next_url=([^&]+)' # Listed in order of quality - _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13', - '95', '94', '93', '92', '132', '151', + _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '36', '17', '13', + # AHLS + '96', '95', '94', '93', '92', '132', '151', # 3D '85', '84', '102', '83', '101', '82', '100', # Dash video @@ -163,8 +164,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): # Dash audio '141', '172', '140', '171', '139', ] - _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13', - '95', '94', '93', '92', '132', '151', + _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '36', '17', '13', + # AHLS + '96', '95', '94', '93', '92', '132', '151', + # 3D '85', '102', '84', '101', '83', '100', '82', # Dash video '138', '248', '137', '247', '136', '246', '245', @@ -172,11 +175,18 @@ class YoutubeIE(YoutubeBaseInfoExtractor): # Dash audio '172', '141', '171', '140', '139', ] + _video_formats_map = { + 'flv': ['35', '34', '6', '5'], + '3gp': ['36', '17', '13'], + 'mp4': ['38', '37', '22', '18'], + 'webm': ['46', '45', '44', '43'], + } _video_extensions = { '13': '3gp', - '17': 'mp4', + '17': '3gp', '18': 'mp4', '22': 'mp4', + '36': '3gp', '37': 'mp4', '38': 'mp4', '43': 'webm', @@ -193,7 +203,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): '101': 'webm', '102': 'webm', - # videos that use m3u8 + # AHLS '92': 'mp4', '93': 'mp4', '94': 'mp4', @@ -234,6 +244,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): '22': '720x1280', '34': '360x640', '35': '480x854', + '36': '240x320', '37': '1080x1920', '38': '3072x4096', '43': '360x640', @@ -597,13 +608,25 @@ class YoutubeIE(YoutubeBaseInfoExtractor): video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats else: # Specific formats. We pick the first in a slash-delimeted sequence. - # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'. + # Format can be specified as itag or 'mp4' or 'flv' etc. We pick the highest quality + # available in the specified format. For example, + # if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'. + # if '1/mp4/3/4' is requested and '1' and '5' (is a mp4) are available, we pick '1'. + # if '1/mp4/3/4' is requested and '4' and '5' (is a mp4) are available, we pick '5'. req_formats = req_format.split('/') video_url_list = None for rf in req_formats: if rf in url_map: video_url_list = [(rf, url_map[rf])] break + if rf in self._video_formats_map: + for srf in self._video_formats_map[rf]: + if srf in url_map: + video_url_list = [(srf, url_map[srf])] + break + else: + continue + break if video_url_list is None: raise ExtractorError(u'requested format not available') return video_url_list From 691008087b902fa731a8f4f840c1821c93505840 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sat, 31 Aug 2013 15:05:59 +0200 Subject: [PATCH 014/215] Add an automatic page generator for the supported sites (related #156) They are listed in the "supportedsites.html" page. --- devscripts/gh-pages/update-sites.py | 33 +++++++++++++++++++++++++++++ devscripts/release.sh | 1 + 2 files changed, 34 insertions(+) create mode 100755 devscripts/gh-pages/update-sites.py diff --git a/devscripts/gh-pages/update-sites.py b/devscripts/gh-pages/update-sites.py new file mode 100755 index 000000000..fa4bb2beb --- /dev/null +++ b/devscripts/gh-pages/update-sites.py @@ -0,0 +1,33 @@ +#!/usr/bin/env python3 + +import sys +import os +import textwrap + +# We must be able to import youtube_dl +sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) + +import youtube_dl + +def main(): + with open('supportedsites.html.in', 'r', encoding='utf-8') as tmplf: + template = tmplf.read() + + ie_htmls = [] + for ie in sorted(youtube_dl.gen_extractors(), key=lambda i: i.IE_NAME): + ie_html = '{}'.format(ie.IE_NAME) + try: + ie_html += ': {}'.format(ie.IE_DESC) + except AttributeError: + pass + if ie.working() == False: + ie_html += ' (Currently broken)' + ie_htmls.append('
  • {}
  • '.format(ie_html)) + + template = template.replace('@SITES@', textwrap.indent('\n'.join(ie_htmls), '\t')) + + with open('supportedsites.html', 'w', encoding='utf-8') as sitesf: + sitesf.write(template) + +if __name__ == '__main__': + main() diff --git a/devscripts/release.sh b/devscripts/release.sh index 24c9ad8d8..62c68a6cf 100755 --- a/devscripts/release.sh +++ b/devscripts/release.sh @@ -85,6 +85,7 @@ ROOT=$(pwd) "$ROOT/devscripts/gh-pages/sign-versions.py" < "$ROOT/updates_key.pem" "$ROOT/devscripts/gh-pages/generate-download.py" "$ROOT/devscripts/gh-pages/update-copyright.py" + "$ROOT/devscripts/gh-pages/update-sites.py" git add *.html *.html.in update git commit -m "release $version" git show HEAD From 6c758d79de48956b90d9e78aec695ee0b10b00d2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sat, 31 Aug 2013 22:35:39 +0200 Subject: [PATCH 015/215] [metacafe] Add more cases for detecting the uploader detection (reported in #1343) --- youtube_dl/extractor/metacafe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/metacafe.py b/youtube_dl/extractor/metacafe.py index e38dc98b4..e537648ff 100644 --- a/youtube_dl/extractor/metacafe.py +++ b/youtube_dl/extractor/metacafe.py @@ -122,7 +122,7 @@ class MetacafeIE(InfoExtractor): video_title = self._html_search_regex(r'(?im)(.*) - Video', webpage, u'title') description = self._og_search_description(webpage) video_uploader = self._html_search_regex( - r'submitter=(.*?);|googletag\.pubads\(\)\.setTargeting\("channel","([^"]+)"\);', + r'submitter=(.*?);|googletag\.pubads\(\)\.setTargeting\("(?:channel|submiter)","([^"]+)"\);', webpage, u'uploader nickname', fatal=False) return { From 8e4e89f1c236e1bec38c5363c1c341930056211e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Mon, 2 Sep 2013 11:54:09 +0200 Subject: [PATCH 016/215] Add an extractor for VeeHD (closes #1359) --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/veehd.py | 56 ++++++++++++++++++++++++++++++++ 2 files changed, 57 insertions(+) create mode 100644 youtube_dl/extractor/veehd.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 90f1a4418..9f56e427c 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -89,6 +89,7 @@ from .tutv import TutvIE from .unistra import UnistraIE from .ustream import UstreamIE from .vbox7 import Vbox7IE +from .veehd import VeeHDIE from .veoh import VeohIE from .vevo import VevoIE from .videofyme import VideofyMeIE diff --git a/youtube_dl/extractor/veehd.py b/youtube_dl/extractor/veehd.py new file mode 100644 index 000000000..3a99a29c6 --- /dev/null +++ b/youtube_dl/extractor/veehd.py @@ -0,0 +1,56 @@ +import re +import json + +from .common import InfoExtractor +from ..utils import ( + compat_urlparse, + get_element_by_id, + clean_html, +) + +class VeeHDIE(InfoExtractor): + _VALID_URL = r'https?://veehd.com/video/(?P\d+)' + + _TEST = { + u'url': u'http://veehd.com/video/4686958', + u'file': u'4686958.mp4', + u'info_dict': { + u'title': u'Time Lapse View from Space ( ISS)', + u'uploader_id': u'spotted', + u'description': u'md5:f0094c4cf3a72e22bc4e4239ef767ad7', + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + + webpage = self._download_webpage(url, video_id) + player_path = self._search_regex(r'\$\("#playeriframe"\).attr\({src : "(.+?)"', + webpage, u'player path') + player_url = compat_urlparse.urljoin(url, player_path) + player_page = self._download_webpage(player_url, video_id, + u'Downloading player page') + config_json = self._search_regex(r'value=\'config=({.+?})\'', + player_page, u'config json') + config = json.loads(config_json) + + video_url = compat_urlparse.unquote(config['clip']['url']) + title = clean_html(get_element_by_id('videoName', webpage).rpartition('|')[0]) + uploader_id = self._html_search_regex(r'(.+?)', + webpage, u'uploader') + thumbnail = self._search_regex(r'(.*?) Date: Tue, 3 Sep 2013 01:51:17 +0200 Subject: [PATCH 017/215] Extractor for defense.gouv.fr --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/defense.py | 37 ++++++++++++++++++++++++++++++++ 2 files changed, 38 insertions(+) create mode 100644 youtube_dl/extractor/defense.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 9f56e427c..a96b62d37 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -21,6 +21,7 @@ from .dailymotion import DailymotionIE, DailymotionPlaylistIE from .depositfiles import DepositFilesIE from .dotsub import DotsubIE from .dreisat import DreiSatIE +from .defense import DefenseGouvFrIE from .ehow import EHowIE from .eighttracks import EightTracksIE from .escapist import EscapistIE diff --git a/youtube_dl/extractor/defense.py b/youtube_dl/extractor/defense.py new file mode 100644 index 000000000..963fb897f --- /dev/null +++ b/youtube_dl/extractor/defense.py @@ -0,0 +1,37 @@ +# coding: utf-8 +'''Extractor for defense.gouv.fr''' +import re +import json + +from .common import InfoExtractor + + +class DefenseGouvFrIE(InfoExtractor): + '''Extractor for defense.gouv.fr''' + _IE_NAME = 'defense.gouv.fr' + _VALID_URL = (r'http://.*?\.defense\.gouv\.fr/layout/set/' + 'ligthboxvideo/base-de-medias/webtv/(.*)') + + _TEST = { + u'url': (u'http://www.defense.gouv.fr/layout/set/ligthboxvideo/', + 'base-de-medias/webtv/attaque-chimique-syrienne-du-21-aout-2013-1') + } + + def _real_extract(self, url): + title = re.match(self._VALID_URL, url).group(1) + webpage = self._download_webpage(url, title) + video_id = self._search_regex( + r"flashvars.pvg_id=\"(\d+)\";", + webpage, 'ID') + + json_url = ('http://static.videos.gouv.fr/brightcovehub/export/json/' + + video_id) + info = self._download_webpage(json_url, title, + 'Downloading JSON config') + video_url = json.loads(info)['renditions'][0]['url'] + + return {'id': video_id, + 'ext': 'mp4', + 'url': video_url, + 'title': title, + } From aa32314d09cf0ab3fad1efc2c5657e6704a7e47b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Tue, 3 Sep 2013 10:48:56 +0200 Subject: [PATCH 018/215] [vimeo] add support for videos that embed the download url in the player page (fixes #1364) --- youtube_dl/extractor/vimeo.py | 34 ++++++++++++++++++++++++++-------- 1 file changed, 26 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 512e06e2a..dee4175ef 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -44,6 +44,16 @@ class VimeoIE(InfoExtractor): u'title': u'Andy Allan - Putting the Carto into OpenStreetMap Cartography', }, }, + { + u'url': u'http://player.vimeo.com/video/54469442', + u'file': u'54469442.mp4', + u'md5': u'619b811a4417aa4abe78dc653becf511', + u'note': u'Videos that embed the url in the player page', + u'info_dict': { + u'title': u'Kathy Sierra: Building the minimum Badass User, Business of Software', + u'uploader': u'The BLN & Business of Software', + }, + }, ] def _login(self): @@ -112,7 +122,8 @@ class VimeoIE(InfoExtractor): # Extract the config JSON try: - config = webpage.split(' = {config:')[1].split(',assets:')[0] + config = self._search_regex([r' = {config:({.+?}),assets:', r'c=({.+?);'], + webpage, u'info section', flags=re.DOTALL) config = json.loads(config) except: if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage): @@ -132,7 +143,9 @@ class VimeoIE(InfoExtractor): video_uploader_id = config["video"]["owner"]["url"].split('/')[-1] if config["video"]["owner"]["url"] else None # Extract video thumbnail - video_thumbnail = config["video"]["thumbnail"] + video_thumbnail = config["video"].get("thumbnail") + if video_thumbnail is None: + _, video_thumbnail = sorted((int(width), t_url) for (width, t_url) in config["video"]["thumbs"].items())[-1] # Extract video description video_description = get_element_by_attribute("itemprop", "description", webpage) @@ -154,14 +167,15 @@ class VimeoIE(InfoExtractor): # TODO bind to format param codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')] files = { 'hd': [], 'sd': [], 'other': []} + config_files = config["video"].get("files") or config["request"].get("files") for codec_name, codec_extension in codecs: - if codec_name in config["video"]["files"]: - if 'hd' in config["video"]["files"][codec_name]: + if codec_name in config_files: + if 'hd' in config_files[codec_name]: files['hd'].append((codec_name, codec_extension, 'hd')) - elif 'sd' in config["video"]["files"][codec_name]: + elif 'sd' in config_files[codec_name]: files['sd'].append((codec_name, codec_extension, 'sd')) else: - files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0])) + files['other'].append((codec_name, codec_extension, config_files[codec_name][0])) for quality in ('hd', 'sd', 'other'): if len(files[quality]) > 0: @@ -173,8 +187,12 @@ class VimeoIE(InfoExtractor): else: raise ExtractorError(u'No known codec found') - video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \ - %(video_id, sig, timestamp, video_quality, video_codec.upper()) + video_url = None + if isinstance(config_files[video_codec], dict): + video_url = config_files[video_codec][video_quality].get("url") + if video_url is None: + video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \ + %(video_id, sig, timestamp, video_quality, video_codec.upper()) return [{ 'id': video_id, From 9c2ade40de53bae865c5267642651c81d16e48a8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Tue, 3 Sep 2013 11:11:36 +0200 Subject: [PATCH 019/215] [vimeo] Handle Assertions Error when trying to get the description In some pages the html tags are not closed, python 2.6 cannot handle it. --- youtube_dl/extractor/vimeo.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index dee4175ef..4a7d82b7a 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -148,9 +148,17 @@ class VimeoIE(InfoExtractor): _, video_thumbnail = sorted((int(width), t_url) for (width, t_url) in config["video"]["thumbs"].items())[-1] # Extract video description - video_description = get_element_by_attribute("itemprop", "description", webpage) - if video_description: video_description = clean_html(video_description) - else: video_description = u'' + video_description = None + try: + video_description = get_element_by_attribute("itemprop", "description", webpage) + if video_description: video_description = clean_html(video_description) + except AssertionError as err: + # On some pages like (http://player.vimeo.com/video/54469442) the + # html tags are not closed, python 2.6 cannot handle it + if err.args[0] == 'we should not get here!': + pass + else: + raise # Extract upload date video_upload_date = None From 4ff7a0f1f6e6b1ad1743330d318dfe85806923b7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Tue, 3 Sep 2013 11:33:59 +0200 Subject: [PATCH 020/215] [dailymotion] improve the regex for extracting the video info --- youtube_dl/extractor/dailymotion.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index 1ea449ca8..439033d23 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -55,7 +55,8 @@ class DailymotionIE(InfoExtractor): embed_url = 'http://www.dailymotion.com/embed/video/%s' % video_id embed_page = self._download_webpage(embed_url, video_id, u'Downloading embed page') - info = self._search_regex(r'var info = ({.*?}),', embed_page, 'video info') + info = self._search_regex(r'var info = ({.*?}),$', embed_page, + 'video info', flags=re.MULTILINE) info = json.loads(info) # TODO: support choosing qualities From c8dbccde30d9ca06d4c9305329a9aacd10420276 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Tue, 3 Sep 2013 11:51:01 +0200 Subject: [PATCH 021/215] [orf] Remove the test video, they seem to expire in one week --- youtube_dl/extractor/orf.py | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/youtube_dl/extractor/orf.py b/youtube_dl/extractor/orf.py index 41ef8e992..cfca2a063 100644 --- a/youtube_dl/extractor/orf.py +++ b/youtube_dl/extractor/orf.py @@ -14,19 +14,6 @@ from ..utils import ( class ORFIE(InfoExtractor): _VALID_URL = r'https?://tvthek.orf.at/(programs/.+?/episodes|topics/.+?)/(?P\d+)' - _TEST = { - u'url': u'http://tvthek.orf.at/programs/1171769-Wetter-ZIB/episodes/6557323-Wetter', - u'file': u'6566957.flv', - u'info_dict': { - u'title': u'Wetter', - u'description': u'Christa Kummer, Marcus Wadsak und Kollegen präsentieren abwechselnd ihre täglichen Wetterprognosen für Österreich.\r \r Mehr Wetter unter wetter.ORF.at', - }, - u'params': { - # It uses rtmp - u'skip_download': True, - } - } - def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) playlist_id = mobj.group('id') From 025171c47641a47cc2a4e4ed52c7a04b465e0e5d Mon Sep 17 00:00:00 2001 From: Pierre Rudloff Date: Tue, 3 Sep 2013 12:03:19 +0200 Subject: [PATCH 022/215] Suggested by @phihag --- youtube_dl/extractor/defense.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/defense.py b/youtube_dl/extractor/defense.py index 963fb897f..424d960da 100644 --- a/youtube_dl/extractor/defense.py +++ b/youtube_dl/extractor/defense.py @@ -1,5 +1,3 @@ -# coding: utf-8 -'''Extractor for defense.gouv.fr''' import re import json @@ -7,14 +5,18 @@ from .common import InfoExtractor class DefenseGouvFrIE(InfoExtractor): - '''Extractor for defense.gouv.fr''' _IE_NAME = 'defense.gouv.fr' _VALID_URL = (r'http://.*?\.defense\.gouv\.fr/layout/set/' - 'ligthboxvideo/base-de-medias/webtv/(.*)') + r'ligthboxvideo/base-de-medias/webtv/(.*)') _TEST = { - u'url': (u'http://www.defense.gouv.fr/layout/set/ligthboxvideo/', - 'base-de-medias/webtv/attaque-chimique-syrienne-du-21-aout-2013-1') + u'url': (u'http://www.defense.gouv.fr/layout/set/ligthboxvideo/' + u'base-de-medias/webtv/attaque-chimique-syrienne-du-21-aout-2013-1'), + u'file': u'11213.mp4', + u'md5': u'75bba6124da7e63d2d60b5244ec9430c', + "info_dict": { + "title": "attaque-chimique-syrienne-du-21-aout-2013-1" + } } def _real_extract(self, url): From 96fb5605b29c5029ab2894b5722c0937e320a3c0 Mon Sep 17 00:00:00 2001 From: Allan Zhou Date: Tue, 3 Sep 2013 18:49:35 -0700 Subject: [PATCH 023/215] AHLS -> Apple HTTP Live Streaming --- youtube_dl/extractor/youtube.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index d331aa01b..01265ca28 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -154,7 +154,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): _NEXT_URL_RE = r'[\?&]next_url=([^&]+)' # Listed in order of quality _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '36', '17', '13', - # AHLS + # Apple HTTP Live Streaming '96', '95', '94', '93', '92', '132', '151', # 3D '85', '84', '102', '83', '101', '82', '100', @@ -165,7 +165,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): '141', '172', '140', '171', '139', ] _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '36', '17', '13', - # AHLS + # Apple HTTP Live Streaming '96', '95', '94', '93', '92', '132', '151', # 3D '85', '102', '84', '101', '83', '100', '82', @@ -203,7 +203,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): '101': 'webm', '102': 'webm', - # AHLS + # Apple HTTP Live Streaming '92': 'mp4', '93': 'mp4', '94': 'mp4', From 08523ee20a57e7ac28d895165f3b759b311e8495 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Wed, 4 Sep 2013 14:33:32 +0200 Subject: [PATCH 024/215] release 2013.09.04 --- README.md | 3 ++- youtube_dl/version.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 75068fe56..2776cb3eb 100644 --- a/README.md +++ b/README.md @@ -113,7 +113,8 @@ which means you can modify it, redistribute it or use it however you like. ## Video Format Options: -f, --format FORMAT video format code, specifiy the order of - preference using slashes: "-f 22/17/18" + preference using slashes: "-f 22/17/18". "-f mp4" + and "-f flv" are also supported --all-formats download all available video formats --prefer-free-formats prefer free video formats unless a specific one is requested diff --git a/youtube_dl/version.py b/youtube_dl/version.py index b6284c6d6..5d7467699 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2013.08.30' +__version__ = '2013.09.04' From 150f20828be552763dddce1c45b9a4e642cff599 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Wed, 4 Sep 2013 22:06:50 +0200 Subject: [PATCH 025/215] Add extractor for daum.net (closes #1330) --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/daum.py | 71 ++++++++++++++++++++++++++++++++ 2 files changed, 72 insertions(+) create mode 100644 youtube_dl/extractor/daum.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index a96b62d37..caef53b73 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -18,6 +18,7 @@ from .condenast import CondeNastIE from .criterion import CriterionIE from .cspan import CSpanIE from .dailymotion import DailymotionIE, DailymotionPlaylistIE +from .daum import DaumIE from .depositfiles import DepositFilesIE from .dotsub import DotsubIE from .dreisat import DreiSatIE diff --git a/youtube_dl/extractor/daum.py b/youtube_dl/extractor/daum.py new file mode 100644 index 000000000..9b4566999 --- /dev/null +++ b/youtube_dl/extractor/daum.py @@ -0,0 +1,71 @@ +# encoding: utf-8 +import re +import xml.etree.ElementTree + +from .common import InfoExtractor +from ..utils import ( + compat_urllib_parse, + determine_ext, +) + + +class DaumIE(InfoExtractor): + _VALID_URL = r'https?://tvpot\.daum\.net/.*?clipid=(?P\d+)' + IE_NAME = u'daum.net' + + _TEST = { + u'url': u'http://tvpot.daum.net/clip/ClipView.do?clipid=52554690', + u'file': u'52554690.mp4', + u'info_dict': { + u'title': u'DOTA 2GETHER 시즌2 6회 - 2부', + u'upload_date': u'20130831', + u'duration': 3868, + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group(1) + webpage = self._download_webpage(url, video_id) + full_id = self._search_regex(r' Date: Wed, 4 Sep 2013 22:09:22 +0200 Subject: [PATCH 026/215] Credit @Huarong for tv.sohu.com --- youtube_dl/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index b6b12683f..4213ec1d5 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -28,6 +28,7 @@ __authors__ = ( 'Axel Noack', 'Albert Kim', 'Pierre Rudloff', + 'Huarong Huo', ) __license__ = 'Public Domain' From 9363169b67a7837bdd157939a896bd38b350f634 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Thu, 5 Sep 2013 10:08:17 +0200 Subject: [PATCH 027/215] [daum] Get the video page from a canonical url to extract the full id (fixes #1373) and extract description. --- youtube_dl/extractor/daum.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/daum.py b/youtube_dl/extractor/daum.py index 9b4566999..a804e83bd 100644 --- a/youtube_dl/extractor/daum.py +++ b/youtube_dl/extractor/daum.py @@ -18,6 +18,7 @@ class DaumIE(InfoExtractor): u'file': u'52554690.mp4', u'info_dict': { u'title': u'DOTA 2GETHER 시즌2 6회 - 2부', + u'description': u'DOTA 2GETHER 시즌2 6회 - 2부', u'upload_date': u'20130831', u'duration': 3868, }, @@ -26,7 +27,8 @@ class DaumIE(InfoExtractor): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group(1) - webpage = self._download_webpage(url, video_id) + canonical_url = 'http://tvpot.daum.net/v/%s' % video_id + webpage = self._download_webpage(canonical_url, video_id) full_id = self._search_regex(r' Date: Thu, 5 Sep 2013 10:53:40 +0200 Subject: [PATCH 028/215] Add extractor for tvcast.naver.com (closes #1331) --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/naver.py | 73 ++++++++++++++++++++++++++++++++ 2 files changed, 74 insertions(+) create mode 100644 youtube_dl/extractor/naver.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index caef53b73..70ebd29e2 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -58,6 +58,7 @@ from .mtv import MTVIE from .muzu import MuzuTVIE from .myspass import MySpassIE from .myvideo import MyVideoIE +from .naver import NaverIE from .nba import NBAIE from .nbc import NBCNewsIE from .ooyala import OoyalaIE diff --git a/youtube_dl/extractor/naver.py b/youtube_dl/extractor/naver.py new file mode 100644 index 000000000..9df236d69 --- /dev/null +++ b/youtube_dl/extractor/naver.py @@ -0,0 +1,73 @@ +# encoding: utf-8 +import re +import xml.etree.ElementTree + +from .common import InfoExtractor +from ..utils import ( + compat_urllib_parse, + ExtractorError, +) + + +class NaverIE(InfoExtractor): + _VALID_URL = r'https?://tvcast\.naver\.com/v/(?P\d+)' + + _TEST = { + u'url': u'http://tvcast.naver.com/v/81652', + u'file': u'81652.mp4', + u'info_dict': { + u'title': u'[9월 모의고사 해설강의][수학_김상희] 수학 A형 16~20번', + u'description': u'합격불변의 법칙 메가스터디 | 메가스터디 수학 김상희 선생님이 9월 모의고사 수학A형 16번에서 20번까지 해설강의를 공개합니다.', + u'upload_date': u'20130903', + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group(1) + webpage = self._download_webpage(url, video_id) + m_id = re.search(r'var rmcPlayer = new nhn.rmcnmv.RMCVideoPlayer\("(.+?)", "(.+?)"', + webpage) + if m_id is None: + raise ExtractorError(u'couldn\'t extract vid and key') + vid = m_id.group(1) + key = m_id.group(2) + query = compat_urllib_parse.urlencode({'vid': vid, 'inKey': key,}) + query_urls = compat_urllib_parse.urlencode({ + 'masterVid': vid, + 'protocol': 'p2p', + 'inKey': key, + }) + info_xml = self._download_webpage( + 'http://serviceapi.rmcnmv.naver.com/flash/videoInfo.nhn?' + query, + video_id, u'Downloading video info') + urls_xml = self._download_webpage( + 'http://serviceapi.rmcnmv.naver.com/flash/playableEncodingOption.nhn?' + query_urls, + video_id, u'Downloading video formats info') + info = xml.etree.ElementTree.fromstring(info_xml.encode('utf-8')) + urls = xml.etree.ElementTree.fromstring(urls_xml.encode('utf-8')) + + formats = [] + for format_el in urls.findall('EncodingOptions/EncodingOption'): + domain = format_el.find('Domain').text + if domain.startswith('rtmp'): + continue + formats.append({ + 'url': domain + format_el.find('uri').text, + 'ext': 'mp4', + 'width': int(format_el.find('width').text), + 'height': int(format_el.find('height').text), + }) + + info = { + 'id': video_id, + 'title': info.find('Subject').text, + 'formats': formats, + 'description': self._og_search_description(webpage), + 'thumbnail': self._og_search_thumbnail(webpage), + 'upload_date': info.find('WriteDate').text.replace('.', ''), + 'view_count': int(info.find('PlayCount').text), + } + # TODO: Remove when #980 has been merged + info.update(formats[-1]) + return info From 08e291b54d8aaa34300c02e70ff86aaa36820a62 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Thu, 5 Sep 2013 18:02:17 +0200 Subject: [PATCH 029/215] [generic] Recognize html5 video in the format '