From 0025da15cf310a58ee8f124e395bc1bd52fab5c8 Mon Sep 17 00:00:00 2001 From: alphapapa Date: Sat, 13 Jul 2013 16:42:16 -0500 Subject: [PATCH 01/62] Clarify that download rate is in bytes per second I found https://github.com/rg3/youtube-dl/commit/f918ec7ea29a37521d1fc22fb9f900283c5a2c49 but it is still not clear to anyone who hasn't read Issue #723 whether the limit is in bits or bytes. This is doubly confusing because 1) ISPs usually advertise speeds in bits per second, and 2) lowercase "k" and "m" are often used in correlation with bits rather than bytes. --- youtube_dl/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index db63d0adb..250cf62f8 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -208,7 +208,7 @@ def parseOpts(overrideArguments=None): help='language of the subtitles to download (optional) use IETF language tags like \'en\'') downloader.add_option('-r', '--rate-limit', - dest='ratelimit', metavar='LIMIT', help='maximum download rate (e.g. 50k or 44.6m)') + dest='ratelimit', metavar='LIMIT', help='maximum download rate in bytes per second (e.g. 50k or 44.6m)') downloader.add_option('-R', '--retries', dest='retries', metavar='RETRIES', help='number of retries (default is %default)', default=10) downloader.add_option('--buffer-size', From d79a0e233a329e543797478a2eeb377e469c0f3f Mon Sep 17 00:00:00 2001 From: Pierre Rudloff Date: Tue, 17 Sep 2013 22:13:40 +0200 Subject: [PATCH 02/62] Extractor for websurg.com --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/websurg.py | 67 ++++++++++++++++++++++++++++++++ 2 files changed, 68 insertions(+) create mode 100644 youtube_dl/extractor/websurg.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 761575062..19ded18f1 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -109,6 +109,7 @@ from .videofyme import VideofyMeIE from .vimeo import VimeoIE, VimeoChannelIE from .vine import VineIE from .wat import WatIE +from .websurg import WeBSurgIE from .weibo import WeiboIE from .wimp import WimpIE from .worldstarhiphop import WorldStarHipHopIE diff --git a/youtube_dl/extractor/websurg.py b/youtube_dl/extractor/websurg.py new file mode 100644 index 000000000..953bc9831 --- /dev/null +++ b/youtube_dl/extractor/websurg.py @@ -0,0 +1,67 @@ +# coding: utf-8 + +import re + +from ..utils import ( + compat_urllib_request, + compat_urllib_parse +) + +from .common import InfoExtractor + +class WeBSurgIE(InfoExtractor): + IE_NAME = u'websurg.com' + _VALID_URL = r'http://.*?\.websurg\.com/MEDIA/\?noheader=1&doi=(.*)' + + _TEST = { + u'url': u'http://www.websurg.com/MEDIA/?noheader=1&doi=vd01en4012', + u'file': u'vd01en4012.mp4', + u'params': { + u'skip_download': True, + } + } + + _LOGIN_URL = 'http://www.websurg.com/inc/login/login_div.ajax.php?login=1' + + def _real_extract(self, url): + + login_form = { + 'username': self._downloader.params['username'], + 'password': self._downloader.params['password'], + 'Submit': 1 + } + + request = compat_urllib_request.Request( + self._LOGIN_URL, compat_urllib_parse.urlencode(login_form)) + request.add_header( + 'Content-Type', 'application/x-www-form-urlencoded;charset=utf-8') + login_results = compat_urllib_request.urlopen(request).info() + + sessid = re.match(r'PHPSESSID=(.*);', + login_results['Set-Cookie']).group(1) + request = compat_urllib_request.Request( + url, compat_urllib_parse.urlencode(login_form), + {'Cookie': 'PHPSESSID=' + sessid + ';'}) + webpage = compat_urllib_request.urlopen(request).read() + + video_id = re.match(self._VALID_URL, url).group(1) + + url_info = re.search(r'streamer="(.*?)" src="(.*?)"', webpage) + + if url_info is None: + self._downloader.report_warning( + u'Unable to log in: bad username/password') + return + + return {'id': video_id, + 'title' : re.search( + r'property="og:title" content="(.*?)" />' + , webpage).group(1), + 'description': re.search( + r'name="description" content="(.*?)" />', webpage).group(1), + 'ext' : 'mp4', + 'url' : url_info.group(1) + '/' + url_info.group(2), + 'thumbnail': re.search( + r'property="og:image" content="(.*?)" />', webpage + ).group(1) + } From cc6943e86aef74bef767be7f4027ab6122c95d55 Mon Sep 17 00:00:00 2001 From: Pierre Rudloff Date: Wed, 18 Sep 2013 00:07:04 +0200 Subject: [PATCH 03/62] Improvements --- youtube_dl/extractor/websurg.py | 27 ++++++++++----------------- 1 file changed, 10 insertions(+), 17 deletions(-) diff --git a/youtube_dl/extractor/websurg.py b/youtube_dl/extractor/websurg.py index 953bc9831..efc8029af 100644 --- a/youtube_dl/extractor/websurg.py +++ b/youtube_dl/extractor/websurg.py @@ -23,7 +23,7 @@ class WeBSurgIE(InfoExtractor): _LOGIN_URL = 'http://www.websurg.com/inc/login/login_div.ajax.php?login=1' - def _real_extract(self, url): + def _real_initialize(self): login_form = { 'username': self._downloader.params['username'], @@ -35,14 +35,13 @@ class WeBSurgIE(InfoExtractor): self._LOGIN_URL, compat_urllib_parse.urlencode(login_form)) request.add_header( 'Content-Type', 'application/x-www-form-urlencoded;charset=utf-8') - login_results = compat_urllib_request.urlopen(request).info() + compat_urllib_request.urlopen(request).info() - sessid = re.match(r'PHPSESSID=(.*);', - login_results['Set-Cookie']).group(1) - request = compat_urllib_request.Request( - url, compat_urllib_parse.urlencode(login_form), - {'Cookie': 'PHPSESSID=' + sessid + ';'}) - webpage = compat_urllib_request.urlopen(request).read() + def _real_extract(self, url): + + request = compat_urllib_request.Request(url) + webpage = unicode( + compat_urllib_request.urlopen(request).read(), 'utf-8') video_id = re.match(self._VALID_URL, url).group(1) @@ -52,16 +51,10 @@ class WeBSurgIE(InfoExtractor): self._downloader.report_warning( u'Unable to log in: bad username/password') return - return {'id': video_id, - 'title' : re.search( - r'property="og:title" content="(.*?)" />' - , webpage).group(1), - 'description': re.search( - r'name="description" content="(.*?)" />', webpage).group(1), + 'title': self._og_search_title(webpage), + 'description': self._og_search_description(webpage), 'ext' : 'mp4', 'url' : url_info.group(1) + '/' + url_info.group(2), - 'thumbnail': re.search( - r'property="og:image" content="(.*?)" />', webpage - ).group(1) + 'thumbnail': self._og_search_thumbnail(webpage) } From 5c1d63b73737bb23885ae6079e2004b5f084eb9c Mon Sep 17 00:00:00 2001 From: Pierre Rudloff Date: Fri, 4 Oct 2013 01:04:38 +0200 Subject: [PATCH 04/62] Changes suggested by @phihag --- youtube_dl/extractor/websurg.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/youtube_dl/extractor/websurg.py b/youtube_dl/extractor/websurg.py index efc8029af..849334aa0 100644 --- a/youtube_dl/extractor/websurg.py +++ b/youtube_dl/extractor/websurg.py @@ -36,21 +36,21 @@ class WeBSurgIE(InfoExtractor): request.add_header( 'Content-Type', 'application/x-www-form-urlencoded;charset=utf-8') compat_urllib_request.urlopen(request).info() + request = compat_urllib_request.Request(self._LOGIN_URL) + webpage = compat_urllib_request.urlopen(request).read() + + if webpage != 'OK': + self._downloader.report_error( + u'Unable to log in: bad username/password') def _real_extract(self, url): - - request = compat_urllib_request.Request(url) - webpage = unicode( - compat_urllib_request.urlopen(request).read(), 'utf-8') - video_id = re.match(self._VALID_URL, url).group(1) + request = compat_urllib_request.Request(url) + webpage = self._download_webpage(url, video_id) + url_info = re.search(r'streamer="(.*?)" src="(.*?)"', webpage) - if url_info is None: - self._downloader.report_warning( - u'Unable to log in: bad username/password') - return return {'id': video_id, 'title': self._og_search_title(webpage), 'description': self._og_search_description(webpage), From b039775057abf6005ceef2819a746c9f3b671cd3 Mon Sep 17 00:00:00 2001 From: Pierre Rudloff Date: Fri, 4 Oct 2013 01:07:24 +0200 Subject: [PATCH 05/62] Unused variable --- youtube_dl/extractor/websurg.py | 1 - 1 file changed, 1 deletion(-) diff --git a/youtube_dl/extractor/websurg.py b/youtube_dl/extractor/websurg.py index 849334aa0..96a1bb852 100644 --- a/youtube_dl/extractor/websurg.py +++ b/youtube_dl/extractor/websurg.py @@ -46,7 +46,6 @@ class WeBSurgIE(InfoExtractor): def _real_extract(self, url): video_id = re.match(self._VALID_URL, url).group(1) - request = compat_urllib_request.Request(url) webpage = self._download_webpage(url, video_id) url_info = re.search(r'streamer="(.*?)" src="(.*?)"', webpage) From 73b4fafd82256c66198b1670d1a6dccfaf5f782c Mon Sep 17 00:00:00 2001 From: Pierre Rudloff Date: Fri, 4 Oct 2013 01:12:42 +0200 Subject: [PATCH 06/62] Use self._download_webpage everywhere --- youtube_dl/extractor/websurg.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/youtube_dl/extractor/websurg.py b/youtube_dl/extractor/websurg.py index 96a1bb852..7d335d444 100644 --- a/youtube_dl/extractor/websurg.py +++ b/youtube_dl/extractor/websurg.py @@ -36,8 +36,7 @@ class WeBSurgIE(InfoExtractor): request.add_header( 'Content-Type', 'application/x-www-form-urlencoded;charset=utf-8') compat_urllib_request.urlopen(request).info() - request = compat_urllib_request.Request(self._LOGIN_URL) - webpage = compat_urllib_request.urlopen(request).read() + webpage = self._download_webpage(self._LOGIN_URL, '', 'Logging in') if webpage != 'OK': self._downloader.report_error( From 400afddaf49353c9b4c31d17d5efe4045e500fec Mon Sep 17 00:00:00 2001 From: rzhxeo Date: Sat, 5 Oct 2013 09:37:11 +0200 Subject: [PATCH 07/62] Add CinemassacreIE --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/cinemassacre.py | 100 +++++++++++++++++++++++++++ 2 files changed, 101 insertions(+) create mode 100644 youtube_dl/extractor/cinemassacre.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index d1b7e5f99..db30edc27 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -12,6 +12,7 @@ from .brightcove import BrightcoveIE from .c56 import C56IE from .canalplus import CanalplusIE from .canalc2 import Canalc2IE +from .cinemassacre import CinemassacreIE from .cnn import CNNIE from .collegehumor import CollegeHumorIE from .comedycentral import ComedyCentralIE diff --git a/youtube_dl/extractor/cinemassacre.py b/youtube_dl/extractor/cinemassacre.py new file mode 100644 index 000000000..16eaff3a1 --- /dev/null +++ b/youtube_dl/extractor/cinemassacre.py @@ -0,0 +1,100 @@ +# encoding: utf-8 +import re + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, +) + +class CinemassacreIE(InfoExtractor): + """Information Extractor for Cinemassacre""" + _VALID_URL = r'(?:http://)?(?:www\.)?(?Pcinemassacre\.com/(?P[0-9]{4})/(?P[0-9]{2})/(?P[0-9]{2})/.+?)(?:[/?].*)?' + _TESTS = [{ + u'url': u'http://cinemassacre.com/2012/11/10/avgn-the-movie-trailer/', + u'file': u'19911.mp4', + u'info_dict': { + u'upload_date': u'20121110', + u'title': u'“Angry Video Game Nerd: The Movie” – Trailer', + #u'description': u'“Angry Video Game Nerd: The Movie” is...', # Description is too long + }, + u'params': { + u'skip_download': True, + }, + }] + + def _real_extract(self,url): + mobj = re.match(self._VALID_URL, url) + + webpage_url = u'http://' + mobj.group('url') + webpage = self._download_webpage(webpage_url, None) # Don't know video id yet + video_date = mobj.group('date_Y') + mobj.group('date_m') + mobj.group('date_d') + video_id = self._html_search_regex(r'src="http://player\.screenwavemedia\.com/play/embed\.php\?id=(?P.+?)"', + webpage, u'video_id') + video_title = self._html_search_regex(r'

(?P.+?)</h1>[^<]*</div>', + webpage, u'title') + video_description = self._html_search_regex(r'<div class="entry-content">(?P<description>.+?)</div>', + webpage, u'description', flags=re.DOTALL, fatal=False) + + playerdata_url = u'http://player.screenwavemedia.com/play/player.php?id=' + video_id + playerdata = self._download_webpage(playerdata_url, video_id) + base_url = self._html_search_regex(r'\'streamer\': \'(?P<base_url>rtmp://[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3})/vod\'', + playerdata, u'base_url') + base_url += '/Cinemassacre/' + # The file names in playerdata are wrong for some videos??? + sd_file = 'Cinemassacre-%s_high.mp4' % video_id + hd_file = 'Cinemassacre-%s.mp4' % video_id + video_thumbnail = 'http://image.screenwavemedia.com/Cinemassacre/Cinemassacre-%s_thumb_640x360.jpg' % video_id + + formats = [{ + 'id': video_id, + 'url': base_url + hd_file, + 'format': 'hd', + 'ext': 'mp4', + 'title': video_title, + 'description': video_description, + 'upload_date': video_date, + 'thumbnail': video_thumbnail, + }, + { + 'id': video_id, + 'url': base_url + sd_file, + 'ext': 'mp4', + 'format': 'sd', + 'title': video_title, + 'description': video_description, + 'upload_date': video_date, + 'thumbnail': video_thumbnail, + }] + + if self._downloader.params.get('listformats', None): + self._print_formats(formats) + return + + req_format = self._downloader.params.get('format', 'best') + self.to_screen(u'Format: %s' % req_format) + + if req_format is None or req_format == 'best': + return [formats[0]] + elif req_format == 'worst': + return [formats[-1]] + elif req_format in ('-1', 'all'): + return formats + else: + format = self._specific( req_format, formats ) + if format is None: + raise ExtractorError(u'Requested format not available') + return [format] + + def _print_formats(self, formats): + """Print all available formats""" + print(u'Available formats:') + print(u'ext\t\tformat') + print(u'---------------------------------') + for format in formats: + print(u'%s\t\t%s' % (format['ext'], format['format'])) + + def _specific(self, req_format, formats): + for x in formats: + if x["format"] == req_format: + return x + return None From 1ece880d7c94d9b966f52855949aae6c0f37a140 Mon Sep 17 00:00:00 2001 From: rzhxeo <rzhxeot7z81b4700@mailcatch.com> Date: Sat, 5 Oct 2013 13:36:13 +0200 Subject: [PATCH 08/62] [CinemassacreIE] Add support for other embed methods --- youtube_dl/extractor/cinemassacre.py | 28 ++++++++++++++++++++++------ 1 file changed, 22 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/cinemassacre.py b/youtube_dl/extractor/cinemassacre.py index 16eaff3a1..f0629ee93 100644 --- a/youtube_dl/extractor/cinemassacre.py +++ b/youtube_dl/extractor/cinemassacre.py @@ -20,6 +20,17 @@ class CinemassacreIE(InfoExtractor): u'params': { u'skip_download': True, }, + }, + { + u'url': u'http://cinemassacre.com/2013/10/02/the-mummys-hand-1940', + u'file': u'521be8ef82b16.mp4', + u'info_dict': { + u'upload_date': u'20131002', + u'title': u'The Mummy’s Hand (1940)', + }, + u'params': { + u'skip_download': True, + }, }] def _real_extract(self,url): @@ -28,19 +39,24 @@ class CinemassacreIE(InfoExtractor): webpage_url = u'http://' + mobj.group('url') webpage = self._download_webpage(webpage_url, None) # Don't know video id yet video_date = mobj.group('date_Y') + mobj.group('date_m') + mobj.group('date_d') - video_id = self._html_search_regex(r'src="http://player\.screenwavemedia\.com/play/embed\.php\?id=(?P<video_id>.+?)"', - webpage, u'video_id') - video_title = self._html_search_regex(r'<h1 class="entry-title">(?P<title>.+?)</h1>[^<]*</div>', + mobj = re.search(r'src="(?P<embed_url>http://player\.screenwavemedia\.com/play/(?:embed|player)\.php\?id=(?:Cinemassacre-)?(?P<video_id>.+?))"', webpage) + if not mobj: + raise ExtractorError(u'Can\'t extract embed url and video id') + playerdata_url = mobj.group(u'embed_url') + video_id = mobj.group(u'video_id') + + video_title = self._html_search_regex(r'<title>(?P<title>.+?)\|', webpage, u'title') video_description = self._html_search_regex(r'<div class="entry-content">(?P<description>.+?)</div>', webpage, u'description', flags=re.DOTALL, fatal=False) + if len(video_description) == 0: + video_description = None - playerdata_url = u'http://player.screenwavemedia.com/play/player.php?id=' + video_id playerdata = self._download_webpage(playerdata_url, video_id) - base_url = self._html_search_regex(r'\'streamer\': \'(?P<base_url>rtmp://[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3})/vod\'', + base_url = self._html_search_regex(r'\'streamer\': \'(?P<base_url>rtmp://[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3})/(?:vod|Cinemassacre)\'', playerdata, u'base_url') base_url += '/Cinemassacre/' - # The file names in playerdata are wrong for some videos??? + # Important: The file names in playerdata are not used by the player and even wrong for some videos sd_file = 'Cinemassacre-%s_high.mp4' % video_id hd_file = 'Cinemassacre-%s.mp4' % video_id video_thumbnail = 'http://image.screenwavemedia.com/Cinemassacre/Cinemassacre-%s_thumb_640x360.jpg' % video_id From 91a26ca559d225307d1bcaac74a5ca499748adc5 Mon Sep 17 00:00:00 2001 From: rzhxeo <rzhxeot7z81b4700@mailcatch.com> Date: Sat, 5 Oct 2013 13:40:05 +0200 Subject: [PATCH 09/62] [CinemassacreIE] Remove docstring from class --- youtube_dl/extractor/cinemassacre.py | 1 - 1 file changed, 1 deletion(-) diff --git a/youtube_dl/extractor/cinemassacre.py b/youtube_dl/extractor/cinemassacre.py index f0629ee93..181f57e76 100644 --- a/youtube_dl/extractor/cinemassacre.py +++ b/youtube_dl/extractor/cinemassacre.py @@ -7,7 +7,6 @@ from ..utils import ( ) class CinemassacreIE(InfoExtractor): - """Information Extractor for Cinemassacre""" _VALID_URL = r'(?:http://)?(?:www\.)?(?P<url>cinemassacre\.com/(?P<date_Y>[0-9]{4})/(?P<date_m>[0-9]{2})/(?P<date_d>[0-9]{2})/.+?)(?:[/?].*)?' _TESTS = [{ u'url': u'http://cinemassacre.com/2012/11/10/avgn-the-movie-trailer/', From ca215e0a4fdf42cba913f5b21d0e9e0e46814102 Mon Sep 17 00:00:00 2001 From: rzhxeo <rzhxeot7z81b4700@mailcatch.com> Date: Sat, 5 Oct 2013 13:42:17 +0200 Subject: [PATCH 10/62] [CinemassacreIE] Use MD5 to check in TEST description --- youtube_dl/extractor/cinemassacre.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/cinemassacre.py b/youtube_dl/extractor/cinemassacre.py index 181f57e76..17a7916cb 100644 --- a/youtube_dl/extractor/cinemassacre.py +++ b/youtube_dl/extractor/cinemassacre.py @@ -14,7 +14,7 @@ class CinemassacreIE(InfoExtractor): u'info_dict': { u'upload_date': u'20121110', u'title': u'“Angry Video Game Nerd: The Movie” – Trailer', - #u'description': u'“Angry Video Game Nerd: The Movie” is...', # Description is too long + u'description': u'md5:fb87405fcb42a331742a0dce2708560b', }, u'params': { u'skip_download': True, From ad7a071ab678d8ec5a2cee21efbf1a88a8ff8544 Mon Sep 17 00:00:00 2001 From: rzhxeo <rzhxeot7z81b4700@mailcatch.com> Date: Sun, 6 Oct 2013 20:55:24 +0200 Subject: [PATCH 11/62] Only download 1 sec. with rtmpdump in test mode --- youtube_dl/FileDownloader.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/youtube_dl/FileDownloader.py b/youtube_dl/FileDownloader.py index d6673fd3a..f1ff0b520 100644 --- a/youtube_dl/FileDownloader.py +++ b/youtube_dl/FileDownloader.py @@ -267,7 +267,7 @@ class FileDownloader(object): self.to_screen(u'\r%s[download] 100%% of %s in %s' % (clear_line, data_len_str, self.format_seconds(tot_time))) - def _download_with_rtmpdump(self, filename, url, player_url, page_url, play_path, tc_url): + def _download_with_rtmpdump(self, filename, url, player_url, page_url, play_path, tc_url, test): self.report_destination(filename) tmpfilename = self.temp_name(filename) @@ -291,6 +291,8 @@ class FileDownloader(object): basic_args += ['--playpath', play_path] if tc_url is not None: basic_args += ['--tcUrl', url] + if test: + basic_args += ['--stop', '1'] args = basic_args + [[], ['--resume', '--skip', '1']][self.params.get('continuedl', False)] if self.params.get('verbose', False): try: @@ -408,7 +410,8 @@ class FileDownloader(object): info_dict.get('player_url', None), info_dict.get('page_url', None), info_dict.get('play_path', None), - info_dict.get('tc_url', None)) + info_dict.get('tc_url', None), + self.params.get('test', False)) # Attempt to download using mplayer if url.startswith('mms') or url.startswith('rtsp'): From 8e4f824365543e394286742fcdb4c0a548becc8e Mon Sep 17 00:00:00 2001 From: rzhxeo <rzhxeot7z81b4700@mailcatch.com> Date: Sun, 6 Oct 2013 22:04:32 +0200 Subject: [PATCH 12/62] Remove test parameter from _download_with_rtmpdump --- youtube_dl/FileDownloader.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/youtube_dl/FileDownloader.py b/youtube_dl/FileDownloader.py index f1ff0b520..2cda5d52a 100644 --- a/youtube_dl/FileDownloader.py +++ b/youtube_dl/FileDownloader.py @@ -267,7 +267,7 @@ class FileDownloader(object): self.to_screen(u'\r%s[download] 100%% of %s in %s' % (clear_line, data_len_str, self.format_seconds(tot_time))) - def _download_with_rtmpdump(self, filename, url, player_url, page_url, play_path, tc_url, test): + def _download_with_rtmpdump(self, filename, url, player_url, page_url, play_path, tc_url): self.report_destination(filename) tmpfilename = self.temp_name(filename) @@ -291,7 +291,7 @@ class FileDownloader(object): basic_args += ['--playpath', play_path] if tc_url is not None: basic_args += ['--tcUrl', url] - if test: + if self.params.get('test', False): basic_args += ['--stop', '1'] args = basic_args + [[], ['--resume', '--skip', '1']][self.params.get('continuedl', False)] if self.params.get('verbose', False): @@ -410,8 +410,7 @@ class FileDownloader(object): info_dict.get('player_url', None), info_dict.get('page_url', None), info_dict.get('play_path', None), - info_dict.get('tc_url', None), - self.params.get('test', False)) + info_dict.get('tc_url', None)) # Attempt to download using mplayer if url.startswith('mms') or url.startswith('rtsp'): From 88bd97e34c91a86dfe7dd01a9677b76ef43e1b66 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Tue, 8 Oct 2013 21:23:55 +0200 Subject: [PATCH 13/62] [vevo] Some improvements (fixes #1580) Extract the info from http://videoplayer.vevo.com/VideoService/AuthenticateVideo?isrc={id} Some videos don't have an smil manifest, extract the video urls directly from the json and use the last version of the video. Extract all the available formats and set the 'formats' field of the result --- youtube_dl/extractor/vevo.py | 68 ++++++++++++++++++++++++------------ 1 file changed, 46 insertions(+), 22 deletions(-) diff --git a/youtube_dl/extractor/vevo.py b/youtube_dl/extractor/vevo.py index 70408c4f0..1c1cc418d 100644 --- a/youtube_dl/extractor/vevo.py +++ b/youtube_dl/extractor/vevo.py @@ -1,11 +1,15 @@ import re import json +import xml.etree.ElementTree +import datetime from .common import InfoExtractor from ..utils import ( + determine_ext, ExtractorError, ) + class VevoIE(InfoExtractor): """ Accepts urls from vevo.com or in the format 'vevo:{id}' @@ -15,11 +19,11 @@ class VevoIE(InfoExtractor): _TEST = { u'url': u'http://www.vevo.com/watch/hurts/somebody-to-die-for/GB1101300280', u'file': u'GB1101300280.mp4', - u'md5': u'06bea460acb744eab74a9d7dcb4bfd61', u'info_dict': { u"upload_date": u"20130624", u"uploader": u"Hurts", - u"title": u"Somebody to Die For" + u"title": u"Somebody to Die For", + u'duration': 230, } } @@ -27,27 +31,47 @@ class VevoIE(InfoExtractor): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') - json_url = 'http://www.vevo.com/data/video/%s' % video_id - base_url = 'http://smil.lvl3.vevo.com' - videos_url = '%s/Video/V2/VFILE/%s/%sr.smil' % (base_url, video_id, video_id.lower()) + json_url = 'http://videoplayer.vevo.com/VideoService/AuthenticateVideo?isrc=%s' % video_id info_json = self._download_webpage(json_url, video_id, u'Downloading json info') - links_webpage = self._download_webpage(videos_url, video_id, u'Downloading videos urls') self.report_extraction(video_id) - video_info = json.loads(info_json) - m_urls = list(re.finditer(r'<video src="(?P<ext>.*?):/?(?P<url>.*?)"', links_webpage)) - if m_urls is None or len(m_urls) == 0: - raise ExtractorError(u'Unable to extract video url') - # They are sorted from worst to best quality - m_url = m_urls[-1] - video_url = base_url + '/' + m_url.group('url') - ext = m_url.group('ext') + video_info = json.loads(info_json)['video'] + last_version = {'version': -1} + for version in video_info['videoVersions']: + # These are the HTTP downloads, other types are for different manifests + if version['sourceType'] == 2: + if version['version'] > last_version['version']: + last_version = version + if last_version['version'] == -1: + raise ExtractorError(u'Unable to extract last version of the video') - return {'url': video_url, - 'ext': ext, - 'id': video_id, - 'title': video_info['title'], - 'thumbnail': video_info['img'], - 'upload_date': video_info['launchDate'].replace('/',''), - 'uploader': video_info['Artists'][0]['title'], - } + renditions = xml.etree.ElementTree.fromstring(last_version['data']) + formats = [] + # Already sorted from worst to best quality + for rend in renditions.findall('rendition'): + attr = rend.attrib + f_url = attr['url'] + formats.append({ + 'url': f_url, + 'ext': determine_ext(f_url), + 'height': int(attr['frameheight']), + 'width': int(attr['frameWidth']), + }) + + date_epoch = int(self._search_regex( + r'/Date\((\d+)\)/', video_info['launchDate'], u'launch date'))/1000 + upload_date = datetime.datetime.fromtimestamp(date_epoch) + info = { + 'id': video_id, + 'title': video_info['title'], + 'formats': formats, + 'thumbnail': video_info['imageUrl'], + 'upload_date': upload_date.strftime('%Y%m%d'), + 'uploader': video_info['mainArtists'][0]['artistName'], + 'duration': video_info['duration'], + } + + # TODO: Remove when #980 has been merged + info.update(formats[-1]) + + return info From 1d368c7589908d9e810732f3c8aeecd24f3cce04 Mon Sep 17 00:00:00 2001 From: Tom <eales@live.com> Date: Wed, 9 Oct 2013 21:56:09 +0800 Subject: [PATCH 14/62] Tiny tpo --- youtube_dl/YoutubeDL.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 073a3837c..e85e03fa4 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -119,7 +119,7 @@ class YoutubeDL(object): and not params['restrictfilenames']): # On Python 3, the Unicode filesystem API will throw errors (#1474) self.report_warning( - u'Assuming --restrict-filenames isnce file system encoding ' + u'Assuming --restrict-filenames since file system encoding ' u'cannot encode all charactes. ' u'Set the LC_ALL environment variable to fix this.') params['restrictfilenames'] = True From a34c2faae4315f8c5ea6ef8ea2cc6dc063cb0149 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Wed, 9 Oct 2013 16:41:36 +0200 Subject: [PATCH 15/62] [youtube] set the 'name' parameter in the subtitles url (fixes #1577) --- youtube_dl/extractor/youtube.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 35310b39f..a7c514513 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1116,6 +1116,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): 'lang': lang, 'v': video_id, 'fmt': self._downloader.params.get('subtitlesformat'), + 'name': l[0], }) url = u'http://www.youtube.com/api/timedtext?' + params sub_lang_list[lang] = url From 57da92b7df21137fc7c02d467365ae2189e0baed Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Wed, 9 Oct 2013 23:50:38 +0200 Subject: [PATCH 16/62] [youtube] Do not recognize attribution link as user (Fixes #1573) --- youtube_dl/extractor/youtube.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index a7c514513..8222a880f 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1635,7 +1635,7 @@ class YoutubeChannelIE(InfoExtractor): class YoutubeUserIE(InfoExtractor): IE_DESC = u'YouTube.com user videos (URL or "ytuser" keyword)' - _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!watch(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)([A-Za-z0-9_-]+)' + _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)([A-Za-z0-9_-]+)' _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s' _GDATA_PAGE_SIZE = 50 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json' From 8f1ae18a181eb74f6e592a99774624b96a1c62d3 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Wed, 9 Oct 2013 23:50:47 +0200 Subject: [PATCH 17/62] release 2013.10.09 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 8b4f03308..1004af116 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2013.10.07' +__version__ = '2013.10.09' From 2e1fa03bf5b165e930dd68278360b53036326cd6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Thu, 10 Oct 2013 15:25:11 +0200 Subject: [PATCH 18/62] Add an extractor for video.nhl.com (closes #1586) --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/nhl.py | 59 ++++++++++++++++++++++++++++++++ youtube_dl/utils.py | 1 + 3 files changed, 61 insertions(+) create mode 100644 youtube_dl/extractor/nhl.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index c01de6b5e..f44468d35 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -80,6 +80,7 @@ from .naver import NaverIE from .nba import NBAIE from .nbc import NBCNewsIE from .newgrounds import NewgroundsIE +from .nhl import NHLIE from .ooyala import OoyalaIE from .orf import ORFIE from .pbs import PBSIE diff --git a/youtube_dl/extractor/nhl.py b/youtube_dl/extractor/nhl.py new file mode 100644 index 000000000..f86d9de7e --- /dev/null +++ b/youtube_dl/extractor/nhl.py @@ -0,0 +1,59 @@ +import re +import json +import xml.etree.ElementTree + +from .common import InfoExtractor +from ..utils import ( + compat_urlparse, + compat_urllib_parse, + determine_ext, + unified_strdate, +) + + +class NHLIE(InfoExtractor): + IE_NAME = u'nhl.com' + _VALID_URL = r'https?://video(?P<team>\.[^.]*)?\.nhl\.com/videocenter/console\?.*?(?<=[?&])id=(?P<id>\d+)' + + _TEST = { + u'url': u'http://video.canucks.nhl.com/videocenter/console?catid=6?id=453614', + u'file': u'453614.mp4', + u'info_dict': { + u'title': u'Quick clip: Weise 4-3 goal vs Flames', + u'description': u'Dale Weise scores his first of the season to put the Canucks up 4-3.', + u'duration': 18, + u'upload_date': u'20131006', + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + json_url = 'http://video.nhl.com/videocenter/servlets/playlist?ids=%s&format=json' % video_id + info_json = self._download_webpage(json_url, video_id, + u'Downloading info json') + info_json = info_json.replace('\\\'', '\'') + info = json.loads(info_json)[0] + + initial_video_url = info['publishPoint'] + data = compat_urllib_parse.urlencode({ + 'type': 'fvod', + 'path': initial_video_url.replace('.mp4', '_sd.mp4'), + }) + path_url = 'http://video.nhl.com/videocenter/servlets/encryptvideopath?' + data + path_response = self._download_webpage(path_url, video_id, + u'Downloading final video url') + path_doc = xml.etree.ElementTree.fromstring(path_response) + video_url = path_doc.find('path').text + + join = compat_urlparse.urljoin + return { + 'id': video_id, + 'title': info['name'], + 'url': video_url, + 'ext': determine_ext(video_url), + 'description': info['description'], + 'duration': int(info['duration']), + 'thumbnail': join(join(video_url, '/u/'), info['bigImage']), + 'upload_date': unified_strdate(info['releaseDate'].split('.')[0]), + } diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index de2654762..82a1daeb9 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -715,6 +715,7 @@ def unified_strdate(date_str): '%Y/%m/%d %H:%M:%S', '%d.%m.%Y %H:%M', '%Y-%m-%dT%H:%M:%SZ', + '%Y-%m-%dT%H:%M:%S', ] for expression in format_expressions: try: From 4193a453c22bea044a0bfb204dfbc1374304a1a1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Thu, 10 Oct 2013 16:18:02 +0200 Subject: [PATCH 19/62] Don't add extractors with IE_DESC set to False to the page of supported sites. --- devscripts/gh-pages/update-sites.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/devscripts/gh-pages/update-sites.py b/devscripts/gh-pages/update-sites.py index 33f242480..153e15c8a 100755 --- a/devscripts/gh-pages/update-sites.py +++ b/devscripts/gh-pages/update-sites.py @@ -16,10 +16,11 @@ def main(): ie_htmls = [] for ie in sorted(youtube_dl.gen_extractors(), key=lambda i: i.IE_NAME.lower()): ie_html = '<b>{}</b>'.format(ie.IE_NAME) - try: + ie_desc = getattr(ie, 'IE_DESC', None) + if ie_desc is False: + continue + elif ie_desc is not None: ie_html += ': {}'.format(ie.IE_DESC) - except AttributeError: - pass if ie.working() == False: ie_html += ' (Currently broken)' ie_htmls.append('<li>{}</li>'.format(ie_html)) From 63da13e8291e2debce073aea63bcfb710c0f5f1a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Thu, 10 Oct 2013 19:37:17 +0200 Subject: [PATCH 20/62] Add an extractor for faz.net (closes #1582) --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/faz.py | 60 ++++++++++++++++++++++++++++++++ 2 files changed, 61 insertions(+) create mode 100644 youtube_dl/extractor/faz.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index f44468d35..a4d0c71ec 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -34,6 +34,7 @@ from .eighttracks import EightTracksIE from .escapist import EscapistIE from .exfm import ExfmIE from .facebook import FacebookIE +from .faz import FazIE from .fktv import ( FKTVIE, FKTVPosteckeIE, diff --git a/youtube_dl/extractor/faz.py b/youtube_dl/extractor/faz.py new file mode 100644 index 000000000..deaa4ed2d --- /dev/null +++ b/youtube_dl/extractor/faz.py @@ -0,0 +1,60 @@ +# encoding: utf-8 +import re +import xml.etree.ElementTree + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + clean_html, + get_element_by_attribute, +) + + +class FazIE(InfoExtractor): + IE_NAME = u'faz.net' + _VALID_URL = r'https?://www\.faz\.net/multimedia/videos/.*?-(?P<id>\d+).html' + + _TEST = { + u'url': u'http://www.faz.net/multimedia/videos/stockholm-chemie-nobelpreis-fuer-drei-amerikanische-forscher-12610585.html', + u'file': u'12610585.mp4', + u'info_dict': { + u'title': u'Stockholm: Chemie-Nobelpreis für drei amerikanische Forscher', + u'description': u'md5:1453fbf9a0d041d985a47306192ea253', + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + self.to_screen(video_id) + webpage = self._download_webpage(url, video_id) + config_xml_url = self._search_regex(r'writeFLV\(\'(.+?)\',', webpage, + u'config xml url') + config_xml = self._download_webpage(config_xml_url, video_id, + u'Downloading config xml') + config = xml.etree.ElementTree.fromstring(config_xml.encode('utf-8')) + + encodings = config.find('ENCODINGS') + formats = [] + for code in ['LOW', 'HIGH', 'HQ']: + encoding = encodings.find(code) + if encoding is None: + continue + encoding_url = encoding.find('FILENAME').text + formats.append({ + 'url': encoding_url, + 'ext': determine_ext(encoding_url), + 'format_id': code.lower(), + }) + + descr_html = get_element_by_attribute('class', 'Content Copy', webpage) + info = { + 'id': video_id, + 'title': self._og_search_title(webpage), + 'formats': formats, + 'description': clean_html(descr_html), + 'thumbnail': config.find('STILL/STILL_BIG').text, + } + # TODO: Remove when #980 has been merged + info.update(formats[-1]) + return info From 0ab4ff6378b40d35a0bd0e63c3bd9b837c4e6b74 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Thu, 10 Oct 2013 19:53:44 +0200 Subject: [PATCH 21/62] [mtv] Strip the description There were some tabs and newlines added around the string. --- youtube_dl/extractor/mtv.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py index 001a576a8..e520e2bb4 100644 --- a/youtube_dl/extractor/mtv.py +++ b/youtube_dl/extractor/mtv.py @@ -87,7 +87,7 @@ class MTVIE(InfoExtractor): description_node = itemdoc.find('description') if description_node is not None: - description = description_node.text + description = description_node.text.strip() else: description = None From 1cbb27b151cb3c2195a551726c05a8f156c5b8b1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Thu, 10 Oct 2013 19:55:09 +0200 Subject: [PATCH 22/62] [gamespot] Mark as broken (#1587) --- youtube_dl/extractor/gamespot.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/gamespot.py b/youtube_dl/extractor/gamespot.py index cd3bbe65f..5edbf678a 100644 --- a/youtube_dl/extractor/gamespot.py +++ b/youtube_dl/extractor/gamespot.py @@ -8,6 +8,7 @@ from ..utils import ( ) class GameSpotIE(InfoExtractor): + _WORKING = False _VALID_URL = r'(?:http://)?(?:www\.)?gamespot\.com/.*-(?P<page_id>\d+)/?' _TEST = { u"url": u"http://www.gamespot.com/arma-iii/videos/arma-iii-community-guide-sitrep-i-6410818/", From cb354c8f6218ecd722b218a08935f7bd7eecabd3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Thu, 10 Oct 2013 21:01:45 +0200 Subject: [PATCH 23/62] [yahoo] Download the info from another page The 'meta' field is not always in the video webpage --- youtube_dl/extractor/yahoo.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/youtube_dl/extractor/yahoo.py b/youtube_dl/extractor/yahoo.py index 5bdd5d591..464b498f5 100644 --- a/youtube_dl/extractor/yahoo.py +++ b/youtube_dl/extractor/yahoo.py @@ -50,6 +50,21 @@ class YahooIE(InfoExtractor): webpage, u'items', flags=re.MULTILINE) items = json.loads(items_json) info = items['mediaItems']['query']['results']['mediaObj'][0] + # The 'meta' field is not always in the video webpage, we request it + # from another page + long_id = info['id'] + query = ('SELECT * FROM yahoo.media.video.streams WHERE id="%s"' + ' AND plrs="86Gj0vCaSzV_Iuf6hNylf2"' % long_id) + data = compat_urllib_parse.urlencode({ + 'q': query, + 'env': 'prod', + 'format': 'json', + }) + query_result_json = self._download_webpage( + 'http://video.query.yahoo.com/v1/public/yql?' + data, + video_id, u'Downloading video info') + query_result = json.loads(query_result_json) + info = query_result['query']['results']['mediaObj'][0] meta = info['meta'] formats = [] From bc4f29170f7fe1088f63fdc42f225656d3680c5e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Fri, 11 Oct 2013 11:19:09 +0200 Subject: [PATCH 24/62] Add a PostProcessor for adding metadata to the file (closes #1570) It currently sets the title, the date and the author values. --- youtube_dl/PostProcessor.py | 33 +++++++++++++++++++++++++++++++++ youtube_dl/__init__.py | 5 +++++ 2 files changed, 38 insertions(+) diff --git a/youtube_dl/PostProcessor.py b/youtube_dl/PostProcessor.py index 3ee1d3c58..fbf8a7f98 100644 --- a/youtube_dl/PostProcessor.py +++ b/youtube_dl/PostProcessor.py @@ -2,6 +2,7 @@ import os import subprocess import sys import time +import datetime from .utils import * @@ -467,3 +468,35 @@ class FFmpegEmbedSubtitlePP(FFmpegPostProcessor): os.rename(encodeFilename(temp_filename), encodeFilename(filename)) return True, information + + +class FFmpegMetadataPP(FFmpegPostProcessor): + def run(self, info): + metadata = {} + if info.get('title') is not None: + metadata['title'] = info['title'] + if info.get('upload_date') is not None: + metadata['date'] = info['upload_date'] + if info.get('uploader') is not None: + metadata['artist'] = info['uploader'] + elif info.get('uploader_id') is not None: + metadata['artist'] = info['uploader_id'] + + if not metadata: + self._downloader.to_screen(u'[ffmpeg] There isn\'t any metadata to add') + return True, info + + filename = info['filepath'] + ext = os.path.splitext(filename)[1][1:] + temp_filename = filename + u'.temp' + + options = ['-c', 'copy'] + for (name, value) in metadata.items(): + options.extend(['-metadata', '%s="%s"' % (name, value)]) + options.extend(['-f', ext]) + + self._downloader.to_screen(u'[ffmpeg] Adding metadata to \'%s\'' % filename) + self.run_ffmpeg(filename, temp_filename, options) + os.remove(encodeFilename(filename)) + os.rename(encodeFilename(temp_filename), encodeFilename(filename)) + return True, info diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 3ff78daac..3513d719f 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -358,6 +358,8 @@ def parseOpts(overrideArguments=None): help='do not overwrite post-processed files; the post-processed files are overwritten by default') postproc.add_option('--embed-subs', action='store_true', dest='embedsubtitles', default=False, help='embed subtitles in the video (only for mp4 videos)') + postproc.add_option('--add-metadata', action='store_true', dest='addmetadata', default=False, + help='add metadata to the files') parser.add_option_group(general) @@ -651,6 +653,9 @@ def _real_main(argv=None): ydl.add_default_info_extractors() # PostProcessors + # Add the metadata pp first, the other pps will copy it + if opts.addmetadata: + ydl.add_post_processor(FFmpegMetadataPP()) if opts.extractaudio: ydl.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat, preferredquality=opts.audioquality, nopostoverwrites=opts.nopostoverwrites)) if opts.recodevideo: From 9026dd3858050db071b15db90cd953f7ab3de6c3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Fri, 11 Oct 2013 12:42:15 +0200 Subject: [PATCH 25/62] Make sure it only runs rtmpdump one time in test mode and return True if the download can be resumed --- youtube_dl/FileDownloader.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/youtube_dl/FileDownloader.py b/youtube_dl/FileDownloader.py index 2cda5d52a..8ecabab1a 100644 --- a/youtube_dl/FileDownloader.py +++ b/youtube_dl/FileDownloader.py @@ -270,6 +270,7 @@ class FileDownloader(object): def _download_with_rtmpdump(self, filename, url, player_url, page_url, play_path, tc_url): self.report_destination(filename) tmpfilename = self.temp_name(filename) + test = self.params.get('test', False) # Check for rtmpdump first try: @@ -291,7 +292,7 @@ class FileDownloader(object): basic_args += ['--playpath', play_path] if tc_url is not None: basic_args += ['--tcUrl', url] - if self.params.get('test', False): + if test: basic_args += ['--stop', '1'] args = basic_args + [[], ['--resume', '--skip', '1']][self.params.get('continuedl', False)] if self.params.get('verbose', False): @@ -302,7 +303,7 @@ class FileDownloader(object): shell_quote = repr self.to_screen(u'[debug] rtmpdump command line: ' + shell_quote(args)) retval = subprocess.call(args) - while retval == 2 or retval == 1: + while (retval == 2 or retval == 1) and not test: prevsize = os.path.getsize(encodeFilename(tmpfilename)) self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True) time.sleep(5.0) # This seems to be needed @@ -315,7 +316,7 @@ class FileDownloader(object): self.to_screen(u'\r[rtmpdump] Could not download the whole video. This can happen for some advertisements.') retval = 0 break - if retval == 0: + if retval == 0 or (test and retval == 2): fsize = os.path.getsize(encodeFilename(tmpfilename)) self.to_screen(u'\r[rtmpdump] %s bytes' % fsize) self.try_rename(tmpfilename, filename) From 91dbaef40692a68a53aa74858f538a5699bae9ea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Fri, 11 Oct 2013 14:33:26 +0200 Subject: [PATCH 26/62] [nhl] Add an extractor for videocenter's categories (#1586) It downloads the last 12 videos. --- test/test_playlists.py | 10 +++ youtube_dl/extractor/__init__.py | 2 +- youtube_dl/extractor/nhl.py | 105 ++++++++++++++++++++++++------- 3 files changed, 94 insertions(+), 23 deletions(-) diff --git a/test/test_playlists.py b/test/test_playlists.py index c33511333..de8bd298a 100644 --- a/test/test_playlists.py +++ b/test/test_playlists.py @@ -16,6 +16,7 @@ from youtube_dl.extractor import ( UstreamChannelIE, SoundcloudUserIE, LivestreamIE, + NHLVideocenterIE, ) from youtube_dl.utils import * @@ -74,5 +75,14 @@ class TestPlaylists(unittest.TestCase): self.assertEqual(result['title'], u'TEDCity2.0 (English)') self.assertTrue(len(result['entries']) >= 4) + def test_nhl_videocenter(self): + dl = FakeYDL() + ie = NHLVideocenterIE(dl) + result = ie.extract('http://video.canucks.nhl.com/videocenter/console?catid=999') + self.assertIsPlaylist(result) + self.assertEqual(result['id'], u'999') + self.assertEqual(result['title'], u'Highlights') + self.assertEqual(len(result['entries']), 12) + if __name__ == '__main__': unittest.main() diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index a4d0c71ec..688196869 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -81,7 +81,7 @@ from .naver import NaverIE from .nba import NBAIE from .nbc import NBCNewsIE from .newgrounds import NewgroundsIE -from .nhl import NHLIE +from .nhl import NHLIE, NHLVideocenterIE from .ooyala import OoyalaIE from .orf import ORFIE from .pbs import PBSIE diff --git a/youtube_dl/extractor/nhl.py b/youtube_dl/extractor/nhl.py index f86d9de7e..e8d43dd13 100644 --- a/youtube_dl/extractor/nhl.py +++ b/youtube_dl/extractor/nhl.py @@ -11,29 +11,14 @@ from ..utils import ( ) -class NHLIE(InfoExtractor): - IE_NAME = u'nhl.com' - _VALID_URL = r'https?://video(?P<team>\.[^.]*)?\.nhl\.com/videocenter/console\?.*?(?<=[?&])id=(?P<id>\d+)' +class NHLBaseInfoExtractor(InfoExtractor): + @staticmethod + def _fix_json(json_string): + return json_string.replace('\\\'', '\'') - _TEST = { - u'url': u'http://video.canucks.nhl.com/videocenter/console?catid=6?id=453614', - u'file': u'453614.mp4', - u'info_dict': { - u'title': u'Quick clip: Weise 4-3 goal vs Flames', - u'description': u'Dale Weise scores his first of the season to put the Canucks up 4-3.', - u'duration': 18, - u'upload_date': u'20131006', - }, - } - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - json_url = 'http://video.nhl.com/videocenter/servlets/playlist?ids=%s&format=json' % video_id - info_json = self._download_webpage(json_url, video_id, - u'Downloading info json') - info_json = info_json.replace('\\\'', '\'') - info = json.loads(info_json)[0] + def _extract_video(self, info): + video_id = info['id'] + self.report_extraction(video_id) initial_video_url = info['publishPoint'] data = compat_urllib_parse.urlencode({ @@ -57,3 +42,79 @@ class NHLIE(InfoExtractor): 'thumbnail': join(join(video_url, '/u/'), info['bigImage']), 'upload_date': unified_strdate(info['releaseDate'].split('.')[0]), } + + +class NHLIE(NHLBaseInfoExtractor): + IE_NAME = u'nhl.com' + _VALID_URL = r'https?://video(?P<team>\.[^.]*)?\.nhl\.com/videocenter/console\?.*?(?<=[?&])id=(?P<id>\d+)' + + _TEST = { + u'url': u'http://video.canucks.nhl.com/videocenter/console?catid=6?id=453614', + u'file': u'453614.mp4', + u'info_dict': { + u'title': u'Quick clip: Weise 4-3 goal vs Flames', + u'description': u'Dale Weise scores his first of the season to put the Canucks up 4-3.', + u'duration': 18, + u'upload_date': u'20131006', + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + json_url = 'http://video.nhl.com/videocenter/servlets/playlist?ids=%s&format=json' % video_id + info_json = self._download_webpage(json_url, video_id, + u'Downloading info json') + info_json = self._fix_json(info_json) + info = json.loads(info_json)[0] + return self._extract_video(info) + + +class NHLVideocenterIE(NHLBaseInfoExtractor): + IE_NAME = u'nhl.com:videocenter' + IE_DESC = u'Download the first 12 videos from a videocenter category' + _VALID_URL = r'https?://video\.(?P<team>[^.]*)\.nhl\.com/videocenter/(console\?.*?catid=(?P<catid>[^&]+))?' + + @classmethod + def suitable(cls, url): + if NHLIE.suitable(url): + return False + return super(NHLVideocenterIE, cls).suitable(url) + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + team = mobj.group('team') + webpage = self._download_webpage(url, team) + cat_id = self._search_regex( + [r'var defaultCatId = "(.+?)";', + r'{statusIndex:0,index:0,.*?id:(.*?),'], + webpage, u'category id') + playlist_title = self._html_search_regex( + r'\?catid=%s">(.*?)</a>' % cat_id, + webpage, u'playlist title', flags=re.DOTALL) + + data = compat_urllib_parse.urlencode({ + 'cid': cat_id, + # This is the default value + 'count': 12, + 'ptrs': 3, + 'format': 'json', + }) + path = '/videocenter/servlets/browse?' + data + request_url = compat_urlparse.urljoin(url, path) + response = self._download_webpage(request_url, playlist_title) + response = self._fix_json(response) + if not response.strip(): + self._downloader.report_warning(u'Got an empty reponse, trying ' + u'adding the "newvideos" parameter') + response = self._download_webpage(request_url + '&newvideos=true', + playlist_title) + response = self._fix_json(response) + videos = json.loads(response) + + return { + '_type': 'playlist', + 'title': playlist_title, + 'id': cat_id, + 'entries': [self._extract_video(i) for i in videos], + } From 3823342d9d0a2c50327aa3e1f85a7e8e1221b0bc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Fri, 11 Oct 2013 16:33:31 +0200 Subject: [PATCH 27/62] [arte] Prepare for generic format support (#980) --- youtube_dl/extractor/arte.py | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py index 69b3b0ad7..4707d7cca 100644 --- a/youtube_dl/extractor/arte.py +++ b/youtube_dl/extractor/arte.py @@ -109,17 +109,27 @@ class ArteTvIE(InfoExtractor): return any(re.match(r, f['versionCode']) for r in regexes) # Some formats may not be in the same language as the url formats = filter(_match_lang, formats) + # Some formats use the m3u8 protocol + formats = filter(lambda f: f['videoFormat'] != 'M3U8', formats) # We order the formats by quality formats = sorted(formats, key=lambda f: int(f['height'])) # Prefer videos without subtitles in the same language formats = sorted(formats, key=lambda f: re.match(r'VO(F|A)-STM\1', f['versionCode']) is None) # Pick the best quality - format_info = formats[-1] - if format_info['mediaType'] == u'rtmp': - info_dict['url'] = format_info['streamer'] - info_dict['play_path'] = 'mp4:' + format_info['url'] - else: - info_dict['url'] = format_info['url'] + def _format(format_info): + info = {'ext': 'flv', + 'width': format_info.get('width'), + 'height': format_info.get('height'), + } + if format_info['mediaType'] == u'rtmp': + info['url'] = format_info['streamer'] + info['play_path'] = 'mp4:' + format_info['url'] + else: + info_dict['url'] = format_info['url'] + return info + info_dict['formats'] = [_format(f) for f in formats] + # TODO: Remove when #980 has been merged + info_dict.update(info_dict['formats'][-1]) return info_dict From 8032e31f2dfcccd2a20bc028a6534ac9f89ee10a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Fri, 11 Oct 2013 20:36:50 +0200 Subject: [PATCH 28/62] Merge pull request #1558 from rzhxeo/cinemassacre Add support for http://cinemassacre.com --- youtube_dl/extractor/cinemassacre.py | 96 +++++++++++----------------- 1 file changed, 36 insertions(+), 60 deletions(-) diff --git a/youtube_dl/extractor/cinemassacre.py b/youtube_dl/extractor/cinemassacre.py index 17a7916cb..6925b96c2 100644 --- a/youtube_dl/extractor/cinemassacre.py +++ b/youtube_dl/extractor/cinemassacre.py @@ -6,33 +6,36 @@ from ..utils import ( ExtractorError, ) + class CinemassacreIE(InfoExtractor): _VALID_URL = r'(?:http://)?(?:www\.)?(?P<url>cinemassacre\.com/(?P<date_Y>[0-9]{4})/(?P<date_m>[0-9]{2})/(?P<date_d>[0-9]{2})/.+?)(?:[/?].*)?' _TESTS = [{ u'url': u'http://cinemassacre.com/2012/11/10/avgn-the-movie-trailer/', - u'file': u'19911.mp4', + u'file': u'19911.flv', u'info_dict': { - u'upload_date': u'20121110', + u'upload_date': u'20121110', u'title': u'“Angry Video Game Nerd: The Movie” – Trailer', u'description': u'md5:fb87405fcb42a331742a0dce2708560b', }, u'params': { + # rtmp download u'skip_download': True, }, }, { u'url': u'http://cinemassacre.com/2013/10/02/the-mummys-hand-1940', - u'file': u'521be8ef82b16.mp4', + u'file': u'521be8ef82b16.flv', u'info_dict': { - u'upload_date': u'20131002', + u'upload_date': u'20131002', u'title': u'The Mummy’s Hand (1940)', }, u'params': { + # rtmp download u'skip_download': True, }, }] - def _real_extract(self,url): + def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) webpage_url = u'http://' + mobj.group('url') @@ -50,66 +53,39 @@ class CinemassacreIE(InfoExtractor): webpage, u'description', flags=re.DOTALL, fatal=False) if len(video_description) == 0: video_description = None - + playerdata = self._download_webpage(playerdata_url, video_id) - base_url = self._html_search_regex(r'\'streamer\': \'(?P<base_url>rtmp://[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3})/(?:vod|Cinemassacre)\'', + base_url = self._html_search_regex(r'\'streamer\': \'(?P<base_url>rtmp://.*?)/(?:vod|Cinemassacre)\'', playerdata, u'base_url') base_url += '/Cinemassacre/' - # Important: The file names in playerdata are not used by the player and even wrong for some videos + # Important: The file names in playerdata are not used by the player and even wrong for some videos sd_file = 'Cinemassacre-%s_high.mp4' % video_id hd_file = 'Cinemassacre-%s.mp4' % video_id video_thumbnail = 'http://image.screenwavemedia.com/Cinemassacre/Cinemassacre-%s_thumb_640x360.jpg' % video_id - - formats = [{ - 'id': video_id, - 'url': base_url + hd_file, - 'format': 'hd', - 'ext': 'mp4', - 'title': video_title, + + formats = [ + { + 'url': base_url + sd_file, + 'ext': 'flv', + 'format': 'sd', + 'format_id': 'sd', + }, + { + 'url': base_url + hd_file, + 'ext': 'flv', + 'format': 'hd', + 'format_id': 'hd', + }, + ] + + info = { + 'id': video_id, + 'title': video_title, + 'formats': formats, 'description': video_description, 'upload_date': video_date, - 'thumbnail': video_thumbnail, - }, - { - 'id': video_id, - 'url': base_url + sd_file, - 'ext': 'mp4', - 'format': 'sd', - 'title': video_title, - 'description': video_description, - 'upload_date': video_date, - 'thumbnail': video_thumbnail, - }] - - if self._downloader.params.get('listformats', None): - self._print_formats(formats) - return - - req_format = self._downloader.params.get('format', 'best') - self.to_screen(u'Format: %s' % req_format) - - if req_format is None or req_format == 'best': - return [formats[0]] - elif req_format == 'worst': - return [formats[-1]] - elif req_format in ('-1', 'all'): - return formats - else: - format = self._specific( req_format, formats ) - if format is None: - raise ExtractorError(u'Requested format not available') - return [format] - - def _print_formats(self, formats): - """Print all available formats""" - print(u'Available formats:') - print(u'ext\t\tformat') - print(u'---------------------------------') - for format in formats: - print(u'%s\t\t%s' % (format['ext'], format['format'])) - - def _specific(self, req_format, formats): - for x in formats: - if x["format"] == req_format: - return x - return None + 'thumbnail': video_thumbnail, + } + # TODO: Remove when #980 has been merged + info.update(formats[-1]) + return info From 32835331496e0a77cf7b21f34b80b2ae6e9142a5 Mon Sep 17 00:00:00 2001 From: Joey Adams <joeyadams3.14159@gmail.com> Date: Fri, 11 Oct 2013 21:52:30 -0400 Subject: [PATCH 29/62] Fix Brightcove detection when another Flash object is on the page The regex used non-greedy match, but alas it failed on input like this: <object class="...> ... class="BrightcoveExperience" It captured two objects and the intervening HTML. This commit fixes this by not allowing a ">" to appear before BrightcoveExperience. Video in question: http://www.harpercollinschildrens.com/feature/petethecat/ --- youtube_dl/extractor/generic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 7060c6f92..d48c84f8d 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -121,7 +121,7 @@ class GenericIE(InfoExtractor): self.report_extraction(video_id) # Look for BrightCove: - m_brightcove = re.search(r'<object.+?class=([\'"]).*?BrightcoveExperience.*?\1.+?</object>', webpage, re.DOTALL) + m_brightcove = re.search(r'<object[^>]+?class=([\'"])[^>]*?BrightcoveExperience.*?\1.+?</object>', webpage, re.DOTALL) if m_brightcove is not None: self.to_screen(u'Brightcove video detected.') bc_url = BrightcoveIE._build_brighcove_url(m_brightcove.group()) From 0f6d12e43c0adbd362765aa6b6f54c67e034a247 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Sat, 12 Oct 2013 13:29:02 +0200 Subject: [PATCH 30/62] Don't set the '-aq' option with the opus format (fixes #1263) --- youtube_dl/PostProcessor.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/PostProcessor.py b/youtube_dl/PostProcessor.py index fbf8a7f98..07b6895c0 100644 --- a/youtube_dl/PostProcessor.py +++ b/youtube_dl/PostProcessor.py @@ -178,7 +178,8 @@ class FFmpegExtractAudioPP(FFmpegPostProcessor): extension = self._preferredcodec more_opts = [] if self._preferredquality is not None: - if int(self._preferredquality) < 10: + # The opus codec doesn't support the -aq option + if int(self._preferredquality) < 10 and extension != 'opus': more_opts += [self._exes['avconv'] and '-q:a' or '-aq', self._preferredquality] else: more_opts += [self._exes['avconv'] and '-b:a' or '-ab', self._preferredquality + 'k'] From 4eb7f1d12e512fa69f90d98b2e6e97fa0c04e7a5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Sat, 12 Oct 2013 13:49:27 +0200 Subject: [PATCH 31/62] FFmpegPostProcessor: print the command line used if the --verbose option is given --- youtube_dl/PostProcessor.py | 2 ++ youtube_dl/utils.py | 5 +++++ 2 files changed, 7 insertions(+) diff --git a/youtube_dl/PostProcessor.py b/youtube_dl/PostProcessor.py index fbf8a7f98..0479591f0 100644 --- a/youtube_dl/PostProcessor.py +++ b/youtube_dl/PostProcessor.py @@ -83,6 +83,8 @@ class FFmpegPostProcessor(PostProcessor): + opts + [encodeFilename(self._ffmpeg_filename_argument(out_path))]) + if self._downloader.params.get('verbose', False): + self._downloader.to_screen(u'[debug] ffmpeg command line: %s' % shell_quote(cmd)) p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdout,stderr = p.communicate() if p.returncode != 0: diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 82a1daeb9..0457f3ded 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -9,6 +9,7 @@ import io import json import locale import os +import pipes import platform import re import socket @@ -927,3 +928,7 @@ class locked_file(object): def read(self, *args): return self.f.read(*args) + + +def shell_quote(args): + return ' '.join(map(pipes.quote, args)) From f5e54a1fda6fcc4ef279e54ff6cf63f6eae71bb0 Mon Sep 17 00:00:00 2001 From: Filippo Valsorda <filippo.valsorda@gmail.com> Date: Sat, 12 Oct 2013 13:11:03 -0400 Subject: [PATCH 32/62] add support for NowVideo.ch --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/nowvideo.py | 43 ++++++++++++++++++++++++++++++++ 2 files changed, 44 insertions(+) create mode 100644 youtube_dl/extractor/nowvideo.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 226c3a762..bc191a012 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -83,6 +83,7 @@ from .nba import NBAIE from .nbc import NBCNewsIE from .newgrounds import NewgroundsIE from .nhl import NHLIE, NHLVideocenterIE +from .nowvideo import NowVideoIE from .ooyala import OoyalaIE from .orf import ORFIE from .pbs import PBSIE diff --git a/youtube_dl/extractor/nowvideo.py b/youtube_dl/extractor/nowvideo.py new file mode 100644 index 000000000..ab52ad401 --- /dev/null +++ b/youtube_dl/extractor/nowvideo.py @@ -0,0 +1,43 @@ +import re + +from .common import InfoExtractor +from ..utils import compat_urlparse + + +class NowVideoIE(InfoExtractor): + _VALID_URL = r'(?:https?://)?(?:www\.)?nowvideo\.ch/video/(?P<id>\w+)' + _TEST = { + u'url': u'http://www.nowvideo.ch/video/0mw0yow7b6dxa', + u'file': u'0mw0yow7b6dxa.flv', + u'md5': u'f8fbbc8add72bd95b7850c6a02fc8817', + u'info_dict': { + u"title": u"youtubedl test video _BaW_jenozKc.mp4" + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + + video_id = mobj.group('id') + webpage_url = 'http://www.nowvideo.ch/video/' + video_id + webpage = self._download_webpage(webpage_url, video_id) + + self.report_extraction(video_id) + + video_title = self._html_search_regex(r'<h4>(.*)</h4>', + webpage, u'video title') + + video_key = self._search_regex(r'var fkzd="(.*)";', + webpage, u'video key') + + api_call = "http://www.nowvideo.ch/api/player.api.php?file={0}&numOfErrors=0&cid=1&key={1}".format(video_id, video_key) + api_response = self._download_webpage(api_call, video_id, + u'Downloading API page') + video_url = compat_urlparse.parse_qs(api_response)[u'url'][0] + + return [{ + 'id': video_id, + 'url': video_url, + 'ext': 'flv', + 'title': video_title, + }] From d3f46b9aa5727323182dd845030c9d781e1824fd Mon Sep 17 00:00:00 2001 From: Filippo Valsorda <filippo.valsorda@gmail.com> Date: Sat, 12 Oct 2013 13:17:11 -0400 Subject: [PATCH 33/62] Add support for single-test tox runs Use a sintax like tox test.test_download:TestDownload.test_NowVideo to run the specific test on all the tox environments (Python versions) --- test/__init__.py | 0 test/test_age_restriction.py | 2 +- test/test_all_urls.py | 2 +- test/test_dailymotion_subtitles.py | 2 +- test/test_download.py | 4 ++-- test/test_playlists.py | 2 +- test/test_youtube_lists.py | 2 +- test/test_youtube_subtitles.py | 2 +- tox.ini | 7 +++++-- 9 files changed, 13 insertions(+), 10 deletions(-) create mode 100644 test/__init__.py diff --git a/test/__init__.py b/test/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/test/test_age_restriction.py b/test/test_age_restriction.py index 943f9a315..ec3e30572 100644 --- a/test/test_age_restriction.py +++ b/test/test_age_restriction.py @@ -8,7 +8,7 @@ import os sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from youtube_dl import YoutubeDL -from helper import try_rm +from .helper import try_rm def _download_restricted(url, filename, age): diff --git a/test/test_all_urls.py b/test/test_all_urls.py index ff1c86efe..b28ad000b 100644 --- a/test/test_all_urls.py +++ b/test/test_all_urls.py @@ -8,7 +8,7 @@ import os sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from youtube_dl.extractor import YoutubeIE, YoutubePlaylistIE, YoutubeChannelIE, JustinTVIE, gen_extractors -from helper import get_testcases +from .helper import get_testcases class TestAllURLsMatching(unittest.TestCase): def setUp(self): diff --git a/test/test_dailymotion_subtitles.py b/test/test_dailymotion_subtitles.py index ed2ad311d..e655d280d 100644 --- a/test/test_dailymotion_subtitles.py +++ b/test/test_dailymotion_subtitles.py @@ -10,7 +10,7 @@ sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from youtube_dl.extractor import DailymotionIE from youtube_dl.utils import * -from helper import FakeYDL +from .helper import FakeYDL md5 = lambda s: hashlib.md5(s.encode('utf-8')).hexdigest() diff --git a/test/test_download.py b/test/test_download.py index fdf59bb5c..68da4d984 100644 --- a/test/test_download.py +++ b/test/test_download.py @@ -37,8 +37,8 @@ def _file_md5(fn): with open(fn, 'rb') as f: return hashlib.md5(f.read()).hexdigest() -import helper # Set up remaining global configuration -from helper import get_testcases, try_rm +import test.helper as helper # Set up remaining global configuration +from .helper import get_testcases, try_rm defs = get_testcases() with io.open(PARAMETERS_FILE, encoding='utf-8') as pf: diff --git a/test/test_playlists.py b/test/test_playlists.py index de8bd298a..108a4d63b 100644 --- a/test/test_playlists.py +++ b/test/test_playlists.py @@ -20,7 +20,7 @@ from youtube_dl.extractor import ( ) from youtube_dl.utils import * -from helper import FakeYDL +from .helper import FakeYDL class TestPlaylists(unittest.TestCase): def assertIsPlaylist(self, info): diff --git a/test/test_youtube_lists.py b/test/test_youtube_lists.py index 53e65816d..0b5c79030 100644 --- a/test/test_youtube_lists.py +++ b/test/test_youtube_lists.py @@ -11,7 +11,7 @@ sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from youtube_dl.extractor import YoutubeUserIE, YoutubePlaylistIE, YoutubeIE, YoutubeChannelIE, YoutubeShowIE from youtube_dl.utils import * -from helper import FakeYDL +from .helper import FakeYDL class TestYoutubeLists(unittest.TestCase): def assertIsPlaylist(self,info): diff --git a/test/test_youtube_subtitles.py b/test/test_youtube_subtitles.py index f9b0c1ad0..07850385e 100644 --- a/test/test_youtube_subtitles.py +++ b/test/test_youtube_subtitles.py @@ -10,7 +10,7 @@ sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from youtube_dl.extractor import YoutubeIE from youtube_dl.utils import * -from helper import FakeYDL +from .helper import FakeYDL md5 = lambda s: hashlib.md5(s.encode('utf-8')).hexdigest() diff --git a/tox.ini b/tox.ini index 53b461fdb..ed01e3386 100644 --- a/tox.ini +++ b/tox.ini @@ -1,5 +1,8 @@ [tox] envlist = py26,py27,py33 [testenv] -deps = nose -commands = nosetests --with-coverage --cover-package=youtube_dl --cover-html --verbose test +deps = + nose + coverage +commands = nosetests --verbose {posargs:test} # --with-coverage --cover-package=youtube_dl --cover-html + # test.test_download:TestDownload.test_NowVideo From d7e66d39a040886f940f4adf444be71e50e97391 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Sat, 12 Oct 2013 21:34:04 +0200 Subject: [PATCH 34/62] Add an extractor for internetvideoarchive.com videos It's used by videodetective.com --- test/test_utils.py | 14 ++++ youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/internetvideoarchive.py | 71 ++++++++++++++++++++ youtube_dl/utils.py | 13 ++++ 4 files changed, 99 insertions(+) create mode 100644 youtube_dl/extractor/internetvideoarchive.py diff --git a/test/test_utils.py b/test/test_utils.py index ff2e9885b..f2c03d421 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -20,6 +20,7 @@ from youtube_dl.utils import ( unified_strdate, find_xpath_attr, get_meta_content, + xpath_with_ns, ) if sys.version_info < (3, 0): @@ -141,5 +142,18 @@ class TestUtil(unittest.TestCase): self.assertEqual(get_meta('description'), u'foo & bar') self.assertEqual(get_meta('author'), 'Plato') + def test_xpath_with_ns(self): + testxml = u'''<root xmlns:media="http://example.com/"> + <media:song> + <media:author>The Author</media:author> + <url>http://server.com/download.mp3</url> + </media:song> + </root>''' + doc = xml.etree.ElementTree.fromstring(testxml) + find = lambda p: doc.find(xpath_with_ns(p, {'media': 'http://example.com/'})) + self.assertTrue(find('media:song') is not None) + self.assertEqual(find('media:song/media:author').text, u'The Author') + self.assertEqual(find('media:song/url').text, u'http://server.com/download.mp3') + if __name__ == '__main__': unittest.main() diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index bc191a012..e50a89149 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -62,6 +62,7 @@ from .ign import IGNIE, OneUPIE from .ina import InaIE from .infoq import InfoQIE from .instagram import InstagramIE +from .internetvideoarchive import InternetVideoArchiveIE from .jeuxvideo import JeuxVideoIE from .jukebox import JukeboxIE from .justintv import JustinTVIE diff --git a/youtube_dl/extractor/internetvideoarchive.py b/youtube_dl/extractor/internetvideoarchive.py new file mode 100644 index 000000000..52e3f9eec --- /dev/null +++ b/youtube_dl/extractor/internetvideoarchive.py @@ -0,0 +1,71 @@ +import re +import xml.etree.ElementTree + +from .common import InfoExtractor +from ..utils import ( + compat_urlparse, + xpath_with_ns, + determine_ext, +) + + +class InternetVideoArchiveIE(InfoExtractor): + _VALID_URL = r'https?://video\.internetvideoarchive\.net/flash/players/.*?\?.*?publishedid.*?' + + _TEST = { + u'url': u'http://video.internetvideoarchive.net/flash/players/flashconfiguration.aspx?customerid=69249&publishedid=452693&playerid=247', + u'file': u'452693.mp4', + u'info_dict': { + u'title': u'SKYFALL', + u'description': u'In SKYFALL, Bond\'s loyalty to M is tested as her past comes back to haunt her. As MI6 comes under attack, 007 must track down and destroy the threat, no matter how personal the cost.', + u'duration': 156, + }, + } + + @staticmethod + def _build_url(query): + return 'http://video.internetvideoarchive.net/flash/players/flashconfiguration.aspx?' + query + + def _real_extract(self, url): + query = compat_urlparse.urlparse(url).query + query_dic = compat_urlparse.parse_qs(query) + video_id = query_dic['publishedid'][0] + url = self._build_url(query) + + flashconfiguration_xml = self._download_webpage(url, video_id, + u'Downloading flash configuration') + flashconfiguration = xml.etree.ElementTree.fromstring(flashconfiguration_xml.encode('utf-8')) + file_url = flashconfiguration.find('file').text + file_url = file_url.replace('/playlist.aspx', '/mrssplaylist.aspx') + info_xml = self._download_webpage(file_url, video_id, + u'Downloading video info') + info = xml.etree.ElementTree.fromstring(info_xml.encode('utf-8')) + item = info.find('channel/item') + + def _bp(p): + return xpath_with_ns(p, + {'media': 'http://search.yahoo.com/mrss/', + 'jwplayer': 'http://developer.longtailvideo.com/trac/wiki/FlashFormats'}) + formats = [] + for content in item.findall(_bp('media:group/media:content')): + attr = content.attrib + f_url = attr['url'] + formats.append({ + 'url': f_url, + 'ext': determine_ext(f_url), + 'width': int(attr['width']), + 'bitrate': int(attr['bitrate']), + }) + formats = sorted(formats, key=lambda f: f['bitrate']) + + info = { + 'id': video_id, + 'title': item.find('title').text, + 'formats': formats, + 'thumbnail': item.find(_bp('media:thumbnail')).attrib['url'], + 'description': item.find('description').text, + 'duration': int(attr['duration']), + } + # TODO: Remove when #980 has been merged + info.update(formats[-1]) + return info diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 0457f3ded..3e81c308b 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -230,6 +230,19 @@ else: return f return None +# On python2.6 the xml.etree.ElementTree.Element methods don't support +# the namespace parameter +def xpath_with_ns(path, ns_map): + components = [c.split(':') for c in path.split('/')] + replaced = [] + for c in components: + if len(c) == 1: + replaced.append(c[0]) + else: + ns, tag = c + replaced.append('{%s}%s' % (ns_map[ns], tag)) + return '/'.join(replaced) + def htmlentity_transform(matchobj): """Transforms an HTML entity to a character. From 3d60d33773e1be28955a74c3491edd13581aeb8b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Sat, 12 Oct 2013 21:36:17 +0200 Subject: [PATCH 35/62] Add an extractor for videodetective.com (closes #262) It uses the internetvideoarchive.com platform. --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/videodetective.py | 30 ++++++++++++++++++++++++++ 2 files changed, 31 insertions(+) create mode 100644 youtube_dl/extractor/videodetective.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index e50a89149..0f38bdd54 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -123,6 +123,7 @@ from .veoh import VeohIE from .vevo import VevoIE from .vice import ViceIE from .viddler import ViddlerIE +from .videodetective import VideoDetectiveIE from .videofyme import VideofyMeIE from .vimeo import VimeoIE, VimeoChannelIE from .vine import VineIE diff --git a/youtube_dl/extractor/videodetective.py b/youtube_dl/extractor/videodetective.py new file mode 100644 index 000000000..265dd5b91 --- /dev/null +++ b/youtube_dl/extractor/videodetective.py @@ -0,0 +1,30 @@ +import re + +from .common import InfoExtractor +from .internetvideoarchive import InternetVideoArchiveIE +from ..utils import ( + compat_urlparse, +) + + +class VideoDetectiveIE(InfoExtractor): + _VALID_URL = r'https?://www\.videodetective\.com/[^/]+/[^/]+/(?P<id>\d+)' + + _TEST = { + u'url': u'http://www.videodetective.com/movies/kick-ass-2/194487', + u'file': u'194487.mp4', + u'info_dict': { + u'title': u'KICK-ASS 2', + u'description': u'md5:65ba37ad619165afac7d432eaded6013', + u'duration': 135, + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + webpage = self._download_webpage(url, video_id) + og_video = self._og_search_video_url(webpage) + query = compat_urlparse.urlparse(og_video).query + return self.url_result(InternetVideoArchiveIE._build_url(query), + ie=InternetVideoArchiveIE.ie_key()) From 4b7b839f24c6e95a4c1047de1a0a5194ef7f8fce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Sat, 12 Oct 2013 22:21:23 +0200 Subject: [PATCH 36/62] Add an extractor for rottentomatoes.com and improve InternetVideoArchiveIE to get the best quality --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/internetvideoarchive.py | 16 ++++++++++++++++ youtube_dl/extractor/rottentomatoes.py | 16 ++++++++++++++++ youtube_dl/extractor/videodetective.py | 2 +- 4 files changed, 34 insertions(+), 1 deletion(-) create mode 100644 youtube_dl/extractor/rottentomatoes.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 0f38bdd54..9dc9651ad 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -94,6 +94,7 @@ from .rbmaradio import RBMARadioIE from .redtube import RedTubeIE from .ringtv import RingTVIE from .ro220 import Ro220IE +from .rottentomatoes import RottenTomatoesIE from .roxwel import RoxwelIE from .rtlnow import RTLnowIE from .sina import SinaIE diff --git a/youtube_dl/extractor/internetvideoarchive.py b/youtube_dl/extractor/internetvideoarchive.py index 52e3f9eec..5986459d6 100644 --- a/youtube_dl/extractor/internetvideoarchive.py +++ b/youtube_dl/extractor/internetvideoarchive.py @@ -4,6 +4,7 @@ import xml.etree.ElementTree from .common import InfoExtractor from ..utils import ( compat_urlparse, + compat_urllib_parse, xpath_with_ns, determine_ext, ) @@ -26,6 +27,16 @@ class InternetVideoArchiveIE(InfoExtractor): def _build_url(query): return 'http://video.internetvideoarchive.net/flash/players/flashconfiguration.aspx?' + query + @staticmethod + def _clean_query(query): + NEEDED_ARGS = ['publishedid', 'customerid'] + query_dic = compat_urlparse.parse_qs(query) + cleaned_dic = dict((k,v[0]) for (k,v) in query_dic.items() if k in NEEDED_ARGS) + # Other player ids return m3u8 urls + cleaned_dic['playerid'] = '247' + cleaned_dic['videokbrate'] = '100000' + return compat_urllib_parse.urlencode(cleaned_dic) + def _real_extract(self, url): query = compat_urlparse.urlparse(url).query query_dic = compat_urlparse.parse_qs(query) @@ -37,6 +48,11 @@ class InternetVideoArchiveIE(InfoExtractor): flashconfiguration = xml.etree.ElementTree.fromstring(flashconfiguration_xml.encode('utf-8')) file_url = flashconfiguration.find('file').text file_url = file_url.replace('/playlist.aspx', '/mrssplaylist.aspx') + # Replace some of the parameters in the query to get the best quality + # and http links (no m3u8 manifests) + file_url = re.sub(r'(?<=\?)(.+)$', + lambda m: self._clean_query(m.group()), + file_url) info_xml = self._download_webpage(file_url, video_id, u'Downloading video info') info = xml.etree.ElementTree.fromstring(info_xml.encode('utf-8')) diff --git a/youtube_dl/extractor/rottentomatoes.py b/youtube_dl/extractor/rottentomatoes.py new file mode 100644 index 000000000..c79c39413 --- /dev/null +++ b/youtube_dl/extractor/rottentomatoes.py @@ -0,0 +1,16 @@ +from .videodetective import VideoDetectiveIE + + +# It just uses the same method as videodetective.com, +# the internetvideoarchive.com is extracted from the og:video property +class RottenTomatoesIE(VideoDetectiveIE): + _VALID_URL = r'https?://www\.rottentomatoes\.com/m/[^/]+/trailers/(?P<id>\d+)' + + _TEST = { + u'url': u'http://www.rottentomatoes.com/m/toy_story_3/trailers/11028566/', + u'file': '613340.mp4', + u'info_dict': { + u'title': u'TOY STORY 3', + u'description': u'From the creators of the beloved TOY STORY films, comes a story that will reunite the gang in a whole new way.', + }, + } diff --git a/youtube_dl/extractor/videodetective.py b/youtube_dl/extractor/videodetective.py index 265dd5b91..d89f84094 100644 --- a/youtube_dl/extractor/videodetective.py +++ b/youtube_dl/extractor/videodetective.py @@ -16,7 +16,7 @@ class VideoDetectiveIE(InfoExtractor): u'info_dict': { u'title': u'KICK-ASS 2', u'description': u'md5:65ba37ad619165afac7d432eaded6013', - u'duration': 135, + u'duration': 138, }, } From c40f5cf45ce896c021ed44fa22d79adbb05eaf5e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Sun, 13 Oct 2013 13:54:31 +0200 Subject: [PATCH 37/62] [arte] add an extractor for creative.arte.tv (#1593) The +7 videos now use an independent extractor that is also used for the creative videos --- youtube_dl/extractor/__init__.py | 6 +- youtube_dl/extractor/arte.py | 156 +++++++++++++++++-------------- 2 files changed, 93 insertions(+), 69 deletions(-) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 9dc9651ad..837c5834d 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -2,7 +2,11 @@ from .appletrailers import AppleTrailersIE from .addanime import AddAnimeIE from .archiveorg import ArchiveOrgIE from .ard import ARDIE -from .arte import ArteTvIE +from .arte import ( + ArteTvIE, + ArteTVPlus7IE, + ArteTVCreativeIE, +) from .auengine import AUEngineIE from .bandcamp import BandcampIE from .bliptv import BlipTVIE, BlipTVUserIE diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py index 4707d7cca..d296b6d63 100644 --- a/youtube_dl/extractor/arte.py +++ b/youtube_dl/extractor/arte.py @@ -7,15 +7,14 @@ from ..utils import ( ExtractorError, find_xpath_attr, unified_strdate, + determine_ext, ) +# There are different sources of video in arte.tv, the extraction process +# is different for each one. The videos usually expire in 7 days, so we can't +# add tests. + class ArteTvIE(InfoExtractor): - """ - There are two sources of video in arte.tv: videos.arte.tv and - www.arte.tv/guide, the extraction process is different for each one. - The videos expire in 7 days, so we can't add tests. - """ - _EMISSION_URL = r'(?:http://)?www\.arte.tv/guide/(?P<lang>fr|de)/(?:(?:sendungen|emissions)/)?(?P<id>.*?)/(?P<name>.*?)(\?.*)?' _VIDEOS_URL = r'(?:http://)?videos.arte.tv/(?P<lang>fr|de)/.*-(?P<id>.*?).html' _LIVEWEB_URL = r'(?:http://)?liveweb.arte.tv/(?P<lang>fr|de)/(?P<subpage>.+?)/(?P<name>.+)' _LIVE_URL = r'index-[0-9]+\.html$' @@ -24,7 +23,7 @@ class ArteTvIE(InfoExtractor): @classmethod def suitable(cls, url): - return any(re.match(regex, url) for regex in (cls._EMISSION_URL, cls._VIDEOS_URL, cls._LIVEWEB_URL)) + return any(re.match(regex, url) for regex in (cls._VIDEOS_URL, cls._LIVEWEB_URL)) # TODO implement Live Stream # from ..utils import compat_urllib_parse @@ -55,14 +54,6 @@ class ArteTvIE(InfoExtractor): # video_url = u'%s/%s' % (info.get('url'), info.get('path')) def _real_extract(self, url): - mobj = re.match(self._EMISSION_URL, url) - if mobj is not None: - lang = mobj.group('lang') - # This is not a real id, it can be for example AJT for the news - # http://www.arte.tv/guide/fr/emissions/AJT/arte-journal - video_id = mobj.group('id') - return self._extract_emission(url, video_id, lang) - mobj = re.match(self._VIDEOS_URL, url) if mobj is not None: id = mobj.group('id') @@ -80,59 +71,6 @@ class ArteTvIE(InfoExtractor): # self.extractLiveStream(url) # return - def _extract_emission(self, url, video_id, lang): - """Extract from www.arte.tv/guide""" - webpage = self._download_webpage(url, video_id) - json_url = self._html_search_regex(r'arte_vp_url="(.*?)"', webpage, 'json url') - - json_info = self._download_webpage(json_url, video_id, 'Downloading info json') - self.report_extraction(video_id) - info = json.loads(json_info) - player_info = info['videoJsonPlayer'] - - info_dict = {'id': player_info['VID'], - 'title': player_info['VTI'], - 'description': player_info.get('VDE'), - 'upload_date': unified_strdate(player_info['VDA'].split(' ')[0]), - 'thumbnail': player_info['programImage'], - 'ext': 'flv', - } - - formats = player_info['VSR'].values() - def _match_lang(f): - # Return true if that format is in the language of the url - if lang == 'fr': - l = 'F' - elif lang == 'de': - l = 'A' - regexes = [r'VO?%s' % l, r'VO?.-ST%s' % l] - return any(re.match(r, f['versionCode']) for r in regexes) - # Some formats may not be in the same language as the url - formats = filter(_match_lang, formats) - # Some formats use the m3u8 protocol - formats = filter(lambda f: f['videoFormat'] != 'M3U8', formats) - # We order the formats by quality - formats = sorted(formats, key=lambda f: int(f['height'])) - # Prefer videos without subtitles in the same language - formats = sorted(formats, key=lambda f: re.match(r'VO(F|A)-STM\1', f['versionCode']) is None) - # Pick the best quality - def _format(format_info): - info = {'ext': 'flv', - 'width': format_info.get('width'), - 'height': format_info.get('height'), - } - if format_info['mediaType'] == u'rtmp': - info['url'] = format_info['streamer'] - info['play_path'] = 'mp4:' + format_info['url'] - else: - info_dict['url'] = format_info['url'] - return info - info_dict['formats'] = [_format(f) for f in formats] - # TODO: Remove when #980 has been merged - info_dict.update(info_dict['formats'][-1]) - - return info_dict - def _extract_video(self, url, video_id, lang): """Extract from videos.arte.tv""" ref_xml_url = url.replace('/videos/', '/do_delegate/videos/') @@ -182,3 +120,85 @@ class ArteTvIE(InfoExtractor): 'ext': 'flv', 'thumbnail': self._og_search_thumbnail(webpage), } + + +class ArteTVPlus7IE(InfoExtractor): + IE_NAME = u'arte.tv:+7' + _VALID_URL = r'https?://www\.arte.tv/guide/(?P<lang>fr|de)/(?:(?:sendungen|emissions)/)?(?P<id>.*?)/(?P<name>.*?)(\?.*)?' + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + lang = mobj.group('lang') + # This is not a real id, it can be for example AJT for the news + # http://www.arte.tv/guide/fr/emissions/AJT/arte-journal + video_id = mobj.group('id') + + webpage = self._download_webpage(url, video_id) + json_url = self._html_search_regex(r'arte_vp_url="(.*?)"', webpage, 'json url') + + json_info = self._download_webpage(json_url, video_id, 'Downloading info json') + self.report_extraction(video_id) + info = json.loads(json_info) + player_info = info['videoJsonPlayer'] + + info_dict = { + 'id': player_info['VID'], + 'title': player_info['VTI'], + 'description': player_info.get('VDE'), + 'upload_date': unified_strdate(player_info.get('VDA', '').split(' ')[0]), + 'thumbnail': player_info.get('programImage') or player_info.get('VTU', {}).get('IUR'), + } + + formats = player_info['VSR'].values() + def _match_lang(f): + if f.get('versionCode') is None: + return True + # Return true if that format is in the language of the url + if lang == 'fr': + l = 'F' + elif lang == 'de': + l = 'A' + regexes = [r'VO?%s' % l, r'VO?.-ST%s' % l] + return any(re.match(r, f['versionCode']) for r in regexes) + # Some formats may not be in the same language as the url + formats = filter(_match_lang, formats) + # Some formats use the m3u8 protocol + formats = filter(lambda f: f.get('videoFormat') != 'M3U8', formats) + # We order the formats by quality + formats = sorted(formats, key=lambda f: int(f.get('height',-1))) + # Prefer videos without subtitles in the same language + formats = sorted(formats, key=lambda f: re.match(r'VO(F|A)-STM\1', f.get('versionCode', '')) is None) + # Pick the best quality + def _format(format_info): + info = { + 'width': format_info.get('width'), + 'height': format_info.get('height'), + } + if format_info['mediaType'] == u'rtmp': + info['url'] = format_info['streamer'] + info['play_path'] = 'mp4:' + format_info['url'] + info['ext'] = 'flv' + else: + info['url'] = format_info['url'] + info['ext'] = determine_ext(info['url']) + return info + info_dict['formats'] = [_format(f) for f in formats] + # TODO: Remove when #980 has been merged + info_dict.update(info_dict['formats'][-1]) + + return info_dict + + +# It also uses the arte_vp_url url from the webpage to extract the information +class ArteTVCreativeIE(ArteTVPlus7IE): + IE_NAME = u'arte.tv:creative' + _VALID_URL = r'https?://creative\.arte\.tv/(?P<lang>fr|de)/magazine?/(?P<id>.+)' + + _TEST = { + u'url': u'http://creative.arte.tv/de/magazin/agentur-amateur-corporate-design', + u'file': u'050489-002.mp4', + u'info_dict': { + u'title': u'Agentur Amateur #2 - Corporate Design', + }, + } + From 69a0c470b5cbcb789ef0358b7f13a18bf7564fc1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Sun, 13 Oct 2013 14:21:13 +0200 Subject: [PATCH 38/62] [arte] Add an extractor for future.arte.tv (closes #1593) --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/arte.py | 31 +++++++++++++++++++++++++++++-- 2 files changed, 30 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 837c5834d..d76945a48 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -6,6 +6,7 @@ from .arte import ( ArteTvIE, ArteTVPlus7IE, ArteTVCreativeIE, + ArteTVFutureIE, ) from .auengine import AUEngineIE from .bandcamp import BandcampIE diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py index d296b6d63..5ee8a67b1 100644 --- a/youtube_dl/extractor/arte.py +++ b/youtube_dl/extractor/arte.py @@ -1,3 +1,4 @@ +# encoding: utf-8 import re import json import xml.etree.ElementTree @@ -8,6 +9,7 @@ from ..utils import ( find_xpath_attr, unified_strdate, determine_ext, + get_element_by_id, ) # There are different sources of video in arte.tv, the extraction process @@ -126,14 +128,21 @@ class ArteTVPlus7IE(InfoExtractor): IE_NAME = u'arte.tv:+7' _VALID_URL = r'https?://www\.arte.tv/guide/(?P<lang>fr|de)/(?:(?:sendungen|emissions)/)?(?P<id>.*?)/(?P<name>.*?)(\?.*)?' - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) + @classmethod + def _extract_url_info(cls, url): + mobj = re.match(cls._VALID_URL, url) lang = mobj.group('lang') # This is not a real id, it can be for example AJT for the news # http://www.arte.tv/guide/fr/emissions/AJT/arte-journal video_id = mobj.group('id') + return video_id, lang + def _real_extract(self, url): + video_id, lang = self._extract_url_info(url) webpage = self._download_webpage(url, video_id) + return self._extract_from_webpage(webpage, video_id, lang) + + def _extract_from_webpage(self, webpage, video_id, lang): json_url = self._html_search_regex(r'arte_vp_url="(.*?)"', webpage, 'json url') json_info = self._download_webpage(json_url, video_id, 'Downloading info json') @@ -202,3 +211,21 @@ class ArteTVCreativeIE(ArteTVPlus7IE): }, } + +class ArteTVFutureIE(ArteTVPlus7IE): + IE_NAME = u'arte.tv:future' + _VALID_URL = r'https?://future\.arte\.tv/(?P<lang>fr|de)/(thema|sujet)/.*?#article-anchor-(?P<id>\d+)' + + _TEST = { + u'url': u'http://future.arte.tv/fr/sujet/info-sciences#article-anchor-7081', + u'file': u'050940-003.mp4', + u'info_dict': { + u'title': u'Les champignons au secours de la planète', + }, + } + + def _real_extract(self, url): + anchor_id, lang = self._extract_url_info(url) + webpage = self._download_webpage(url, anchor_id) + row = get_element_by_id(anchor_id, webpage) + return self._extract_from_webpage(row, anchor_id, lang) From 9378ae6e1d6165c2402890c53c76f7975fee6d7b Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Sun, 13 Oct 2013 15:54:53 +0200 Subject: [PATCH 39/62] [youku] Allow shortcut youku:ID and make non-matching groups non-matching (#1571) --- youtube_dl/extractor/youku.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youku.py b/youtube_dl/extractor/youku.py index 00fa2ccb5..9d88c17f5 100644 --- a/youtube_dl/extractor/youku.py +++ b/youtube_dl/extractor/youku.py @@ -13,7 +13,7 @@ from ..utils import ( class YoukuIE(InfoExtractor): - _VALID_URL = r'(?:http://)?(v|player)\.youku\.com/(v_show/id_|player\.php/sid/)(?P<ID>[A-Za-z0-9]+)(\.html|/v.swf)' + _VALID_URL = r'(?:(?:http://)?(?:v|player)\.youku\.com/(?:v_show/id_|player\.php/sid/)|youku:)(?P<ID>[A-Za-z0-9]+)(?:\.html|/v\.swf|)' _TEST = { u"url": u"http://v.youku.com/v_show/id_XNDgyMDQ2NTQw.html", u"file": u"XNDgyMDQ2NTQw_part00.flv", From 1fb07d10a3f5f2baf1ebbdbc69d8ee8615cec2f9 Mon Sep 17 00:00:00 2001 From: Jai Grimshaw <jai@jaigrimshaw.com> Date: Mon, 14 Oct 2013 16:18:58 +1100 Subject: [PATCH 40/62] [youtube] Adds #1312 Download annotations Adds #1321 Download annotations from youtube Annotations are downloaded and written to a .annotations.xml file using the https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=$VIDEOID API. Added unit test for annotations. --- test/test_write_annotations.py | 82 +++++++++++++++++++++++++++++++++ youtube_dl/YoutubeDL.py | 17 +++++++ youtube_dl/__init__.py | 4 ++ youtube_dl/extractor/youtube.py | 10 ++++ 4 files changed, 113 insertions(+) create mode 100644 test/test_write_annotations.py diff --git a/test/test_write_annotations.py b/test/test_write_annotations.py new file mode 100644 index 000000000..ba7a9f50a --- /dev/null +++ b/test/test_write_annotations.py @@ -0,0 +1,82 @@ +#!/usr/bin/env python +# coding: utf-8 + +import xml.etree.ElementTree +import os +import sys +import unittest + +# Allow direct execution +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +import youtube_dl.YoutubeDL +import youtube_dl.extractor +from youtube_dl.utils import * +from .helper import try_rm + +PARAMETERS_FILE = os.path.join(os.path.dirname(os.path.abspath(__file__)), "parameters.json") + +# General configuration (from __init__, not very elegant...) +jar = compat_cookiejar.CookieJar() +cookie_processor = compat_urllib_request.HTTPCookieProcessor(jar) +proxy_handler = compat_urllib_request.ProxyHandler() +opener = compat_urllib_request.build_opener(proxy_handler, cookie_processor, YoutubeDLHandler()) +compat_urllib_request.install_opener(opener) + +class YoutubeDL(youtube_dl.YoutubeDL): + def __init__(self, *args, **kwargs): + super(YoutubeDL, self).__init__(*args, **kwargs) + self.to_stderr = self.to_screen + +with io.open(PARAMETERS_FILE, encoding='utf-8') as pf: + params = json.load(pf) +params['writeannotations'] = True +params['skip_download'] = True +params['writeinfojson'] = False +params['format'] = 'flv' + +TEST_ID = 'gr51aVj-mLg' +ANNOTATIONS_FILE = TEST_ID + '.flv.annotations.xml' +EXPECTED_ANNOTATIONS = ['Speech bubble', 'Note', 'Title', 'Spotlight', 'Label'] + +class TestAnnotations(unittest.TestCase): + def setUp(self): + # Clear old files + self.tearDown() + + + def test_info_json(self): + expected = list(EXPECTED_ANNOTATIONS) #Two annotations could have the same text. + ie = youtube_dl.extractor.YoutubeIE() + ydl = YoutubeDL(params) + ydl.add_info_extractor(ie) + ydl.download([TEST_ID]) + self.assertTrue(os.path.exists(ANNOTATIONS_FILE)) + annoxml = None + with io.open(ANNOTATIONS_FILE, 'r', encoding='utf-8') as annof: + annoxml = xml.etree.ElementTree.parse(annof) + self.assertTrue(annoxml is not None, 'Failed to parse annotations XML') + root = annoxml.getroot() + self.assertEqual(root.tag, 'document') + annotationsTag = root.find('annotations') + self.assertEqual(annotationsTag.tag, 'annotations') + annotations = annotationsTag.findall('annotation') + + #Not all the annotations have TEXT children and the annotations are returned unsorted. + for a in annotations: + self.assertEqual(a.tag, 'annotation') + if a.get('type') == 'text': + textTag = a.find('TEXT') + text = textTag.text + self.assertTrue(text in expected) #assertIn only added in python 2.7 + #remove the first occurance, there could be more than one annotation with the same text + expected.remove(text) + #We should have seen (and removed) all the expected annotation texts. + self.assertEqual(len(expected), 0, 'Not all expected annotations were found.') + + + def tearDown(self): + try_rm(ANNOTATIONS_FILE) + +if __name__ == '__main__': + unittest.main() diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index e85e03fa4..c8054544a 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -71,6 +71,7 @@ class YoutubeDL(object): logtostderr: Log messages to stderr instead of stdout. writedescription: Write the video description to a .description file writeinfojson: Write the video description to a .info.json file + writeannotations: Write the video annotations to a .annotations.xml file writethumbnail: Write the thumbnail image to a file writesubtitles: Write the video subtitles to a file writeautomaticsub: Write the automatic subtitles to a file @@ -258,6 +259,10 @@ class YoutubeDL(object): """ Report that the metadata file has been written """ self.to_screen(u'[info] Video description metadata as JSON to: ' + infofn) + def report_writeannotations(self, annofn): + """ Report that the annotations file has been written. """ + self.to_screen(u'[info] Writing video annotations to: ' + annofn) + def report_file_already_downloaded(self, file_name): """Report file has already been fully downloaded.""" try: @@ -522,6 +527,18 @@ class YoutubeDL(object): self.report_error(u'Cannot write description file ' + descfn) return + if self.params.get('writeannotations', False): + try: + annofn = filename + u'.annotations.xml' + self.report_writeannotations(annofn) + with io.open(encodeFilename(annofn), 'w', encoding='utf-8') as annofile: + annofile.write(info_dict['annotations']) + except (KeyError, TypeError): + self.report_warning(u'There are no annotations to write.') + except (OSError, IOError): + self.report_error(u'Cannot write annotations file: ' + annofn) + return + subtitles_are_requested = any([self.params.get('writesubtitles', False), self.params.get('writeautomaticsub')]) diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 3513d719f..fb1270ea2 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -339,6 +339,9 @@ def parseOpts(overrideArguments=None): filesystem.add_option('--write-info-json', action='store_true', dest='writeinfojson', help='write video metadata to a .info.json file', default=False) + filesystem.add_option('--write-annotations', + action='store_true', dest='writeannotations', + help='write video annotations to a .annotation file', default=False) filesystem.add_option('--write-thumbnail', action='store_true', dest='writethumbnail', help='write thumbnail image to disk', default=False) @@ -601,6 +604,7 @@ def _real_main(argv=None): 'nopart': opts.nopart, 'updatetime': opts.updatetime, 'writedescription': opts.writedescription, + 'writeannotations': opts.writeannotations, 'writeinfojson': opts.writeinfojson, 'writethumbnail': opts.writethumbnail, 'writesubtitles': opts.writesubtitles, diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 8222a880f..4347651d7 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1250,6 +1250,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): url_map[itag] = format_url return url_map + def _extract_annotations(self, video_id): + url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id + return self._download_webpage(url, video_id, note=u'Searching for annotations.', errnote=u'Unable to download video annotations.') + def _real_extract(self, url): # Extract original video URL from URL with redirection, like age verification, using next_url parameter mobj = re.search(self._NEXT_URL_RE, url) @@ -1382,6 +1386,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): else: video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0]) + # annotations + video_annotations = None + if self._downloader.params.get('writeannotations', False): + video_annotations = self._extract_annotations(video_id) + # Decide which formats to download try: @@ -1495,6 +1504,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): 'subtitles': video_subtitles, 'duration': video_duration, 'age_limit': 18 if age_gate else 0, + 'annotations': video_annotations }) return results From ea62a2da466e3fce802930d3685d53a159520719 Mon Sep 17 00:00:00 2001 From: Filippo Valsorda <filippo.valsorda@gmail.com> Date: Mon, 14 Oct 2013 01:32:47 -0400 Subject: [PATCH 41/62] add VideoPremium.tv RTMP support --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/videopremium.py | 40 ++++++++++++++++++++++++++++ 2 files changed, 41 insertions(+) create mode 100644 youtube_dl/extractor/videopremium.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index d76945a48..748f12e5a 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -131,6 +131,7 @@ from .vice import ViceIE from .viddler import ViddlerIE from .videodetective import VideoDetectiveIE from .videofyme import VideofyMeIE +from .videopremium import VideoPremiumIE from .vimeo import VimeoIE, VimeoChannelIE from .vine import VineIE from .wat import WatIE diff --git a/youtube_dl/extractor/videopremium.py b/youtube_dl/extractor/videopremium.py new file mode 100644 index 000000000..65f39b982 --- /dev/null +++ b/youtube_dl/extractor/videopremium.py @@ -0,0 +1,40 @@ +import re +import random + +from .common import InfoExtractor + + +class VideoPremiumIE(InfoExtractor): + _VALID_URL = r'(?:https?://)?(?:www\.)?videopremium\.tv/(?P<id>\w+)(?:/.*)?' + _TEST = { + u'url': u'http://videopremium.tv/4w7oadjsf156', + u'file': u'4w7oadjsf156.f4v', + u'info_dict': { + u"title": u"youtube-dl_test_video____a_________-BaW_jenozKc.mp4.mp4" + }, + u'params': { + u'skip_download': True, + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + + video_id = mobj.group('id') + webpage_url = 'http://videopremium.tv/' + video_id + webpage = self._download_webpage(webpage_url, video_id) + + self.report_extraction(video_id) + + video_title = self._html_search_regex(r'<h2(?:.*?)>\s*(.+?)\s*<', + webpage, u'video title') + + return [{ + 'id': video_id, + 'url': "rtmp://e%d.md.iplay.md/play" % random.randint(1, 16), + 'play_path': "mp4:%s.f4v" % video_id, + 'page_url': "http://videopremium.tv/" + video_id, + 'player_url': "http://videopremium.tv/uplayer/uppod.swf", + 'ext': 'f4v', + 'title': video_title, + }] From f9b3d7af471909a449c3bf5977a7aaa6555a3495 Mon Sep 17 00:00:00 2001 From: Andras Elso <elso.andras@gmail.com> Date: Mon, 14 Oct 2013 13:07:47 +0200 Subject: [PATCH 42/62] Add an extractor for Szombathelyi TV --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/sztvhu.py | 41 ++++++++++++++++++++++++++++++++ 2 files changed, 42 insertions(+) create mode 100644 youtube_dl/extractor/sztvhu.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 748f12e5a..14ba6f358 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -112,6 +112,7 @@ from .spiegel import SpiegelIE from .stanfordoc import StanfordOpenClassroomIE from .statigram import StatigramIE from .steam import SteamIE +from .sztvhu import SztvHuIE from .teamcoco import TeamcocoIE from .ted import TEDIE from .tf1 import TF1IE diff --git a/youtube_dl/extractor/sztvhu.py b/youtube_dl/extractor/sztvhu.py new file mode 100644 index 000000000..486f93d26 --- /dev/null +++ b/youtube_dl/extractor/sztvhu.py @@ -0,0 +1,41 @@ +# -*- coding: utf-8 -*- + +import re + +from .common import InfoExtractor +from ..utils import determine_ext + +class SztvHuIE(InfoExtractor): + _VALID_URL = r'(?:http://)?(?:(?:www\.)?sztv\.hu|www\.tvszombathely\.hu)/([^/]+)/(?P<name>.+)' + _TEST = { + u'url': u'http://sztv.hu/hirek/cserkeszek-nepszerusitettek-a-kornyezettudatos-eletmodot-a-savaria-teren-20130909', + u'file': u'130909zoldnap.mp4', + u'md5': u'0047eacedc0afd1ceeac99e69173a07e', + u'info_dict': { + u"title": u"Cserkészek népszerűsítették a környezettudatos életmódot a Savaria téren", + u"description" : u'A zöld nap játékos ismeretterjesztő programjait a Magyar Cserkész Szövetség szervezte, akik az ország nyolc városában adják át tudásukat az érdeklődőknek. A PET...', + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + name = mobj.group('name') + webpage = self._download_webpage(url, name) +# file = self._search_regex(r'var fileHtml5 = "...:(.*?)";', + file = self._search_regex(r'file: "...:(.*?)",', + webpage, 'video file') + title = self._html_search_regex(r'<meta name="title" content="([^"]*)"', + webpage, 'video title').rsplit(' - ', 2)[0] + description = self._html_search_regex(r'<meta name="description" content="([^"]*)"/>', + webpage, 'video description') + thumbnail = self._og_search_thumbnail(webpage) + + video_url = 'http://media.sztv.hu/vod/' + file + + return {'id': name, + 'url' : video_url, + 'title': title, + 'ext': determine_ext(video_url), + 'description': description, + 'thumbnail': thumbnail, + } From c45aa560804e5be087b75c6e9fa8697020e57ea2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Mon, 14 Oct 2013 16:25:04 +0200 Subject: [PATCH 43/62] [gamespot] Fix video extraction (fixes #1587) --- youtube_dl/extractor/gamespot.py | 71 +++++++++++++++++--------------- 1 file changed, 37 insertions(+), 34 deletions(-) diff --git a/youtube_dl/extractor/gamespot.py b/youtube_dl/extractor/gamespot.py index 5edbf678a..098768361 100644 --- a/youtube_dl/extractor/gamespot.py +++ b/youtube_dl/extractor/gamespot.py @@ -1,56 +1,59 @@ import re -import xml.etree.ElementTree +import json from .common import InfoExtractor from ..utils import ( - unified_strdate, compat_urllib_parse, + compat_urlparse, + unescapeHTML, + get_meta_content, ) + class GameSpotIE(InfoExtractor): - _WORKING = False _VALID_URL = r'(?:http://)?(?:www\.)?gamespot\.com/.*-(?P<page_id>\d+)/?' _TEST = { u"url": u"http://www.gamespot.com/arma-iii/videos/arma-iii-community-guide-sitrep-i-6410818/", - u"file": u"6410818.mp4", + u"file": u"gs-2300-6410818.mp4", u"md5": u"b2a30deaa8654fcccd43713a6b6a4825", u"info_dict": { u"title": u"Arma 3 - Community Guide: SITREP I", - u"upload_date": u"20130627", + u'description': u'Check out this video where some of the basics of Arma 3 is explained.', } } - def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - page_id = mobj.group('page_id') + page_id = video_id = mobj.group('page_id') webpage = self._download_webpage(url, page_id) - video_id = self._html_search_regex([r'"og:video" content=".*?\?id=(\d+)"', - r'http://www\.gamespot\.com/videoembed/(\d+)'], - webpage, 'video id') - data = compat_urllib_parse.urlencode({'id': video_id, 'newplayer': '1'}) - info_url = 'http://www.gamespot.com/pages/video_player/xml.php?' + data - info_xml = self._download_webpage(info_url, video_id) - doc = xml.etree.ElementTree.fromstring(info_xml) - clip_el = doc.find('./playList/clip') + data_video_json = self._search_regex(r'data-video=\'(.*?)\'', webpage, u'data video') + data_video = json.loads(unescapeHTML(data_video_json)) - http_urls = [{'url': node.find('filePath').text, - 'rate': int(node.find('rate').text)} - for node in clip_el.find('./httpURI')] - best_quality = sorted(http_urls, key=lambda f: f['rate'])[-1] - video_url = best_quality['url'] - title = clip_el.find('./title').text - ext = video_url.rpartition('.')[2] - thumbnail_url = clip_el.find('./screenGrabURI').text - view_count = int(clip_el.find('./views').text) - upload_date = unified_strdate(clip_el.find('./postDate').text) + # Transform the manifest url to a link to the mp4 files + # they are used in mobile devices. + f4m_url = data_video['videoStreams']['f4m_stream'] + f4m_path = compat_urlparse.urlparse(f4m_url).path + QUALITIES_RE = r'((,\d+)+,?)' + qualities = self._search_regex(QUALITIES_RE, f4m_path, u'qualities').strip(',').split(',') + http_path = f4m_path[1:].split('/', 1)[1] + http_template = re.sub(QUALITIES_RE, r'%s', http_path) + http_template = http_template.replace('.csmil/manifest.f4m', '') + http_template = compat_urlparse.urljoin('http://video.gamespotcdn.com/', http_template) + formats = [] + for q in qualities: + formats.append({ + 'url': http_template % q, + 'ext': 'mp4', + 'format_id': q, + }) - return [{ - 'id' : video_id, - 'url' : video_url, - 'ext' : ext, - 'title' : title, - 'thumbnail' : thumbnail_url, - 'upload_date' : upload_date, - 'view_count' : view_count, - }] + info = { + 'id': data_video['guid'], + 'title': compat_urllib_parse.unquote(data_video['title']), + 'formats': formats, + 'description': get_meta_content('description', webpage), + 'thumbnail': self._og_search_thumbnail(webpage), + } + # TODO: Remove when #980 has been merged + info.update(formats[-1]) + return info From 9ed3bdc64d0310e568883b9e81e3dd5114efd7ed Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Tue, 15 Oct 2013 01:20:04 +0200 Subject: [PATCH 44/62] [tudou] Add support for youku links (Closes #1571) --- youtube_dl/extractor/tudou.py | 36 ++++++++++++++++++++++++----------- 1 file changed, 25 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/tudou.py b/youtube_dl/extractor/tudou.py index 1405b73f7..79679a14a 100644 --- a/youtube_dl/extractor/tudou.py +++ b/youtube_dl/extractor/tudou.py @@ -7,15 +7,25 @@ from .common import InfoExtractor class TudouIE(InfoExtractor): - _VALID_URL = r'(?:http://)?(?:www\.)?tudou\.com/(?:listplay|programs)/(?:view|(.+?))/(?:([^/]+)|([^/]+))(?:\.html)?' - _TEST = { + _VALID_URL = r'(?:http://)?(?:www\.)?tudou\.com/(?:listplay|programs|albumplay)/(?:view|(.+?))/(?:([^/]+)|([^/]+))(?:\.html)?' + _TESTS = [{ u'url': u'http://www.tudou.com/listplay/zzdE77v6Mmo/2xN2duXMxmw.html', u'file': u'159448201.f4v', u'md5': u'140a49ed444bd22f93330985d8475fcb', u'info_dict': { u"title": u"卡马乔国足开大脚长传冲吊集锦" } - } + }, + { + u'url': u'http://www.tudou.com/albumplay/TenTw_JgiPM/PzsAs5usU9A.html', + u'file': u'todo.mp4', + u'md5': u'todo.mp4', + u'info_dict': { + u'title': u'todo.mp4', + }, + u'add_ie': [u'Youku'], + u'skip': u'Only works from China' + }] def _url_for_id(self, id, quality = None): info_url = "http://v2.tudou.com/f?id="+str(id) @@ -29,14 +39,18 @@ class TudouIE(InfoExtractor): mobj = re.match(self._VALID_URL, url) video_id = mobj.group(2) webpage = self._download_webpage(url, video_id) - title = re.search(",kw:\"(.+)\"",webpage) - if title is None: - title = re.search(",kw: \'(.+)\'",webpage) - title = title.group(1) - thumbnail_url = re.search(",pic: \'(.+?)\'",webpage) - if thumbnail_url is None: - thumbnail_url = re.search(",pic:\"(.+?)\"",webpage) - thumbnail_url = thumbnail_url.group(1) + + m = re.search(r'vcode:\s*[\'"](.+?)[\'"]', webpage) + if m and m.group(1): + return { + '_type': 'url', + 'url': u'youku:' + m.group(1), + 'ie_key': 'Youku' + } + + title = self._search_regex(r",kw:['\"](.+?)[\"']", webpage, u'title') + thumbnail_url = self._search_regex( + r",pic:\s*[\"'](.+?)[\"']", webpage, u'thumbnail URL', fatal=False) segs_json = self._search_regex(r'segs: \'(.*)\'', webpage, 'segments') segments = json.loads(segs_json) From 7cf67fbe29684f9681aa591a6eaeb43a5c6b5cb2 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Tue, 15 Oct 2013 01:33:20 +0200 Subject: [PATCH 45/62] [sztvhu] Simplify --- youtube_dl/extractor/sztvhu.py | 45 ++++++++++++++++++---------------- 1 file changed, 24 insertions(+), 21 deletions(-) diff --git a/youtube_dl/extractor/sztvhu.py b/youtube_dl/extractor/sztvhu.py index 486f93d26..cd3e203e6 100644 --- a/youtube_dl/extractor/sztvhu.py +++ b/youtube_dl/extractor/sztvhu.py @@ -5,37 +5,40 @@ import re from .common import InfoExtractor from ..utils import determine_ext + class SztvHuIE(InfoExtractor): - _VALID_URL = r'(?:http://)?(?:(?:www\.)?sztv\.hu|www\.tvszombathely\.hu)/([^/]+)/(?P<name>.+)' + _VALID_URL = r'(?:http://)?(?:(?:www\.)?sztv\.hu|www\.tvszombathely\.hu)/(?:[^/]+)/.+-(?P<id>[0-9]+)' _TEST = { u'url': u'http://sztv.hu/hirek/cserkeszek-nepszerusitettek-a-kornyezettudatos-eletmodot-a-savaria-teren-20130909', - u'file': u'130909zoldnap.mp4', - u'md5': u'0047eacedc0afd1ceeac99e69173a07e', + u'file': u'20130909.mp4', + u'md5': u'a6df607b11fb07d0e9f2ad94613375cb', u'info_dict': { u"title": u"Cserkészek népszerűsítették a környezettudatos életmódot a Savaria téren", - u"description" : u'A zöld nap játékos ismeretterjesztő programjait a Magyar Cserkész Szövetség szervezte, akik az ország nyolc városában adják át tudásukat az érdeklődőknek. A PET...', + u"description": u'A zöld nap játékos ismeretterjesztő programjait a Magyar Cserkész Szövetség szervezte, akik az ország nyolc városában adják át tudásukat az érdeklődőknek. A PET...', } } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - name = mobj.group('name') - webpage = self._download_webpage(url, name) -# file = self._search_regex(r'var fileHtml5 = "...:(.*?)";', - file = self._search_regex(r'file: "...:(.*?)",', - webpage, 'video file') - title = self._html_search_regex(r'<meta name="title" content="([^"]*)"', - webpage, 'video title').rsplit(' - ', 2)[0] - description = self._html_search_regex(r'<meta name="description" content="([^"]*)"/>', - webpage, 'video description') + video_id = mobj.group('id') + webpage = self._download_webpage(url, video_id) + video_file = self._search_regex( + r'file: "...:(.*?)",', webpage, 'video file') + title = self._html_search_regex( + r'<meta name="title" content="([^"]*) - [^-]*"', + webpage, 'video title') + description = self._html_search_regex( + r'<meta name="description" content="([^"]*)"/>', + webpage, 'video description', fatal=False) thumbnail = self._og_search_thumbnail(webpage) - video_url = 'http://media.sztv.hu/vod/' + file + video_url = 'http://media.sztv.hu/vod/' + video_file - return {'id': name, - 'url' : video_url, - 'title': title, - 'ext': determine_ext(video_url), - 'description': description, - 'thumbnail': thumbnail, - } + return { + 'id': video_id, + 'url': video_url, + 'title': title, + 'ext': determine_ext(video_url), + 'description': description, + 'thumbnail': thumbnail, + } From a623df4c7b099bc3adfe943c7155e55c6512aeff Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Tue, 15 Oct 2013 01:34:47 +0200 Subject: [PATCH 46/62] Credit @Elbandi for sztvhu --- youtube_dl/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index fb1270ea2..f79b7796c 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -31,6 +31,7 @@ __authors__ = ( 'Huarong Huo', 'Ismael Mejía', 'Steffan \'Ruirize\' James', + 'Andras Elso', ) __license__ = 'Public Domain' From 44a5f1718a5657a08082d8fd3201403bf2683c4f Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Tue, 15 Oct 2013 02:00:53 +0200 Subject: [PATCH 47/62] Simplify tests * Make them directly executable again * Move common stuff (md5, parameters) to helper * Never import * * General clean up --- test/helper.py | 28 ++++++++++++++-------- test/test_age_restriction.py | 12 ++++++---- test/test_all_urls.py | 18 +++++++++----- test/test_dailymotion_subtitles.py | 16 ++++++------- test/test_download.py | 38 ++++++++++++++---------------- test/test_playlists.py | 13 +++++----- test/test_utils.py | 14 +++++------ test/test_write_annotations.py | 37 ++++++++++++++--------------- test/test_write_info_json.py | 32 ++++++++++++------------- test/test_youtube_lists.py | 24 ++++++++++++------- test/test_youtube_signature.py | 16 ++++++++----- test/test_youtube_subtitles.py | 29 ++++++++++++++++------- 12 files changed, 154 insertions(+), 123 deletions(-) diff --git a/test/helper.py b/test/helper.py index ad1b74dd3..79a0ede48 100644 --- a/test/helper.py +++ b/test/helper.py @@ -1,22 +1,27 @@ import errno import io +import hashlib import json import os.path import re import types import youtube_dl.extractor -from youtube_dl import YoutubeDL, YoutubeDLHandler -from youtube_dl.utils import ( - compat_cookiejar, - compat_urllib_request, -) +from youtube_dl import YoutubeDL -youtube_dl._setup_opener(timeout=10) -PARAMETERS_FILE = os.path.join(os.path.dirname(os.path.abspath(__file__)), "parameters.json") -with io.open(PARAMETERS_FILE, encoding='utf-8') as pf: - parameters = json.load(pf) +def global_setup(): + youtube_dl._setup_opener(timeout=10) + + +def get_params(override=None): + PARAMETERS_FILE = os.path.join(os.path.dirname(os.path.abspath(__file__)), + "parameters.json") + with io.open(PARAMETERS_FILE, encoding='utf-8') as pf: + parameters = json.load(pf) + if override: + parameters.update(override) + return parameters def try_rm(filename): @@ -32,7 +37,7 @@ class FakeYDL(YoutubeDL): def __init__(self): # Different instances of the downloader can't share the same dictionary # some test set the "sublang" parameter, which would break the md5 checks. - params = dict(parameters) + params = get_params() super(FakeYDL, self).__init__(params) self.result = [] @@ -62,3 +67,6 @@ def get_testcases(): for t in getattr(ie, '_TESTS', []): t['name'] = type(ie).__name__[:-len('IE')] yield t + + +md5 = lambda s: hashlib.md5(s.encode('utf-8')).hexdigest() diff --git a/test/test_age_restriction.py b/test/test_age_restriction.py index ec3e30572..d500c6edc 100644 --- a/test/test_age_restriction.py +++ b/test/test_age_restriction.py @@ -1,14 +1,16 @@ #!/usr/bin/env python -import sys -import unittest - # Allow direct execution import os -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +import sys +import unittest +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from test.helper import global_setup, try_rm +global_setup() + from youtube_dl import YoutubeDL -from .helper import try_rm def _download_restricted(url, filename, age): diff --git a/test/test_all_urls.py b/test/test_all_urls.py index b28ad000b..56e5f80e1 100644 --- a/test/test_all_urls.py +++ b/test/test_all_urls.py @@ -1,14 +1,20 @@ #!/usr/bin/env python -import sys -import unittest - # Allow direct execution import os -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +import sys +import unittest +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + + +from test.helper import get_testcases + +from youtube_dl.extractor import ( + gen_extractors, + JustinTVIE, + YoutubeIE, +) -from youtube_dl.extractor import YoutubeIE, YoutubePlaylistIE, YoutubeChannelIE, JustinTVIE, gen_extractors -from .helper import get_testcases class TestAllURLsMatching(unittest.TestCase): def setUp(self): diff --git a/test/test_dailymotion_subtitles.py b/test/test_dailymotion_subtitles.py index e655d280d..c596415c4 100644 --- a/test/test_dailymotion_subtitles.py +++ b/test/test_dailymotion_subtitles.py @@ -1,18 +1,16 @@ #!/usr/bin/env python -import sys -import unittest -import hashlib - # Allow direct execution import os -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +import sys +import unittest +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from test.helper import FakeYDL, global_setup, md5 +global_setup() + from youtube_dl.extractor import DailymotionIE -from youtube_dl.utils import * -from .helper import FakeYDL - -md5 = lambda s: hashlib.md5(s.encode('utf-8')).hexdigest() class TestDailymotionSubtitles(unittest.TestCase): def setUp(self): diff --git a/test/test_download.py b/test/test_download.py index 68da4d984..b9a9be11d 100644 --- a/test/test_download.py +++ b/test/test_download.py @@ -1,26 +1,31 @@ #!/usr/bin/env python +# Allow direct execution +import os +import sys +import unittest +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from test.helper import get_params, get_testcases, global_setup, try_rm, md5 +global_setup() + + import hashlib import io -import os import json -import unittest -import sys import socket -import binascii - -# Allow direct execution -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) import youtube_dl.YoutubeDL -from youtube_dl.utils import * - -PARAMETERS_FILE = os.path.join(os.path.dirname(os.path.abspath(__file__)), "parameters.json") +from youtube_dl.utils import ( + compat_str, + compat_urllib_error, + DownloadError, + ExtractorError, + UnavailableVideoError, +) RETRIES = 3 -md5 = lambda s: hashlib.md5(s.encode('utf-8')).hexdigest() - class YoutubeDL(youtube_dl.YoutubeDL): def __init__(self, *args, **kwargs): self.to_stderr = self.to_screen @@ -37,18 +42,12 @@ def _file_md5(fn): with open(fn, 'rb') as f: return hashlib.md5(f.read()).hexdigest() -import test.helper as helper # Set up remaining global configuration -from .helper import get_testcases, try_rm defs = get_testcases() -with io.open(PARAMETERS_FILE, encoding='utf-8') as pf: - parameters = json.load(pf) - class TestDownload(unittest.TestCase): maxDiff = None def setUp(self): - self.parameters = parameters self.defs = defs ### Dynamically generate tests @@ -68,8 +67,7 @@ def generator(test_case): print_skipping(test_case['skip']) return - params = self.parameters.copy() - params.update(test_case.get('params', {})) + params = get_params(test_case.get('params', {})) ydl = YoutubeDL(params) ydl.add_default_info_extractors() diff --git a/test/test_playlists.py b/test/test_playlists.py index 108a4d63b..d6a8d56df 100644 --- a/test/test_playlists.py +++ b/test/test_playlists.py @@ -1,13 +1,16 @@ #!/usr/bin/env python # encoding: utf-8 -import sys -import unittest -import json # Allow direct execution import os -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +import sys +import unittest +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from test.helper import FakeYDL, global_setup +global_setup() + from youtube_dl.extractor import ( DailymotionPlaylistIE, @@ -18,9 +21,7 @@ from youtube_dl.extractor import ( LivestreamIE, NHLVideocenterIE, ) -from youtube_dl.utils import * -from .helper import FakeYDL class TestPlaylists(unittest.TestCase): def assertIsPlaylist(self, info): diff --git a/test/test_utils.py b/test/test_utils.py index f2c03d421..270669044 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -1,14 +1,14 @@ #!/usr/bin/env python -# Various small unit tests - -import sys -import unittest -import xml.etree.ElementTree - # Allow direct execution import os -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +import sys +import unittest +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + + +# Various small unit tests +import xml.etree.ElementTree #from youtube_dl.utils import htmlentity_transform from youtube_dl.utils import ( diff --git a/test/test_write_annotations.py b/test/test_write_annotations.py index ba7a9f50a..6f08808cd 100644 --- a/test/test_write_annotations.py +++ b/test/test_write_annotations.py @@ -1,39 +1,38 @@ #!/usr/bin/env python # coding: utf-8 -import xml.etree.ElementTree +# Allow direct execution import os import sys import unittest +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -# Allow direct execution -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +from test.helper import get_params, global_setup, try_rm +global_setup() + + +import io + +import xml.etree.ElementTree import youtube_dl.YoutubeDL import youtube_dl.extractor -from youtube_dl.utils import * -from .helper import try_rm +from youtube_dl.utils import True -PARAMETERS_FILE = os.path.join(os.path.dirname(os.path.abspath(__file__)), "parameters.json") - -# General configuration (from __init__, not very elegant...) -jar = compat_cookiejar.CookieJar() -cookie_processor = compat_urllib_request.HTTPCookieProcessor(jar) -proxy_handler = compat_urllib_request.ProxyHandler() -opener = compat_urllib_request.build_opener(proxy_handler, cookie_processor, YoutubeDLHandler()) -compat_urllib_request.install_opener(opener) class YoutubeDL(youtube_dl.YoutubeDL): def __init__(self, *args, **kwargs): super(YoutubeDL, self).__init__(*args, **kwargs) self.to_stderr = self.to_screen -with io.open(PARAMETERS_FILE, encoding='utf-8') as pf: - params = json.load(pf) -params['writeannotations'] = True -params['skip_download'] = True -params['writeinfojson'] = False -params['format'] = 'flv' +params = get_params({ + 'writeannotations': True, + 'skip_download': True, + 'writeinfojson': False, + 'format': 'flv', +}) + + TEST_ID = 'gr51aVj-mLg' ANNOTATIONS_FILE = TEST_ID + '.flv.annotations.xml' diff --git a/test/test_write_info_json.py b/test/test_write_info_json.py index de6d5180f..a5b6f6972 100644 --- a/test/test_write_info_json.py +++ b/test/test_write_info_json.py @@ -1,37 +1,34 @@ #!/usr/bin/env python # coding: utf-8 -import json +# Allow direct execution import os import sys import unittest +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -# Allow direct execution -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +from test.helper import get_params, global_setup +global_setup() + + +import io +import json import youtube_dl.YoutubeDL import youtube_dl.extractor -from youtube_dl.utils import * -PARAMETERS_FILE = os.path.join(os.path.dirname(os.path.abspath(__file__)), "parameters.json") - -# General configuration (from __init__, not very elegant...) -jar = compat_cookiejar.CookieJar() -cookie_processor = compat_urllib_request.HTTPCookieProcessor(jar) -proxy_handler = compat_urllib_request.ProxyHandler() -opener = compat_urllib_request.build_opener(proxy_handler, cookie_processor, YoutubeDLHandler()) -compat_urllib_request.install_opener(opener) class YoutubeDL(youtube_dl.YoutubeDL): def __init__(self, *args, **kwargs): super(YoutubeDL, self).__init__(*args, **kwargs) self.to_stderr = self.to_screen -with io.open(PARAMETERS_FILE, encoding='utf-8') as pf: - params = json.load(pf) -params['writeinfojson'] = True -params['skip_download'] = True -params['writedescription'] = True +params = get_params({ + 'writeinfojson': True, + 'skip_download': True, + 'writedescription': True, +}) + TEST_ID = 'BaW_jenozKc' INFO_JSON_FILE = TEST_ID + '.mp4.info.json' @@ -42,6 +39,7 @@ This is a test video for youtube-dl. For more information, contact phihag@phihag.de .''' + class TestInfoJSON(unittest.TestCase): def setUp(self): # Clear old files diff --git a/test/test_youtube_lists.py b/test/test_youtube_lists.py index 0b5c79030..c1753b5bb 100644 --- a/test/test_youtube_lists.py +++ b/test/test_youtube_lists.py @@ -1,20 +1,26 @@ #!/usr/bin/env python -import sys -import unittest -import json - # Allow direct execution import os -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +import sys +import unittest +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from youtube_dl.extractor import YoutubeUserIE, YoutubePlaylistIE, YoutubeIE, YoutubeChannelIE, YoutubeShowIE -from youtube_dl.utils import * +from test.helper import FakeYDL, global_setup +global_setup() + + +from youtube_dl.extractor import ( + YoutubeUserIE, + YoutubePlaylistIE, + YoutubeIE, + YoutubeChannelIE, + YoutubeShowIE, +) -from .helper import FakeYDL class TestYoutubeLists(unittest.TestCase): - def assertIsPlaylist(self,info): + def assertIsPlaylist(self, info): """Make sure the info has '_type' set to 'playlist'""" self.assertEqual(info['_type'], 'playlist') diff --git a/test/test_youtube_signature.py b/test/test_youtube_signature.py index 5007d9a16..5e1ff5eb0 100644 --- a/test/test_youtube_signature.py +++ b/test/test_youtube_signature.py @@ -1,14 +1,18 @@ #!/usr/bin/env python +# Allow direct execution +import os +import sys +import unittest +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from test.helper import global_setup +global_setup() + + import io import re import string -import sys -import unittest - -# Allow direct execution -import os -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from youtube_dl.extractor import YoutubeIE from youtube_dl.utils import compat_str, compat_urlretrieve diff --git a/test/test_youtube_subtitles.py b/test/test_youtube_subtitles.py index 07850385e..00430a338 100644 --- a/test/test_youtube_subtitles.py +++ b/test/test_youtube_subtitles.py @@ -1,69 +1,79 @@ #!/usr/bin/env python -import sys -import unittest -import hashlib - # Allow direct execution import os -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +import sys +import unittest +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from test.helper import FakeYDL, global_setup, md5 +global_setup() + from youtube_dl.extractor import YoutubeIE -from youtube_dl.utils import * -from .helper import FakeYDL -md5 = lambda s: hashlib.md5(s.encode('utf-8')).hexdigest() class TestYoutubeSubtitles(unittest.TestCase): def setUp(self): self.DL = FakeYDL() self.url = 'QRS8MkLhQmM' + def getInfoDict(self): IE = YoutubeIE(self.DL) info_dict = IE.extract(self.url) return info_dict + def getSubtitles(self): info_dict = self.getInfoDict() - return info_dict[0]['subtitles'] + return info_dict[0]['subtitles'] + def test_youtube_no_writesubtitles(self): self.DL.params['writesubtitles'] = False subtitles = self.getSubtitles() self.assertEqual(subtitles, None) + def test_youtube_subtitles(self): self.DL.params['writesubtitles'] = True subtitles = self.getSubtitles() self.assertEqual(md5(subtitles['en']), '4cd9278a35ba2305f47354ee13472260') + def test_youtube_subtitles_lang(self): self.DL.params['writesubtitles'] = True self.DL.params['subtitleslangs'] = ['it'] subtitles = self.getSubtitles() self.assertEqual(md5(subtitles['it']), '164a51f16f260476a05b50fe4c2f161d') + def test_youtube_allsubtitles(self): self.DL.params['writesubtitles'] = True self.DL.params['allsubtitles'] = True subtitles = self.getSubtitles() self.assertEqual(len(subtitles.keys()), 13) + def test_youtube_subtitles_sbv_format(self): self.DL.params['writesubtitles'] = True self.DL.params['subtitlesformat'] = 'sbv' subtitles = self.getSubtitles() self.assertEqual(md5(subtitles['en']), '13aeaa0c245a8bed9a451cb643e3ad8b') + def test_youtube_subtitles_vtt_format(self): self.DL.params['writesubtitles'] = True self.DL.params['subtitlesformat'] = 'vtt' subtitles = self.getSubtitles() self.assertEqual(md5(subtitles['en']), '356cdc577fde0c6783b9b822e7206ff7') + def test_youtube_list_subtitles(self): self.DL.expect_warning(u'Video doesn\'t have automatic captions') self.DL.params['listsubtitles'] = True info_dict = self.getInfoDict() self.assertEqual(info_dict, None) + def test_youtube_automatic_captions(self): self.url = '8YoUxe5ncPo' self.DL.params['writeautomaticsub'] = True self.DL.params['subtitleslangs'] = ['it'] subtitles = self.getSubtitles() self.assertTrue(subtitles['it'] is not None) + def test_youtube_nosubtitles(self): self.DL.expect_warning(u'video doesn\'t have subtitles') self.url = 'sAjKT8FhjI8' @@ -71,6 +81,7 @@ class TestYoutubeSubtitles(unittest.TestCase): self.DL.params['allsubtitles'] = True subtitles = self.getSubtitles() self.assertEqual(len(subtitles), 0) + def test_youtube_multiple_langs(self): self.url = 'QRS8MkLhQmM' self.DL.params['writesubtitles'] = True From a4fd04158eb7e570a0b2d27f6d9b6b9360644807 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Tue, 15 Oct 2013 02:07:26 +0200 Subject: [PATCH 48/62] Do not import * --- youtube_dl/PostProcessor.py | 10 ++++++++-- youtube_dl/__init__.py | 36 +++++++++++++++++++++++++++++++----- 2 files changed, 39 insertions(+), 7 deletions(-) diff --git a/youtube_dl/PostProcessor.py b/youtube_dl/PostProcessor.py index 039e01498..13b56ede5 100644 --- a/youtube_dl/PostProcessor.py +++ b/youtube_dl/PostProcessor.py @@ -2,9 +2,15 @@ import os import subprocess import sys import time -import datetime -from .utils import * + +from .utils import ( + compat_subprocess_get_DEVNULL, + encodeFilename, + PostProcessingError, + shell_quote, + subtitles_filename, +) class PostProcessor(object): diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index f79b7796c..5248a92c7 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -47,17 +47,43 @@ import shlex import socket import subprocess import sys -import warnings +import traceback import platform -from .utils import * +from .utils import ( + compat_cookiejar, + compat_print, + compat_str, + compat_urllib_request, + DateRange, + decodeOption, + determine_ext, + DownloadError, + get_cachedir, + make_HTTPS_handler, + MaxDownloadsReached, + platform_name, + preferredencoding, + SameFileError, + std_headers, + write_string, + YoutubeDLHandler, +) from .update import update_self from .version import __version__ -from .FileDownloader import * +from .FileDownloader import ( + FileDownloader, +) from .extractor import gen_extractors from .YoutubeDL import YoutubeDL -from .PostProcessor import * +from .PostProcessor import ( + FFmpegMetadataPP, + FFmpegVideoConvertor, + FFmpegExtractAudioPP, + FFmpegEmbedSubtitlePP, +) + def parseOpts(overrideArguments=None): def _readOptions(filename_bytes): @@ -689,7 +715,7 @@ def _real_main(argv=None): if opts.cookiefile is not None: try: jar.save() - except (IOError, OSError) as err: + except (IOError, OSError): sys.exit(u'ERROR: unable to save cookie jar') sys.exit(retcode) From cd054fc491198a5a7c69d76f19693b1cd4d5c086 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Tue, 15 Oct 2013 04:53:02 +0200 Subject: [PATCH 49/62] Use upper-case for prefixes in help to signify bytes (#1043) --- youtube_dl/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 472ae9c0c..3efa5dfd1 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -267,11 +267,11 @@ def parseOpts(overrideArguments=None): help='languages of the subtitles to download (optional) separated by commas, use IETF language tags like \'en,pt\'') downloader.add_option('-r', '--rate-limit', - dest='ratelimit', metavar='LIMIT', help='maximum download rate in bytes per second (e.g. 50k or 44.6m)') + dest='ratelimit', metavar='LIMIT', help='maximum download rate in bytes per second (e.g. 50K or 4.2M)') downloader.add_option('-R', '--retries', dest='retries', metavar='RETRIES', help='number of retries (default is %default)', default=10) downloader.add_option('--buffer-size', - dest='buffersize', metavar='SIZE', help='size of download buffer (e.g. 1024 or 16k) (default is %default)', default="1024") + dest='buffersize', metavar='SIZE', help='size of download buffer (e.g. 1024 or 16K) (default is %default)', default="1024") downloader.add_option('--no-resize-buffer', action='store_true', dest='noresizebuffer', help='do not automatically adjust the buffer size. By default, the buffer size is automatically resized from an initial value of SIZE.', default=False) From 8381a92120c3826b471e6d2cc38045b5f3a9d15e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Tue, 15 Oct 2013 08:12:30 +0200 Subject: [PATCH 50/62] [websurg] Skipt the test It needs login information. --- youtube_dl/extractor/websurg.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/websurg.py b/youtube_dl/extractor/websurg.py index 7d335d444..43953bfdd 100644 --- a/youtube_dl/extractor/websurg.py +++ b/youtube_dl/extractor/websurg.py @@ -18,7 +18,8 @@ class WeBSurgIE(InfoExtractor): u'file': u'vd01en4012.mp4', u'params': { u'skip_download': True, - } + }, + u'skip': u'Requires login information', } _LOGIN_URL = 'http://www.websurg.com/inc/login/login_div.ajax.php?login=1' From e772692ffd727631e65be90948b7e8c422738a7b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Tue, 15 Oct 2013 08:22:20 +0200 Subject: [PATCH 51/62] Fix an import in the tests and the Youtube Shows test --- test/test_write_annotations.py | 1 - test/test_youtube_lists.py | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/test/test_write_annotations.py b/test/test_write_annotations.py index 6f08808cd..35defb895 100644 --- a/test/test_write_annotations.py +++ b/test/test_write_annotations.py @@ -17,7 +17,6 @@ import xml.etree.ElementTree import youtube_dl.YoutubeDL import youtube_dl.extractor -from youtube_dl.utils import True class YoutubeDL(youtube_dl.YoutubeDL): diff --git a/test/test_youtube_lists.py b/test/test_youtube_lists.py index c1753b5bb..4b7a7847b 100644 --- a/test/test_youtube_lists.py +++ b/test/test_youtube_lists.py @@ -106,7 +106,7 @@ class TestYoutubeLists(unittest.TestCase): dl = FakeYDL() ie = YoutubeShowIE(dl) result = ie.extract('http://www.youtube.com/show/airdisasters') - self.assertTrue(len(result) >= 4) + self.assertTrue(len(result) >= 3) if __name__ == '__main__': unittest.main() From 9d74e308f7caa7f649809366ebcdb5a7caf560b9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Tue, 15 Oct 2013 08:22:59 +0200 Subject: [PATCH 52/62] [sztvhu] Fix the title extraction --- youtube_dl/extractor/sztvhu.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/sztvhu.py b/youtube_dl/extractor/sztvhu.py index cd3e203e6..81fa35c4b 100644 --- a/youtube_dl/extractor/sztvhu.py +++ b/youtube_dl/extractor/sztvhu.py @@ -25,7 +25,7 @@ class SztvHuIE(InfoExtractor): video_file = self._search_regex( r'file: "...:(.*?)",', webpage, 'video file') title = self._html_search_regex( - r'<meta name="title" content="([^"]*) - [^-]*"', + r'<meta name="title" content="([^"]*?) - [^-]*? - [^-]*?"', webpage, 'video title') description = self._html_search_regex( r'<meta name="description" content="([^"]*)"/>', From 9d4660cab15f374176f87d3f747a559142e4af9b Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Tue, 15 Oct 2013 12:05:13 +0200 Subject: [PATCH 53/62] [generic] Support embedded vimeo videos (#1602) --- test/test_utils.py | 16 ++++++++++++++++ youtube_dl/extractor/generic.py | 21 +++++++++++++++++++++ youtube_dl/extractor/vimeo.py | 11 +++++++++-- youtube_dl/utils.py | 17 +++++++++++++++++ 4 files changed, 63 insertions(+), 2 deletions(-) diff --git a/test/test_utils.py b/test/test_utils.py index 270669044..f3fbff042 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -1,4 +1,5 @@ #!/usr/bin/env python +# coding: utf-8 # Allow direct execution import os @@ -21,6 +22,8 @@ from youtube_dl.utils import ( find_xpath_attr, get_meta_content, xpath_with_ns, + smuggle_url, + unsmuggle_url, ) if sys.version_info < (3, 0): @@ -155,5 +158,18 @@ class TestUtil(unittest.TestCase): self.assertEqual(find('media:song/media:author').text, u'The Author') self.assertEqual(find('media:song/url').text, u'http://server.com/download.mp3') + def test_smuggle_url(self): + data = {u"ö": u"ö", u"abc": [3]} + url = 'https://foo.bar/baz?x=y#a' + smug_url = smuggle_url(url, data) + unsmug_url, unsmug_data = unsmuggle_url(smug_url) + self.assertEqual(url, unsmug_url) + self.assertEqual(data, unsmug_data) + + res_url, res_data = unsmuggle_url(url) + self.assertEqual(res_url, url) + self.assertEqual(res_data, None) + + if __name__ == '__main__': unittest.main() diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index d48c84f8d..89805250c 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -11,6 +11,8 @@ from ..utils import ( compat_urlparse, ExtractorError, + smuggle_url, + unescapeHTML, ) from .brightcove import BrightcoveIE @@ -29,6 +31,17 @@ class GenericIE(InfoExtractor): u"title": u"R\u00e9gis plante sa Jeep" } }, + # embedded vimeo video + { + u'url': u'http://skillsmatter.com/podcast/home/move-semanticsperfect-forwarding-and-rvalue-references', + u'file': u'22444065.mp4', + u'md5': u'2903896e23df39722c33f015af0666e2', + u'info_dict': { + u'title': u'ACCU 2011: Move Semantics,Perfect Forwarding, and Rvalue references- Scott Meyers- 13/04/2011', + u"uploader_id": u"skillsmatter", + u"uploader": u"Skills Matter", + } + } ] def report_download_webpage(self, video_id): @@ -127,6 +140,14 @@ class GenericIE(InfoExtractor): bc_url = BrightcoveIE._build_brighcove_url(m_brightcove.group()) return self.url_result(bc_url, 'Brightcove') + # Look for embedded Vimeo player + mobj = re.search( + r'<iframe\s+src="(https?://player.vimeo.com/video/.*?)"', webpage) + if mobj: + player_url = unescapeHTML(mobj.group(1)) + surl = smuggle_url(player_url, {'Referer': url}) + return self.url_result(surl, 'Vimeo') + # Start with something easy: JW Player in SWFObject mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage) if mobj is None: diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index cea29f035..2de56ac81 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -11,6 +11,7 @@ from ..utils import ( get_element_by_attribute, ExtractorError, std_headers, + unsmuggle_url, ) class VimeoIE(InfoExtractor): @@ -53,7 +54,7 @@ class VimeoIE(InfoExtractor): u'title': u'Kathy Sierra: Building the minimum Badass User, Business of Software', u'uploader': u'The BLN & Business of Software', }, - }, + } ] def _login(self): @@ -98,6 +99,12 @@ class VimeoIE(InfoExtractor): self._login() def _real_extract(self, url, new_video=True): + url, data = unsmuggle_url(url) + headers = std_headers + if data is not None: + headers = headers.copy() + headers.update(data) + # Extract ID from URL mobj = re.match(self._VALID_URL, url) if mobj is None: @@ -112,7 +119,7 @@ class VimeoIE(InfoExtractor): url = 'https://vimeo.com/' + video_id # Retrieve video webpage to extract further information - request = compat_urllib_request.Request(url, None, std_headers) + request = compat_urllib_request.Request(url, None, headers) webpage = self._download_webpage(request, video_id) # Now we begin extracting as much information as we can from what we diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 3e81c308b..833f981f2 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -945,3 +945,20 @@ class locked_file(object): def shell_quote(args): return ' '.join(map(pipes.quote, args)) + + +def smuggle_url(url, data): + """ Pass additional data in a URL for internal use. """ + + sdata = compat_urllib_parse.urlencode( + {u'__youtubedl_smuggle': json.dumps(data)}) + return url + u'#' + sdata + + +def unsmuggle_url(smug_url): + if not '#__youtubedl_smuggle' in smug_url: + return smug_url, None + url, _, sdata = smug_url.rpartition(u'#') + jsond = compat_parse_qs(sdata)[u'__youtubedl_smuggle'][0] + data = json.loads(jsond) + return url, data From 8abbf43f21d2afcfa2db1744a3f6ccfc917cc8d7 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Tue, 15 Oct 2013 12:06:45 +0200 Subject: [PATCH 54/62] release 2013.10.15 --- README.md | 7 +++++-- youtube_dl/version.py | 2 +- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 8824daee2..6dae0a580 100644 --- a/README.md +++ b/README.md @@ -57,9 +57,10 @@ which means you can modify it, redistribute it or use it however you like. file. Record all downloaded videos in it. ## Download Options: - -r, --rate-limit LIMIT maximum download rate (e.g. 50k or 44.6m) + -r, --rate-limit LIMIT maximum download rate in bytes per second (e.g. + 50K or 4.2M) -R, --retries RETRIES number of retries (default is 10) - --buffer-size SIZE size of download buffer (e.g. 1024 or 16k) + --buffer-size SIZE size of download buffer (e.g. 1024 or 16K) (default is 1024) --no-resize-buffer do not automatically adjust the buffer size. By default, the buffer size is automatically resized @@ -100,6 +101,7 @@ which means you can modify it, redistribute it or use it however you like. file modification time --write-description write video description to a .description file --write-info-json write video metadata to a .info.json file + --write-annotations write video annotations to a .annotation file --write-thumbnail write thumbnail image to disk ## Verbosity / Simulation Options: @@ -166,6 +168,7 @@ which means you can modify it, redistribute it or use it however you like. processed files are overwritten by default --embed-subs embed subtitles in the video (only for mp4 videos) + --add-metadata add metadata to the files # CONFIGURATION diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 1004af116..97dc5e9cc 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2013.10.09' +__version__ = '2013.10.15' From 996d1c3242be5569bb4b579b2e3ad25a6d928dfb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Tue, 15 Oct 2013 23:08:52 +0200 Subject: [PATCH 55/62] Don't include the test/testdata directory in the youtube-dl.tar.gz The last releases included big files that increased the size of the compressed file. --- Makefile | 1 + 1 file changed, 1 insertion(+) diff --git a/Makefile b/Makefile index 85dacfa4c..abd89be49 100644 --- a/Makefile +++ b/Makefile @@ -71,6 +71,7 @@ youtube-dl.tar.gz: youtube-dl README.md README.txt youtube-dl.1 youtube-dl.bash- --exclude '*~' \ --exclude '__pycache' \ --exclude '.git' \ + --exclude 'testdata' \ -- \ bin devscripts test youtube_dl \ CHANGELOG LICENSE README.md README.txt \ From 76965512daae80b7f1e43f063308ff93d6dfbc8a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Tue, 15 Oct 2013 23:15:15 +0200 Subject: [PATCH 56/62] Fix the indentation of the Makefile It uses tabs, no spaces. --- Makefile | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/Makefile b/Makefile index abd89be49..c6d09932b 100644 --- a/Makefile +++ b/Makefile @@ -13,13 +13,13 @@ PYTHON=/usr/bin/env python # set SYSCONFDIR to /etc if PREFIX=/usr or PREFIX=/usr/local ifeq ($(PREFIX),/usr) - SYSCONFDIR=/etc + SYSCONFDIR=/etc else - ifeq ($(PREFIX),/usr/local) - SYSCONFDIR=/etc - else - SYSCONFDIR=$(PREFIX)/etc - endif + ifeq ($(PREFIX),/usr/local) + SYSCONFDIR=/etc + else + SYSCONFDIR=$(PREFIX)/etc + endif endif install: youtube-dl youtube-dl.1 youtube-dl.bash-completion @@ -71,7 +71,7 @@ youtube-dl.tar.gz: youtube-dl README.md README.txt youtube-dl.1 youtube-dl.bash- --exclude '*~' \ --exclude '__pycache' \ --exclude '.git' \ - --exclude 'testdata' \ + --exclude 'testdata' \ -- \ bin devscripts test youtube_dl \ CHANGELOG LICENSE README.md README.txt \ From bfd14b1b2fdf1f0e54e639f9695f73edf578e241 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Wed, 16 Oct 2013 16:57:40 +0200 Subject: [PATCH 57/62] Add an extractor for rutube.ru (closes #1136) It downloads with a m3u8 manifest, requires ffmpeg. --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/rutube.py | 58 ++++++++++++++++++++++++++++++++ 2 files changed, 59 insertions(+) create mode 100644 youtube_dl/extractor/rutube.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 5f0e2ec9b..4f20fbd1a 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -102,6 +102,7 @@ from .ro220 import Ro220IE from .rottentomatoes import RottenTomatoesIE from .roxwel import RoxwelIE from .rtlnow import RTLnowIE +from .rutube import RutubeIE from .sina import SinaIE from .slashdot import SlashdotIE from .slideshare import SlideshareIE diff --git a/youtube_dl/extractor/rutube.py b/youtube_dl/extractor/rutube.py new file mode 100644 index 000000000..a18034fe2 --- /dev/null +++ b/youtube_dl/extractor/rutube.py @@ -0,0 +1,58 @@ +# encoding: utf-8 +import re +import json + +from .common import InfoExtractor +from ..utils import ( + compat_urlparse, + compat_str, + ExtractorError, +) + + +class RutubeIE(InfoExtractor): + _VALID_URL = r'https?://rutube.ru/video/(?P<long_id>\w+)' + + _TEST = { + u'url': u'http://rutube.ru/video/3eac3b4561676c17df9132a9a1e62e3e/', + u'file': u'3eac3b4561676c17df9132a9a1e62e3e.mp4', + u'info_dict': { + u'title': u'Раненный кенгуру забежал в аптеку', + u'uploader': u'NTDRussian', + u'uploader_id': u'29790', + }, + u'params': { + # It requires ffmpeg (m3u8 download) + u'skip_download': True, + }, + } + + def _get_api_response(self, short_id, subpath): + api_url = 'http://rutube.ru/api/play/%s/%s/?format=json' % (subpath, short_id) + response_json = self._download_webpage(api_url, short_id, + u'Downloading %s json' % subpath) + return json.loads(response_json) + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + long_id = mobj.group('long_id') + webpage = self._download_webpage(url, long_id) + og_video = self._og_search_video_url(webpage) + short_id = compat_urlparse.urlparse(og_video).path[1:] + options = self._get_api_response(short_id, 'options') + trackinfo = self._get_api_response(short_id, 'trackinfo') + # Some videos don't have the author field + author = trackinfo.get('author') or {} + m3u8_url = trackinfo['video_balancer'].get('m3u8') + if m3u8_url is None: + raise ExtractorError(u'Couldn\'t find m3u8 manifest url') + + return { + 'id': trackinfo['id'], + 'title': trackinfo['title'], + 'url': m3u8_url, + 'ext': 'mp4', + 'thumbnail': options['thumbnail_url'], + 'uploader': author.get('name'), + 'uploader_id': compat_str(author['id']) if author else None, + } From 2d0efe70a684cf378c6c325eafc8e52a85321157 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Thu, 17 Oct 2013 00:46:11 +0200 Subject: [PATCH 58/62] [brightcove] Fix more broken XML (#1608) --- youtube_dl/extractor/brightcove.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index 745212f2f..58f3d9708 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -53,6 +53,8 @@ class BrightcoveIE(InfoExtractor): # Fix up some stupid HTML, see https://github.com/rg3/youtube-dl/issues/1553 object_str = re.sub(r'(<param name="[^"]+" value="[^"]+")>', lambda m: m.group(1) + '/>', object_str) + # Fix up some stupid XML, see https://github.com/rg3/youtube-dl/issues/1608 + object_str = object_str.replace(u'<--', u'<!--') object_doc = xml.etree.ElementTree.fromstring(object_str) assert u'BrightcoveExperience' in object_doc.attrib['class'] From 591454798d330adfcf8e22ef66fed7bbdf9f628b Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Thu, 17 Oct 2013 01:02:17 +0200 Subject: [PATCH 59/62] [brightcove] Raise error if playlist is empty (#1608) --- youtube_dl/extractor/brightcove.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index 58f3d9708..1392f382a 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -98,7 +98,10 @@ class BrightcoveIE(InfoExtractor): playlist_info = self._download_webpage(self._PLAYLIST_URL_TEMPLATE % player_key, player_key, u'Downloading playlist information') - playlist_info = json.loads(playlist_info)['videoList'] + json_data = json.loads(playlist_info) + if 'videoList' not in json_data: + raise ExtractorError(u'Empty playlist') + playlist_info = json_data['videoList'] videos = [self._extract_video_info(video_info) for video_info in playlist_info['mediaCollectionDTO']['videoDTOs']] return self.playlist_result(videos, playlist_id=playlist_info['id'], From a733eb6c534625b51e42763d8c4b8f29e176e512 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Thu, 17 Oct 2013 02:19:19 +0200 Subject: [PATCH 60/62] [youtube] Do not crash if caption info is missing altogether (Fixes #1610) --- youtube_dl/extractor/youtube.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 4347651d7..fb7c42830 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1150,7 +1150,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): list_page = self._download_webpage(list_url, video_id) caption_list = xml.etree.ElementTree.fromstring(list_page.encode('utf-8')) original_lang_node = caption_list.find('track') - if original_lang_node.attrib.get('kind') != 'asr' : + if not original_lang_node or original_lang_node.attrib.get('kind') != 'asr' : self._downloader.report_warning(u'Video doesn\'t have automatic captions') return {} original_lang = original_lang_node.attrib['lang_code'] From 54ed626cf8fa68b76a6ae21f659d84482ab319df Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Thu, 17 Oct 2013 02:20:26 +0200 Subject: [PATCH 61/62] release 2013.10.17 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 97dc5e9cc..22a51ffe6 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2013.10.15' +__version__ = '2013.10.17' From d21ab292008b114ea9e99edc2e9f2adde49415ef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Thu, 17 Oct 2013 08:20:58 +0200 Subject: [PATCH 62/62] Add an extractor for techtalks.tv (closes #1606) --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/techtalks.py | 65 +++++++++++++++++++++++++++++++ 2 files changed, 66 insertions(+) create mode 100644 youtube_dl/extractor/techtalks.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 4f20fbd1a..db69af361 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -115,6 +115,7 @@ from .statigram import StatigramIE from .steam import SteamIE from .sztvhu import SztvHuIE from .teamcoco import TeamcocoIE +from .techtalks import TechTalksIE from .ted import TEDIE from .tf1 import TF1IE from .thisav import ThisAVIE diff --git a/youtube_dl/extractor/techtalks.py b/youtube_dl/extractor/techtalks.py new file mode 100644 index 000000000..a55f236cb --- /dev/null +++ b/youtube_dl/extractor/techtalks.py @@ -0,0 +1,65 @@ +import re + +from .common import InfoExtractor +from ..utils import ( + get_element_by_attribute, + clean_html, +) + + +class TechTalksIE(InfoExtractor): + _VALID_URL = r'https?://techtalks\.tv/talks/[^/]*/(?P<id>\d+)/' + + _TEST = { + u'url': u'http://techtalks.tv/talks/learning-topic-models-going-beyond-svd/57758/', + u'playlist': [ + { + u'file': u'57758.flv', + u'info_dict': { + u'title': u'Learning Topic Models --- Going beyond SVD', + }, + }, + { + u'file': u'57758-slides.flv', + u'info_dict': { + u'title': u'Learning Topic Models --- Going beyond SVD', + }, + }, + ], + u'params': { + # rtmp download + u'skip_download': True, + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + talk_id = mobj.group('id') + webpage = self._download_webpage(url, talk_id) + rtmp_url = self._search_regex(r'netConnectionUrl: \'(.*?)\'', webpage, + u'rtmp url') + play_path = self._search_regex(r'href=\'(.*?)\' [^>]*id="flowplayer_presenter"', + webpage, u'presenter play path') + title = clean_html(get_element_by_attribute('class', 'title', webpage)) + video_info = { + 'id': talk_id, + 'title': title, + 'url': rtmp_url, + 'play_path': play_path, + 'ext': 'flv', + } + m_slides = re.search(r'<a class="slides" href=\'(.*?)\'', webpage) + if m_slides is None: + return video_info + else: + return [ + video_info, + # The slides video + { + 'id': talk_id + '-slides', + 'title': title, + 'url': rtmp_url, + 'play_path': m_slides.group(1), + 'ext': 'flv', + }, + ]