From f44415360e7bdf1b7b90c0c4b08199518210f009 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Fri, 18 Oct 2013 13:49:25 +0200 Subject: [PATCH 01/81] Use the console_scripts entry point if setuptools is available --- setup.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 3b6dc2d40..347a4f2d8 100644 --- a/setup.py +++ b/setup.py @@ -8,6 +8,7 @@ import sys try: from setuptools import setup + setuptools_available = True except ImportError: from distutils.core import setup @@ -43,13 +44,16 @@ if len(sys.argv) >= 2 and sys.argv[1] == 'py2exe': params = py2exe_params else: params = { - 'scripts': ['bin/youtube-dl'], 'data_files': [ # Installing system-wide would require sudo... ('etc/bash_completion.d', ['youtube-dl.bash-completion']), ('share/doc/youtube_dl', ['README.txt']), ('share/man/man1/', ['youtube-dl.1']) ] } + if setuptools_available: + params['entry_points'] = {'console_scripts': ['youtube-dl = youtube_dl:main']} + else: + params['scripts'] = ['bin/youtube-dl'] # Get the version from youtube_dl/version.py without importing the package exec(compile(open('youtube_dl/version.py').read(), From b0505eb6113ab6c02543d7b8272da39d8d57eff8 Mon Sep 17 00:00:00 2001 From: rzhxeo Date: Sat, 19 Oct 2013 16:46:17 +0200 Subject: [PATCH 02/81] [CinemassacreIE] Fix information extraction --- youtube_dl/extractor/cinemassacre.py | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/youtube_dl/extractor/cinemassacre.py b/youtube_dl/extractor/cinemassacre.py index 6925b96c2..8260e8192 100644 --- a/youtube_dl/extractor/cinemassacre.py +++ b/youtube_dl/extractor/cinemassacre.py @@ -12,6 +12,7 @@ class CinemassacreIE(InfoExtractor): _TESTS = [{ u'url': u'http://cinemassacre.com/2012/11/10/avgn-the-movie-trailer/', u'file': u'19911.flv', + u'md5': u'f9bb7ede54d1229c9846e197b4737e06', u'info_dict': { u'upload_date': u'20121110', u'title': u'“Angry Video Game Nerd: The Movie” – Trailer', @@ -25,6 +26,7 @@ class CinemassacreIE(InfoExtractor): { u'url': u'http://cinemassacre.com/2013/10/02/the-mummys-hand-1940', u'file': u'521be8ef82b16.flv', + u'md5': u'91b248e1e2473d5bff55d6010518111f', u'info_dict': { u'upload_date': u'20131002', u'title': u'The Mummy’s Hand (1940)', @@ -55,23 +57,29 @@ class CinemassacreIE(InfoExtractor): video_description = None playerdata = self._download_webpage(playerdata_url, video_id) - base_url = self._html_search_regex(r'\'streamer\': \'(?Prtmp://.*?)/(?:vod|Cinemassacre)\'', - playerdata, u'base_url') - base_url += '/Cinemassacre/' - # Important: The file names in playerdata are not used by the player and even wrong for some videos - sd_file = 'Cinemassacre-%s_high.mp4' % video_id - hd_file = 'Cinemassacre-%s.mp4' % video_id - video_thumbnail = 'http://image.screenwavemedia.com/Cinemassacre/Cinemassacre-%s_thumb_640x360.jpg' % video_id + url = self._html_search_regex(r'\'streamer\': \'(?P[^\']+)\'', playerdata, u'url') + player_url = self._html_search_regex(r'\'flashplayer\': \'(?P[^\']+)\'', playerdata, u'player_url') + page_url = re.split(r'(?<=[^/])/([^/]|$)', player_url)[0] + + sd_file = self._html_search_regex(r'\'file\': \'(?P[^\']+)\'', playerdata, u'sd_file') + hd_file = self._html_search_regex(r'\'?file\'?: "(?P[^"]+)"', playerdata, u'hd_file') + video_thumbnail = self._html_search_regex(r'\'image\': \'(?P[^\']+)\'', playerdata, u'thumbnail', fatal=False) formats = [ { - 'url': base_url + sd_file, + 'url': url, + 'player_url': player_url, + 'page_url': page_url, + 'play_path': 'mp4:' + sd_file, 'ext': 'flv', 'format': 'sd', 'format_id': 'sd', }, { - 'url': base_url + hd_file, + 'url': url, + 'player_url': player_url, + 'page_url': page_url, + 'play_path': 'mp4:' + hd_file, 'ext': 'flv', 'format': 'hd', 'format_id': 'hd', From 71907db3ba28b1d32c3294d9e3bec0c08fb98ad3 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Wed, 23 Oct 2013 11:38:51 +0200 Subject: [PATCH 03/81] [vimeo] Fix normal videos (Fixes #1642) Vimeo Pro Videos are still broken --- youtube_dl/extractor/vimeo.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 1125513c7..bf48671b3 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -25,7 +25,7 @@ class VimeoIE(InfoExtractor): { u'url': u'http://vimeo.com/56015672', u'file': u'56015672.mp4', - u'md5': u'8879b6cc097e987f02484baf890129e5', + u'md5': u'ae7a1d8b183758a0506b0622f37dfa14', u'info_dict': { u"upload_date": u"20121220", u"description": u"This is a test case for youtube-dl.\nFor more information, see github.com/rg3/youtube-dl\nTest chars: \u2605 \" ' \u5e78 / \\ \u00e4 \u21ad \U0001d550", @@ -129,10 +129,11 @@ class VimeoIE(InfoExtractor): # Extract the config JSON try: - config = self._search_regex([r' = {config:({.+?}),assets:', r'c=({.+?);'], - webpage, u'info section', flags=re.DOTALL) - config = json.loads(config) - except: + config_url = self._html_search_regex( + r' data-config-url="(.+?)"', webpage, u'config URL') + config_json = self._download_webpage(config_url, video_id) + config = json.loads(config_json) + except Exception as e: if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage): raise ExtractorError(u'The author has restricted the access to this video, try with the "--referer" option') @@ -140,7 +141,8 @@ class VimeoIE(InfoExtractor): self._verify_video_password(url, video_id, webpage) return self._real_extract(url) else: - raise ExtractorError(u'Unable to extract info section') + raise ExtractorError(u'Unable to extract info section', + cause=e) # Extract title video_title = config["video"]["title"] From 55b3e45bbab3af5132d45c8f3f8f19fae5f5f1d9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Wed, 23 Oct 2013 14:38:03 +0200 Subject: [PATCH 04/81] [vimeo] Fix pro videos and player.vimeo.com urls The old process can still be used for those videos. Added RegexNotFoundError, which is raised by _search_regex if it can't extract the info. --- youtube_dl/extractor/common.py | 5 +++-- youtube_dl/extractor/vimeo.py | 6 ++++++ youtube_dl/utils.py | 5 +++++ 3 files changed, 14 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 7d7ce5d98..aaa5c24c8 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -14,6 +14,7 @@ from ..utils import ( clean_html, compiled_regex_type, ExtractorError, + RegexNotFoundError, unescapeHTML, ) @@ -231,7 +232,7 @@ class InfoExtractor(object): Perform a regex search on the given string, using a single or a list of patterns returning the first matching group. In case of failure return a default value or raise a WARNING or a - ExtractorError, depending on fatal, specifying the field name. + RegexNotFoundError, depending on fatal, specifying the field name. """ if isinstance(pattern, (str, compat_str, compiled_regex_type)): mobj = re.search(pattern, string, flags) @@ -251,7 +252,7 @@ class InfoExtractor(object): elif default is not None: return default elif fatal: - raise ExtractorError(u'Unable to extract %s' % _name) + raise RegexNotFoundError(u'Unable to extract %s' % _name) else: self._downloader.report_warning(u'unable to extract %s; ' u'please report this issue on http://yt-dl.org/bug' % _name) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index bf48671b3..ad2f75d6b 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -10,6 +10,7 @@ from ..utils import ( clean_html, get_element_by_attribute, ExtractorError, + RegexNotFoundError, std_headers, unsmuggle_url, ) @@ -133,6 +134,11 @@ class VimeoIE(InfoExtractor): r' data-config-url="(.+?)"', webpage, u'config URL') config_json = self._download_webpage(config_url, video_id) config = json.loads(config_json) + except RegexNotFoundError: + # For pro videos or player.vimeo.com urls + config = self._search_regex([r' = {config:({.+?}),assets:', r'c=({.+?);'], + webpage, u'info section', flags=re.DOTALL) + config = json.loads(config) except Exception as e: if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage): raise ExtractorError(u'The author has restricted the access to this video, try with the "--referer" option') diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index bfb8f6bcd..1d9785341 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -572,6 +572,11 @@ class ExtractorError(Exception): return u''.join(traceback.format_tb(self.traceback)) +class RegexNotFoundError(ExtractorError): + """Error when a regex didn't match""" + pass + + class DownloadError(Exception): """Download Error exception. From 0a89b2852e927914ecd5643d956449a4841a3141 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Wed, 23 Oct 2013 15:12:33 +0200 Subject: [PATCH 05/81] release 2013.10.23.1 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index a5b56d894..df6002970 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2013.10.23' +__version__ = '2013.10.23.1' From 93b22c7828911668c503e868d6be053e8a0deb7c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Wed, 23 Oct 2013 16:31:53 +0200 Subject: [PATCH 06/81] [vimeo] fix the extraction for videos protected with password Added a test video. --- youtube_dl/extractor/vimeo.py | 39 +++++++++++++++++++++++++---------- 1 file changed, 28 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index ad2f75d6b..ef90fecc0 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -1,3 +1,4 @@ +# encoding: utf-8 import json import re import itertools @@ -55,7 +56,22 @@ class VimeoIE(InfoExtractor): u'title': u'Kathy Sierra: Building the minimum Badass User, Business of Software', u'uploader': u'The BLN & Business of Software', }, - } + }, + { + u'url': u'http://vimeo.com/68375962', + u'file': u'68375962.mp4', + u'md5': u'aaf896bdb7ddd6476df50007a0ac0ae7', + u'note': u'Video protected with password', + u'info_dict': { + u'title': u'youtube-dl password protected test video', + u'upload_date': u'20130614', + u'uploader_id': u'user18948128', + u'uploader': u'Jaime Marquínez Ferrándiz', + }, + u'params': { + u'videopassword': u'youtube-dl', + }, + }, ] def _login(self): @@ -130,20 +146,21 @@ class VimeoIE(InfoExtractor): # Extract the config JSON try: - config_url = self._html_search_regex( - r' data-config-url="(.+?)"', webpage, u'config URL') - config_json = self._download_webpage(config_url, video_id) - config = json.loads(config_json) - except RegexNotFoundError: - # For pro videos or player.vimeo.com urls - config = self._search_regex([r' = {config:({.+?}),assets:', r'c=({.+?);'], - webpage, u'info section', flags=re.DOTALL) - config = json.loads(config) + try: + config_url = self._html_search_regex( + r' data-config-url="(.+?)"', webpage, u'config URL') + config_json = self._download_webpage(config_url, video_id) + config = json.loads(config_json) + except RegexNotFoundError: + # For pro videos or player.vimeo.com urls + config = self._search_regex([r' = {config:({.+?}),assets:', r'c=({.+?);'], + webpage, u'info section', flags=re.DOTALL) + config = json.loads(config) except Exception as e: if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage): raise ExtractorError(u'The author has restricted the access to this video, try with the "--referer" option') - if re.search('If so please provide the correct password.', webpage): + if re.search(']+?id="pw_form"', webpage) is not None: self._verify_video_password(url, video_id, webpage) return self._real_extract(url) else: From 3126050c0fe204dfb2669f794097648b9c9fa8c8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Wed, 23 Oct 2013 16:32:17 +0200 Subject: [PATCH 07/81] Hide the video password on verbose mode --- youtube_dl/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index fce1adf0c..c141dcdda 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -133,7 +133,7 @@ def parseOpts(overrideArguments=None): def _hide_login_info(opts): opts = list(opts) - for private_opt in ['-p', '--password', '-u', '--username']: + for private_opt in ['-p', '--password', '-u', '--username', '--video-password']: try: i = opts.index(private_opt) opts[i+1] = '' From 2450bcb28b46a6cb3d9f9accfdfd3ef6b7ac5f66 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Wed, 23 Oct 2013 17:00:33 +0200 Subject: [PATCH 08/81] [nowvideo] Fix key extraction Extract it from the embed page --- youtube_dl/extractor/nowvideo.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/nowvideo.py b/youtube_dl/extractor/nowvideo.py index ab52ad401..241cc160b 100644 --- a/youtube_dl/extractor/nowvideo.py +++ b/youtube_dl/extractor/nowvideo.py @@ -20,7 +20,10 @@ class NowVideoIE(InfoExtractor): video_id = mobj.group('id') webpage_url = 'http://www.nowvideo.ch/video/' + video_id + embed_url = 'http://embed.nowvideo.ch/embed.php?v=' + video_id webpage = self._download_webpage(webpage_url, video_id) + embed_page = self._download_webpage(embed_url, video_id, + u'Downloading embed page') self.report_extraction(video_id) @@ -28,7 +31,7 @@ class NowVideoIE(InfoExtractor): webpage, u'video title') video_key = self._search_regex(r'var fkzd="(.*)";', - webpage, u'video key') + embed_page, u'video key') api_call = "http://www.nowvideo.ch/api/player.api.php?file={0}&numOfErrors=0&cid=1&key={1}".format(video_id, video_key) api_response = self._download_webpage(api_call, video_id, From cdec0190c48b90465c5340f7d8af1370dae2cc67 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Wed, 23 Oct 2013 17:33:38 +0200 Subject: [PATCH 09/81] [dailymotion] Extract all the available formats (closes #1028) --- youtube_dl/extractor/dailymotion.py | 41 +++++++++++++++++++---------- 1 file changed, 27 insertions(+), 14 deletions(-) diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index 7d8353946..4c0488245 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -28,6 +28,15 @@ class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor): _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/(?:embed/)?video/([^/]+)' IE_NAME = u'dailymotion' + + _FORMATS = [ + (u'stream_h264_ld_url', u'ld'), + (u'stream_h264_url', u'standard'), + (u'stream_h264_hq_url', u'hq'), + (u'stream_h264_hd_url', u'hd'), + (u'stream_h264_hd1080_url', u'hd180'), + ] + _TESTS = [ { u'url': u'http://www.dailymotion.com/video/x33vw9_tutoriel-de-youtubeur-dl-des-video_tech', @@ -60,7 +69,6 @@ class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor): video_id = mobj.group(1).split('_')[0].split('?')[0] - video_extension = 'mp4' url = 'http://www.dailymotion.com/video/%s' % video_id # Retrieve video webpage to extract further information @@ -99,18 +107,24 @@ class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor): msg = 'Couldn\'t get video, Dailymotion says: %s' % info['error']['title'] raise ExtractorError(msg, expected=True) - # TODO: support choosing qualities - - for key in ['stream_h264_hd1080_url','stream_h264_hd_url', - 'stream_h264_hq_url','stream_h264_url', - 'stream_h264_ld_url']: - if info.get(key):#key in info and info[key]: - max_quality = key - self.to_screen(u'Using %s' % key) - break - else: + formats = [] + for (key, format_id) in self._FORMATS: + video_url = info.get(key) + if video_url is not None: + m_size = re.search(r'H264-(\d+)x(\d+)', video_url) + if m_size is not None: + width, height = m_size.group(1), m_size.group(2) + else: + width, height = None, None + formats.append({ + 'url': video_url, + 'ext': 'mp4', + 'format_id': format_id, + 'width': width, + 'height': height, + }) + if not formats: raise ExtractorError(u'Unable to extract video URL') - video_url = info[max_quality] # subtitles video_subtitles = self.extract_subtitles(video_id) @@ -120,11 +134,10 @@ class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor): return [{ 'id': video_id, - 'url': video_url, + 'formats': formats, 'uploader': video_uploader, 'upload_date': video_upload_date, 'title': self._og_search_title(webpage), - 'ext': video_extension, 'subtitles': video_subtitles, 'thumbnail': info['thumbnail_url'] }] From 1cf64ee4685e0f26b2d4dc28d9635351a36007b6 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Wed, 23 Oct 2013 18:38:09 +0200 Subject: [PATCH 10/81] release 2013.10.23.2 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index df6002970..b4ce6068f 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2013.10.23.1' +__version__ = '2013.10.23.2' From fcc28edb2f86bb62ab8b3fcbacf0818991cd3058 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Wed, 23 Oct 2013 20:21:25 +0200 Subject: [PATCH 11/81] [cinemassacre] Simplify * Remove some rtmp parameters that are not needed. * Remove the md5 checksums, the video is not downloaded. * Remove the code used before the current format system. --- youtube_dl/extractor/cinemassacre.py | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/youtube_dl/extractor/cinemassacre.py b/youtube_dl/extractor/cinemassacre.py index 8260e8192..2fe1033f0 100644 --- a/youtube_dl/extractor/cinemassacre.py +++ b/youtube_dl/extractor/cinemassacre.py @@ -12,7 +12,6 @@ class CinemassacreIE(InfoExtractor): _TESTS = [{ u'url': u'http://cinemassacre.com/2012/11/10/avgn-the-movie-trailer/', u'file': u'19911.flv', - u'md5': u'f9bb7ede54d1229c9846e197b4737e06', u'info_dict': { u'upload_date': u'20121110', u'title': u'“Angry Video Game Nerd: The Movie” – Trailer', @@ -26,7 +25,6 @@ class CinemassacreIE(InfoExtractor): { u'url': u'http://cinemassacre.com/2013/10/02/the-mummys-hand-1940', u'file': u'521be8ef82b16.flv', - u'md5': u'91b248e1e2473d5bff55d6010518111f', u'info_dict': { u'upload_date': u'20131002', u'title': u'The Mummy’s Hand (1940)', @@ -58,8 +56,6 @@ class CinemassacreIE(InfoExtractor): playerdata = self._download_webpage(playerdata_url, video_id) url = self._html_search_regex(r'\'streamer\': \'(?P[^\']+)\'', playerdata, u'url') - player_url = self._html_search_regex(r'\'flashplayer\': \'(?P[^\']+)\'', playerdata, u'player_url') - page_url = re.split(r'(?<=[^/])/([^/]|$)', player_url)[0] sd_file = self._html_search_regex(r'\'file\': \'(?P[^\']+)\'', playerdata, u'sd_file') hd_file = self._html_search_regex(r'\'?file\'?: "(?P[^"]+)"', playerdata, u'hd_file') @@ -68,8 +64,6 @@ class CinemassacreIE(InfoExtractor): formats = [ { 'url': url, - 'player_url': player_url, - 'page_url': page_url, 'play_path': 'mp4:' + sd_file, 'ext': 'flv', 'format': 'sd', @@ -77,8 +71,6 @@ class CinemassacreIE(InfoExtractor): }, { 'url': url, - 'player_url': player_url, - 'page_url': page_url, 'play_path': 'mp4:' + hd_file, 'ext': 'flv', 'format': 'hd', @@ -86,7 +78,7 @@ class CinemassacreIE(InfoExtractor): }, ] - info = { + return { 'id': video_id, 'title': video_title, 'formats': formats, @@ -94,6 +86,3 @@ class CinemassacreIE(InfoExtractor): 'upload_date': video_date, 'thumbnail': video_thumbnail, } - # TODO: Remove when #980 has been merged - info.update(formats[-1]) - return info From 00fe14fc758173840f813b339960681e8e7d29d5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Fri, 25 Oct 2013 16:52:58 +0200 Subject: [PATCH 12/81] [youtube] Also use the 'adaptative_fmts' field from the /get_video_info page (fixes #1649) The 'adaptative_fmts' field from the video page is not added to the 'url_encoded_fmt_stream_map' --- youtube_dl/extractor/youtube.py | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 7a7bbe265..8fb07d100 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1405,32 +1405,29 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): # this signatures are encrypted if 'url_encoded_fmt_stream_map' not in args: raise ValueError(u'No stream_map present') # caught below - m_s = re.search(r'[&,]s=', args['url_encoded_fmt_stream_map']) + re_signature = re.compile(r'[&,]s=') + m_s = re_signature.search(args['url_encoded_fmt_stream_map']) if m_s is not None: self.to_screen(u'%s: Encrypted signatures detected.' % video_id) video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']] - m_s = re.search(r'[&,]s=', args.get('adaptive_fmts', u'')) + m_s = re_signature.search(args.get('adaptive_fmts', u'')) if m_s is not None: - if 'url_encoded_fmt_stream_map' in video_info: - video_info['url_encoded_fmt_stream_map'][0] += ',' + args['adaptive_fmts'] + if 'adaptive_fmts' in video_info: + video_info['adaptive_fmts'][0] += ',' + args['adaptive_fmts'] else: - video_info['url_encoded_fmt_stream_map'] = [args['adaptive_fmts']] - elif 'adaptive_fmts' in video_info: - if 'url_encoded_fmt_stream_map' in video_info: - video_info['url_encoded_fmt_stream_map'][0] += ',' + video_info['adaptive_fmts'][0] - else: - video_info['url_encoded_fmt_stream_map'] = video_info['adaptive_fmts'] + video_info['adaptive_fmts'] = [args['adaptive_fmts']] except ValueError: pass if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'): self.report_rtmp_download() video_url_list = [(None, video_info['conn'][0])] - elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1: - if 'rtmpe%3Dyes' in video_info['url_encoded_fmt_stream_map'][0]: + elif len(video_info.get('url_encoded_fmt_stream_map', [])) >= 1 or len(video_info.get('adaptive_fmts', [])) >= 1: + encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts',[''])[0] + if 'rtmpe%3Dyes' in encoded_url_map: raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True) url_map = {} - for url_data_str in video_info['url_encoded_fmt_stream_map'][0].split(','): + for url_data_str in encoded_url_map.split(','): url_data = compat_parse_qs(url_data_str) if 'itag' in url_data and 'url' in url_data: url = url_data['url'][0] From ea32fbacc8939e94f7db9c9a5eb167ada2af5f5d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Fri, 25 Oct 2013 16:55:37 +0200 Subject: [PATCH 13/81] Fix the extensions of two tests with youtube videos The best quality is now a mp4 video. --- youtube_dl/extractor/metacafe.py | 2 +- youtube_dl/extractor/youtube.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/metacafe.py b/youtube_dl/extractor/metacafe.py index e537648ff..234b9e80f 100644 --- a/youtube_dl/extractor/metacafe.py +++ b/youtube_dl/extractor/metacafe.py @@ -23,7 +23,7 @@ class MetacafeIE(InfoExtractor): _TESTS = [{ u"add_ie": ["Youtube"], u"url": u"http://metacafe.com/watch/yt-_aUehQsCQtM/the_electric_company_short_i_pbs_kids_go/", - u"file": u"_aUehQsCQtM.flv", + u"file": u"_aUehQsCQtM.mp4", u"info_dict": { u"upload_date": u"20090102", u"title": u"The Electric Company | \"Short I\" | PBS KIDS GO!", diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 8fb07d100..2884b359c 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -348,7 +348,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): }, { u"url": u"http://www.youtube.com/watch?v=1ltcDfZMA3U", - u"file": u"1ltcDfZMA3U.flv", + u"file": u"1ltcDfZMA3U.mp4", u"note": u"Test VEVO video (#897)", u"info_dict": { u"upload_date": u"20070518", From 600cc1a4f0503651e4fd94af967d25dab3645859 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Fri, 25 Oct 2013 17:11:29 +0200 Subject: [PATCH 14/81] [youtube] Set the format_id field to the itag of the format (closes #1624) --- youtube_dl/extractor/youtube.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 2884b359c..d05d0a8c1 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1480,13 +1480,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): raise ExtractorError(u'no conn, hlsvp or url_encoded_fmt_stream_map information found in video info') results = [] - for format_param, video_real_url in video_url_list: + for itag, video_real_url in video_url_list: # Extension - video_extension = self._video_extensions.get(format_param, 'flv') + video_extension = self._video_extensions.get(itag, 'flv') - video_format = '{0} - {1}{2}'.format(format_param if format_param else video_extension, - self._video_dimensions.get(format_param, '???'), - ' ('+self._special_itags[format_param]+')' if format_param in self._special_itags else '') + video_format = '{0} - {1}{2}'.format(itag if itag else video_extension, + self._video_dimensions.get(itag, '???'), + ' ('+self._special_itags[itag]+')' if itag in self._special_itags else '') results.append({ 'id': video_id, @@ -1497,6 +1497,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): 'title': video_title, 'ext': video_extension, 'format': video_format, + 'format_id': itag, 'thumbnail': video_thumbnail, 'description': video_description, 'player_url': player_url, From b5936c0059eae236cfc0b53fadf6bc24f8f8f3a1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Fri, 25 Oct 2013 17:17:23 +0200 Subject: [PATCH 15/81] Document the %(format_id)s field for the output template --- youtube_dl/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index c141dcdda..a33dec785 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -336,7 +336,8 @@ def parseOpts(overrideArguments=None): '%(uploader)s for the uploader name, %(uploader_id)s for the uploader nickname if different, ' '%(autonumber)s to get an automatically incremented number, ' '%(ext)s for the filename extension, ' - '%(format)s for the format description (like "22 - 1280x720" or "HD")' + '%(format)s for the format description (like "22 - 1280x720" or "HD"),' + '%(format_id)s for the unique id of the format (like Youtube\'s itags: "137"),' '%(upload_date)s for the upload date (YYYYMMDD), ' '%(extractor)s for the provider (youtube, metacafe, etc), ' '%(id)s for the video id , %(playlist)s for the playlist the video is in, ' From 49a25557b082a147c875015ceeecb370671f025c Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Fri, 25 Oct 2013 23:46:18 +0200 Subject: [PATCH 16/81] [8tracks] Use track count instead of looking at at_last_track property This fixes the error: $ youtube-dl http://8tracks.com/vladmc/counting-stars [8tracks] counting-stars: Downloading webpage [8tracks] counting-stars: Downloading song information 1/4 [8tracks] counting-stars: Downloading song information 2/4 [8tracks] counting-stars: Downloading song information 3/4 [8tracks] counting-stars: Downloading song information 4/4 [8tracks] counting-stars: Downloading song information 5/4 Traceback (most recent call last): File "/usr/lib/python2.7/runpy.py", line 162, in _run_module_as_main "__main__", fname, loader, pkg_name) File "/usr/lib/python2.7/runpy.py", line 72, in _run_code exec code in run_globals File "/home/phihag/projects/youtube-dl/youtube_dl/__main__.py", line 18, in youtube_dl.main() File "/home/phihag/projects/youtube-dl/youtube_dl/__init__.py", line 761, in main _real_main(argv) File "/home/phihag/projects/youtube-dl/youtube_dl/__init__.py", line 714, in _real_main retcode = ydl.download(all_urls) File "/home/phihag/projects/youtube-dl/youtube_dl/YoutubeDL.py", line 701, in download videos = self.extract_info(url) File "/home/phihag/projects/youtube-dl/youtube_dl/YoutubeDL.py", line 342, in extract_info ie_result = ie.extract(url) File "/home/phihag/projects/youtube-dl/youtube_dl/extractor/common.py", line 121, in extract return self._real_extract(url) File "/home/phihag/projects/youtube-dl/youtube_dl/extractor/eighttracks.py", line 111, in _real_extract 'id': track_data['id'], KeyError: 'id' --- youtube_dl/extractor/eighttracks.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/youtube_dl/extractor/eighttracks.py b/youtube_dl/extractor/eighttracks.py index cced06811..2cfbcd363 100644 --- a/youtube_dl/extractor/eighttracks.py +++ b/youtube_dl/extractor/eighttracks.py @@ -101,7 +101,7 @@ class EightTracksIE(InfoExtractor): first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id) next_url = first_url res = [] - for i in itertools.count(): + for i in range(track_count): api_json = self._download_webpage(next_url, playlist_id, note=u'Downloading song information %s/%s' % (str(i+1), track_count), errnote=u'Failed to download song information') @@ -116,7 +116,5 @@ class EightTracksIE(InfoExtractor): 'ext': 'm4a', } res.append(info) - if api_data['set']['at_last_track']: - break next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id']) return res From 5d0c97541af417064e5e3fb4eeb5416a436b0475 Mon Sep 17 00:00:00 2001 From: rzhxeo Date: Sat, 26 Oct 2013 20:38:54 +0200 Subject: [PATCH 17/81] [XHamsterIE] Extract SD and HD video --- youtube_dl/extractor/xhamster.py | 52 +++++++++++++++++++++++--------- 1 file changed, 37 insertions(+), 15 deletions(-) diff --git a/youtube_dl/extractor/xhamster.py b/youtube_dl/extractor/xhamster.py index 81c4be326..7444d3393 100644 --- a/youtube_dl/extractor/xhamster.py +++ b/youtube_dl/extractor/xhamster.py @@ -36,21 +36,25 @@ class XHamsterIE(InfoExtractor): }] def _real_extract(self,url): + def extract_video_url(webpage): + mobj = re.search(r'\'srv\': \'(?P[^\']*)\',\s*\'file\': \'(?P[^\']+)\',', webpage) + if mobj is None: + raise ExtractorError(u'Unable to extract media URL') + if len(mobj.group('server')) == 0: + return compat_urllib_parse.unquote(mobj.group('file')) + else: + return mobj.group('server')+'/key='+mobj.group('file') + + def is_hd(webpage): + return webpage.find('
') != -1 + mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') seo = mobj.group('seo') - mrss_url = 'http://xhamster.com/movies/%s/%s.html?hd' % (video_id, seo) + mrss_url = 'http://xhamster.com/movies/%s/%s.html' % (video_id, seo) webpage = self._download_webpage(mrss_url, video_id) - mobj = re.search(r'\'srv\': \'(?P[^\']*)\',\s*\'file\': \'(?P[^\']+)\',', webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract media URL') - if len(mobj.group('server')) == 0: - video_url = compat_urllib_parse.unquote(mobj.group('file')) - else: - video_url = mobj.group('server')+'/key='+mobj.group('file') - video_title = self._html_search_regex(r'(?P<title>.+?) - xHamster\.com', webpage, u'title') @@ -76,14 +80,32 @@ class XHamsterIE(InfoExtractor): age_limit = self._rta_search(webpage) - return [{ - 'id': video_id, - 'url': video_url, - 'ext': determine_ext(video_url), - 'title': video_title, + video_url = extract_video_url(webpage) + hd = is_hd(webpage) + formats = [{ + 'url': video_url, + 'ext': determine_ext(video_url), + 'format': 'hd' if hd else 'sd', + 'format_id': 'hd' if hd else 'sd', + }] + if not hd: + webpage = self._download_webpage(mrss_url+'?hd', video_id) + if is_hd(webpage): + video_url = extract_video_url(webpage) + formats.append({ + 'url': video_url, + 'ext': determine_ext(video_url), + 'format': 'hd', + 'format_id': 'hd', + }) + + return { + 'id': video_id, + 'title': video_title, + 'formats': formats, 'description': video_description, 'upload_date': video_upload_date, 'uploader_id': video_uploader_id, 'thumbnail': video_thumbnail, 'age_limit': age_limit, - }] + } From 7df286540f893f7fbba07da8ba3b09dd7c9027c4 Mon Sep 17 00:00:00 2001 From: rzhxeo Date: Sat, 26 Oct 2013 21:57:10 +0200 Subject: [PATCH 18/81] [YouPornIE] Extract all encrypted links and remove doubles at the end --- youtube_dl/YoutubeDL.py | 2 +- youtube_dl/extractor/youporn.py | 78 +++++++++++---------------------- 2 files changed, 27 insertions(+), 53 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index e2332f9b8..d4654cc05 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -462,7 +462,7 @@ class YoutubeDL(object): info_dict['playlist_index'] = None # This extractors handle format selection themselves - if info_dict['extractor'] in [u'youtube', u'Youku', u'YouPorn', u'mixcloud']: + if info_dict['extractor'] in [u'youtube', u'Youku', u'mixcloud']: if download: self.process_info(info_dict) return info_dict diff --git a/youtube_dl/extractor/youporn.py b/youtube_dl/extractor/youporn.py index e3b56cece..704ee89dc 100644 --- a/youtube_dl/extractor/youporn.py +++ b/youtube_dl/extractor/youporn.py @@ -31,20 +31,6 @@ class YouPornIE(InfoExtractor): } } - def _print_formats(self, formats): - """Print all available formats""" - print(u'Available formats:') - print(u'ext\t\tformat') - print(u'---------------------------------') - for format in formats: - print(u'%s\t\t%s' % (format['ext'], format['format'])) - - def _specific(self, req_format, formats): - for x in formats: - if x["format"] == req_format: - return x - return None - def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('videoid') @@ -71,27 +57,22 @@ class YouPornIE(InfoExtractor): except KeyError: raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1]) - # Get all of the formats available + # Get all of the links from the page DOWNLOAD_LIST_RE = r'(?s)
    (?P.*?)
' download_list_html = self._search_regex(DOWNLOAD_LIST_RE, webpage, u'download list').strip() - - # Get all of the links from the page - LINK_RE = r'(?s)' + LINK_RE = r'' links = re.findall(LINK_RE, download_list_html) - - # Get link of hd video if available - mobj = re.search(r'var encryptedQuality720URL = \'(?P[a-zA-Z0-9+/]+={0,2})\';', webpage) - if mobj != None: - encrypted_video_url = mobj.group(u'encrypted_video_url') - video_url = aes_decrypt_text(encrypted_video_url, video_title, 32).decode('utf-8') - links = [video_url] + links + + # Get all encrypted links + encrypted_links = re.findall(r'var encryptedQuality[0-9]{3}URL = \'([a-zA-Z0-9+/]+={0,2})\';', webpage) + for encrypted_link in encrypted_links: + link = aes_decrypt_text(encrypted_link, video_title, 32).decode('utf-8') + links.append(link) if not links: raise ExtractorError(u'ERROR: no known formats available for video') - self.to_screen(u'Links found: %d' % len(links)) - formats = [] for link in links: @@ -103,39 +84,32 @@ class YouPornIE(InfoExtractor): path = compat_urllib_parse_urlparse( video_url ).path extension = os.path.splitext( path )[1][1:] format = path.split('/')[4].split('_')[:2] + # size = format[0] # bitrate = format[1] format = "-".join( format ) # title = u'%s-%s-%s' % (video_title, size, bitrate) formats.append({ - 'id': video_id, 'url': video_url, - 'uploader': video_uploader, - 'upload_date': upload_date, - 'title': video_title, 'ext': extension, 'format': format, - 'thumbnail': thumbnail, - 'description': video_description, - 'age_limit': age_limit, + 'format_id': format, }) - if self._downloader.params.get('listformats', None): - self._print_formats(formats) - return - - req_format = self._downloader.params.get('format', 'best') - self.to_screen(u'Format: %s' % req_format) - - if req_format is None or req_format == 'best': - return [formats[0]] - elif req_format == 'worst': - return [formats[-1]] - elif req_format in ('-1', 'all'): - return formats - else: - format = self._specific( req_format, formats ) - if format is None: - raise ExtractorError(u'Requested format not available') - return [format] + # Sort and remove doubles + formats.sort(key=lambda format: list(map(lambda s: s.zfill(6), format['format'].split('-')))) + for i in range(len(formats)-1,0,-1): + if formats[i]['format_id'] == formats[i-1]['format_id']: + del formats[i] + + return { + 'id': video_id, + 'uploader': video_uploader, + 'upload_date': upload_date, + 'title': video_title, + 'thumbnail': thumbnail, + 'description': video_description, + 'age_limit': age_limit, + 'formats': formats, + } From 1d45a23b745cdbb361dd5cef8f848f7ebcfa8f5a Mon Sep 17 00:00:00 2001 From: rzhxeo Date: Sat, 26 Oct 2013 23:27:30 +0200 Subject: [PATCH 19/81] Add support for http://www.tube8.com --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/tube8.py | 63 ++++++++++++++++++++++++++++++++ 2 files changed, 64 insertions(+) create mode 100644 youtube_dl/extractor/tube8.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index db69af361..84fc2e4fa 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -121,6 +121,7 @@ from .tf1 import TF1IE from .thisav import ThisAVIE from .traileraddict import TrailerAddictIE from .trilulilu import TriluliluIE +from .tube8 import Tube8IE from .tudou import TudouIE from .tumblr import TumblrIE from .tutv import TutvIE diff --git a/youtube_dl/extractor/tube8.py b/youtube_dl/extractor/tube8.py new file mode 100644 index 000000000..b7e7d984d --- /dev/null +++ b/youtube_dl/extractor/tube8.py @@ -0,0 +1,63 @@ +import os +import re + +from .common import InfoExtractor +from ..utils import ( + compat_urllib_parse_urlparse, + compat_urllib_request, + compat_urllib_parse, + unescapeHTML, +) +from ..aes import ( + aes_decrypt_text +) + +class Tube8IE(InfoExtractor): + _VALID_URL = r'^(?:https?://)?(?:www\.)?(?Ptube8.com/[^/]+/[^/]+/(?P[0-9]+)/?)' + _TEST = { + u'url': u'http://www.tube8.com/teen/kasia-music-video/229795/', + u'file': u'229795.mp4', + u'md5': u'e9e0b0c86734e5e3766e653509475db0', + u'info_dict': { + u"description": u"hot teen Kasia grinding", + u"uploader": u"unknown", + u"title": u"Kasia music video", + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('videoid') + url = 'http://www.' + mobj.group('url') + + req = compat_urllib_request.Request(url) + req.add_header('Cookie', 'age_verified=1') + webpage = self._download_webpage(req, video_id) + + video_title = self._html_search_regex(r'videotitle ="([^"]+)', webpage, u'title') + video_description = self._html_search_regex(r'>Description:(.+?)<', webpage, u'description', fatal=False) + video_uploader = self._html_search_regex(r'>Submitted by:(?:\w|<[^>]*>)*(.+?)<', webpage, u'uploader', fatal=False) + thumbnail = self._html_search_regex(r'"image_url":"([^"]+)', webpage, u'thumbnail', fatal=False) + if thumbnail: + thumbnail = thumbnail.replace('\\/', '/') + + video_url = self._html_search_regex(r'"video_url":"([^"]+)', webpage, u'video_url') + if webpage.find('"encrypted":true')!=-1: + password = self._html_search_regex(r'"video_title":"([^"]+)', webpage, u'password') + video_url = aes_decrypt_text(video_url, password, 32).decode('utf-8') + path = compat_urllib_parse_urlparse( video_url ).path + extension = os.path.splitext( path )[1][1:] + format = path.split('/')[4].split('_')[:2] + format = "-".join( format ) + + return { + 'id': video_id, + 'uploader': video_uploader, + 'title': video_title, + 'thumbnail': thumbnail, + 'description': video_description, + 'url': video_url, + 'ext': extension, + 'format': format, + 'format_id': format, + } From 6e76104d66624a8f742d1e0d210a35452a79aec8 Mon Sep 17 00:00:00 2001 From: rzhxeo Date: Sat, 26 Oct 2013 23:33:32 +0200 Subject: [PATCH 20/81] [YouPornIE] Make webpage download more robust --- youtube_dl/extractor/youporn.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youporn.py b/youtube_dl/extractor/youporn.py index 704ee89dc..e46a9b4d6 100644 --- a/youtube_dl/extractor/youporn.py +++ b/youtube_dl/extractor/youporn.py @@ -17,7 +17,7 @@ from ..aes import ( ) class YouPornIE(InfoExtractor): - _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P[0-9]+)/(?P[^/]+)' + _VALID_URL = r'^(?:https?://)?(?:www\.)?(?P<url>youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+))' _TEST = { u'url': u'http://www.youporn.com/watch/505835/sex-ed-is-it-safe-to-masturbate-daily/', u'file': u'505835.mp4', @@ -34,6 +34,7 @@ class YouPornIE(InfoExtractor): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('videoid') + url = 'http://www.' + mobj.group('url') req = compat_urllib_request.Request(url) req.add_header('Cookie', 'age_verified=1') From 14e10b2b6ec0d1ac3af36cc0458673ec89a88f03 Mon Sep 17 00:00:00 2001 From: pyed <iAbdulElah@Gmail.com> Date: Sun, 27 Oct 2013 01:19:38 +0300 Subject: [PATCH 21/81] [addanime] try to download HQ before normal --- youtube_dl/extractor/addanime.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/addanime.py b/youtube_dl/extractor/addanime.py index 82a785a19..adbda194a 100644 --- a/youtube_dl/extractor/addanime.py +++ b/youtube_dl/extractor/addanime.py @@ -17,8 +17,8 @@ class AddAnimeIE(InfoExtractor): IE_NAME = u'AddAnime' _TEST = { u'url': u'http://www.add-anime.net/watch_video.php?v=24MR3YO5SAS9', - u'file': u'24MR3YO5SAS9.flv', - u'md5': u'1036a0e0cd307b95bd8a8c3a5c8cfaf1', + u'file': u'24MR3YO5SAS9.mp4', + u'md5': u'3f8e232ad52163c87fa23897e736cb2c', u'info_dict': { u"description": u"One Piece 606", u"title": u"One Piece 606" @@ -60,8 +60,12 @@ class AddAnimeIE(InfoExtractor): note=u'Confirming after redirect') webpage = self._download_webpage(url, video_id) - video_url = self._search_regex(r"var normal_video_file = '(.*?)';", + video_url = self._search_regex(r"var hq_video_file = '(.*?)';", webpage, u'video file URL') + if not video_url: # if there's no hq_video_file, get normal_video_file + video_url = self._search_regex(r"var normal_video_file = '(.*?)';", + webpage, u'video file URL') + video_extension = video_url[-3:] # mp4 or flv ? video_title = self._og_search_title(webpage) video_description = self._og_search_description(webpage) @@ -69,7 +73,7 @@ class AddAnimeIE(InfoExtractor): '_type': 'video', 'id': video_id, 'url': video_url, - 'ext': 'flv', + 'ext': video_extension, 'title': video_title, 'description': video_description } From 8cb57d9b91cce72b522d89b5e3e469c433956a07 Mon Sep 17 00:00:00 2001 From: rzhxeo <rzhxeo@users.noreply.github.com> Date: Sun, 27 Oct 2013 00:21:27 +0200 Subject: [PATCH 22/81] [Tube8IE] Escape dot in regex --- youtube_dl/extractor/tube8.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/tube8.py b/youtube_dl/extractor/tube8.py index b7e7d984d..ef8d21642 100644 --- a/youtube_dl/extractor/tube8.py +++ b/youtube_dl/extractor/tube8.py @@ -13,7 +13,7 @@ from ..aes import ( ) class Tube8IE(InfoExtractor): - _VALID_URL = r'^(?:https?://)?(?:www\.)?(?P<url>tube8.com/[^/]+/[^/]+/(?P<videoid>[0-9]+)/?)' + _VALID_URL = r'^(?:https?://)?(?:www\.)?(?P<url>tube8\.com/[^/]+/[^/]+/(?P<videoid>[0-9]+)/?)' _TEST = { u'url': u'http://www.tube8.com/teen/kasia-music-video/229795/', u'file': u'229795.mp4', From 125cfd78e8579b1c6104d3ec2359417677863a8a Mon Sep 17 00:00:00 2001 From: rzhxeo <rzhxeot7z81b4700@mailcatch.com> Date: Sun, 27 Oct 2013 01:04:22 +0200 Subject: [PATCH 23/81] Add support for http://www.pornhub.com --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/pornhub.py | 67 ++++++++++++++++++++++++++++++++ 2 files changed, 68 insertions(+) create mode 100644 youtube_dl/extractor/pornhub.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index db69af361..2a5518665 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -94,6 +94,7 @@ from .ooyala import OoyalaIE from .orf import ORFIE from .pbs import PBSIE from .photobucket import PhotobucketIE +from .pornhub import PornHubIE from .pornotube import PornotubeIE from .rbmaradio import RBMARadioIE from .redtube import RedTubeIE diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py new file mode 100644 index 000000000..3dbd2ab69 --- /dev/null +++ b/youtube_dl/extractor/pornhub.py @@ -0,0 +1,67 @@ +import os +import re + +from .common import InfoExtractor +from ..utils import ( + compat_urllib_parse_urlparse, + compat_urllib_request, + compat_urllib_parse, + unescapeHTML, +) +from ..aes import ( + aes_decrypt_text +) + +class PornHubIE(InfoExtractor): + _VALID_URL = r'^(?:https?://)?(?:www\.)?(?P<url>pornhub\.com/view_video\.php\?viewkey=(?P<videoid>[0-9]+))' + _TEST = { + u'url': u'http://www.pornhub.com/view_video.php?viewkey=648719015', + u'file': u'648719015.mp4', + u'md5': u'882f488fa1f0026f023f33576004a2ed', + u'info_dict': { + u"uploader": u"BABES-COM", + u"title": u"Seductive Indian beauty strips down and fingers her pink pussy", + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('videoid') + url = 'http://www.' + mobj.group('url') + + req = compat_urllib_request.Request(url) + req.add_header('Cookie', 'age_verified=1') + webpage = self._download_webpage(req, video_id) + + video_title = self._html_search_regex(r'<h1 [^>]+>([^<]+)', webpage, u'title') + video_uploader = self._html_search_regex(r'<b>From: </b>(?:\s|<[^>]*>)*(.+?)<', webpage, u'uploader', fatal=False) + thumbnail = self._html_search_regex(r'"image_url":"([^"]+)', webpage, u'thumbnail', fatal=False) + if thumbnail: + thumbnail = compat_urllib_parse.unquote(thumbnail) + + video_urls = list(map(compat_urllib_parse.unquote , re.findall(r'"quality_[0-9]{3}p":"([^"]+)', webpage))) + if webpage.find('"encrypted":true') != -1: + password = self._html_search_regex(r'"video_title":"([^"]+)', webpage, u'password').replace('+', ' ') + video_urls = list(map(lambda s: aes_decrypt_text(s, password, 32).decode('utf-8'), video_urls)) + + formats = [] + for video_url in video_urls: + path = compat_urllib_parse_urlparse( video_url ).path + extension = os.path.splitext( path )[1][1:] + format = path.split('/')[5].split('_')[:2] + format = "-".join( format ) + formats.append({ + 'url': video_url, + 'ext': extension, + 'format': format, + 'format_id': format, + }) + formats.sort(key=lambda format: list(map(lambda s: s.zfill(6), format['format'].split('-')))) + + return { + 'id': video_id, + 'uploader': video_uploader, + 'title': video_title, + 'thumbnail': thumbnail, + 'formats': formats, + } From 71865091abbb0166edeffff14da019542260557f Mon Sep 17 00:00:00 2001 From: rzhxeo <rzhxeo@users.noreply.github.com> Date: Sun, 27 Oct 2013 01:08:03 +0200 Subject: [PATCH 24/81] [Tube8IE] Fix regex for uploader extraction --- youtube_dl/extractor/tube8.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/tube8.py b/youtube_dl/extractor/tube8.py index ef8d21642..ebc8c1f4f 100644 --- a/youtube_dl/extractor/tube8.py +++ b/youtube_dl/extractor/tube8.py @@ -36,7 +36,7 @@ class Tube8IE(InfoExtractor): video_title = self._html_search_regex(r'videotitle ="([^"]+)', webpage, u'title') video_description = self._html_search_regex(r'>Description:</strong>(.+?)<', webpage, u'description', fatal=False) - video_uploader = self._html_search_regex(r'>Submitted by:</strong>(?:\w|<[^>]*>)*(.+?)<', webpage, u'uploader', fatal=False) + video_uploader = self._html_search_regex(r'>Submitted by:</strong>(?:\s|<[^>]*>)*(.+?)<', webpage, u'uploader', fatal=False) thumbnail = self._html_search_regex(r'"image_url":"([^"]+)', webpage, u'thumbnail', fatal=False) if thumbnail: thumbnail = thumbnail.replace('\\/', '/') From 7b2212e954a3f2ecf1c0936d7c5b90a43fa380cd Mon Sep 17 00:00:00 2001 From: rzhxeo <rzhxeot7z81b4700@mailcatch.com> Date: Sun, 27 Oct 2013 01:59:26 +0200 Subject: [PATCH 25/81] Add support for http://www.spankwire.com --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/spankwire.py | 70 +++++++++++++++++++++++++++++++ 2 files changed, 71 insertions(+) create mode 100644 youtube_dl/extractor/spankwire.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index db69af361..7a60e0937 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -109,6 +109,7 @@ from .slideshare import SlideshareIE from .sohu import SohuIE from .soundcloud import SoundcloudIE, SoundcloudSetIE, SoundcloudUserIE from .southparkstudios import SouthParkStudiosIE +from .spankwire import SpankwireIE from .spiegel import SpiegelIE from .stanfordoc import StanfordOpenClassroomIE from .statigram import StatigramIE diff --git a/youtube_dl/extractor/spankwire.py b/youtube_dl/extractor/spankwire.py new file mode 100644 index 000000000..f0d5009c7 --- /dev/null +++ b/youtube_dl/extractor/spankwire.py @@ -0,0 +1,70 @@ +import os +import re + +from .common import InfoExtractor +from ..utils import ( + compat_urllib_parse_urlparse, + compat_urllib_request, + compat_urllib_parse, + unescapeHTML, +) +from ..aes import ( + aes_decrypt_text +) + +class SpankwireIE(InfoExtractor): + _VALID_URL = r'^(?:https?://)?(?:www\.)?(?P<url>spankwire\.com/[^/]*/video(?P<videoid>[0-9]+)/?)' + _TEST = { + u'url': u'http://www.spankwire.com/Buckcherry-s-X-Rated-Music-Video-Crazy-Bitch/video103545/', + u'file': u'103545.mp4', + u'md5': u'1b3f55e345500552dbc252a3e9c1af43', + u'info_dict': { + u"uploader": u"oreusz", + u"title": u"Buckcherry`s X Rated Music Video Crazy Bitch", + u"description": u"Crazy Bitch X rated music video.", + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('videoid') + url = 'http://www.' + mobj.group('url') + + req = compat_urllib_request.Request(url) + req.add_header('Cookie', 'age_verified=1') + webpage = self._download_webpage(req, video_id) + + video_title = self._html_search_regex(r'<h1>([^<]+)', webpage, u'title') + video_uploader = self._html_search_regex(r'by:\s*<a [^>]*>(.+?)</a>', webpage, u'uploader', fatal=False) + thumbnail = self._html_search_regex(r'flashvars\.image_url = "([^"]+)', webpage, u'thumbnail', fatal=False) + description = self._html_search_regex(r'>\s*Description:</div>\s*<[^>]*>([^<]+)', webpage, u'description', fatal=False) + if len(description) == 0: + description = None + + video_urls = list(map(compat_urllib_parse.unquote , re.findall(r'flashvars\.quality_[0-9]{3}p = "([^"]+)', webpage))) + if webpage.find('flashvars\.encrypted = "true"') != -1: + password = self._html_search_regex(r'flashvars\.video_title = "([^"]+)', webpage, u'password').replace('+', ' ') + video_urls = list(map(lambda s: aes_decrypt_text(s, password, 32).decode('utf-8'), video_urls)) + + formats = [] + for video_url in video_urls: + path = compat_urllib_parse_urlparse( video_url ).path + extension = os.path.splitext( path )[1][1:] + format = path.split('/')[4].split('_')[:2] + format = "-".join( format ) + formats.append({ + 'url': video_url, + 'ext': extension, + 'format': format, + 'format_id': format, + }) + formats.sort(key=lambda format: list(map(lambda s: s.zfill(6), format['format'].split('-')))) + + return { + 'id': video_id, + 'uploader': video_uploader, + 'title': video_title, + 'thumbnail': thumbnail, + 'description': description, + 'formats': formats, + } From 5b11143d05c6d38cf1df94561c2a515c9150b2e1 Mon Sep 17 00:00:00 2001 From: rzhxeo <rzhxeot7z81b4700@mailcatch.com> Date: Sun, 27 Oct 2013 10:10:28 +0100 Subject: [PATCH 26/81] Add support for http://www.keezmovies.com --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/keezmovies.py | 58 ++++++++++++++++++++++++++++++ 2 files changed, 59 insertions(+) create mode 100644 youtube_dl/extractor/keezmovies.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index db69af361..d4ad4e37c 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -72,6 +72,7 @@ from .jeuxvideo import JeuxVideoIE from .jukebox import JukeboxIE from .justintv import JustinTVIE from .kankan import KankanIE +from .keezmovies import KeezMoviesIE from .kickstarter import KickStarterIE from .keek import KeekIE from .liveleak import LiveLeakIE diff --git a/youtube_dl/extractor/keezmovies.py b/youtube_dl/extractor/keezmovies.py new file mode 100644 index 000000000..937caf664 --- /dev/null +++ b/youtube_dl/extractor/keezmovies.py @@ -0,0 +1,58 @@ +import os +import re + +from .common import InfoExtractor +from ..utils import ( + compat_urllib_parse_urlparse, + compat_urllib_request, + compat_urllib_parse, + unescapeHTML, +) +from ..aes import ( + aes_decrypt_text +) + +class KeezMoviesIE(InfoExtractor): + _VALID_URL = r'^(?:https?://)?(?:www\.)?(?P<url>keezmovies\.com/video/.+?(?P<videoid>[0-9]+))' + _TEST = { + u'url': u'http://www.keezmovies.com/video/petite-asian-lady-mai-playing-in-bathtub-1214711', + u'file': u'1214711.mp4', + u'md5': u'6e297b7e789329923fcf83abb67c9289', + u'info_dict': { + u"title": u"Petite Asian Lady Mai Playing In Bathtub", + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('videoid') + url = 'http://www.' + mobj.group('url') + + req = compat_urllib_request.Request(url) + req.add_header('Cookie', 'age_verified=1') + webpage = self._download_webpage(req, video_id) + + # embedded video + mobj = re.search(r'href="([^"]+)"></iframe>', webpage) + if mobj: + embedded_url = mobj.group(1) + return self.playlist_result([self.url_result(embedded_url)], playlist_id=video_id) + + video_title = self._html_search_regex(r'<h1 [^>]*>([^<]+)', webpage, u'title') + video_url = compat_urllib_parse.unquote(self._html_search_regex(r'video_url=(.+?)&', webpage, u'video_url')) + if webpage.find('encrypted=true')!=-1: + password = self._html_search_regex(r'video_title=(.+?)&', webpage, u'password') + video_url = aes_decrypt_text(video_url, password, 32).decode('utf-8') + path = compat_urllib_parse_urlparse( video_url ).path + extension = os.path.splitext( path )[1][1:] + format = path.split('/')[4].split('_')[:2] + format = "-".join( format ) + + return { + 'id': video_id, + 'title': video_title, + 'url': video_url, + 'ext': extension, + 'format': format, + 'format_id': format, + } From aee5e18c8f4c4360216ab27a2b1362a2ce24881e Mon Sep 17 00:00:00 2001 From: Abdulelah Alfntokh <iAbdulelah@Gmail.com> Date: Sun, 27 Oct 2013 13:36:43 +0300 Subject: [PATCH 27/81] [addanime] catch 'RegexNotFoundError' --- youtube_dl/extractor/addanime.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/addanime.py b/youtube_dl/extractor/addanime.py index adbda194a..45aac15c3 100644 --- a/youtube_dl/extractor/addanime.py +++ b/youtube_dl/extractor/addanime.py @@ -8,6 +8,7 @@ from ..utils import ( compat_urllib_parse_urlparse, ExtractorError, + RegexNotFoundError, ) @@ -60,11 +61,13 @@ class AddAnimeIE(InfoExtractor): note=u'Confirming after redirect') webpage = self._download_webpage(url, video_id) - video_url = self._search_regex(r"var hq_video_file = '(.*?)';", - webpage, u'video file URL') - if not video_url: # if there's no hq_video_file, get normal_video_file + try: + video_url = self._search_regex(r"var hq_video_file = '(.*?)';", + webpage, u'video file URL') + except RegexNotFoundError: video_url = self._search_regex(r"var normal_video_file = '(.*?)';", webpage, u'video file URL') + video_extension = video_url[-3:] # mp4 or flv ? video_title = self._og_search_title(webpage) video_description = self._og_search_description(webpage) From 3e6a330d38e2bfce12a789c0f51c7d9754f4316e Mon Sep 17 00:00:00 2001 From: Abdulelah Alfntokh <iAbdulelah@Gmail.com> Date: Sun, 27 Oct 2013 13:51:26 +0300 Subject: [PATCH 28/81] [addanime] fix md5sum --- youtube_dl/extractor/addanime.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/addanime.py b/youtube_dl/extractor/addanime.py index 45aac15c3..490b5af62 100644 --- a/youtube_dl/extractor/addanime.py +++ b/youtube_dl/extractor/addanime.py @@ -19,7 +19,7 @@ class AddAnimeIE(InfoExtractor): _TEST = { u'url': u'http://www.add-anime.net/watch_video.php?v=24MR3YO5SAS9', u'file': u'24MR3YO5SAS9.mp4', - u'md5': u'3f8e232ad52163c87fa23897e736cb2c', + u'md5': u'72954ea10bc979ab5e2eb288b21425a0', u'info_dict': { u"description": u"One Piece 606", u"title": u"One Piece 606" From 67874aeffa37a114b01fe6be11d156b7ece584b2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Sun, 27 Oct 2013 12:07:58 +0100 Subject: [PATCH 29/81] [facebook] Fix the login process (fixes #1244) --- youtube_dl/extractor/facebook.py | 63 ++++++++++++++++++-------------- 1 file changed, 35 insertions(+), 28 deletions(-) diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index 9d1bc0751..62881da31 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -19,7 +19,8 @@ class FacebookIE(InfoExtractor): """Information Extractor for Facebook""" _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)' - _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&' + _LOGIN_URL = 'https://www.facebook.com/login.php?next=http%3A%2F%2Ffacebook.com%2Fhome.php&login_attempt=1' + _CHECKPOINT_URL = 'https://www.facebook.com/checkpoint/?next=http%3A%2F%2Ffacebook.com%2Fhome.php&_fb_noscript=1' _NETRC_MACHINE = 'facebook' IE_NAME = u'facebook' _TEST = { @@ -36,50 +37,56 @@ class FacebookIE(InfoExtractor): """Report attempt to log in.""" self.to_screen(u'Logging in') - def _real_initialize(self): - if self._downloader is None: - return - - useremail = None - password = None - downloader_params = self._downloader.params - - # Attempt to use provided username and password or .netrc data - if downloader_params.get('username', None) is not None: - useremail = downloader_params['username'] - password = downloader_params['password'] - elif downloader_params.get('usenetrc', False): - try: - info = netrc.netrc().authenticators(self._NETRC_MACHINE) - if info is not None: - useremail = info[0] - password = info[2] - else: - raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE) - except (IOError, netrc.NetrcParseError) as err: - self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err)) - return - + def _login(self): + (useremail, password) = self._get_login_info() if useremail is None: return - # Log in + login_page_req = compat_urllib_request.Request(self._LOGIN_URL) + login_page_req.add_header('Cookie', 'locale=en_US') + self.report_login() + login_page = self._download_webpage(login_page_req, None, note=False, + errnote=u'Unable to download login page') + lsd = self._search_regex(r'"lsd":"(\w*?)"', login_page, u'lsd') + lgnrnd = self._search_regex(r'name="lgnrnd" value="([^"]*?)"', login_page, u'lgnrnd') + login_form = { 'email': useremail, 'pass': password, - 'login': 'Log+In' + 'lsd': lsd, + 'lgnrnd': lgnrnd, + 'next': 'http://facebook.com/home.php', + 'default_persistent': '0', + 'legacy_return': '1', + 'timezone': '-60', + 'trynum': '1', } request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form)) + request.add_header('Content-Type', 'application/x-www-form-urlencoded') try: - self.report_login() login_results = compat_urllib_request.urlopen(request).read() if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None: self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.') return + + check_form = { + 'fb_dtsg': self._search_regex(r'"fb_dtsg":"(.*?)"', login_results, u'fb_dtsg'), + 'nh': self._search_regex(r'name="nh" value="(\w*?)"', login_results, u'nh'), + 'name_action_selected': 'dont_save', + 'submit[Continue]': self._search_regex(r'<input value="(.*?)" name="submit\[Continue\]"', login_results, u'continue'), + } + check_req = compat_urllib_request.Request(self._CHECKPOINT_URL, compat_urllib_parse.urlencode(check_form)) + check_req.add_header('Content-Type', 'application/x-www-form-urlencoded') + check_response = compat_urllib_request.urlopen(check_req).read() + if re.search(r'id="checkpointSubmitButton"', check_response) is not None: + self._downloader.report_warning(u'Unable to confirm login, you have to login in your brower and authorize the login.') except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: self._downloader.report_warning(u'unable to log in: %s' % compat_str(err)) return + def _real_initialize(self): + self._login() + def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) if mobj is None: From 6f71ef580c0d93947817c81a09f6a188631585a1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Sun, 27 Oct 2013 12:09:46 +0100 Subject: [PATCH 30/81] [facebook] Report a more meaningful message if the video cannot be accessed (closes #1658) --- youtube_dl/extractor/facebook.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index 62881da31..aa2525f17 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -100,7 +100,12 @@ class FacebookIE(InfoExtractor): AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});' m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage) if not m: - raise ExtractorError(u'Cannot parse data') + m_msg = re.search(r'class="[^"]*uiInterstitialContent[^"]*"><div>(.*?)</div>', webpage) + if m_msg is not None: + err_msg = u'The video is not available, Facebook said: "%s"' % m_msg.group(1) + else: + err_msg = u'Cannot parse data' + raise ExtractorError(err_msg) data = dict(json.loads(m.group(1))) params_raw = compat_urllib_parse.unquote(data['params']) params = json.loads(params_raw) From 749a4fd2fd88017bafca5c298f16123fd0146b40 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Sun, 27 Oct 2013 12:13:55 +0100 Subject: [PATCH 31/81] [facebook] Don't recommend to report the issue if the video is private. --- youtube_dl/extractor/facebook.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index aa2525f17..f8bdfc2d3 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -102,10 +102,11 @@ class FacebookIE(InfoExtractor): if not m: m_msg = re.search(r'class="[^"]*uiInterstitialContent[^"]*"><div>(.*?)</div>', webpage) if m_msg is not None: - err_msg = u'The video is not available, Facebook said: "%s"' % m_msg.group(1) + raise ExtractorError( + u'The video is not available, Facebook said: "%s"' % m_msg.group(1), + expected=True) else: - err_msg = u'Cannot parse data' - raise ExtractorError(err_msg) + raise ExtractorError(u'Cannot parse data') data = dict(json.loads(m.group(1))) params_raw = compat_urllib_parse.unquote(data['params']) params = json.loads(params_raw) From 5da054958151263040f2a53cf554b0084e79f6fa Mon Sep 17 00:00:00 2001 From: rzhxeo <rzhxeot7z81b4700@mailcatch.com> Date: Sun, 27 Oct 2013 12:48:09 +0100 Subject: [PATCH 32/81] [KeezMoviesIE] Correct return value for embedded videos --- youtube_dl/extractor/keezmovies.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/keezmovies.py b/youtube_dl/extractor/keezmovies.py index 937caf664..23d5209d9 100644 --- a/youtube_dl/extractor/keezmovies.py +++ b/youtube_dl/extractor/keezmovies.py @@ -36,7 +36,7 @@ class KeezMoviesIE(InfoExtractor): mobj = re.search(r'href="([^"]+)"></iframe>', webpage) if mobj: embedded_url = mobj.group(1) - return self.playlist_result([self.url_result(embedded_url)], playlist_id=video_id) + return self.url_result(embedded_url) video_title = self._html_search_regex(r'<h1 [^>]*>([^<]+)', webpage, u'title') video_url = compat_urllib_parse.unquote(self._html_search_regex(r'video_url=(.+?)&', webpage, u'video_url')) From af4d506eb35a257e91098fa92498b24ef5de14c6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Sun, 27 Oct 2013 14:18:55 +0100 Subject: [PATCH 33/81] [faz] Use a regex for getting the description The page cannot be parsed in python2.6 with the html parser. --- youtube_dl/extractor/faz.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/faz.py b/youtube_dl/extractor/faz.py index deaa4ed2d..89ed08db4 100644 --- a/youtube_dl/extractor/faz.py +++ b/youtube_dl/extractor/faz.py @@ -5,8 +5,6 @@ import xml.etree.ElementTree from .common import InfoExtractor from ..utils import ( determine_ext, - clean_html, - get_element_by_attribute, ) @@ -47,12 +45,12 @@ class FazIE(InfoExtractor): 'format_id': code.lower(), }) - descr_html = get_element_by_attribute('class', 'Content Copy', webpage) + descr = self._html_search_regex(r'<p class="Content Copy">(.*?)</p>', webpage, u'description') info = { 'id': video_id, 'title': self._og_search_title(webpage), 'formats': formats, - 'description': clean_html(descr_html), + 'description': descr, 'thumbnail': config.find('STILL/STILL_BIG').text, } # TODO: Remove when #980 has been merged From aa929c37d58163cc13184b6922ebc9ceb4625239 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Sun, 27 Oct 2013 14:21:37 +0100 Subject: [PATCH 34/81] [generic] Fix test video's checksum --- youtube_dl/extractor/generic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 69e0a7bd2..ab4a5b7de 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -25,7 +25,7 @@ class GenericIE(InfoExtractor): { u'url': u'http://www.hodiho.fr/2013/02/regis-plante-sa-jeep.html', u'file': u'13601338388002.mp4', - u'md5': u'85b90ccc9d73b4acd9138d3af4c27f89', + u'md5': u'6e15c93721d7ec9e9ca3fdbf07982cfd', u'info_dict': { u"uploader": u"www.hodiho.fr", u"title": u"R\u00e9gis plante sa Jeep" From bc63d9d3294072e2b355c3363c0fb5c33756d3af Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Sun, 27 Oct 2013 14:26:19 +0100 Subject: [PATCH 35/81] [rtlnow] Change the test for rtlnitronow --- youtube_dl/extractor/rtlnow.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/rtlnow.py b/youtube_dl/extractor/rtlnow.py index d1b08c9bc..9ac7c3be8 100644 --- a/youtube_dl/extractor/rtlnow.py +++ b/youtube_dl/extractor/rtlnow.py @@ -63,13 +63,12 @@ class RTLnowIE(InfoExtractor): }, }, { - u'url': u'http://www.rtlnitronow.de/recht-ordnung/lebensmittelkontrolle-erlangenordnungsamt-berlin.php?film_id=127367&player=1&season=1', - u'file': u'127367.flv', + u'url': u'http://www.rtlnitronow.de/recht-ordnung/stadtpolizei-frankfurt-gerichtsvollzieher-leipzig.php?film_id=129679&player=1&season=1', + u'file': u'129679.flv', u'info_dict': { - u'upload_date': u'20130926', - u'title': u'Recht & Ordnung - Lebensmittelkontrolle Erlangen/Ordnungsamt...', - u'description': u'Lebensmittelkontrolle Erlangen/Ordnungsamt Berlin', - u'thumbnail': u'http://autoimg.static-fra.de/nitronow/344787/1500x1500/image2.jpg', + u'upload_date': u'20131016', + u'title': u'Recht & Ordnung - Stadtpolizei Frankfurt/ Gerichtsvollzieher...', + u'description': u'Stadtpolizei Frankfurt/ Gerichtsvollzieher Leipzig', }, u'params': { u'skip_download': True, From c19f7764a5499b0f1e1914dd5101619b8d57d7cf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Sun, 27 Oct 2013 14:40:25 +0100 Subject: [PATCH 36/81] [generic] Detect bandcamp pages that use custom domains (closes #1662) They embed the original url in the 'og:url' property. --- youtube_dl/extractor/generic.py | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index ab4a5b7de..2c8fcf5ae 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -41,7 +41,17 @@ class GenericIE(InfoExtractor): u"uploader_id": u"skillsmatter", u"uploader": u"Skills Matter", } - } + }, + # bandcamp page with custom domain + { + u'url': u'http://bronyrock.com/track/the-pony-mash', + u'file': u'3235767654.mp3', + u'info_dict': { + u'title': u'The Pony Mash', + u'uploader': u'M_Pallante', + }, + u'skip': u'There is a limit of 200 free downloads / month for the test song', + }, ] def report_download_webpage(self, video_id): @@ -155,6 +165,12 @@ class GenericIE(InfoExtractor): surl = unescapeHTML(mobj.group(1)) return self.url_result(surl, 'Youtube') + # Look for Bandcamp pages with custom domain + mobj = re.search(r'<meta property="og:url"[^>]*?content="(.*?bandcamp\.com.*?)"', webpage) + if mobj is not None: + burl = unescapeHTML(mobj.group(1)) + return self.url_result(burl, 'Bandcamp') + # Start with something easy: JW Player in SWFObject mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage) if mobj is None: From 198e370f23d9e97b335d1c2603b9fc624817b701 Mon Sep 17 00:00:00 2001 From: Abdulelah Alfntokh <iAbdulelah@Gmail.com> Date: Sun, 27 Oct 2013 19:48:02 +0300 Subject: [PATCH 37/81] [addanime] better regex. --- youtube_dl/extractor/addanime.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/addanime.py b/youtube_dl/extractor/addanime.py index 490b5af62..465df8cf0 100644 --- a/youtube_dl/extractor/addanime.py +++ b/youtube_dl/extractor/addanime.py @@ -8,7 +8,6 @@ from ..utils import ( compat_urllib_parse_urlparse, ExtractorError, - RegexNotFoundError, ) @@ -61,12 +60,8 @@ class AddAnimeIE(InfoExtractor): note=u'Confirming after redirect') webpage = self._download_webpage(url, video_id) - try: - video_url = self._search_regex(r"var hq_video_file = '(.*?)';", - webpage, u'video file URL') - except RegexNotFoundError: - video_url = self._search_regex(r"var normal_video_file = '(.*?)';", - webpage, u'video file URL') + video_url = self._search_regex(r"var (?:hq|normal)_video_file = '(.*?)';", + webpage, u'video file URL') video_extension = video_url[-3:] # mp4 or flv ? video_title = self._og_search_title(webpage) From 7d8c2e07f218dc33aefb77db78fa420becb53732 Mon Sep 17 00:00:00 2001 From: Filippo Valsorda <filippo.valsorda@gmail.com> Date: Mon, 28 Oct 2013 00:33:43 -0400 Subject: [PATCH 38/81] [Exfm] replace the failing Soundcloud test vector (broken also in browser) --- youtube_dl/extractor/exfm.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/exfm.py b/youtube_dl/extractor/exfm.py index 3443f19c5..c74556579 100644 --- a/youtube_dl/extractor/exfm.py +++ b/youtube_dl/extractor/exfm.py @@ -11,14 +11,14 @@ class ExfmIE(InfoExtractor): _SOUNDCLOUD_URL = r'(?:http://)?(?:www\.)?api\.soundcloud.com/tracks/([^/]+)/stream' _TESTS = [ { - u'url': u'http://ex.fm/song/1bgtzg', - u'file': u'95223130.mp3', - u'md5': u'8a7967a3fef10e59a1d6f86240fd41cf', + u'url': u'http://ex.fm/song/eh359', + u'file': u'44216187.mp3', + u'md5': u'e45513df5631e6d760970b14cc0c11e7', u'info_dict': { - u"title": u"We Can't Stop - Miley Cyrus", - u"uploader": u"Miley Cyrus", - u'upload_date': u'20130603', - u'description': u'Download "We Can\'t Stop" \r\niTunes: http://smarturl.it/WeCantStop?IQid=SC\r\nAmazon: http://smarturl.it/WeCantStopAMZ?IQid=SC', + u"title": u"Test House \"Love Is Not Enough\" (Extended Mix) DeadJournalist Exclusive", + u"uploader": u"deadjournalist", + u'upload_date': u'20120424', + u'description': u'Test House \"Love Is Not Enough\" (Extended Mix) DeadJournalist Exclusive', }, u'note': u'Soundcloud song', }, From 750e9833b83c6e17a4efa8d5dac5b3cd848f4603 Mon Sep 17 00:00:00 2001 From: Filippo Valsorda <filippo.valsorda@gmail.com> Date: Mon, 28 Oct 2013 01:50:17 -0400 Subject: [PATCH 39/81] Add the missing age_limit tags; added a devscript to do a superficial check for porn sites without the age_limit tag in the test --- devscripts/check-porn.py | 39 ++++++++++++++++++++++++++++++ youtube_dl/extractor/keezmovies.py | 5 +++- youtube_dl/extractor/pornhub.py | 2 ++ youtube_dl/extractor/pornotube.py | 3 ++- youtube_dl/extractor/spankwire.py | 4 +++ youtube_dl/extractor/tube8.py | 2 ++ youtube_dl/extractor/youjizz.py | 8 ++++-- 7 files changed, 59 insertions(+), 4 deletions(-) create mode 100644 devscripts/check-porn.py diff --git a/devscripts/check-porn.py b/devscripts/check-porn.py new file mode 100644 index 000000000..63401fe18 --- /dev/null +++ b/devscripts/check-porn.py @@ -0,0 +1,39 @@ +#!/usr/bin/env python + +""" +This script employs a VERY basic heuristic ('porn' in webpage.lower()) to check +if we are not 'age_limit' tagging some porn site +""" + +# Allow direct execution +import os +import sys +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from test.helper import get_testcases +from youtube_dl.utils import compat_urllib_request + +for test in get_testcases(): + try: + webpage = compat_urllib_request.urlopen(test['url'], timeout=10).read() + except: + print('\nFail: {0}'.format(test['name'])) + continue + + webpage = webpage.decode('utf8', 'replace') + + if 'porn' in webpage.lower() and ('info_dict' not in test + or 'age_limit' not in test['info_dict'] + or test['info_dict']['age_limit'] != 18): + print('\nPotential missing age_limit check: {0}'.format(test['name'])) + + elif 'porn' not in webpage.lower() and ('info_dict' in test and + 'age_limit' in test['info_dict'] and + test['info_dict']['age_limit'] == 18): + print('\nPotential false negative: {0}'.format(test['name'])) + + else: + sys.stdout.write('.') + sys.stdout.flush() + +print() diff --git a/youtube_dl/extractor/keezmovies.py b/youtube_dl/extractor/keezmovies.py index 23d5209d9..5e05900da 100644 --- a/youtube_dl/extractor/keezmovies.py +++ b/youtube_dl/extractor/keezmovies.py @@ -6,7 +6,6 @@ from ..utils import ( compat_urllib_parse_urlparse, compat_urllib_request, compat_urllib_parse, - unescapeHTML, ) from ..aes import ( aes_decrypt_text @@ -20,6 +19,7 @@ class KeezMoviesIE(InfoExtractor): u'md5': u'6e297b7e789329923fcf83abb67c9289', u'info_dict': { u"title": u"Petite Asian Lady Mai Playing In Bathtub", + u"age_limit": 18, } } @@ -48,6 +48,8 @@ class KeezMoviesIE(InfoExtractor): format = path.split('/')[4].split('_')[:2] format = "-".join( format ) + age_limit = self._rta_search(webpage) + return { 'id': video_id, 'title': video_title, @@ -55,4 +57,5 @@ class KeezMoviesIE(InfoExtractor): 'ext': extension, 'format': format, 'format_id': format, + 'age_limit': age_limit, } diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index 3dbd2ab69..5e2454f1b 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -21,6 +21,7 @@ class PornHubIE(InfoExtractor): u'info_dict': { u"uploader": u"BABES-COM", u"title": u"Seductive Indian beauty strips down and fingers her pink pussy", + u"age_limit": 18 } } @@ -64,4 +65,5 @@ class PornHubIE(InfoExtractor): 'title': video_title, 'thumbnail': thumbnail, 'formats': formats, + 'age_limit': 18, } diff --git a/youtube_dl/extractor/pornotube.py b/youtube_dl/extractor/pornotube.py index 5d770ec28..35dc5a9ff 100644 --- a/youtube_dl/extractor/pornotube.py +++ b/youtube_dl/extractor/pornotube.py @@ -16,7 +16,8 @@ class PornotubeIE(InfoExtractor): u'md5': u'374dd6dcedd24234453b295209aa69b6', u'info_dict': { u"upload_date": u"20090708", - u"title": u"Marilyn-Monroe-Bathing" + u"title": u"Marilyn-Monroe-Bathing", + u"age_limit": 18 } } diff --git a/youtube_dl/extractor/spankwire.py b/youtube_dl/extractor/spankwire.py index f0d5009c7..32df0a7fb 100644 --- a/youtube_dl/extractor/spankwire.py +++ b/youtube_dl/extractor/spankwire.py @@ -22,6 +22,7 @@ class SpankwireIE(InfoExtractor): u"uploader": u"oreusz", u"title": u"Buckcherry`s X Rated Music Video Crazy Bitch", u"description": u"Crazy Bitch X rated music video.", + u"age_limit": 18, } } @@ -60,6 +61,8 @@ class SpankwireIE(InfoExtractor): }) formats.sort(key=lambda format: list(map(lambda s: s.zfill(6), format['format'].split('-')))) + age_limit = self._rta_search(webpage) + return { 'id': video_id, 'uploader': video_uploader, @@ -67,4 +70,5 @@ class SpankwireIE(InfoExtractor): 'thumbnail': thumbnail, 'description': description, 'formats': formats, + 'age_limit': age_limit, } diff --git a/youtube_dl/extractor/tube8.py b/youtube_dl/extractor/tube8.py index ebc8c1f4f..aea9d9a24 100644 --- a/youtube_dl/extractor/tube8.py +++ b/youtube_dl/extractor/tube8.py @@ -22,6 +22,7 @@ class Tube8IE(InfoExtractor): u"description": u"hot teen Kasia grinding", u"uploader": u"unknown", u"title": u"Kasia music video", + u"age_limit": 18, } } @@ -60,4 +61,5 @@ class Tube8IE(InfoExtractor): 'ext': extension, 'format': format, 'format_id': format, + 'age_limit': 18, } diff --git a/youtube_dl/extractor/youjizz.py b/youtube_dl/extractor/youjizz.py index 1265639e8..1fcc518ac 100644 --- a/youtube_dl/extractor/youjizz.py +++ b/youtube_dl/extractor/youjizz.py @@ -13,7 +13,8 @@ class YouJizzIE(InfoExtractor): u'file': u'2189178.flv', u'md5': u'07e15fa469ba384c7693fd246905547c', u'info_dict': { - u"title": u"Zeichentrick 1" + u"title": u"Zeichentrick 1", + u"age_limit": 18, } } @@ -25,6 +26,8 @@ class YouJizzIE(InfoExtractor): # Get webpage content webpage = self._download_webpage(url, video_id) + age_limit = self._rta_search(webpage) + # Get the video title video_title = self._html_search_regex(r'<title>(?P<title>.*)', webpage, u'title').strip() @@ -60,6 +63,7 @@ class YouJizzIE(InfoExtractor): 'title': video_title, 'ext': 'flv', 'format': 'flv', - 'player_url': embed_page_url} + 'player_url': embed_page_url, + 'age_limit': age_limit} return [info] From 8ffa13e03e995f2009d8240cbdc6ba7aba9d3759 Mon Sep 17 00:00:00 2001 From: Filippo Valsorda Date: Mon, 28 Oct 2013 02:34:29 -0400 Subject: [PATCH 40/81] [Instagram] get the non-https link, as they are serving Akamai cert from a instagram.com domain --- youtube_dl/extractor/common.py | 8 ++++---- youtube_dl/extractor/instagram.py | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index aaa5c24c8..8b067b48d 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -318,10 +318,10 @@ class InfoExtractor(object): def _og_search_title(self, html, **kargs): return self._og_search_property('title', html, **kargs) - def _og_search_video_url(self, html, name='video url', **kargs): - return self._html_search_regex([self._og_regex('video:secure_url'), - self._og_regex('video')], - html, name, **kargs) + def _og_search_video_url(self, html, name='video url', secure=True, **kargs): + regexes = [self._og_regex('video')] + if secure: regexes.insert(0, self._og_regex('video:secure_url')) + return self._html_search_regex(regexes, html, name, **kargs) def _rta_search(self, html): # See http://www.rtalabel.org/index.php?content=howtofaq#single diff --git a/youtube_dl/extractor/instagram.py b/youtube_dl/extractor/instagram.py index ddc42882a..213aac428 100644 --- a/youtube_dl/extractor/instagram.py +++ b/youtube_dl/extractor/instagram.py @@ -26,7 +26,7 @@ class InstagramIE(InfoExtractor): return [{ 'id': video_id, - 'url': self._og_search_video_url(webpage), + 'url': self._og_search_video_url(webpage, secure=False), 'ext': 'mp4', 'title': u'Video by %s' % uploader_id, 'thumbnail': self._og_search_thumbnail(webpage), From d41e6efc852c34da582790a54ecc4f5e9dbbedda Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Mon, 28 Oct 2013 10:44:02 +0100 Subject: [PATCH 41/81] New debug option --write-pages --- youtube_dl/__init__.py | 4 ++++ youtube_dl/extractor/common.py | 12 ++++++++++++ 2 files changed, 16 insertions(+) diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index a33dec785..48ffcbf8e 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -316,6 +316,9 @@ def parseOpts(overrideArguments=None): verbosity.add_option('--dump-intermediate-pages', action='store_true', dest='dump_intermediate_pages', default=False, help='print downloaded pages to debug problems(very verbose)') + verbosity.add_option('--write-pages', + action='store_true', dest='write_pages', default=False, + help='Write downloaded pages to files in the current directory') verbosity.add_option('--youtube-print-sig-code', action='store_true', dest='youtube_print_sig_code', default=False, help=optparse.SUPPRESS_HELP) @@ -652,6 +655,7 @@ def _real_main(argv=None): 'prefer_free_formats': opts.prefer_free_formats, 'verbose': opts.verbose, 'dump_intermediate_pages': opts.dump_intermediate_pages, + 'write_pages': opts.write_pages, 'test': opts.test, 'keepvideo': opts.keepvideo, 'min_filesize': opts.min_filesize, diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index aaa5c24c8..458635f1e 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -15,6 +15,7 @@ from ..utils import ( compiled_regex_type, ExtractorError, RegexNotFoundError, + sanitize_filename, unescapeHTML, ) @@ -182,6 +183,17 @@ class InfoExtractor(object): self.to_screen(u'Dumping request to ' + url) dump = base64.b64encode(webpage_bytes).decode('ascii') self._downloader.to_screen(dump) + if self._downloader.params.get('write_pages', False): + try: + url = url_or_request.get_full_url() + except AttributeError: + url = url_or_request + raw_filename = ('%s_%s.dump' % (video_id, url)) + filename = sanitize_filename(raw_filename, restricted=True) + self.to_screen(u'Saving request to ' + filename) + with open(filename, 'wb') as outf: + outf.write(webpage_bytes) + content = webpage_bytes.decode(encoding, 'replace') return (content, urlh) From 77d0a82fefd8ad7a2ab0662739aa2f039bed11ca Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Mon, 28 Oct 2013 11:24:47 +0100 Subject: [PATCH 42/81] [addanime] Use new formats system --- youtube_dl/extractor/addanime.py | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/addanime.py b/youtube_dl/extractor/addanime.py index 465df8cf0..3b8258ad8 100644 --- a/youtube_dl/extractor/addanime.py +++ b/youtube_dl/extractor/addanime.py @@ -31,7 +31,8 @@ class AddAnimeIE(InfoExtractor): video_id = mobj.group('video_id') webpage = self._download_webpage(url, video_id) except ExtractorError as ee: - if not isinstance(ee.cause, compat_HTTPError): + if not isinstance(ee.cause, compat_HTTPError) or \ + ee.cause.code != 503: raise redir_webpage = ee.cause.read().decode('utf-8') @@ -60,18 +61,27 @@ class AddAnimeIE(InfoExtractor): note=u'Confirming after redirect') webpage = self._download_webpage(url, video_id) - video_url = self._search_regex(r"var (?:hq|normal)_video_file = '(.*?)';", - webpage, u'video file URL') - - video_extension = video_url[-3:] # mp4 or flv ? + formats = [] + for format_id in ('normal', 'hq'): + rex = r"var %s_video_file = '(.*?)';" % re.escape(format_id) + video_url = self._search_regex(rex, webpage, u'video file URLx', + fatal=False) + if not video_url: + continue + formats.append({ + 'format_id': format_id, + 'url': video_url, + 'ext': video_url[-3:], + }) + if not formats: + raise ExtractorError(u'Cannot find any video format!') video_title = self._og_search_title(webpage) video_description = self._og_search_description(webpage) return { '_type': 'video', 'id': video_id, - 'url': video_url, - 'ext': video_extension, + 'formats': formats, 'title': video_title, 'description': video_description } From c1002e96e98f4851aed5de0142e8e2bd1ac4661c Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Mon, 28 Oct 2013 11:28:02 +0100 Subject: [PATCH 43/81] Let extractors omit ext in formats --- youtube_dl/YoutubeDL.py | 3 +++ youtube_dl/extractor/addanime.py | 1 - 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index d4654cc05..b09eeff32 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -484,6 +484,9 @@ class YoutubeDL(object): res=self.format_resolution(format), note=u' ({})'.format(format['format_note']) if format.get('format_note') is not None else '', ) + # Automatically determine file extension if missing + if 'ext' not in format: + format['ext'] = determine_ext(format['url']) if self.params.get('listformats', None): self.list_formats(info_dict) diff --git a/youtube_dl/extractor/addanime.py b/youtube_dl/extractor/addanime.py index 3b8258ad8..b99d4b966 100644 --- a/youtube_dl/extractor/addanime.py +++ b/youtube_dl/extractor/addanime.py @@ -71,7 +71,6 @@ class AddAnimeIE(InfoExtractor): formats.append({ 'format_id': format_id, 'url': video_url, - 'ext': video_url[-3:], }) if not formats: raise ExtractorError(u'Cannot find any video format!') From 8abeeb94490e7066826ac086554be935a0c1dd94 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Mon, 28 Oct 2013 11:31:12 +0100 Subject: [PATCH 44/81] Nicer --list-formats output --- youtube_dl/YoutubeDL.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index b09eeff32..12621ff95 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -757,23 +757,23 @@ class YoutubeDL(object): archive_file.write(vid_id + u'\n') @staticmethod - def format_resolution(format): + def format_resolution(format, default='unknown'): if format.get('height') is not None: if format.get('width') is not None: res = u'%sx%s' % (format['width'], format['height']) else: res = u'%sp' % format['height'] else: - res = '???' + res = default return res def list_formats(self, info_dict): formats_s = [] for format in info_dict.get('formats', [info_dict]): - formats_s.append(u'%-15s: %-5s %-15s[%s]' % ( + formats_s.append(u'%-15s%-7s %-15s%s' % ( format['format_id'], format['ext'], - format.get('format_note') or '-', + format.get('format_note', ''), self.format_resolution(format), ) ) From 1003d108d51b7eb5edb84778ec234b217d72d4a5 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Mon, 28 Oct 2013 11:32:22 +0100 Subject: [PATCH 45/81] [vimeo] Support hash in URL (Fixes #1669) --- youtube_dl/extractor/vimeo.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index ef90fecc0..b4dbcd2ee 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -20,12 +20,12 @@ class VimeoIE(InfoExtractor): """Information extractor for vimeo.com.""" # _VALID_URL matches Vimeo URLs - _VALID_URL = r'(?Phttps?://)?(?:(?:www|player)\.)?vimeo(?Ppro)?\.com/(?:(?:(?:groups|album)/[^/]+)|(?:.*?)/)?(?Pplay_redirect_hls\?clip_id=)?(?:videos?/)?(?P[0-9]+)/?(?:[?].*)?$' + _VALID_URL = r'(?Phttps?://)?(?:(?:www|player)\.)?vimeo(?Ppro)?\.com/(?:(?:(?:groups|album)/[^/]+)|(?:.*?)/)?(?Pplay_redirect_hls\?clip_id=)?(?:videos?/)?(?P[0-9]+)/?(?:[?].*)?(?:#.*)?$' _NETRC_MACHINE = 'vimeo' IE_NAME = u'vimeo' _TESTS = [ { - u'url': u'http://vimeo.com/56015672', + u'url': u'http://vimeo.com/56015672#at=0', u'file': u'56015672.mp4', u'md5': u'ae7a1d8b183758a0506b0622f37dfa14', u'info_dict': { From f088ea54863f17cad7d50d73b49042e18092de3e Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Mon, 28 Oct 2013 11:34:21 +0100 Subject: [PATCH 46/81] release 2013.10.28 --- README.md | 23 +++++++++++++---------- youtube_dl/version.py | 2 +- 2 files changed, 14 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index 2b8db0cfc..a2b296613 100644 --- a/README.md +++ b/README.md @@ -79,16 +79,17 @@ which means you can modify it, redistribute it or use it however you like. different, %(autonumber)s to get an automatically incremented number, %(ext)s for the filename extension, %(format)s for the format description - (like "22 - 1280x720" or "HD")%(upload_date)s for - the upload date (YYYYMMDD), %(extractor)s for the - provider (youtube, metacafe, etc), %(id)s for the - video id , %(playlist)s for the playlist the - video is in, %(playlist_index)s for the position - in the playlist and %% for a literal percent. Use - - to output to stdout. Can also be used to - download to a different directory, for example - with -o '/my/downloads/%(uploader)s/%(title)s-%(i - d)s.%(ext)s' . + (like "22 - 1280x720" or "HD"),%(format_id)s for + the unique id of the format (like Youtube's + itags: "137"),%(upload_date)s for the upload date + (YYYYMMDD), %(extractor)s for the provider + (youtube, metacafe, etc), %(id)s for the video id + , %(playlist)s for the playlist the video is in, + %(playlist_index)s for the position in the + playlist and %% for a literal percent. Use - to + output to stdout. Can also be used to download to + a different directory, for example with -o '/my/d + ownloads/%(uploader)s/%(title)s-%(id)s.%(ext)s' . --autonumber-size NUMBER Specifies the number of digits in %(autonumber)s when it is present in output filename template or --autonumber option is given @@ -126,6 +127,8 @@ which means you can modify it, redistribute it or use it however you like. -v, --verbose print various debugging information --dump-intermediate-pages print downloaded pages to debug problems(very verbose) + --write-pages Write downloaded pages to files in the current + directory ## Video Format Options: -f, --format FORMAT video format code, specifiy the order of diff --git a/youtube_dl/version.py b/youtube_dl/version.py index b4ce6068f..048afc8e7 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2013.10.23.2' +__version__ = '2013.10.28' From a7685f3bf4275bfc0f390146e4ac99139d5b96b9 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Mon, 28 Oct 2013 11:41:32 +0100 Subject: [PATCH 47/81] mixcloud does not do any format selection --- youtube_dl/YoutubeDL.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 12621ff95..2a779373a 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -462,7 +462,7 @@ class YoutubeDL(object): info_dict['playlist_index'] = None # This extractors handle format selection themselves - if info_dict['extractor'] in [u'youtube', u'Youku', u'mixcloud']: + if info_dict['extractor'] in [u'youtube', u'Youku']: if download: self.process_info(info_dict) return info_dict From 78a3a9f89ef4a9918c0e6dc854b99df9c2a94e4e Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Mon, 28 Oct 2013 11:41:43 +0100 Subject: [PATCH 48/81] Make "requested format not available" expected (#1655) --- youtube_dl/YoutubeDL.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 2a779373a..19dabef2d 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -524,7 +524,8 @@ class YoutubeDL(object): formats_to_download = [selected_format] break if not formats_to_download: - raise ExtractorError(u'requested format not available') + raise ExtractorError(u'requested format not available', + expected=True) if download: if len(formats_to_download) > 1: From 216d71d001989725b402a7ebee4715541314fd61 Mon Sep 17 00:00:00 2001 From: rzhxeo Date: Mon, 28 Oct 2013 16:28:35 +0100 Subject: [PATCH 49/81] Check if description and thumbnail are None to prevent crash --- youtube_dl/YoutubeDL.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 19dabef2d..313295839 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -575,9 +575,9 @@ class YoutubeDL(object): if self.params.get('forceurl', False): # For RTMP URLs, also include the playpath compat_print(info_dict['url'] + info_dict.get('play_path', u'')) - if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict: + if self.params.get('forcethumbnail', False) and info_dict.get('thumbnail') is not None: compat_print(info_dict['thumbnail']) - if self.params.get('forcedescription', False) and 'description' in info_dict: + if self.params.get('forcedescription', False) and info_dict.get('description') is not None: compat_print(info_dict['description']) if self.params.get('forcefilename', False) and filename is not None: compat_print(filename) From 369a759acc9d12590355c6d9f96ef7852153570f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Mon, 28 Oct 2013 16:54:38 +0100 Subject: [PATCH 50/81] setup.py: Make sure the setuptools_available variable is set Otherwise it would crash if it can't import setuptools. --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index f14f96377..aa7cfca08 100644 --- a/setup.py +++ b/setup.py @@ -11,6 +11,7 @@ try: setuptools_available = True except ImportError: from distutils.core import setup + setuptools_available = False try: # This will create an exe that needs Microsoft Visual C++ 2008 From 2bc67c35acece68a75284b88fcb03d69f267a63c Mon Sep 17 00:00:00 2001 From: rzhxeo Date: Mon, 28 Oct 2013 18:22:55 +0100 Subject: [PATCH 51/81] [KeezMoviesIE] Detect URLs with numbers in the SEO part correct --- youtube_dl/extractor/keezmovies.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/keezmovies.py b/youtube_dl/extractor/keezmovies.py index 5e05900da..786924445 100644 --- a/youtube_dl/extractor/keezmovies.py +++ b/youtube_dl/extractor/keezmovies.py @@ -12,7 +12,7 @@ from ..aes import ( ) class KeezMoviesIE(InfoExtractor): - _VALID_URL = r'^(?:https?://)?(?:www\.)?(?Pkeezmovies\.com/video/.+?(?P[0-9]+))' + _VALID_URL = r'^(?:https?://)?(?:www\.)?(?Pkeezmovies\.com/video/.+?(?P[0-9]+))(?:[/?&]|$)' _TEST = { u'url': u'http://www.keezmovies.com/video/petite-asian-lady-mai-playing-in-bathtub-1214711', u'file': u'1214711.mp4', From 702665c0854af6fb317600c4825c0b00e2a4c981 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Mon, 28 Oct 2013 22:01:37 +0100 Subject: [PATCH 52/81] tests: build the filename from the info_dict if the 'file' key is missing It will need to have the 'id' and 'ext' keys to work. --- test/test_download.py | 39 +++++++++++++++++++++++---------------- youtube_dl/YoutubeDL.py | 2 +- 2 files changed, 24 insertions(+), 17 deletions(-) diff --git a/test/test_download.py b/test/test_download.py index b9a9be11d..f136176b1 100644 --- a/test/test_download.py +++ b/test/test_download.py @@ -60,9 +60,12 @@ def generator(test_case): if not ie._WORKING: print_skipping('IE marked as not _WORKING') return - if 'playlist' not in test_case and not test_case['file']: - print_skipping('No output file specified') - return + if 'playlist' not in test_case: + info_dict = test_case.get('info_dict', {}) + if not test_case.get('file') and not (info_dict.get('id') and info_dict.get('ext')): + print_skipping('The output file cannot be know, the "file" ' + 'key is missing or the info_dict is incomplete') + return if 'skip' in test_case: print_skipping(test_case['skip']) return @@ -77,11 +80,17 @@ def generator(test_case): finished_hook_called.add(status['filename']) ydl.fd.add_progress_hook(_hook) + def get_tc_filename(tc): + return tc.get('file') or ydl.prepare_filename(tc.get('info_dict', {})) + test_cases = test_case.get('playlist', [test_case]) - for tc in test_cases: - try_rm(tc['file']) - try_rm(tc['file'] + '.part') - try_rm(tc['file'] + '.info.json') + def try_rm_tcs_files(): + for tc in test_cases: + tc_filename = get_tc_filename(tc) + try_rm(tc_filename) + try_rm(tc_filename + '.part') + try_rm(tc_filename + '.info.json') + try_rm_tcs_files() try: for retry in range(1, RETRIES + 1): try: @@ -98,14 +107,15 @@ def generator(test_case): break for tc in test_cases: + tc_filename = get_tc_filename(tc) if not test_case.get('params', {}).get('skip_download', False): - self.assertTrue(os.path.exists(tc['file']), msg='Missing file ' + tc['file']) - self.assertTrue(tc['file'] in finished_hook_called) - self.assertTrue(os.path.exists(tc['file'] + '.info.json')) + self.assertTrue(os.path.exists(tc_filename), msg='Missing file ' + tc_filename) + self.assertTrue(tc_filename in finished_hook_called) + self.assertTrue(os.path.exists(tc_filename + '.info.json')) if 'md5' in tc: - md5_for_file = _file_md5(tc['file']) + md5_for_file = _file_md5(tc_filename) self.assertEqual(md5_for_file, tc['md5']) - with io.open(tc['file'] + '.info.json', encoding='utf-8') as infof: + with io.open(tc_filename + '.info.json', encoding='utf-8') as infof: info_dict = json.load(infof) for (info_field, expected) in tc.get('info_dict', {}).items(): if isinstance(expected, compat_str) and expected.startswith('md5:'): @@ -126,10 +136,7 @@ def generator(test_case): for key in ('id', 'url', 'title', 'ext'): self.assertTrue(key in info_dict.keys() and info_dict[key]) finally: - for tc in test_cases: - try_rm(tc['file']) - try_rm(tc['file'] + '.part') - try_rm(tc['file'] + '.info.json') + try_rm_tcs_files() return test_template diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 313295839..060678e9b 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -272,7 +272,7 @@ class YoutubeDL(object): autonumber_size = 5 autonumber_templ = u'%0' + str(autonumber_size) + u'd' template_dict['autonumber'] = autonumber_templ % self._num_downloads - if template_dict['playlist_index'] is not None: + if template_dict.get('playlist_index') is not None: template_dict['playlist_index'] = u'%05d' % template_dict['playlist_index'] sanitize = lambda k, v: sanitize_filename( From 2563bcc85cc09382d7e731709b2c8a4ad96c7ea3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Mon, 28 Oct 2013 22:02:17 +0100 Subject: [PATCH 53/81] Add an extractor for MySpace (closes #1666) --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/myspace.py | 48 ++++++++++++++++++++++++++++++++ 2 files changed, 49 insertions(+) create mode 100644 youtube_dl/extractor/myspace.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 0d933986f..caaf54456 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -83,6 +83,7 @@ from .mit import TechTVMITIE, MITIE from .mixcloud import MixcloudIE from .mtv import MTVIE from .muzu import MuzuTVIE +from .myspace import MySpaceIE from .myspass import MySpassIE from .myvideo import MyVideoIE from .naver import NaverIE diff --git a/youtube_dl/extractor/myspace.py b/youtube_dl/extractor/myspace.py new file mode 100644 index 000000000..050f54a5a --- /dev/null +++ b/youtube_dl/extractor/myspace.py @@ -0,0 +1,48 @@ +import re +import json + +from .common import InfoExtractor +from ..utils import ( + compat_str, +) + + +class MySpaceIE(InfoExtractor): + _VALID_URL = r'https?://myspace\.com/([^/]+)/video/[^/]+/(?P\d+)' + + _TEST = { + u'url': u'https://myspace.com/coldplay/video/viva-la-vida/100008689', + u'info_dict': { + u'id': u'100008689', + u'ext': u'flv', + u'title': u'Viva La Vida', + u'description': u'The official Viva La Vida video, directed by Hype Williams', + u'uploader': u'Coldplay', + u'uploader_id': u'coldplay', + }, + u'params': { + # rtmp download + u'skip_download': True, + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + webpage = self._download_webpage(url, video_id) + context = json.loads(self._search_regex(r'context = ({.*?});', webpage, + u'context')) + video = context['video'] + rtmp_url, play_path = video['streamUrl'].split(';', 1) + + return { + 'id': compat_str(video['mediaId']), + 'title': video['title'], + 'url': rtmp_url, + 'play_path': play_path, + 'ext': 'flv', + 'description': video['description'], + 'thumbnail': video['imageUrl'], + 'uploader': video['artistName'], + 'uploader_id': video['artistUsername'], + } From dd508b7c4f0dd8881de07a4e8593d4fcdef9bae7 Mon Sep 17 00:00:00 2001 From: Filippo Valsorda Date: Mon, 28 Oct 2013 18:03:26 -0400 Subject: [PATCH 54/81] [tests] don't fail on network errors This is suboptimal, but at least this way we will need to look at the logs only to check for network errors that happen too often, instead of parsing a ton of lines each time to see if there is some true test failing --- test/helper.py | 17 +++++++++++++++++ test/test_download.py | 22 +++++++++++++++++----- 2 files changed, 34 insertions(+), 5 deletions(-) diff --git a/test/helper.py b/test/helper.py index 777119ea5..d7bf7a828 100644 --- a/test/helper.py +++ b/test/helper.py @@ -5,9 +5,11 @@ import json import os.path import re import types +import sys import youtube_dl.extractor from youtube_dl import YoutubeDL +from youtube_dl.utils import preferredencoding def global_setup(): @@ -33,6 +35,21 @@ def try_rm(filename): raise +def report_warning(message): + ''' + Print the message to stderr, it will be prefixed with 'WARNING:' + If stderr is a tty file the 'WARNING:' will be colored + ''' + if sys.stderr.isatty() and os.name != 'nt': + _msg_header = u'\033[0;33mWARNING:\033[0m' + else: + _msg_header = u'WARNING:' + output = u'%s %s\n' % (_msg_header, message) + if 'b' in getattr(sys.stderr, 'mode', '') or sys.version_info[0] < 3: + output = output.encode(preferredencoding()) + sys.stderr.write(output) + + class FakeYDL(YoutubeDL): def __init__(self, override=None): # Different instances of the downloader can't share the same dictionary diff --git a/test/test_download.py b/test/test_download.py index f136176b1..565afa1b5 100644 --- a/test/test_download.py +++ b/test/test_download.py @@ -6,7 +6,14 @@ import sys import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from test.helper import get_params, get_testcases, global_setup, try_rm, md5 +from test.helper import ( + get_params, + get_testcases, + global_setup, + try_rm, + md5, + report_warning +) global_setup() @@ -92,17 +99,22 @@ def generator(test_case): try_rm(tc_filename + '.info.json') try_rm_tcs_files() try: - for retry in range(1, RETRIES + 1): + try_num = 1 + while True: try: ydl.download([test_case['url']]) except (DownloadError, ExtractorError) as err: - if retry == RETRIES: raise - # Check if the exception is not a network related one if not err.exc_info[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError): raise - print('Retrying: {0} failed tries\n\n##########\n\n'.format(retry)) + if try_num == RETRIES: + report_warning(u'Failed due to network errors, skipping...') + return + + print('Retrying: {0} failed tries\n\n##########\n\n'.format(try_num)) + + try_num += 1 else: break From 646e17a53d3885b84b03045728b3add3d50f513c Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Mon, 28 Oct 2013 23:18:13 +0100 Subject: [PATCH 55/81] Fix YouTubeDL test --- test/test_YoutubeDL.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index f8cd1bdce..ffebb4ae5 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -62,10 +62,10 @@ class TestFormatSelection(unittest.TestCase): def test_format_limit(self): formats = [ - {u'format_id': u'meh'}, - {u'format_id': u'good'}, - {u'format_id': u'great'}, - {u'format_id': u'excellent'}, + {u'format_id': u'meh', u'url': u'http://example.com/meh'}, + {u'format_id': u'good', u'url': u'http://example.com/good'}, + {u'format_id': u'great', u'url': u'http://example.com/great'}, + {u'format_id': u'excellent', u'url': u'http://example.com/exc'}, ] info_dict = { u'formats': formats, u'extractor': u'test', 'id': 'testvid'} From 321a01f97110c3048e9d9c360a099d1ec8cd4479 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Mon, 28 Oct 2013 23:37:01 +0100 Subject: [PATCH 56/81] [mtv] Remove the templates from the mediagen url --- youtube_dl/extractor/mtv.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py index e520e2bb4..e96d3952c 100644 --- a/youtube_dl/extractor/mtv.py +++ b/youtube_dl/extractor/mtv.py @@ -80,6 +80,8 @@ class MTVIE(InfoExtractor): video_id = self._id_from_uri(uri) self.report_extraction(video_id) mediagen_url = itemdoc.find('%s/%s' % (_media_xml_tag('group'), _media_xml_tag('content'))).attrib['url'] + # Remove the templates, like &device={device} + mediagen_url = re.sub(r'&[^=]*?={.*?}(?=(&|$))', u'', mediagen_url) if 'acceptMethods' not in mediagen_url: mediagen_url += '&acceptMethods=fms' mediagen_page = self._download_webpage(mediagen_url, video_id, From f6cc16f5d821a50df173b865164e4fa9cbe854af Mon Sep 17 00:00:00 2001 From: Filippo Valsorda Date: Mon, 28 Oct 2013 19:07:16 -0400 Subject: [PATCH 57/81] [tests] a HTTP 503 is a transient issue --- test/test_download.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/test/test_download.py b/test/test_download.py index 565afa1b5..dfb04d010 100644 --- a/test/test_download.py +++ b/test/test_download.py @@ -26,6 +26,7 @@ import youtube_dl.YoutubeDL from youtube_dl.utils import ( compat_str, compat_urllib_error, + compat_HTTPError, DownloadError, ExtractorError, UnavailableVideoError, @@ -105,7 +106,7 @@ def generator(test_case): ydl.download([test_case['url']]) except (DownloadError, ExtractorError) as err: # Check if the exception is not a network related one - if not err.exc_info[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError): + if not err.exc_info[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError) or (err.exc_info[0] == compat_HTTPError and err.exc_info[1].code == 503): raise if try_num == RETRIES: From 795f28f871074aca2a74dfe67e1e75252b525c4c Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Tue, 29 Oct 2013 06:45:54 +0100 Subject: [PATCH 58/81] [youtube] Fix login (Fixes #1681) --- youtube_dl/extractor/youtube.py | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index d05d0a8c1..f3a2a32b4 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -74,14 +74,8 @@ class YoutubeBaseInfoExtractor(InfoExtractor): self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err)) return False - galx = None - dsh = None - match = re.search(re.compile(r' Date: Tue, 29 Oct 2013 06:48:39 +0100 Subject: [PATCH 59/81] release 2013.10.29 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 048afc8e7..1a94003bc 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2013.10.28' +__version__ = '2013.10.29' From 912cbf5d4ef5b131af88e63815863c389083d077 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Tue, 29 Oct 2013 14:00:01 +0100 Subject: [PATCH 60/81] [vevo] Fix timestamp handling ( / 1000 is implicit float division ) --- youtube_dl/extractor/vevo.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/vevo.py b/youtube_dl/extractor/vevo.py index 1c1cc418d..26ec9fa1b 100644 --- a/youtube_dl/extractor/vevo.py +++ b/youtube_dl/extractor/vevo.py @@ -58,9 +58,9 @@ class VevoIE(InfoExtractor): 'width': int(attr['frameWidth']), }) - date_epoch = int(self._search_regex( - r'/Date\((\d+)\)/', video_info['launchDate'], u'launch date'))/1000 - upload_date = datetime.datetime.fromtimestamp(date_epoch) + timestamp_ms = int(self._search_regex( + r'/Date\((\d+)\)/', video_info['launchDate'], u'launch date')) + upload_date = datetime.datetime.fromtimestamp(timestamp_ms // 1000) info = { 'id': video_id, 'title': video_info['title'], From 57dd9a8f2f5885fb3d909c4905adb69b4749491c Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Tue, 29 Oct 2013 15:09:45 +0100 Subject: [PATCH 61/81] Nicer --list-formats output --- youtube_dl/YoutubeDL.py | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 060678e9b..260cd2809 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -759,6 +759,8 @@ class YoutubeDL(object): @staticmethod def format_resolution(format, default='unknown'): + if format.get('_resolution') is not None: + return format['_resolution'] if format.get('height') is not None: if format.get('width') is not None: res = u'%sx%s' % (format['width'], format['height']) @@ -769,19 +771,22 @@ class YoutubeDL(object): return res def list_formats(self, info_dict): - formats_s = [] - for format in info_dict.get('formats', [info_dict]): - formats_s.append(u'%-15s%-7s %-15s%s' % ( + def line(format): + return (u'%-15s%-10s%-12s%s' % ( format['format_id'], format['ext'], - format.get('format_note', ''), self.format_resolution(format), + format.get('format_note', ''), ) ) + + formats_s = list(map(line, info_dict.get('formats', [info_dict]))) if len(formats_s) != 1: - formats_s[0] += ' (worst)' - formats_s[-1] += ' (best)' - formats_s = "\n".join(formats_s) - self.to_screen(u'[info] Available formats for %s:\n' - u'format code extension note resolution\n%s' % ( - info_dict['id'], formats_s)) + formats_s[0] += (' ' if formats_s[0] else '') + '(worst)' + formats_s[-1] += (' ' if formats_s[-1] else '') + '(best)' + + header_line = line({ + 'format_id': u'format code', 'ext': u'extension', + '_resolution': u'resolution', 'format_note': u'note'}) + self.to_screen(u'[info] Available formats for %s:\n%s\n%s' % + (info_dict['id'], header_line, u"\n".join(formats_s))) From e54fd4b23b8110779e8caff805d3078dcf042d0b Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Tue, 29 Oct 2013 15:10:09 +0100 Subject: [PATCH 62/81] [vevo] Add more format details --- youtube_dl/extractor/vevo.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/vevo.py b/youtube_dl/extractor/vevo.py index 26ec9fa1b..4d9f2a843 100644 --- a/youtube_dl/extractor/vevo.py +++ b/youtube_dl/extractor/vevo.py @@ -50,10 +50,11 @@ class VevoIE(InfoExtractor): # Already sorted from worst to best quality for rend in renditions.findall('rendition'): attr = rend.attrib - f_url = attr['url'] + format_note = '%(videoCodec)s@%(videoBitrate)4sK, %(audioCodec)s@%(audioBitrate)3sK' % attr formats.append({ - 'url': f_url, - 'ext': determine_ext(f_url), + 'url': attr['url'], + 'format_id': attr['name'], + 'format_note': format_note, 'height': int(attr['frameheight']), 'width': int(attr['frameWidth']), }) @@ -71,7 +72,4 @@ class VevoIE(InfoExtractor): 'duration': video_info['duration'], } - # TODO: Remove when #980 has been merged - info.update(formats[-1]) - return info From 21c924f4068692786e0c5435689d10f3d17ef612 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Tue, 29 Oct 2013 20:58:49 +0100 Subject: [PATCH 63/81] [arte] Download the 'Originalversion' version if it's the only one available (fixes #1682) --- youtube_dl/extractor/arte.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py index d39b48951..e10c74c11 100644 --- a/youtube_dl/extractor/arte.py +++ b/youtube_dl/extractor/arte.py @@ -158,7 +158,9 @@ class ArteTVPlus7IE(InfoExtractor): 'thumbnail': player_info.get('programImage') or player_info.get('VTU', {}).get('IUR'), } - formats = player_info['VSR'].values() + all_formats = player_info['VSR'].values() + # Some formats use the m3u8 protocol + all_formats = list(filter(lambda f: f.get('videoFormat') != 'M3U8', all_formats)) def _match_lang(f): if f.get('versionCode') is None: return True @@ -170,11 +172,16 @@ class ArteTVPlus7IE(InfoExtractor): regexes = [r'VO?%s' % l, r'VO?.-ST%s' % l] return any(re.match(r, f['versionCode']) for r in regexes) # Some formats may not be in the same language as the url - formats = filter(_match_lang, formats) - # Some formats use the m3u8 protocol - formats = filter(lambda f: f.get('videoFormat') != 'M3U8', formats) - # We order the formats by quality + formats = filter(_match_lang, all_formats) formats = list(formats) # in python3 filter returns an iterator + if not formats: + # Some videos are only available in the 'Originalversion' + # they aren't tagged as being in French or German + if all(f['versionCode'] == 'VO' for f in all_formats): + formats = all_formats + else: + raise ExtractorError(u'The formats list is empty') + # We order the formats by quality if re.match(r'[A-Z]Q', formats[0]['quality']) is not None: sort_key = lambda f: ['HQ', 'MQ', 'EQ', 'SQ'].index(f['quality']) else: From b9a836515fad5df57a86412b2cd41c49869ec0d6 Mon Sep 17 00:00:00 2001 From: Filippo Valsorda Date: Tue, 29 Oct 2013 16:44:35 -0400 Subject: [PATCH 64/81] Update the Vimeo test vector md5 confirmed that this is indeed the first 10241 (we went off by one with byte range 0-10240) of the full, playing mp4, so they probably reencoded or something --- youtube_dl/extractor/vimeo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index b4dbcd2ee..c7d864a2b 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -27,7 +27,7 @@ class VimeoIE(InfoExtractor): { u'url': u'http://vimeo.com/56015672#at=0', u'file': u'56015672.mp4', - u'md5': u'ae7a1d8b183758a0506b0622f37dfa14', + u'md5': u'8879b6cc097e987f02484baf890129e5', u'info_dict': { u"upload_date": u"20121220", u"description": u"This is a test case for youtube-dl.\nFor more information, see github.com/rg3/youtube-dl\nTest chars: \u2605 \" ' \u5e78 / \\ \u00e4 \u21ad \U0001d550", From 94badb2599e54bfd711b38f3a74c552ff652d6d3 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Wed, 30 Oct 2013 01:09:26 +0100 Subject: [PATCH 65/81] Fix output indenting for --list-formats --- youtube_dl/YoutubeDL.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 260cd2809..898533496 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -780,10 +780,11 @@ class YoutubeDL(object): ) ) - formats_s = list(map(line, info_dict.get('formats', [info_dict]))) - if len(formats_s) != 1: - formats_s[0] += (' ' if formats_s[0] else '') + '(worst)' - formats_s[-1] += (' ' if formats_s[-1] else '') + '(best)' + formats = info_dict.get('formats', [info_dict]) + formats_s = list(map(line, formats)) + if len(formats) > 1: + formats_s[0] += (' ' if formats[0].get('format_note') else '') + '(worst)' + formats_s[-1] += (' ' if formats[-1].get('format_note') else '') + '(best)' header_line = line({ 'format_id': u'format code', 'ext': u'extension', From b5d0d817bc8a23ef6dc2a00d1af6fad893143206 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Wed, 30 Oct 2013 01:09:44 +0100 Subject: [PATCH 66/81] Remove superfluous space --- youtube_dl/extractor/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index ce349fe20..cef4dce85 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -63,7 +63,7 @@ class InfoExtractor(object): * ext Will be calculated from url if missing * format A human-readable description of the format ("mp4 container with h264/opus"). - Calculated from the format_id, width, height + Calculated from the format_id, width, height. and format_note fields if missing. * format_id A short description of the format ("mp4_h264_opus" or "19") From 72321ead7b176824d1a8b2895ad4926555e41b88 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Wed, 30 Oct 2013 01:14:17 +0100 Subject: [PATCH 67/81] [vevo] Readd support for SMIL (Fixes #1683) --- youtube_dl/extractor/vevo.py | 80 +++++++++++++++++++++++++++++------- 1 file changed, 66 insertions(+), 14 deletions(-) diff --git a/youtube_dl/extractor/vevo.py b/youtube_dl/extractor/vevo.py index 4d9f2a843..3f6020f74 100644 --- a/youtube_dl/extractor/vevo.py +++ b/youtube_dl/extractor/vevo.py @@ -5,7 +5,7 @@ import datetime from .common import InfoExtractor from ..utils import ( - determine_ext, + compat_HTTPError, ExtractorError, ) @@ -16,26 +16,22 @@ class VevoIE(InfoExtractor): (currently used by MTVIE) """ _VALID_URL = r'((http://www.vevo.com/watch/.*?/.*?/)|(vevo:))(?P.*?)(\?|$)' - _TEST = { + _TESTS = [{ u'url': u'http://www.vevo.com/watch/hurts/somebody-to-die-for/GB1101300280', u'file': u'GB1101300280.mp4', + u"md5": u"06bea460acb744eab74a9d7dcb4bfd61", u'info_dict': { u"upload_date": u"20130624", u"uploader": u"Hurts", u"title": u"Somebody to Die For", - u'duration': 230, + u"duration": 230, + u"width": 1920, + u"height": 1080, } - } + }] + _SMIL_BASE_URL = 'http://smil.lvl3.vevo.com/' - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - - json_url = 'http://videoplayer.vevo.com/VideoService/AuthenticateVideo?isrc=%s' % video_id - info_json = self._download_webpage(json_url, video_id, u'Downloading json info') - - self.report_extraction(video_id) - video_info = json.loads(info_json)['video'] + def _formats_from_json(self, video_info): last_version = {'version': -1} for version in video_info['videoVersions']: # These are the HTTP downloads, other types are for different manifests @@ -50,7 +46,7 @@ class VevoIE(InfoExtractor): # Already sorted from worst to best quality for rend in renditions.findall('rendition'): attr = rend.attrib - format_note = '%(videoCodec)s@%(videoBitrate)4sK, %(audioCodec)s@%(audioBitrate)3sK' % attr + format_note = '%(videoCodec)s@%(videoBitrate)4sk, %(audioCodec)s@%(audioBitrate)3sk' % attr formats.append({ 'url': attr['url'], 'format_id': attr['name'], @@ -58,6 +54,62 @@ class VevoIE(InfoExtractor): 'height': int(attr['frameheight']), 'width': int(attr['frameWidth']), }) + return formats + + def _formats_from_smil(self, smil_xml): + formats = [] + smil_doc = xml.etree.ElementTree.fromstring(smil_xml.encode('utf-8')) + els = smil_doc.findall('.//{http://www.w3.org/2001/SMIL20/Language}video') + for el in els: + src = el.attrib['src'] + m = re.match(r'''(?xi) + (?P[a-z0-9]+): + (?P + [/a-z0-9]+ # The directory and main part of the URL + _(?P[0-9]+)k + _(?P[0-9]+)x(?P[0-9]+) + _(?P[a-z0-9]+) + _(?P[0-9]+) + _(?P[a-z0-9]+) + _(?P[0-9]+) + \.[a-z0-9]+ # File extension + )''', src) + if not m: + continue + + format_url = self._SMIL_BASE_URL + m.group('path') + format_note = ('%(vcodec)s@%(vbr)4sk, %(acodec)s@%(abr)3sk' % + m.groupdict()) + formats.append({ + 'url': format_url, + 'format_id': u'SMIL_' + m.group('cbr'), + 'format_note': format_note, + 'ext': m.group('ext'), + 'width': int(m.group('width')), + 'height': int(m.group('height')), + }) + return formats + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + + json_url = 'http://videoplayer.vevo.com/VideoService/AuthenticateVideo?isrc=%s' % video_id + info_json = self._download_webpage(json_url, video_id, u'Downloading json info') + video_info = json.loads(info_json)['video'] + + formats = self._formats_from_json(video_info) + try: + smil_url = '%s/Video/V2/VFILE/%s/%sr.smil' % ( + self._SMIL_BASE_URL, video_id, video_id.lower()) + smil_xml = self._download_webpage(smil_url, video_id, + u'Downloading SMIL info') + formats.extend(self._formats_from_smil(smil_xml)) + except ExtractorError as ee: + if not isinstance(ee.cause, compat_HTTPError): + raise + self._downloader.report_warning( + u'Cannot download SMIL information, falling back to JSON ..') timestamp_ms = int(self._search_regex( r'/Date\((\d+)\)/', video_info['launchDate'], u'launch date')) From 7193498811cb17a66ca57569a8588adb28ba2b27 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Wed, 30 Oct 2013 01:17:00 +0100 Subject: [PATCH 68/81] Use index in formt string (Fixes vevo test on Python 2.6) --- youtube_dl/YoutubeDL.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 898533496..7f73ea360 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -482,7 +482,7 @@ class YoutubeDL(object): format['format'] = u'{id} - {res}{note}'.format( id=format['format_id'], res=self.format_resolution(format), - note=u' ({})'.format(format['format_note']) if format.get('format_note') is not None else '', + note=u' ({0})'.format(format['format_note']) if format.get('format_note') is not None else '', ) # Automatically determine file extension if missing if 'ext' not in format: From 33b1d9595d853893b5d732863dc2f5eabd939637 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Wed, 30 Oct 2013 01:17:20 +0100 Subject: [PATCH 69/81] release 2013.10.30 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 1a94003bc..e8eade7ad 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2013.10.29' +__version__ = '2013.10.30' From 9f1109a56424d118263963062bc5185d8415835e Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Thu, 31 Oct 2013 00:20:49 +0100 Subject: [PATCH 70/81] [dailymotion] Fix support for age-restricted videos (Fixes #1688) --- youtube_dl/extractor/dailymotion.py | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index 4c0488245..355b4ed0a 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -21,6 +21,7 @@ class DailymotionBaseInfoExtractor(InfoExtractor): """Build a request with the family filter disabled""" request = compat_urllib_request.Request(url) request.add_header('Cookie', 'family_filter=off') + request.add_header('Cookie', 'ff=off') return request class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor): @@ -61,6 +62,18 @@ class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor): }, u'skip': u'VEVO is only available in some countries', }, + # age-restricted video + { + u'url': u'http://www.dailymotion.com/video/xyh2zz_leanna-decker-cyber-girl-of-the-year-desires-nude-playboy-plus_redband', + u'file': u'xyh2zz.mp4', + u'md5': u'0d667a7b9cebecc3c89ee93099c4159d', + u'info_dict': { + u'title': 'Leanna Decker - Cyber Girl Of The Year Desires Nude [Playboy Plus]', + u'uploader': 'HotWaves1012', + u'age_limit': 18, + } + + } ] def _real_extract(self, url): @@ -90,7 +103,8 @@ class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor): video_uploader = self._search_regex([r'(?im)[^<]+?]+?>([^<]+?)', # Looking for official user r'<(?:span|a) .*?rel="author".*?>([^<]+?)([0-9]{2})-([0-9]{2})-([0-9]{4})
', webpage) @@ -132,15 +146,16 @@ class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor): self._list_available_subtitles(video_id) return - return [{ + return { 'id': video_id, 'formats': formats, 'uploader': video_uploader, 'upload_date': video_upload_date, 'title': self._og_search_title(webpage), 'subtitles': video_subtitles, - 'thumbnail': info['thumbnail_url'] - }] + 'thumbnail': info['thumbnail_url'], + 'age_limit': age_limit, + } def _get_available_subtitles(self, video_id): try: From 0ef7ad5cd49d527a24c62e831cf80f2eb443276f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Thu, 31 Oct 2013 07:55:03 +0100 Subject: [PATCH 71/81] Fix the test for dailymotion subtitles The extractor returns a single info_dict now. --- test/test_dailymotion_subtitles.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_dailymotion_subtitles.py b/test/test_dailymotion_subtitles.py index c596415c4..ba3580ea4 100644 --- a/test/test_dailymotion_subtitles.py +++ b/test/test_dailymotion_subtitles.py @@ -22,7 +22,7 @@ class TestDailymotionSubtitles(unittest.TestCase): return info_dict def getSubtitles(self): info_dict = self.getInfoDict() - return info_dict[0]['subtitles'] + return info_dict['subtitles'] def test_no_writesubtitles(self): subtitles = self.getSubtitles() self.assertEqual(subtitles, None) From 5f1ea943ab6814c2f8ca2a383f990e3f4c9e5f87 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Thu, 31 Oct 2013 08:07:26 +0100 Subject: [PATCH 72/81] [livestream] fix the extraction of events It now uses a json dictionary from the webpage. --- youtube_dl/extractor/livestream.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/livestream.py b/youtube_dl/extractor/livestream.py index d04da98c8..4531fd6ab 100644 --- a/youtube_dl/extractor/livestream.py +++ b/youtube_dl/extractor/livestream.py @@ -40,13 +40,9 @@ class LivestreamIE(InfoExtractor): if video_id is None: # This is an event page: - player = get_meta_content('twitter:player', webpage) - if player is None: - raise ExtractorError('Couldn\'t extract event api url') - api_url = player.replace('/player', '') - api_url = re.sub(r'^(https?://)(new\.)', r'\1api.\2', api_url) - info = json.loads(self._download_webpage(api_url, event_name, - u'Downloading event info')) + config_json = self._search_regex(r'window.config = ({.*?});', + webpage, u'window config') + info = json.loads(config_json)['event'] videos = [self._extract_video_info(video_data['data']) for video_data in info['feed']['data'] if video_data['type'] == u'video'] return self.playlist_result(videos, info['id'], info['full_name']) From 66cf3ac3426b62fb960b4de770c4ea8203a0e205 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Fri, 1 Nov 2013 11:55:35 +0100 Subject: [PATCH 73/81] [metacafe] Fix support for age-restricted videos (fixes #1696) The 'Content-Type' header must be set for disabling the family filter. The 'flashversion' cookie is only needed for AnyClip videos. Added tests for standard metacafe videos and for age-restricted videos. Also set the 'age_limit' field. --- youtube_dl/extractor/metacafe.py | 51 ++++++++++++++++++++++++++++---- 1 file changed, 46 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/metacafe.py b/youtube_dl/extractor/metacafe.py index 234b9e80f..91480ba87 100644 --- a/youtube_dl/extractor/metacafe.py +++ b/youtube_dl/extractor/metacafe.py @@ -20,7 +20,9 @@ class MetacafeIE(InfoExtractor): _DISCLAIMER = 'http://www.metacafe.com/family_filter/' _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user' IE_NAME = u'metacafe' - _TESTS = [{ + _TESTS = [ + # Youtube video + { u"add_ie": ["Youtube"], u"url": u"http://metacafe.com/watch/yt-_aUehQsCQtM/the_electric_company_short_i_pbs_kids_go/", u"file": u"_aUehQsCQtM.mp4", @@ -32,15 +34,42 @@ class MetacafeIE(InfoExtractor): u"uploader_id": u"PBS" } }, + # Normal metacafe video + { + u'url': u'http://www.metacafe.com/watch/11121940/news_stuff_you_wont_do_with_your_playstation_4/', + u'md5': u'6e0bca200eaad2552e6915ed6fd4d9ad', + u'info_dict': { + u'id': u'11121940', + u'ext': u'mp4', + u'title': u'News: Stuff You Won\'t Do with Your PlayStation 4', + u'uploader': u'ign', + u'description': u'Sony released a massive FAQ on the PlayStation Blog detailing the PS4\'s capabilities and limitations.', + }, + }, + # AnyClip video { u"url": u"http://www.metacafe.com/watch/an-dVVXnuY7Jh77J/the_andromeda_strain_1971_stop_the_bomb_part_3/", u"file": u"an-dVVXnuY7Jh77J.mp4", u"info_dict": { u"title": u"The Andromeda Strain (1971): Stop the Bomb Part 3", u"uploader": u"anyclip", - u"description": u"md5:38c711dd98f5bb87acf973d573442e67" - } - }] + u"description": u"md5:38c711dd98f5bb87acf973d573442e67", + }, + }, + # age-restricted video + { + u'url': u'http://www.metacafe.com/watch/5186653/bbc_internal_christmas_tape_79_uncensored_outtakes_etc/', + u'md5': u'98dde7c1a35d02178e8ab7560fe8bd09', + u'info_dict': { + u'id': u'5186653', + u'ext': u'mp4', + u'title': u'BBC INTERNAL Christmas Tape \'79 - UNCENSORED Outtakes, Etc.', + u'uploader': u'Dwayne Pipe', + u'description': u'md5:950bf4c581e2c059911fa3ffbe377e4b', + u'age_limit': 18, + }, + }, + ] def report_disclaimer(self): @@ -62,6 +91,7 @@ class MetacafeIE(InfoExtractor): 'submit': "Continue - I'm over 18", } request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form)) + request.add_header('Content-Type', 'application/x-www-form-urlencoded') try: self.report_age_confirmation() compat_urllib_request.urlopen(request).read() @@ -83,7 +113,12 @@ class MetacafeIE(InfoExtractor): # Retrieve video webpage to extract further information req = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id) - req.headers['Cookie'] = 'flashVersion=0;' + + # AnyClip videos require the flashversion cookie so that we get the link + # to the mp4 file + mobj_an = re.match(r'^an-(.*?)$', video_id) + if mobj_an: + req.headers['Cookie'] = 'flashVersion=0;' webpage = self._download_webpage(req, video_id) # Extract URL, uploader and title from webpage @@ -125,6 +160,11 @@ class MetacafeIE(InfoExtractor): r'submitter=(.*?);|googletag\.pubads\(\)\.setTargeting\("(?:channel|submiter)","([^"]+)"\);', webpage, u'uploader nickname', fatal=False) + if re.search(r'"contentRating":"restricted"', webpage) is not None: + age_limit = 18 + else: + age_limit = 0 + return { '_type': 'video', 'id': video_id, @@ -134,4 +174,5 @@ class MetacafeIE(InfoExtractor): 'upload_date': None, 'title': video_title, 'ext': video_ext, + 'age_limit': age_limit, } From 60d142aa8d896674ca2b062a53b3d18c644192ea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Fri, 1 Nov 2013 22:28:51 +0100 Subject: [PATCH 74/81] Add an extractor for vk.com (closes #1635) --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/vk.py | 45 ++++++++++++++++++++++++++++++++ 2 files changed, 46 insertions(+) create mode 100644 youtube_dl/extractor/vk.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index caaf54456..bcf1cce7f 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -142,6 +142,7 @@ from .videofyme import VideofyMeIE from .videopremium import VideoPremiumIE from .vimeo import VimeoIE, VimeoChannelIE from .vine import VineIE +from .vk import VKIE from .wat import WatIE from .websurg import WeBSurgIE from .weibo import WeiboIE diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py new file mode 100644 index 000000000..90d8a6d07 --- /dev/null +++ b/youtube_dl/extractor/vk.py @@ -0,0 +1,45 @@ +# encoding: utf-8 +import re +import json + +from .common import InfoExtractor +from ..utils import ( + compat_str, + unescapeHTML, +) + + +class VKIE(InfoExtractor): + IE_NAME = u'vk.com' + _VALID_URL = r'https?://vk\.com/(?:videos.*?\?.*?z=)?video(?P.*?)(?:\?|%2F|$)' + + _TEST = { + u'url': u'http://vk.com/videos-77521?z=video-77521_162222515%2Fclub77521', + u'md5': u'0deae91935c54e00003c2a00646315f0', + u'info_dict': { + u'id': u'162222515', + u'ext': u'flv', + u'title': u'ProtivoGunz - Хуёвая песня', + u'uploader': u'Noize MC', + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + info_url = 'http://vk.com/al_video.php?act=show&al=1&video=%s' % video_id + info_page = self._download_webpage(info_url, video_id) + m_yt = re.search(r'src="(http://www.youtube.com/.*?)"', info_page) + if m_yt is not None: + self.to_screen(u'Youtube video detected') + return self.url_result(m_yt.group(1), 'Youtube') + vars_json = self._search_regex(r'var vars = ({.*?});', info_page, u'vars') + vars = json.loads(vars_json) + + return { + 'id': compat_str(vars['vid']), + 'url': vars['url240'], + 'title': unescapeHTML(vars['md_title']), + 'thumbnail': vars['jpg'], + 'uploader': vars['md_author'], + } From 8eddf3e91ddab3bb766bc5176edb3120be5743ea Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Sat, 2 Nov 2013 11:21:05 +0100 Subject: [PATCH 75/81] [youtube] Encode subtitle track name in request (Fixes #1700) --- youtube_dl/extractor/youtube.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index f3a2a32b4..dc601de52 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1111,7 +1111,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): 'lang': lang, 'v': video_id, 'fmt': self._downloader.params.get('subtitlesformat'), - 'name': l[0], + 'name': l[0].encode('utf-8'), }) url = u'http://www.youtube.com/api/timedtext?' + params sub_lang_list[lang] = url From aa2484e390d8a5e74d740fda61b4062a4a8c1d0e Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Sat, 2 Nov 2013 11:21:36 +0100 Subject: [PATCH 76/81] release 2013.11.02 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index e8eade7ad..75a46a2d5 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2013.10.30' +__version__ = '2013.11.02' From 72a5b4f70216fe1a5b1c22be34653ae0ff81058a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sat, 2 Nov 2013 19:01:01 +0100 Subject: [PATCH 77/81] Add an extractor for bambuser.com (#1702) --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/bambuser.py | 42 ++++++++++++++++++++++++++++++++ 2 files changed, 43 insertions(+) create mode 100644 youtube_dl/extractor/bambuser.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index bcf1cce7f..a1e35eb46 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -9,6 +9,7 @@ from .arte import ( ArteTVFutureIE, ) from .auengine import AUEngineIE +from .bambuser import BambuserIE from .bandcamp import BandcampIE from .bliptv import BlipTVIE, BlipTVUserIE from .bloomberg import BloombergIE diff --git a/youtube_dl/extractor/bambuser.py b/youtube_dl/extractor/bambuser.py new file mode 100644 index 000000000..cf8da22e3 --- /dev/null +++ b/youtube_dl/extractor/bambuser.py @@ -0,0 +1,42 @@ +import re +import json + +from .common import InfoExtractor + + +class BambuserIE(InfoExtractor): + _VALID_URL = r'https?://bambuser\.com/v/(?P\d+)' + _API_KEY = '005f64509e19a868399060af746a00aa' + + _TEST = { + u'url': u'http://bambuser.com/v/4050584', + u'md5': u'fba8f7693e48fd4e8641b3fd5539a641', + u'info_dict': { + u'id': u'4050584', + u'ext': u'flv', + u'title': u'Education engineering days - lightning talks', + u'duration': 3741, + u'uploader': u'pixelversity', + u'uploader_id': u'344706', + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + info_url = ('http://player-c.api.bambuser.com/getVideo.json?' + '&api_key=%s&vid=%s' % (self._API_KEY, video_id)) + info_json = self._download_webpage(info_url, video_id) + info = json.loads(info_json)['result'] + + return { + 'id': video_id, + 'title': info['title'], + 'url': info['url'], + 'thumbnail': info['preview'], + 'duration': int(info['length']), + 'view_count': int(info['views_total']), + 'uploader': info['username'], + 'uploader_id': info['uid'], + } + From 165e3bb67a6d737f33d0aa2024c652b363d85ebe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sat, 2 Nov 2013 19:50:57 +0100 Subject: [PATCH 78/81] [bambuser] Add an extractor for channels (closes #1702) --- test/test_playlists.py | 9 +++++++ youtube_dl/extractor/__init__.py | 2 +- youtube_dl/extractor/bambuser.py | 40 +++++++++++++++++++++++++++++++- 3 files changed, 49 insertions(+), 2 deletions(-) diff --git a/test/test_playlists.py b/test/test_playlists.py index d6a8d56df..de1e8d88e 100644 --- a/test/test_playlists.py +++ b/test/test_playlists.py @@ -20,6 +20,7 @@ from youtube_dl.extractor import ( SoundcloudUserIE, LivestreamIE, NHLVideocenterIE, + BambuserChannelIE, ) @@ -85,5 +86,13 @@ class TestPlaylists(unittest.TestCase): self.assertEqual(result['title'], u'Highlights') self.assertEqual(len(result['entries']), 12) + def test_bambuser_channel(self): + dl = FakeYDL() + ie = BambuserChannelIE(dl) + result = ie.extract('http://bambuser.com/channel/pixelversity') + self.assertIsPlaylist(result) + self.assertEqual(result['title'], u'pixelversity') + self.assertTrue(len(result['entries']) >= 66) + if __name__ == '__main__': unittest.main() diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index a1e35eb46..a69c08f51 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -9,7 +9,7 @@ from .arte import ( ArteTVFutureIE, ) from .auengine import AUEngineIE -from .bambuser import BambuserIE +from .bambuser import BambuserIE, BambuserChannelIE from .bandcamp import BandcampIE from .bliptv import BlipTVIE, BlipTVUserIE from .bloomberg import BloombergIE diff --git a/youtube_dl/extractor/bambuser.py b/youtube_dl/extractor/bambuser.py index cf8da22e3..f3b36f473 100644 --- a/youtube_dl/extractor/bambuser.py +++ b/youtube_dl/extractor/bambuser.py @@ -1,10 +1,15 @@ import re import json +import itertools from .common import InfoExtractor +from ..utils import ( + compat_urllib_request, +) class BambuserIE(InfoExtractor): + IE_NAME = u'bambuser' _VALID_URL = r'https?://bambuser\.com/v/(?P\d+)' _API_KEY = '005f64509e19a868399060af746a00aa' @@ -33,10 +38,43 @@ class BambuserIE(InfoExtractor): 'id': video_id, 'title': info['title'], 'url': info['url'], - 'thumbnail': info['preview'], + 'thumbnail': info.get('preview'), 'duration': int(info['length']), 'view_count': int(info['views_total']), 'uploader': info['username'], 'uploader_id': info['uid'], } + +class BambuserChannelIE(InfoExtractor): + IE_NAME = u'bambuser:channel' + _VALID_URL = r'http://bambuser.com/channel/(?P.*?)(?:/|#|\?|$)' + # The maximum number we can get with each request + _STEP = 50 + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + user = mobj.group('user') + urls = [] + last_id = '' + for i in itertools.count(1): + req_url = ('http://bambuser.com/xhr-api/index.php?username={user}' + '&sort=created&access_mode=0%2C1%2C2&limit={count}' + '&method=broadcast&format=json&vid_older_than={last}' + ).format(user=user, count=self._STEP, last=last_id) + req = compat_urllib_request.Request(req_url) + # Without setting this header, we wouldn't get any result + req.add_header('Referer', 'http://bambuser.com/channel/%s' % user) + info_json = self._download_webpage(req, user, + u'Downloading page %d' % i) + results = json.loads(info_json)['result'] + if len(results) == 0: + break + last_id = results[-1]['vid'] + urls.extend(self.url_result(v['page'], 'Bambuser') for v in results) + + return { + '_type': 'playlist', + 'title': user, + 'entries': urls, + } From cf519235455f312ac45e1d9829018eb5ecbec628 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sat, 2 Nov 2013 20:46:26 +0100 Subject: [PATCH 79/81] [youtube] Remove vevo test The video is no longer available and it seems that vevo video don't use encrypted signatures anymore. --- youtube_dl/extractor/youtube.py | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index dc601de52..a19abe1f0 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -339,18 +339,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): u"description": u"test chars: \"'/\\ä↭𝕐\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de ." } }, - { - u"url": u"http://www.youtube.com/watch?v=1ltcDfZMA3U", - u"file": u"1ltcDfZMA3U.mp4", - u"note": u"Test VEVO video (#897)", - u"info_dict": { - u"upload_date": u"20070518", - u"title": u"Maps - It Will Find You", - u"description": u"Music video by Maps performing It Will Find You.", - u"uploader": u"MuteUSA", - u"uploader_id": u"MuteUSA" - } - }, { u"url": u"http://www.youtube.com/watch?v=UxxajLWwzqY", u"file": u"UxxajLWwzqY.mp4", From 98d7efb537975b29ccaea64ff2765a0ec7bdb07d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sat, 2 Nov 2013 20:51:09 +0100 Subject: [PATCH 80/81] [exfm] skip tests The site is down too often. --- youtube_dl/extractor/exfm.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/youtube_dl/extractor/exfm.py b/youtube_dl/extractor/exfm.py index c74556579..a51d79b08 100644 --- a/youtube_dl/extractor/exfm.py +++ b/youtube_dl/extractor/exfm.py @@ -21,6 +21,7 @@ class ExfmIE(InfoExtractor): u'description': u'Test House \"Love Is Not Enough\" (Extended Mix) DeadJournalist Exclusive', }, u'note': u'Soundcloud song', + u'skip': u'The site is down too often', }, { u'url': u'http://ex.fm/song/wddt8', @@ -30,6 +31,7 @@ class ExfmIE(InfoExtractor): u'title': u'Safe and Sound', u'uploader': u'Capital Cities', }, + u'skip': u'The site is down too often', }, ] From f52f01b5d2ed117070475b0c7593a55d417e8e41 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sat, 2 Nov 2013 21:20:46 +0100 Subject: [PATCH 81/81] [brightcove] Don't set the extension If the video only has the 'FLVFullLengthURL' key, it can still be an mp4 file. --- youtube_dl/extractor/brightcove.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index 1392f382a..0d9b87a34 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -23,7 +23,7 @@ class BrightcoveIE(InfoExtractor): # From http://www.8tv.cat/8aldia/videos/xavier-sala-i-martin-aquesta-tarda-a-8-al-dia/ u'url': u'http://c.brightcove.com/services/viewer/htmlFederated?playerID=1654948606001&flashID=myExperience&%40videoPlayer=2371591881001', u'file': u'2371591881001.mp4', - u'md5': u'9e80619e0a94663f0bdc849b4566af19', + u'md5': u'8eccab865181d29ec2958f32a6a754f5', u'note': u'Test Brightcove downloads and detection in GenericIE', u'info_dict': { u'title': u'Xavier Sala i Martín: “Un banc que no presta és un banc zombi que no serveix per a res”', @@ -122,12 +122,10 @@ class BrightcoveIE(InfoExtractor): best_format = renditions[-1] info.update({ 'url': best_format['defaultURL'], - 'ext': 'mp4', }) elif video_info.get('FLVFullLengthURL') is not None: info.update({ 'url': video_info['FLVFullLengthURL'], - 'ext': 'flv', }) else: raise ExtractorError(u'Unable to extract video url for %s' % info['id'])