diff --git a/Makefile b/Makefile index fdb1abb60..f826c1685 100644 --- a/Makefile +++ b/Makefile @@ -61,34 +61,34 @@ youtube-dl: youtube_dl/*.py youtube_dl/*/*.py chmod a+x youtube-dl README.md: youtube_dl/*.py youtube_dl/*/*.py - COLUMNS=80 python youtube_dl/__main__.py --help | python devscripts/make_readme.py + COLUMNS=80 $(PYTHON) youtube_dl/__main__.py --help | $(PYTHON) devscripts/make_readme.py CONTRIBUTING.md: README.md - python devscripts/make_contributing.py README.md CONTRIBUTING.md + $(PYTHON) devscripts/make_contributing.py README.md CONTRIBUTING.md supportedsites: - python devscripts/make_supportedsites.py docs/supportedsites.md + $(PYTHON) devscripts/make_supportedsites.py docs/supportedsites.md README.txt: README.md pandoc -f markdown -t plain README.md -o README.txt youtube-dl.1: README.md - python devscripts/prepare_manpage.py >youtube-dl.1.temp.md + $(PYTHON) devscripts/prepare_manpage.py >youtube-dl.1.temp.md pandoc -s -f markdown -t man youtube-dl.1.temp.md -o youtube-dl.1 rm -f youtube-dl.1.temp.md youtube-dl.bash-completion: youtube_dl/*.py youtube_dl/*/*.py devscripts/bash-completion.in - python devscripts/bash-completion.py + $(PYTHON) devscripts/bash-completion.py bash-completion: youtube-dl.bash-completion youtube-dl.zsh: youtube_dl/*.py youtube_dl/*/*.py devscripts/zsh-completion.in - python devscripts/zsh-completion.py + $(PYTHON) devscripts/zsh-completion.py zsh-completion: youtube-dl.zsh youtube-dl.fish: youtube_dl/*.py youtube_dl/*/*.py devscripts/fish-completion.in - python devscripts/fish-completion.py + $(PYTHON) devscripts/fish-completion.py fish-completion: youtube-dl.fish diff --git a/youtube_dl/downloader/hls.py b/youtube_dl/downloader/hls.py index 92765a3f9..b5a3e1167 100644 --- a/youtube_dl/downloader/hls.py +++ b/youtube_dl/downloader/hls.py @@ -13,6 +13,7 @@ from ..utils import ( encodeArgument, encodeFilename, sanitize_open, + handle_youtubedl_headers, ) @@ -33,9 +34,10 @@ class HlsFD(FileDownloader): if info_dict['http_headers'] and re.match(r'^https?://', url): # Trailing \r\n after each HTTP header is important to prevent warning from ffmpeg/avconv: # [http @ 00000000003d2fa0] No trailing CRLF found in HTTP header. + headers = handle_youtubedl_headers(info_dict['http_headers']) args += [ '-headers', - ''.join('%s: %s\r\n' % (key, val) for key, val in info_dict['http_headers'].items() if key.lower() != 'accept-encoding')] + ''.join('%s: %s\r\n' % (key, val) for key, val in headers.items())] args += ['-i', url, '-f', 'mp4', '-c', 'copy', '-bsf:a', 'aac_adtstoasc'] diff --git a/youtube_dl/extractor/cspan.py b/youtube_dl/extractor/cspan.py index fbefd37d0..7b685d157 100644 --- a/youtube_dl/extractor/cspan.py +++ b/youtube_dl/extractor/cspan.py @@ -9,6 +9,7 @@ from ..utils import ( find_xpath_attr, smuggle_url, determine_ext, + ExtractorError, ) from .senateisvp import SenateISVPIE @@ -18,33 +19,32 @@ class CSpanIE(InfoExtractor): IE_DESC = 'C-SPAN' _TESTS = [{ 'url': 'http://www.c-span.org/video/?313572-1/HolderonV', - 'md5': '8e44ce11f0f725527daccc453f553eb0', + 'md5': '94b29a4f131ff03d23471dd6f60b6a1d', 'info_dict': { 'id': '315139', 'ext': 'mp4', 'title': 'Attorney General Eric Holder on Voting Rights Act Decision', - 'description': 'Attorney General Eric Holder spoke to reporters following the Supreme Court decision in Shelby County v. Holder in which the court ruled that the preclearance provisions of the Voting Rights Act could not be enforced until Congress established new guidelines for review.', + 'description': 'Attorney General Eric Holder speaks to reporters following the Supreme Court decision in [Shelby County v. Holder], in which the court ruled that the preclearance provisions of the Voting Rights Act could not be enforced.', }, 'skip': 'Regularly fails on travis, for unknown reasons', }, { 'url': 'http://www.c-span.org/video/?c4486943/cspan-international-health-care-models', - # For whatever reason, the served video alternates between - # two different ones + 'md5': '8e5fbfabe6ad0f89f3012a7943c1287b', 'info_dict': { - 'id': '340723', + 'id': 'c4486943', 'ext': 'mp4', - 'title': 'International Health Care Models', + 'title': 'CSPAN - International Health Care Models', 'description': 'md5:7a985a2d595dba00af3d9c9f0783c967', } }, { 'url': 'http://www.c-span.org/video/?318608-1/gm-ignition-switch-recall', - 'md5': '446562a736c6bf97118e389433ed88d4', + 'md5': '2ae5051559169baadba13fc35345ae74', 'info_dict': { 'id': '342759', 'ext': 'mp4', 'title': 'General Motors Ignition Switch Recall', 'duration': 14848, - 'description': 'md5:70c7c3b8fa63fa60d42772440596034c' + 'description': 'md5:118081aedd24bf1d3b68b3803344e7f3' }, }, { # Video from senate.gov @@ -57,67 +57,77 @@ class CSpanIE(InfoExtractor): }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - page_id = mobj.group('id') - webpage = self._download_webpage(url, page_id) - video_id = self._search_regex(r'progid=\'?([0-9]+)\'?>', webpage, 'video id') + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + matches = re.search(r'data-(prog|clip)id=\'([0-9]+)\'', webpage) + if matches: + video_type, video_id = matches.groups() + if video_type == 'prog': + video_type = 'program' + else: + senate_isvp_url = SenateISVPIE._search_iframe_url(webpage) + if senate_isvp_url: + title = self._og_search_title(webpage) + surl = smuggle_url(senate_isvp_url, {'force_title': title}) + return self.url_result(surl, 'SenateISVP', video_id, title) - description = self._html_search_regex( - [ - # The full description - r'
(.*?)(.*?)

' - ], - webpage, 'description', flags=re.DOTALL, default=None) + def get_text_attr(d, attr): + return d.get(attr, {}).get('#text') - info_url = 'http://c-spanvideo.org/videoLibrary/assets/player/ajax-player.php?os=android&html5=program&id=' + video_id - data = self._download_json(info_url, video_id) + data = self._download_json( + 'http://www.c-span.org/assets/player/ajax-player.php?os=android&html5=%s&id=%s' % (video_type, video_id), + video_id)['video'] + if data['@status'] != 'Success': + raise ExtractorError('%s said: %s' % (self.IE_NAME, get_text_attr(data, 'error')), expected=True) doc = self._download_xml( - 'http://www.c-span.org/common/services/flashXml.php?programid=' + video_id, + 'http://www.c-span.org/common/services/flashXml.php?%sid=%s' % (video_type, video_id), video_id) + description = self._html_search_meta('description', webpage) + title = find_xpath_attr(doc, './/string', 'name', 'title').text thumbnail = find_xpath_attr(doc, './/string', 'name', 'poster').text - senate_isvp_url = SenateISVPIE._search_iframe_url(webpage) - if senate_isvp_url: - surl = smuggle_url(senate_isvp_url, {'force_title': title}) - return self.url_result(surl, 'SenateISVP', video_id, title) + files = data['files'] + capfile = get_text_attr(data, 'capfile') - files = data['video']['files'] - try: - capfile = data['video']['capfile']['#text'] - except KeyError: - capfile = None - - entries = [{ - 'id': '%s_%d' % (video_id, partnum + 1), - 'title': ( - title if len(files) == 1 else - '%s part %d' % (title, partnum + 1)), - 'url': unescapeHTML(f['path']['#text']), - 'description': description, - 'thumbnail': thumbnail, - 'duration': int_or_none(f.get('length', {}).get('#text')), - 'subtitles': { - 'en': [{ - 'url': capfile, - 'ext': determine_ext(capfile, 'dfxp') - }], - } if capfile else None, - } for partnum, f in enumerate(files)] + entries = [] + for partnum, f in enumerate(files): + formats = [] + for quality in f['qualities']: + formats.append({ + 'format_id': '%s-%sp' % (get_text_attr(quality, 'bitrate'), get_text_attr(quality, 'height')), + 'url': unescapeHTML(get_text_attr(quality, 'file')), + 'height': int_or_none(get_text_attr(quality, 'height')), + 'tbr': int_or_none(get_text_attr(quality, 'bitrate')), + }) + self._sort_formats(formats) + entries.append({ + 'id': '%s_%d' % (video_id, partnum + 1), + 'title': ( + title if len(files) == 1 else + '%s part %d' % (title, partnum + 1)), + 'formats': formats, + 'description': description, + 'thumbnail': thumbnail, + 'duration': int_or_none(get_text_attr(f, 'length')), + 'subtitles': { + 'en': [{ + 'url': capfile, + 'ext': determine_ext(capfile, 'dfxp') + }], + } if capfile else None, + }) if len(entries) == 1: entry = dict(entries[0]) - entry['id'] = video_id + entry['id'] = 'c' + video_id if video_type == 'clip' else video_id return entry else: return { '_type': 'playlist', 'entries': entries, 'title': title, - 'id': video_id, + 'id': 'c' + video_id if video_type == 'clip' else video_id, } diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index fd854411b..321eec59e 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -164,7 +164,7 @@ class FacebookIE(InfoExtractor): if not video_title: video_title = self._html_search_regex( r'(?s)(.*?)', - webpage, 'alternative title', fatal=False) + webpage, 'alternative title', default=None) video_title = limit_length(video_title, 80) if not video_title: video_title = 'Facebook video #%s' % video_id diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index 965940a4b..08275687d 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -147,7 +147,8 @@ class PornHubPlaylistIE(InfoExtractor): entries = [ self.url_result('http://www.pornhub.com/%s' % video_url, 'PornHub') - for video_url in set(re.findall('href="/?(view_video\.php\?viewkey=\d+[^"]*)"', webpage)) + for video_url in set(re.findall( + r'href="/?(view_video\.php\?.*\bviewkey=[\da-z]+[^"]*)"', webpage)) ] playlist = self._parse_json( diff --git a/youtube_dl/extractor/spiegel.py b/youtube_dl/extractor/spiegel.py index 5bd3c0087..39a7aaf9d 100644 --- a/youtube_dl/extractor/spiegel.py +++ b/youtube_dl/extractor/spiegel.py @@ -58,7 +58,8 @@ class SpiegelIE(InfoExtractor): description = self._html_search_meta('description', webpage, 'description') base_url = self._search_regex( - r'var\s+server\s*=\s*"([^"]+)\"', webpage, 'server URL') + [r'server\s*:\s*(["\'])(?P.+?)\1', r'var\s+server\s*=\s*"(?P[^"]+)\"'], + webpage, 'server URL', group='url') xml_url = base_url + video_id + '.xml' idoc = self._download_xml(xml_url, video_id) diff --git a/youtube_dl/extractor/vodlocker.py b/youtube_dl/extractor/vodlocker.py index be0a2780f..357594a11 100644 --- a/youtube_dl/extractor/vodlocker.py +++ b/youtube_dl/extractor/vodlocker.py @@ -3,11 +3,14 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..compat import compat_urllib_parse -from ..utils import sanitized_Request +from ..utils import ( + ExtractorError, + sanitized_Request, +) class VodlockerIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?vodlocker\.com/(?P[0-9a-zA-Z]+)(?:\..*?)?' + _VALID_URL = r'https?://(?:www\.)?vodlocker\.com/(?:embed-)?(?P[0-9a-zA-Z]+)(?:\..*?)?' _TESTS = [{ 'url': 'http://vodlocker.com/e8wvyzz4sl42', @@ -24,6 +27,12 @@ class VodlockerIE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) + if any(p in webpage for p in ( + '>THIS FILE WAS DELETED<', + '>File Not Found<', + 'The file you were looking for could not be found, sorry for any inconvenience.<')): + raise ExtractorError('Video %s does not exist' % video_id, expected=True) + fields = self._hidden_inputs(webpage) if fields['op'] == 'download1': diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 1c2420a33..9b39505ba 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -258,7 +258,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): |(?: # or the v= param in all its forms (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx) (?:\?|\#!?) # the params delimiter ? or # or #! - (?:.*?&)?? # any other preceding param (like /?s=tuff&v=xxxx) + (?:.*?[&;])?? # any other preceding param (like /?s=tuff&v=xxxx or ?s=tuff&v=V36LpHqtcDY) v= ) )) @@ -346,6 +346,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, + # itag 272 videos are either 3840x2160 (e.g. RtoitU2A-3E) or 7680x4320 (sLprVF6d7Ug) '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'vp9'}, '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'vp9'}, @@ -730,6 +731,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'skip_download': True, }, }, + { + 'url': 'https://www.youtube.com/watch?feature=player_embedded&amp;v=V36LpHqtcDY', + 'only_matching': True, + } ] def __init__(self, *args, **kwargs): @@ -1475,6 +1480,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): manifest_url = video_info['hlsvp'][0] url_map = self._extract_from_m3u8(manifest_url, video_id) formats = _map_to_format_list(url_map) + # Accept-Encoding header causes failures in live streams on Youtube and Youtube Gaming + for a_format in formats: + a_format.setdefault('http_headers', {})['Youtubedl-no-compression'] = 'True' else: raise ExtractorError('no conn, hlsvp or url_encoded_fmt_stream_map information found in video info') @@ -1559,7 +1567,7 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor, YoutubePlaylistBaseInfoExtract youtube\.com/ (?: (?:course|view_play_list|my_playlists|artist|playlist|watch|embed/videoseries) - \? (?:.*?&)*? (?:p|a|list)= + \? (?:.*?[&;])*? (?:p|a|list)= | p/ ) ( diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index d7b737e21..d0606b4bc 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -663,6 +663,16 @@ def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs): return hc +def handle_youtubedl_headers(headers): + filtered_headers = headers + + if 'Youtubedl-no-compression' in filtered_headers: + filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding') + del filtered_headers['Youtubedl-no-compression'] + + return filtered_headers + + class YoutubeDLHandler(compat_urllib_request.HTTPHandler): """Handler for HTTP requests and responses. @@ -670,7 +680,7 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler): the standard headers to every HTTP request and handles gzipped and deflated responses from web servers. If compression is to be avoided in a particular request, the original request in the program code only has - to include the HTTP header "Youtubedl-No-Compression", which will be + to include the HTTP header "Youtubedl-no-compression", which will be removed before making the real request. Part of this code was copied from: @@ -731,10 +741,8 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler): # The dict keys are capitalized because of this bug by urllib if h.capitalize() not in req.headers: req.add_header(h, v) - if 'Youtubedl-no-compression' in req.headers: - if 'Accept-encoding' in req.headers: - del req.headers['Accept-encoding'] - del req.headers['Youtubedl-no-compression'] + + req.headers = handle_youtubedl_headers(req.headers) if sys.version_info < (2, 7) and '#' in req.get_full_url(): # Python 2.6 is brain-dead when it comes to fragments