From 00d8d755fcd949e8374e548345c74e4288717542 Mon Sep 17 00:00:00 2001 From: Mattias Wadman Date: Mon, 23 Jan 2017 20:43:42 +0100 Subject: [PATCH] [infoq] Probe if audio URL is valid Make it possible to pass headers to _is_valid_url --- youtube_dl/extractor/common.py | 4 ++-- youtube_dl/extractor/infoq.py | 16 +++++++++++----- 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 6fa7c334e..a257721f1 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1015,13 +1015,13 @@ class InfoExtractor(object): unique_formats.append(f) formats[:] = unique_formats - def _is_valid_url(self, url, video_id, item='video'): + def _is_valid_url(self, url, video_id, item='video', headers={}): url = self._proto_relative_url(url, scheme='http:') # For now assume non HTTP(S) URLs always valid if not (url.startswith('http://') or url.startswith('https://')): return True try: - self._request_webpage(url, video_id, 'Checking %s URL' % item) + self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers) return True except ExtractorError as e: if isinstance(e.cause, compat_urllib_error.URLError): diff --git a/youtube_dl/extractor/infoq.py b/youtube_dl/extractor/infoq.py index d63e95854..08f9e86a8 100644 --- a/youtube_dl/extractor/infoq.py +++ b/youtube_dl/extractor/infoq.py @@ -73,22 +73,28 @@ class InfoQIE(BokeCCBaseIE): }, }] - def _extract_http_audio(self, webpage): + def _extract_http_audio(self, webpage, video_id): fields = self._hidden_inputs(webpage) http_audio_url = fields['filename'] if http_audio_url is None: return [] + + cookies_header = {'Cookie': self._extract_cookies(webpage)} + # base URL is found in the Location header in the response returned by # GET https://www.infoq.com/mp3download.action?filename=... when logged in. http_audio_url = compat_urlparse.urljoin('http://res.infoq.com/downloads/mp3downloads/', http_audio_url) + # audio file seem to be missing some times even if there is a download link + # so probe URL to make sure + if not self._is_valid_url(http_audio_url, video_id, headers=cookies_header): + return [] + return [{ 'format_id': 'http_audio', 'url': http_audio_url, 'vcodec': 'none', - 'http_headers': { - 'Cookie': self._extract_cookies(webpage) - }, + 'http_headers': cookies_header, }] def _real_extract(self, url): @@ -105,7 +111,7 @@ class InfoQIE(BokeCCBaseIE): formats = ( self._extract_rtmp_video(webpage) + self._extract_http_video(webpage) + - self._extract_http_audio(webpage)) + self._extract_http_audio(webpage, video_id)) self._sort_formats(formats)