From 984e4d487520bd2a860b31b3165416c879b28096 Mon Sep 17 00:00:00 2001 From: remitamine Date: Wed, 24 Jun 2015 01:13:23 +0100 Subject: [PATCH 01/92] [googledrive] Add new extractor --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/googledrive.py | 106 ++++++++++++++++++++++++++++ 2 files changed, 107 insertions(+) create mode 100644 youtube_dl/extractor/googledrive.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 3cfa804ec..6655d7eb5 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -209,6 +209,7 @@ from .globo import GloboIE from .godtube import GodTubeIE from .goldenmoustache import GoldenMoustacheIE from .golem import GolemIE +from .googledrive import GoogleDriveIE from .googleplus import GooglePlusIE from .googlesearch import GoogleSearchIE from .gorillavid import GorillaVidIE diff --git a/youtube_dl/extractor/googledrive.py b/youtube_dl/extractor/googledrive.py new file mode 100644 index 000000000..8c611fa47 --- /dev/null +++ b/youtube_dl/extractor/googledrive.py @@ -0,0 +1,106 @@ +from .common import InfoExtractor +from ..utils import RegexNotFoundError + +class GoogleDriveIE(InfoExtractor): + _VALID_URL = r'(?:https?://)?(?:video\.google\.com/get_player\?.*?docid=|(?:docs|drive)\.google\.com/(?:uc\?.*?id=|file/d/))(?P.+?)(?:&|/|$)' + _TEST = { + 'url': 'https://drive.google.com/file/d/0BzpExh0WzJF0NlR5WUlxdEVsY0U/edit?pli=1', + 'info_dict': { + 'id': '0BzpExh0WzJF0NlR5WUlxdEVsY0U', + 'ext': 'mp4', + 'title': '[AHSH] Fairy Tail S2 - 01 [720p].mp4', + } + } + _formats = { + '5': {'ext': 'flv'}, + '6': {'ext': 'flv'}, + '13': {'ext': '3gp'}, + '17': {'ext': '3gp'}, + '18': {'ext': 'mp4'}, + '22': {'ext': 'mp4'}, + '34': {'ext': 'flv'}, + '35': {'ext': 'flv'}, + '36': {'ext': '3gp'}, + '37': {'ext': 'mp4'}, + '38': {'ext': 'mp4'}, + '43': {'ext': 'webm'}, + '44': {'ext': 'webm'}, + '45': {'ext': 'webm'}, + '46': {'ext': 'webm'}, + '59': {'ext': 'mp4'} + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage( + 'http://docs.google.com/file/d/'+video_id, video_id, encoding='unicode_escape' + ) + try: + title = self._html_search_regex( + r'"title","(?P.*?)"', + webpage, + 'title', + group='title' + ) + fmt_stream_map = self._html_search_regex( + r'"fmt_stream_map","(?P<fmt_stream_map>.*?)"', + webpage, + 'fmt_stream_map', + group='fmt_stream_map' + ) + fmt_list = self._html_search_regex( + r'"fmt_list","(?P<fmt_list>.*?)"', + webpage, + 'fmt_list', + group='fmt_list' + ) +# timestamp = self._html_search_regex( +# r'"timestamp","(?P<timestamp>.*?)"', +# webpage, +# 'timestamp', +# group='timestamp' +# ) + length_seconds = self._html_search_regex( + r'"length_seconds","(?P<length_seconds>.*?)"', + webpage, + 'length_seconds', + group='length_seconds' + ) + except RegexNotFoundError: + try: + reason = self._html_search_regex( + r'"reason","(?P<reason>.*?)"', + webpage, + 'reason', + group='reason' + ) + self.report_warning(reason) + return + except RegexNotFoundError: + self.report_warning('not a video') + return + + fmt_stream_map = fmt_stream_map.split(',') + fmt_list = fmt_list.split(',') + formats = [] + for i in range(len(fmt_stream_map)): + fmt_id, fmt_url = fmt_stream_map[i].split('|') + resolution = fmt_list[i].split('/')[1] + width, height = resolution.split('x') + formats.append({ + 'url': fmt_url, + 'format_id': fmt_id, + 'resolution': resolution, + 'width': int(width), + 'height': int(height), + 'ext': self._formats[fmt_id]['ext'] + }) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, +# 'timestamp': int(timestamp), + 'duration': int(length_seconds), + 'formats': formats + } From f120a7ab5e9c560a8114f9662e2f213243a945b0 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Wed, 24 Jun 2015 14:56:19 +0100 Subject: [PATCH 02/92] change the _TEST info --- youtube_dl/extractor/googledrive.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/googledrive.py b/youtube_dl/extractor/googledrive.py index 8c611fa47..e3d5c3418 100644 --- a/youtube_dl/extractor/googledrive.py +++ b/youtube_dl/extractor/googledrive.py @@ -4,11 +4,11 @@ from ..utils import RegexNotFoundError class GoogleDriveIE(InfoExtractor): _VALID_URL = r'(?:https?://)?(?:video\.google\.com/get_player\?.*?docid=|(?:docs|drive)\.google\.com/(?:uc\?.*?id=|file/d/))(?P<id>.+?)(?:&|/|$)' _TEST = { - 'url': 'https://drive.google.com/file/d/0BzpExh0WzJF0NlR5WUlxdEVsY0U/edit?pli=1', + 'url': 'https://drive.google.com/file/d/0ByeS4oOUV-49Zzh4R1J6R09zazQ/edit?pli=1', 'info_dict': { - 'id': '0BzpExh0WzJF0NlR5WUlxdEVsY0U', + 'id': '0ByeS4oOUV-49Zzh4R1J6R09zazQ', 'ext': 'mp4', - 'title': '[AHSH] Fairy Tail S2 - 01 [720p].mp4', + 'title': 'Big Buck Bunny.mp4', } } _formats = { From 3e5f3df1729846a33631dd38a887cd1d81a727c1 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Mon, 29 Jun 2015 07:53:21 +0100 Subject: [PATCH 03/92] move the embed to a separate class --- youtube_dl/extractor/googledrive.py | 31 ++++++++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/googledrive.py b/youtube_dl/extractor/googledrive.py index e3d5c3418..ac891b275 100644 --- a/youtube_dl/extractor/googledrive.py +++ b/youtube_dl/extractor/googledrive.py @@ -1,8 +1,37 @@ +import re + from .common import InfoExtractor from ..utils import RegexNotFoundError +class GoogleDriveEmbedIE(InfoExtractor): + _VALID_URL = r'https?://(?:video\.google\.com/get_player\?.*?docid=|(?:docs|drive)\.google\.com/file/d/)(?P<id>[a-zA-Z0-9-]{28})(?:/preview)' + _TEST = { + 'url': 'https://docs.google.com/file/d/0B8KB9DRosYGKMXNoeWxqa3JYclE/preview', + 'info_dict': { + 'id': '0B8KB9DRosYGKMXNoeWxqa3JYclE', + 'ext': 'mp4', + 'title': 'Jimmy Fallon Sings Since You\'ve Been Gone.wmv', + } + } + + @staticmethod + def _extract_url(webpage): + mobj = re.search( + r'<iframe src="https?://(?:video\.google\.com/get_player\?.*?docid=|(?:docs|drive)\.google\.com/file/d/)(?P<id>[a-zA-Z0-9-]{28})(?:/preview)', + webpage) + if mobj: + return 'https://drive.google.com/file/d/%s' % mobj.group('id') + + def _real_extract(self, url): + video_id = self._match_id(url) + return { + '_type': 'url', + 'ie-key': 'GoogleDrive', + 'url': 'https://drive.google.com/file/d/%s' % video_id + } + class GoogleDriveIE(InfoExtractor): - _VALID_URL = r'(?:https?://)?(?:video\.google\.com/get_player\?.*?docid=|(?:docs|drive)\.google\.com/(?:uc\?.*?id=|file/d/))(?P<id>.+?)(?:&|/|$)' + _VALID_URL = r'https?://(?:docs|drive)\.google\.com/(?:uc\?.*?id=|file/d/)(?P<id>[a-zA-Z0-9-]{28})' _TEST = { 'url': 'https://drive.google.com/file/d/0ByeS4oOUV-49Zzh4R1J6R09zazQ/edit?pli=1', 'info_dict': { From 2d651a2d02885cddf1752b45497e9113d3a3d403 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Mon, 29 Jun 2015 07:55:44 +0100 Subject: [PATCH 04/92] import google drive embed class --- youtube_dl/extractor/__init__.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 6655d7eb5..02e18a0da 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -209,7 +209,10 @@ from .globo import GloboIE from .godtube import GodTubeIE from .goldenmoustache import GoldenMoustacheIE from .golem import GolemIE -from .googledrive import GoogleDriveIE +from .googledrive import ( + GoogleDriveEmbedIE, + GoogleDriveIE, +) from .googleplus import GooglePlusIE from .googlesearch import GoogleSearchIE from .gorillavid import GorillaVidIE From 653789afc72d1a225b971541fb633dd768d58942 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Mon, 29 Jun 2015 08:01:30 +0100 Subject: [PATCH 05/92] add google drive embeds --- youtube_dl/extractor/generic.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 6d2efb22e..3f7b094db 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -48,6 +48,7 @@ from .vimeo import VimeoIE from .dailymotion import DailymotionCloudIE from .onionstudios import OnionStudiosIE from .snagfilms import SnagFilmsEmbedIE +from .googledrive import GoogleDriveEmbedIE class GenericIE(InfoExtractor): @@ -1599,6 +1600,11 @@ class GenericIE(InfoExtractor): if nbc_sports_url: return self.url_result(nbc_sports_url, 'NBCSportsVPlayer') + # Look for Google Drive embeds + google_drive_url = GoogleDriveEmbedIE._extract_url(webpage) + if google_drive_url: + return self.url_result(google_drive_url, 'GoogleDrive') + # Look for UDN embeds mobj = re.search( r'<iframe[^>]+src="(?P<url>%s)"' % UDNEmbedIE._VALID_URL, webpage) From 3b3d531965f0f36c20f5fa8557481c144170653f Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Fri, 17 Jul 2015 14:17:19 +0100 Subject: [PATCH 06/92] fix embed regex --- youtube_dl/extractor/googledrive.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/googledrive.py b/youtube_dl/extractor/googledrive.py index ac891b275..c82c9037f 100644 --- a/youtube_dl/extractor/googledrive.py +++ b/youtube_dl/extractor/googledrive.py @@ -4,7 +4,7 @@ from .common import InfoExtractor from ..utils import RegexNotFoundError class GoogleDriveEmbedIE(InfoExtractor): - _VALID_URL = r'https?://(?:video\.google\.com/get_player\?.*?docid=|(?:docs|drive)\.google\.com/file/d/)(?P<id>[a-zA-Z0-9-]{28})(?:/preview)' + _VALID_URL = r'https?://(?:video\.google\.com/get_player\?.*?docid=|(?:docs|drive)\.google\.com/file/d/)(?P<id>[a-zA-Z0-9_-]{28})' _TEST = { 'url': 'https://docs.google.com/file/d/0B8KB9DRosYGKMXNoeWxqa3JYclE/preview', 'info_dict': { @@ -17,7 +17,7 @@ class GoogleDriveEmbedIE(InfoExtractor): @staticmethod def _extract_url(webpage): mobj = re.search( - r'<iframe src="https?://(?:video\.google\.com/get_player\?.*?docid=|(?:docs|drive)\.google\.com/file/d/)(?P<id>[a-zA-Z0-9-]{28})(?:/preview)', + r'<iframe src="https?://(?:video\.google\.com/get_player\?.*?docid=|(?:docs|drive)\.google\.com/file/d/)(?P<id>[a-zA-Z0-9_-]{28})', webpage) if mobj: return 'https://drive.google.com/file/d/%s' % mobj.group('id') @@ -31,7 +31,7 @@ class GoogleDriveEmbedIE(InfoExtractor): } class GoogleDriveIE(InfoExtractor): - _VALID_URL = r'https?://(?:docs|drive)\.google\.com/(?:uc\?.*?id=|file/d/)(?P<id>[a-zA-Z0-9-]{28})' + _VALID_URL = r'https?://(?:docs|drive)\.google\.com/(?:uc\?.*?id=|file/d/)(?P<id>[a-zA-Z0-9_-]{28})' _TEST = { 'url': 'https://drive.google.com/file/d/0ByeS4oOUV-49Zzh4R1J6R09zazQ/edit?pli=1', 'info_dict': { From d1cc05e17eccccb7ee6473574c6a4f887104baeb Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Fri, 17 Jul 2015 14:37:21 +0100 Subject: [PATCH 07/92] remove unnecessary regex group names --- youtube_dl/extractor/googledrive.py | 32 ++++++++++++----------------- 1 file changed, 13 insertions(+), 19 deletions(-) diff --git a/youtube_dl/extractor/googledrive.py b/youtube_dl/extractor/googledrive.py index c82c9037f..6d9bcfefd 100644 --- a/youtube_dl/extractor/googledrive.py +++ b/youtube_dl/extractor/googledrive.py @@ -62,46 +62,40 @@ class GoogleDriveIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage( - 'http://docs.google.com/file/d/'+video_id, video_id, encoding='unicode_escape' + 'http://docs.google.com/file/d/' + video_id, video_id, encoding='unicode_escape' ) try: title = self._html_search_regex( - r'"title","(?P<title>.*?)"', + r'"title"\s+,\s+"[^"]+', webpage, - 'title', - group='title' + 'title' ) fmt_stream_map = self._html_search_regex( - r'"fmt_stream_map","(?P<fmt_stream_map>.*?)"', + r'"fmt_stream_map"\s+,\s+"[^"]+', webpage, - 'fmt_stream_map', - group='fmt_stream_map' + 'fmt_stream_map' ) fmt_list = self._html_search_regex( - r'"fmt_list","(?P<fmt_list>.*?)"', + r'"fmt_list"\s+,\s+"[^"]+', webpage, - 'fmt_list', - group='fmt_list' + 'fmt_list' ) # timestamp = self._html_search_regex( -# r'"timestamp","(?P<timestamp>.*?)"', +# r'"timestamp"\s+,\s+"[^"]+', # webpage, -# 'timestamp', -# group='timestamp' +# 'timestamp' # ) length_seconds = self._html_search_regex( - r'"length_seconds","(?P<length_seconds>.*?)"', + r'"length_seconds"\s+,\s+"[^"]+', webpage, - 'length_seconds', - group='length_seconds' + 'length_seconds' ) except RegexNotFoundError: try: reason = self._html_search_regex( - r'"reason","(?P<reason>.*?)"', + r'"reason","[^"]+', webpage, - 'reason', - group='reason' + 'reason' ) self.report_warning(reason) return From 36dbca87848fc5698d3e0b89380c7bcec741ceaf Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Fri, 17 Jul 2015 14:52:01 +0100 Subject: [PATCH 08/92] fix recursive error --- youtube_dl/extractor/googledrive.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/googledrive.py b/youtube_dl/extractor/googledrive.py index 6d9bcfefd..a3d9b4450 100644 --- a/youtube_dl/extractor/googledrive.py +++ b/youtube_dl/extractor/googledrive.py @@ -26,7 +26,7 @@ class GoogleDriveEmbedIE(InfoExtractor): video_id = self._match_id(url) return { '_type': 'url', - 'ie-key': 'GoogleDrive', + 'ie_key': 'GoogleDrive', 'url': 'https://drive.google.com/file/d/%s' % video_id } @@ -66,34 +66,34 @@ class GoogleDriveIE(InfoExtractor): ) try: title = self._html_search_regex( - r'"title"\s+,\s+"[^"]+', + r'"title"\s*,\s*"([^"]+)', webpage, 'title' ) fmt_stream_map = self._html_search_regex( - r'"fmt_stream_map"\s+,\s+"[^"]+', + r'"fmt_stream_map"\s*,\s*"([^"]+)', webpage, 'fmt_stream_map' ) fmt_list = self._html_search_regex( - r'"fmt_list"\s+,\s+"[^"]+', + r'"fmt_list"\s*,\s*"([^"]+)', webpage, 'fmt_list' ) # timestamp = self._html_search_regex( -# r'"timestamp"\s+,\s+"[^"]+', +# r'"timestamp"\s*,\s*"([^"]+)', # webpage, # 'timestamp' # ) length_seconds = self._html_search_regex( - r'"length_seconds"\s+,\s+"[^"]+', + r'"length_seconds"\s*,\s*"([^"]+)', webpage, 'length_seconds' ) except RegexNotFoundError: try: reason = self._html_search_regex( - r'"reason","[^"]+', + r'"reason","([^"]+)', webpage, 'reason' ) From 8e92d21ebf6f17e14c9e916f22e49f27529556af Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Sat, 18 Jul 2015 23:31:14 +0100 Subject: [PATCH 09/92] [googledrive] raise ExtractorError instead of warning --- youtube_dl/extractor/googledrive.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/googledrive.py b/youtube_dl/extractor/googledrive.py index a3d9b4450..7bc7b7a0d 100644 --- a/youtube_dl/extractor/googledrive.py +++ b/youtube_dl/extractor/googledrive.py @@ -1,7 +1,10 @@ import re from .common import InfoExtractor -from ..utils import RegexNotFoundError +from ..utils import ( + RegexNotFoundError, + ExtractorError, +) class GoogleDriveEmbedIE(InfoExtractor): _VALID_URL = r'https?://(?:video\.google\.com/get_player\?.*?docid=|(?:docs|drive)\.google\.com/file/d/)(?P<id>[a-zA-Z0-9_-]{28})' @@ -97,10 +100,10 @@ class GoogleDriveIE(InfoExtractor): webpage, 'reason' ) - self.report_warning(reason) + raise ExtractorError(reason) return except RegexNotFoundError: - self.report_warning('not a video') + raise ExtractorError('not a video') return fmt_stream_map = fmt_stream_map.split(',') From 8b55cadc83f198e0fa6bac7158f9b05826f39257 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Mon, 7 Sep 2015 16:39:01 +0100 Subject: [PATCH 10/92] [canal13cl] fix info extraction --- youtube_dl/extractor/__init__.py | 2 +- youtube_dl/extractor/canal13cl.py | 48 ------------------- youtube_dl/extractor/tele13.py | 77 +++++++++++++++++++++++++++++++ 3 files changed, 78 insertions(+), 49 deletions(-) delete mode 100644 youtube_dl/extractor/canal13cl.py create mode 100644 youtube_dl/extractor/tele13.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 5d2ea39d0..661b53e63 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -67,7 +67,6 @@ from .camdemy import ( CamdemyIE, CamdemyFolderIE ) -from .canal13cl import Canal13clIE from .canalplus import CanalplusIE from .canalc2 import Canalc2IE from .cbs import CBSIE @@ -612,6 +611,7 @@ from .teachingchannel import TeachingChannelIE from .teamcoco import TeamcocoIE from .techtalks import TechTalksIE from .ted import TEDIE +from .tele13 import Tele13IE from .telebruxelles import TeleBruxellesIE from .telecinco import TelecincoIE from .telegraaf import TelegraafIE diff --git a/youtube_dl/extractor/canal13cl.py b/youtube_dl/extractor/canal13cl.py deleted file mode 100644 index 93241fefe..000000000 --- a/youtube_dl/extractor/canal13cl.py +++ /dev/null @@ -1,48 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor - - -class Canal13clIE(InfoExtractor): - _VALID_URL = r'^http://(?:www\.)?13\.cl/(?:[^/?#]+/)*(?P<id>[^/?#]+)' - _TEST = { - 'url': 'http://www.13.cl/t13/nacional/el-circulo-de-hierro-de-michelle-bachelet-en-su-regreso-a-la-moneda', - 'md5': '4cb1fa38adcad8fea88487a078831755', - 'info_dict': { - 'id': '1403022125', - 'display_id': 'el-circulo-de-hierro-de-michelle-bachelet-en-su-regreso-a-la-moneda', - 'ext': 'mp4', - 'title': 'El "círculo de hierro" de Michelle Bachelet en su regreso a La Moneda', - 'description': '(Foto: Agencia Uno) En nueve días más, Michelle Bachelet va a asumir por segunda vez como presidenta de la República. Entre aquellos que la acompañarán hay caras que se repiten y otras que se consolidan en su entorno de colaboradores más cercanos.', - } - } - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - display_id = mobj.group('id') - - webpage = self._download_webpage(url, display_id) - - title = self._html_search_meta( - 'twitter:title', webpage, 'title', fatal=True) - description = self._html_search_meta( - 'twitter:description', webpage, 'description') - url = self._html_search_regex( - r'articuloVideo = \"(.*?)\"', webpage, 'url') - real_id = self._search_regex( - r'[^0-9]([0-9]{7,})[^0-9]', url, 'id', default=display_id) - thumbnail = self._html_search_regex( - r'articuloImagen = \"(.*?)\"', webpage, 'thumbnail') - - return { - 'id': real_id, - 'display_id': display_id, - 'url': url, - 'title': title, - 'description': description, - 'ext': 'mp4', - 'thumbnail': thumbnail, - } diff --git a/youtube_dl/extractor/tele13.py b/youtube_dl/extractor/tele13.py new file mode 100644 index 000000000..5d89e757f --- /dev/null +++ b/youtube_dl/extractor/tele13.py @@ -0,0 +1,77 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import js_to_json + + +class Tele13IE(InfoExtractor): + _VALID_URL = r'^http://(?:www\.)?t13\.cl/videos(?:/[^/]+)+/(?P<id>[\w-]+)' + _TESTS = [ + { + 'url': 'http://www.t13.cl/videos/actualidad/el-circulo-de-hierro-de-michelle-bachelet-en-su-regreso-a-la-moneda', + 'md5': '4cb1fa38adcad8fea88487a078831755', + 'info_dict': { + 'id': 'el-circulo-de-hierro-de-michelle-bachelet-en-su-regreso-a-la-moneda', + 'ext': 'mp4', + 'title': 'El c\u00edrculo de hierro de Michelle Bachelet en su regreso a La Moneda', + } + }, + { + 'url': 'http://www.t13.cl/videos/mundo/tendencias/video-captan-misteriosa-bola-fuego-cielos-bangkok', + 'md5': '65d1ae54812c96f4b345dd21d3bb1adc', + 'info_dict': { + 'id': 'rOoKv2OMpOw', + 'ext': 'mp4', + 'title': 'Shooting star seen on 7-Sep-2015', + 'description': 'md5:a1cd2e74f6ee6851552c9cf5851d6b06', + 'uploader': 'Porjai Jaturongkhakun', + 'upload_date': '20150906', + 'uploader_id': 'UCnLY_3ezwNcDSC_Wc6suZxw', + }, + 'add_ie': ['Youtube'], + } + ] + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + + setup_js = self._parse_json( + js_to_json( + self._search_regex( + r"jwplayer\('player-vivo'\).setup\((\{.*?\})\)", + webpage, + 'setup code', + flags=re.DOTALL + ).replace('\n//', '') + ), + display_id + ) + title = setup_js['title'] + thumbnail = setup_js.get('image') or setup_js['playlist'][0].get('image') + description = self._html_search_meta( + 'description', webpage, 'description') + + formats = [] + for f in setup_js['playlist'][0]['sources']: + format_url = f['file'] + if format_url != '': + if '.m3u8' in format_url: + formats.extend(self._extract_m3u8_formats(format_url, display_id)) + else: + if 'youtube.com' in format_url: + return self.url_result(format_url, 'Youtube') + else: + formats.append({'url': format_url, 'format_id': f.get('label')}) + + return { + 'id': display_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'formats': formats, + } From 436416afe2ea70dd6b55f8c9d699ddb0bdc1ec5f Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Mon, 7 Sep 2015 21:13:49 +0100 Subject: [PATCH 11/92] [tele13] skip test --- youtube_dl/extractor/tele13.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/tele13.py b/youtube_dl/extractor/tele13.py index 5d89e757f..f1764eb2f 100644 --- a/youtube_dl/extractor/tele13.py +++ b/youtube_dl/extractor/tele13.py @@ -16,8 +16,12 @@ class Tele13IE(InfoExtractor): 'info_dict': { 'id': 'el-circulo-de-hierro-de-michelle-bachelet-en-su-regreso-a-la-moneda', 'ext': 'mp4', - 'title': 'El c\u00edrculo de hierro de Michelle Bachelet en su regreso a La Moneda', - } + 'title': 'El círculo de hierro de Michelle Bachelet en su regreso a La Moneda', + }, + 'params': { + # HTTP Error 404: Not Found + 'skip_download': True, + }, }, { 'url': 'http://www.t13.cl/videos/mundo/tendencias/video-captan-misteriosa-bola-fuego-cielos-bangkok', From b306c439d7f2997ebf2a88385c73fe2d92227b76 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Wed, 23 Sep 2015 13:28:05 +0100 Subject: [PATCH 12/92] [cnet] fix extraction and extract more formats --- youtube_dl/extractor/cnet.py | 54 +++++++++++++++--------------------- 1 file changed, 23 insertions(+), 31 deletions(-) diff --git a/youtube_dl/extractor/cnet.py b/youtube_dl/extractor/cnet.py index 5dd69bff7..2fac0d79d 100644 --- a/youtube_dl/extractor/cnet.py +++ b/youtube_dl/extractor/cnet.py @@ -4,9 +4,7 @@ from __future__ import unicode_literals import json from .common import InfoExtractor -from ..utils import ( - ExtractorError, -) +from .theplatform import ThePlatformIE class CNETIE(InfoExtractor): @@ -15,29 +13,22 @@ class CNETIE(InfoExtractor): 'url': 'http://www.cnet.com/videos/hands-on-with-microsofts-windows-8-1-update/', 'info_dict': { 'id': '56f4ea68-bd21-4852-b08c-4de5b8354c60', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Hands-on with Microsoft Windows 8.1 Update', 'description': 'The new update to the Windows 8 OS brings improved performance for mouse and keyboard users.', - 'thumbnail': 're:^http://.*/flmswindows8.jpg$', 'uploader_id': '6085384d-619e-11e3-b231-14feb5ca9861', 'uploader': 'Sarah Mitroff', }, - 'params': { - 'skip_download': 'requires rtmpdump', - } }, { 'url': 'http://www.cnet.com/videos/whiny-pothole-tweets-at-local-government-when-hit-by-cars-tomorrow-daily-187/', 'info_dict': { 'id': '56527b93-d25d-44e3-b738-f989ce2e49ba', - 'ext': 'flv', + 'ext': 'mp4', 'description': 'Khail and Ashley wonder what other civic woes can be solved by self-tweeting objects, investigate a new kind of VR camera and watch an origami robot self-assemble, walk, climb, dig and dissolve. #TDPothole', 'uploader_id': 'b163284d-6b73-44fc-b3e6-3da66c392d40', 'uploader': 'Ashley Esqueda', 'title': 'Whiny potholes tweet at local government when hit by cars (Tomorrow Daily 187)', }, - 'params': { - 'skip_download': True, # requires rtmpdump - }, }] def _real_extract(self, url): @@ -45,26 +36,13 @@ class CNETIE(InfoExtractor): webpage = self._download_webpage(url, display_id) data_json = self._html_search_regex( - r"<div class=\"cnetVideoPlayer\"\s+.*?data-cnet-video-options='([^']+)'", + r"<div class=\"videoPlayer\"\s+.*?data-cnet-video-uvp-options='([^']+)'", webpage, 'data json') data = json.loads(data_json) - vdata = data['video'] - if not vdata: - vdata = data['videos'][0] - if not vdata: - raise ExtractorError('Cannot find video data') - - mpx_account = data['config']['players']['default']['mpx_account'] - vid = vdata['files'].get('rtmp', vdata['files']['hds']) - tp_link = 'http://link.theplatform.com/s/%s/%s' % (mpx_account, vid) + vdata = data['videos'][0] video_id = vdata['id'] - title = vdata.get('headline') - if title is None: - title = vdata.get('title') - if title is None: - raise ExtractorError('Cannot find title!') - thumbnail = vdata.get('image', {}).get('path') + title = vdata['title'] author = vdata.get('author') if author: uploader = '%s %s' % (author['firstName'], author['lastName']) @@ -73,13 +51,27 @@ class CNETIE(InfoExtractor): uploader = None uploader_id = None + mpx_account = data['config']['uvpConfig']['default']['mpx_account'] + tp = ThePlatformIE(self._downloader) + formats = [] + subtitles = {} + description = vdata.get('description') + + for vid in vdata['files'].values(): + result = tp.extract(('http://link.theplatform.com/s/%s/%s' % (mpx_account, vid))) + formats.extend(result['formats']) + subtitles = self._merge_subtitles(subtitles, result['subtitles']) + description = description or result.get('description') + + self._sort_formats(formats) + return { - '_type': 'url_transparent', - 'url': tp_link, 'id': video_id, 'display_id': display_id, 'title': title, + 'description': description, 'uploader': uploader, 'uploader_id': uploader_id, - 'thumbnail': thumbnail, + 'subtitles': subtitles, + 'formats': formats, } From 77302fe5c989b9cafcb675c0a03642b80fa557ff Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Thu, 15 Oct 2015 23:27:46 +0100 Subject: [PATCH 13/92] [bliptv] remove extractor and add support for site replacement(makertv) --- youtube_dl/extractor/__init__.py | 3 +- youtube_dl/extractor/bliptv.py | 292 --------------------------- youtube_dl/extractor/cinemassacre.py | 18 +- youtube_dl/extractor/generic.py | 6 - youtube_dl/extractor/jwplatform.py | 67 ++++++ youtube_dl/extractor/makertv.py | 27 +++ 6 files changed, 103 insertions(+), 310 deletions(-) delete mode 100644 youtube_dl/extractor/bliptv.py create mode 100644 youtube_dl/extractor/jwplatform.py create mode 100644 youtube_dl/extractor/makertv.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 462717b1e..f9c40e6cd 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -54,7 +54,6 @@ from .bet import BetIE from .bild import BildIE from .bilibili import BiliBiliIE from .blinkx import BlinkxIE -from .bliptv import BlipTVIE, BlipTVUserIE from .bloomberg import BloombergIE from .bpb import BpbIE from .br import BRIE @@ -263,6 +262,7 @@ from .jadorecettepub import JadoreCettePubIE from .jeuxvideo import JeuxVideoIE from .jove import JoveIE from .jukebox import JukeboxIE +from .jwplatform import JWPlatformIE from .jpopsukitv import JpopsukiIE from .kaltura import KalturaIE from .kanalplay import KanalPlayIE @@ -317,6 +317,7 @@ from .lynda import ( from .m6 import M6IE from .macgamestore import MacGameStoreIE from .mailru import MailRuIE +from .makertv import MakerTVIE from .malemotion import MalemotionIE from .mdr import MDRIE from .metacafe import MetacafeIE diff --git a/youtube_dl/extractor/bliptv.py b/youtube_dl/extractor/bliptv.py deleted file mode 100644 index c3296283d..000000000 --- a/youtube_dl/extractor/bliptv.py +++ /dev/null @@ -1,292 +0,0 @@ -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor - -from ..compat import ( - compat_urllib_request, - compat_urlparse, -) -from ..utils import ( - clean_html, - int_or_none, - parse_iso8601, - unescapeHTML, - xpath_text, - xpath_with_ns, -) - - -class BlipTVIE(InfoExtractor): - _VALID_URL = r'https?://(?:\w+\.)?blip\.tv/(?:(?:.+-|rss/flash/)(?P<id>\d+)|((?:play/|api\.swf#)(?P<lookup_id>[\da-zA-Z+_]+)))' - - _TESTS = [ - { - 'url': 'http://blip.tv/cbr/cbr-exclusive-gotham-city-imposters-bats-vs-jokerz-short-3-5796352', - 'md5': '80baf1ec5c3d2019037c1c707d676b9f', - 'info_dict': { - 'id': '5779306', - 'ext': 'm4v', - 'title': 'CBR EXCLUSIVE: "Gotham City Imposters" Bats VS Jokerz Short 3', - 'description': 'md5:9bc31f227219cde65e47eeec8d2dc596', - 'timestamp': 1323138843, - 'upload_date': '20111206', - 'uploader': 'cbr', - 'uploader_id': '679425', - 'duration': 81, - } - }, - { - # https://github.com/rg3/youtube-dl/pull/2274 - 'note': 'Video with subtitles', - 'url': 'http://blip.tv/play/h6Uag5OEVgI.html', - 'md5': '309f9d25b820b086ca163ffac8031806', - 'info_dict': { - 'id': '6586561', - 'ext': 'mp4', - 'title': 'Red vs. Blue Season 11 Episode 1', - 'description': 'One-Zero-One', - 'timestamp': 1371261608, - 'upload_date': '20130615', - 'uploader': 'redvsblue', - 'uploader_id': '792887', - 'duration': 279, - } - }, - { - # https://bugzilla.redhat.com/show_bug.cgi?id=967465 - 'url': 'http://a.blip.tv/api.swf#h6Uag5KbVwI', - 'md5': '314e87b1ebe7a48fcbfdd51b791ce5a6', - 'info_dict': { - 'id': '6573122', - 'ext': 'mov', - 'upload_date': '20130520', - 'description': 'Two hapless space marines argue over what to do when they realize they have an astronomically huge problem on their hands.', - 'title': 'Red vs. Blue Season 11 Trailer', - 'timestamp': 1369029609, - 'uploader': 'redvsblue', - 'uploader_id': '792887', - } - }, - { - 'url': 'http://blip.tv/play/gbk766dkj4Yn', - 'md5': 'fe0a33f022d49399a241e84a8ea8b8e3', - 'info_dict': { - 'id': '1749452', - 'ext': 'mp4', - 'upload_date': '20090208', - 'description': 'Witness the first appearance of the Nostalgia Critic character, as Doug reviews the movie Transformers.', - 'title': 'Nostalgia Critic: Transformers', - 'timestamp': 1234068723, - 'uploader': 'NostalgiaCritic', - 'uploader_id': '246467', - } - }, - { - # https://github.com/rg3/youtube-dl/pull/4404 - 'note': 'Audio only', - 'url': 'http://blip.tv/hilarios-productions/weekly-manga-recap-kingdom-7119982', - 'md5': '76c0a56f24e769ceaab21fbb6416a351', - 'info_dict': { - 'id': '7103299', - 'ext': 'flv', - 'title': 'Weekly Manga Recap: Kingdom', - 'description': 'And then Shin breaks the enemy line, and he's all like HWAH! And then he slices a guy and it's all like FWASHING! And... it's really hard to describe the best parts of this series without breaking down into sound effects, okay?', - 'timestamp': 1417660321, - 'upload_date': '20141204', - 'uploader': 'The Rollo T', - 'uploader_id': '407429', - 'duration': 7251, - 'vcodec': 'none', - } - }, - { - # missing duration - 'url': 'http://blip.tv/rss/flash/6700880', - 'info_dict': { - 'id': '6684191', - 'ext': 'm4v', - 'title': 'Cowboy Bebop: Gateway Shuffle Review', - 'description': 'md5:3acc480c0f9ae157f5fe88547ecaf3f8', - 'timestamp': 1386639757, - 'upload_date': '20131210', - 'uploader': 'sfdebris', - 'uploader_id': '706520', - } - } - ] - - @staticmethod - def _extract_url(webpage): - mobj = re.search(r'<meta\s[^>]*https?://api\.blip\.tv/\w+/redirect/\w+/(\d+)', webpage) - if mobj: - return 'http://blip.tv/a/a-' + mobj.group(1) - mobj = re.search(r'<(?:iframe|embed|object)\s[^>]*(https?://(?:\w+\.)?blip\.tv/(?:play/|api\.swf#)[a-zA-Z0-9_]+)', webpage) - if mobj: - return mobj.group(1) - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - lookup_id = mobj.group('lookup_id') - - # See https://github.com/rg3/youtube-dl/issues/857 and - # https://github.com/rg3/youtube-dl/issues/4197 - if lookup_id: - urlh = self._request_webpage( - 'http://blip.tv/play/%s' % lookup_id, lookup_id, 'Resolving lookup id') - url = compat_urlparse.urlparse(urlh.geturl()) - qs = compat_urlparse.parse_qs(url.query) - mobj = re.match(self._VALID_URL, qs['file'][0]) - - video_id = mobj.group('id') - - rss = self._download_xml('http://blip.tv/rss/flash/%s' % video_id, video_id, 'Downloading video RSS') - - def _x(p): - return xpath_with_ns(p, { - 'blip': 'http://blip.tv/dtd/blip/1.0', - 'media': 'http://search.yahoo.com/mrss/', - 'itunes': 'http://www.itunes.com/dtds/podcast-1.0.dtd', - }) - - item = rss.find('channel/item') - - video_id = xpath_text(item, _x('blip:item_id'), 'video id') or lookup_id - title = xpath_text(item, 'title', 'title', fatal=True) - description = clean_html(xpath_text(item, _x('blip:puredescription'), 'description')) - timestamp = parse_iso8601(xpath_text(item, _x('blip:datestamp'), 'timestamp')) - uploader = xpath_text(item, _x('blip:user'), 'uploader') - uploader_id = xpath_text(item, _x('blip:userid'), 'uploader id') - duration = int_or_none(xpath_text(item, _x('blip:runtime'), 'duration')) - media_thumbnail = item.find(_x('media:thumbnail')) - thumbnail = (media_thumbnail.get('url') if media_thumbnail is not None - else xpath_text(item, 'image', 'thumbnail')) - categories = [category.text for category in item.findall('category') if category is not None] - - formats = [] - subtitles_urls = {} - - media_group = item.find(_x('media:group')) - for media_content in media_group.findall(_x('media:content')): - url = media_content.get('url') - role = media_content.get(_x('blip:role')) - msg = self._download_webpage( - url + '?showplayer=20140425131715&referrer=http://blip.tv&mask=7&skin=flashvars&view=url', - video_id, 'Resolving URL for %s' % role) - real_url = compat_urlparse.parse_qs(msg.strip())['message'][0] - - media_type = media_content.get('type') - if media_type == 'text/srt' or url.endswith('.srt'): - LANGS = { - 'english': 'en', - } - lang = role.rpartition('-')[-1].strip().lower() - langcode = LANGS.get(lang, lang) - subtitles_urls[langcode] = url - elif media_type.startswith('video/'): - formats.append({ - 'url': real_url, - 'format_id': role, - 'format_note': media_type, - 'vcodec': media_content.get(_x('blip:vcodec')) or 'none', - 'acodec': media_content.get(_x('blip:acodec')), - 'filesize': media_content.get('filesize'), - 'width': int_or_none(media_content.get('width')), - 'height': int_or_none(media_content.get('height')), - }) - self._check_formats(formats, video_id) - self._sort_formats(formats) - - subtitles = self.extract_subtitles(video_id, subtitles_urls) - - return { - 'id': video_id, - 'title': title, - 'description': description, - 'timestamp': timestamp, - 'uploader': uploader, - 'uploader_id': uploader_id, - 'duration': duration, - 'thumbnail': thumbnail, - 'categories': categories, - 'formats': formats, - 'subtitles': subtitles, - } - - def _get_subtitles(self, video_id, subtitles_urls): - subtitles = {} - for lang, url in subtitles_urls.items(): - # For some weird reason, blip.tv serves a video instead of subtitles - # when we request with a common UA - req = compat_urllib_request.Request(url) - req.add_header('User-Agent', 'youtube-dl') - subtitles[lang] = [{ - # The extension is 'srt' but it's actually an 'ass' file - 'ext': 'ass', - 'data': self._download_webpage(req, None, note=False), - }] - return subtitles - - -class BlipTVUserIE(InfoExtractor): - _VALID_URL = r'(?:(?:https?://(?:\w+\.)?blip\.tv/)|bliptvuser:)(?!api\.swf)([^/]+)/*$' - _PAGE_SIZE = 12 - IE_NAME = 'blip.tv:user' - _TEST = { - 'url': 'http://blip.tv/actone', - 'info_dict': { - 'id': 'actone', - 'title': 'Act One: The Series', - }, - 'playlist_count': 5, - } - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - username = mobj.group(1) - - page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1' - - page = self._download_webpage(url, username, 'Downloading user page') - mobj = re.search(r'data-users-id="([^"]+)"', page) - page_base = page_base % mobj.group(1) - title = self._og_search_title(page) - - # Download video ids using BlipTV Ajax calls. Result size per - # query is limited (currently to 12 videos) so we need to query - # page by page until there are no video ids - it means we got - # all of them. - - video_ids = [] - pagenum = 1 - - while True: - url = page_base + "&page=" + str(pagenum) - page = self._download_webpage( - url, username, 'Downloading video ids from page %d' % pagenum) - - # Extract video identifiers - ids_in_page = [] - - for mobj in re.finditer(r'href="/([^"]+)"', page): - if mobj.group(1) not in ids_in_page: - ids_in_page.append(unescapeHTML(mobj.group(1))) - - video_ids.extend(ids_in_page) - - # A little optimization - if current page is not - # "full", ie. does not contain PAGE_SIZE video ids then - # we can assume that this page is the last one - there - # are no more ids on further pages - no need to query - # again. - - if len(ids_in_page) < self._PAGE_SIZE: - break - - pagenum += 1 - - urls = ['http://blip.tv/%s' % video_id for video_id in video_ids] - url_entries = [self.url_result(vurl, 'BlipTV') for vurl in urls] - return self.playlist_result( - url_entries, playlist_title=title, playlist_id=username) diff --git a/youtube_dl/extractor/cinemassacre.py b/youtube_dl/extractor/cinemassacre.py index fd1770dac..6d9cd8abd 100644 --- a/youtube_dl/extractor/cinemassacre.py +++ b/youtube_dl/extractor/cinemassacre.py @@ -5,7 +5,6 @@ import re from .common import InfoExtractor from ..utils import ExtractorError -from .bliptv import BlipTVIE from .screenwavemedia import ScreenwaveMediaIE @@ -34,18 +33,17 @@ class CinemassacreIE(InfoExtractor): }, }, { - # blip.tv embedded video + # Youtube embedded video 'url': 'http://cinemassacre.com/2006/12/07/chronologically-confused-about-bad-movie-and-video-game-sequel-titles/', - 'md5': 'ca9b3c8dd5a66f9375daeb5135f5a3de', + 'md5': 'df4cf8a1dcedaec79a73d96d83b99023', 'info_dict': { - 'id': '4065369', - 'ext': 'flv', + 'id': 'OEVzPCY2T-g', + 'ext': 'mp4', 'title': 'AVGN: Chronologically Confused about Bad Movie and Video Game Sequel Titles', 'upload_date': '20061207', - 'uploader': 'cinemassacre', - 'uploader_id': '250778', - 'timestamp': 1283233867, - 'description': 'md5:0a108c78d130676b207d0f6d029ecffd', + 'uploader': 'Cinemassacre', + 'uploader_id': 'JamesNintendoNerd', + 'description': 'md5:784734696c2b8b7f4b8625cc799e07f6', } }, { @@ -88,8 +86,6 @@ class CinemassacreIE(InfoExtractor): r'<iframe[^>]+src="(?P<url>(?:https?:)?//(?:[^.]+\.)?youtube\.com/.+?)"', ], webpage, 'player data URL', default=None, group='url') - if not playerdata_url: - playerdata_url = BlipTVIE._extract_url(webpage) if not playerdata_url: raise ExtractorError('Unable to find player data') diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index ca5fbafb2..285c0ff66 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -41,7 +41,6 @@ from .myvi import MyviIE from .condenast import CondeNastIE from .udn import UDNEmbedIE from .senateisvp import SenateISVPIE -from .bliptv import BlipTVIE from .svt import SVTIE from .pornhub import PornHubIE from .xhamster import XHamsterEmbedIE @@ -1389,11 +1388,6 @@ class GenericIE(InfoExtractor): 'id': match.group('id') } - # Look for embedded blip.tv player - bliptv_url = BlipTVIE._extract_url(webpage) - if bliptv_url: - return self.url_result(bliptv_url, 'BlipTV') - # Look for SVT player svt_url = SVTIE._extract_url(webpage) if svt_url: diff --git a/youtube_dl/extractor/jwplatform.py b/youtube_dl/extractor/jwplatform.py new file mode 100644 index 000000000..3a3dc439a --- /dev/null +++ b/youtube_dl/extractor/jwplatform.py @@ -0,0 +1,67 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import int_or_none + + +class JWPlatformIE(InfoExtractor): + _VALID_URL = r'(?:https?://content\.jwplatform\.com/(?:feeds|players|jw6)/|jwplatform:)(?P<id>[a-zA-Z0-9]{8})' + _TEST = { + 'url': 'http://content.jwplatform.com/players/nPripu9l-ALJ3XQCI.js', + 'md5': 'fa8899fa601eb7c83a64e9d568bdf325', + 'info_dict': { + 'id': 'nPripu9l', + 'ext': 'mov', + 'title': 'Big Buck Bunny Trailer', + 'description': 'Big Buck Bunny is a short animated film by the Blender Institute. It is made using free and open source software.', + 'upload_date': '20081127', + 'timestamp': 1227796140, + } + } + + @staticmethod + def _extract_url(webpage): + mobj = re.search( + r'<script[^>]+?src=["\'](?P<url>(?:https?:)?//content.jwplatform.com/players/[a-zA-Z0-9]{8}', + webpage) + if mobj: + return mobj.group('url') + + def _real_extract(self, url): + video_id = self._match_id(url) + json_data = self._download_json('http://content.jwplatform.com/feeds/%s.json' % video_id, video_id) + video_data = json_data['playlist'][0] + subtitles = {} + for track in video_data['tracks']: + if track['kind'] == 'captions': + subtitles[track['label']] = [{'url': self._proto_relative_url(track['file'])}] + + formats = [] + for source in video_data['sources']: + source_url = self._proto_relative_url(source['file']) + source_type = source.get('type') or '' + if source_type == 'application/vnd.apple.mpegurl': + formats.extend(self._extract_m3u8_formats(source_url, video_id, 'mp4', 'm3u8_native', fatal=None)) + elif source_type.startswith('audio'): + formats.append({ + 'url': source_url, + 'vcodec': 'none', + }) + else: + formats.append({ + 'url': source_url, + 'width': int_or_none(source.get('width')), + 'height': int_or_none(source.get('height')), + }) + self._sort_formats(formats) + + return { + 'id': video_data['mediaid'], + 'title': video_data['title'], + 'description': video_data.get('description'), + 'thumbnail': self._proto_relative_url(video_data.get('image')), + 'timestamp': int_or_none(video_data.get('pubdate')), + 'subtitles': subtitles, + 'formats': formats, + } diff --git a/youtube_dl/extractor/makertv.py b/youtube_dl/extractor/makertv.py new file mode 100644 index 000000000..0256e4e24 --- /dev/null +++ b/youtube_dl/extractor/makertv.py @@ -0,0 +1,27 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class MakerTVIE(InfoExtractor): + _VALID_URL = r'https?://(?:(?:www\.)?maker\.tv/(?:[^/]+/)?video|http://makerplayer.com/embed/maker)/(?P<id>[a-zA-Z0-9]{12})' + _TEST = { + 'url': 'http://www.maker.tv/video/Fh3QgymL9gsc', + 'md5': 'ca237a53a8eb20b6dc5bd60564d4ab3e', + 'info_dict': { + 'id': 'brOEcGut', + 'ext': 'mp4', + 'title': 'Maze Runner: The Scorch Trials Official Movie Review', + 'description': 'md5:11ff3362d7ef1d679fdb649f6413975a', + 'upload_date': '20150918', + 'timestamp': 1442549540, + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + jwplatform_id = self._search_regex([r'jwid="([^"]+)"', r'Maker.jw_id\s*=\s*"([^"]+)";'], webpage, 'jwplatform id') + + return self.url_result('jwplatform:%s' % jwplatform_id, 'JWPlatform') From 02fb9804513ce1bfe28ec7c285526db7989e5844 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Sun, 1 Nov 2015 02:08:19 +0100 Subject: [PATCH 14/92] [flickr] extract more info and formats --- youtube_dl/extractor/flickr.py | 110 ++++++++++++++++++--------------- 1 file changed, 60 insertions(+), 50 deletions(-) diff --git a/youtube_dl/extractor/flickr.py b/youtube_dl/extractor/flickr.py index 2fe76d661..5ca754105 100644 --- a/youtube_dl/extractor/flickr.py +++ b/youtube_dl/extractor/flickr.py @@ -1,67 +1,77 @@ from __future__ import unicode_literals -import re - from .common import InfoExtractor -from ..compat import compat_urllib_request +from ..compat import compat_urllib_parse from ..utils import ( - ExtractorError, - find_xpath_attr, + int_or_none, + qualities, ) class FlickrIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.|secure\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*' + _VALID_URL = r'https?://(?:www\.|secure\.)?flickr\.com/photos/[\w\-_@]+/(?P<id>\d+)' _TEST = { 'url': 'http://www.flickr.com/photos/forestwander-nature-pictures/5645318632/in/photostream/', - 'md5': '6fdc01adbc89d72fc9c4f15b4a4ba87b', + 'md5': '164fe3fa6c22e18d448d4d5af2330f31', 'info_dict': { 'id': '5645318632', - 'ext': 'mp4', - "description": "Waterfalls in the Springtime at Dark Hollow Waterfalls. These are located just off of Skyline Drive in Virginia. They are only about 6/10 of a mile hike but it is a pretty steep hill and a good climb back up.", - "uploader_id": "forestwander-nature-pictures", - "title": "Dark Hollow Waterfalls" + 'ext': 'mpg', + 'description': 'Waterfalls in the Springtime at Dark Hollow Waterfalls. These are located just off of Skyline Drive in Virginia. They are only about 6/10 of a mile hike but it is a pretty steep hill and a good climb back up.', + 'uploader_id': 'forestwander-nature-pictures', + 'title': 'Dark Hollow Waterfalls', + 'duration': 19, + 'timestamp': 1303528740, + 'upload_date': '20110423', + 'uploader_id': '10922353@N03', + 'uploader': 'Forest Wander', + 'comment_count': int, } } - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) + _API_BASE_URL = 'https://api.flickr.com/services/rest?' + _API_KEY = '61b16865f916058e63580a912d9143be' - video_id = mobj.group('id') - video_uploader_id = mobj.group('uploader_id') - webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id - req = compat_urllib_request.Request(webpage_url) - req.add_header( - 'User-Agent', - # it needs a more recent version - 'Mozilla/5.0 (X11; Linux x86_64; rv:38.0) Gecko/20150101 Firefox/38.0 (Chrome)') - webpage = self._download_webpage(req, video_id) - - secret = self._search_regex(r'secret"\s*:\s*"(\w+)"', webpage, 'secret') - - first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self' - first_xml = self._download_xml(first_url, video_id, 'Downloading first data webpage') - - node_id = find_xpath_attr( - first_xml, './/{http://video.yahoo.com/YEP/1.0/}Item', 'id', - 'id').text - - second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1' - second_xml = self._download_xml(second_url, video_id, 'Downloading second data webpage') - - self.report_extraction(video_id) - - stream = second_xml.find('.//STREAM') - if stream is None: - raise ExtractorError('Unable to extract video url') - video_url = stream.attrib['APP'] + stream.attrib['FULLPATH'] - - return { - 'id': video_id, - 'url': video_url, - 'ext': 'mp4', - 'title': self._og_search_title(webpage), - 'description': self._og_search_description(webpage), - 'thumbnail': self._og_search_thumbnail(webpage), - 'uploader_id': video_uploader_id, + def _call_api(self, method, video_id, secret=None): + query = { + 'photo_id': video_id, + 'method': 'flickr.%s' % method, + 'api_key': self._API_KEY, + 'format': 'json', + 'nojsoncallback': 1, } + if secret: + query['secret'] = secret + return self._download_json(self._API_BASE_URL + compat_urllib_parse.urlencode(query), video_id) + + def _real_extract(self, url): + video_id = self._match_id(url) + + video_info = self._call_api('photos.getInfo', video_id)['photo'] + if video_info['media'] == 'video': + streams = self._call_api('video.getStreamInfo', video_id, video_info['secret'])['streams'] + + preference = qualities(['iphone_wifi', '700', 'appletv', 'orig']) + + formats = [] + for stream in streams['stream']: + stream_type = str(stream.get('type')) + formats.append({ + 'format_id': stream_type, + 'url': stream['_content'], + 'preference': preference(stream_type), + }) + self._sort_formats(formats) + + owner = video_info.get('owner', {}) + + return { + 'id': video_id, + 'title': video_info['title']['_content'], + 'description': video_info.get('description', {}).get('_content'), + 'formats': formats, + 'timestamp': int_or_none(video_info.get('dateuploaded')), + 'duration': int_or_none(video_info.get('video', {}).get('duration')), + 'uploader_id': owner.get('nsid'), + 'uploader': owner.get('realname'), + 'comment_count': int_or_none(video_info.get('comments', {}).get('_content')), + } From 146672254e409bf97c82a302095fbfabf2c48928 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Sun, 1 Nov 2015 13:23:23 +0100 Subject: [PATCH 15/92] [flickr] extract fresh api key and remove duplication in test --- youtube_dl/extractor/flickr.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/flickr.py b/youtube_dl/extractor/flickr.py index 5ca754105..0d5d6b0b9 100644 --- a/youtube_dl/extractor/flickr.py +++ b/youtube_dl/extractor/flickr.py @@ -17,7 +17,6 @@ class FlickrIE(InfoExtractor): 'id': '5645318632', 'ext': 'mpg', 'description': 'Waterfalls in the Springtime at Dark Hollow Waterfalls. These are located just off of Skyline Drive in Virginia. They are only about 6/10 of a mile hike but it is a pretty steep hill and a good climb back up.', - 'uploader_id': 'forestwander-nature-pictures', 'title': 'Dark Hollow Waterfalls', 'duration': 19, 'timestamp': 1303528740, @@ -29,26 +28,27 @@ class FlickrIE(InfoExtractor): } _API_BASE_URL = 'https://api.flickr.com/services/rest?' - _API_KEY = '61b16865f916058e63580a912d9143be' - def _call_api(self, method, video_id, secret=None): + def _call_api(self, method, video_id, api_key, note, secret=None): query = { 'photo_id': video_id, 'method': 'flickr.%s' % method, - 'api_key': self._API_KEY, + 'api_key': api_key, 'format': 'json', 'nojsoncallback': 1, } if secret: query['secret'] = secret - return self._download_json(self._API_BASE_URL + compat_urllib_parse.urlencode(query), video_id) + return self._download_json(self._API_BASE_URL + compat_urllib_parse.urlencode(query), video_id, note) def _real_extract(self, url): video_id = self._match_id(url) - video_info = self._call_api('photos.getInfo', video_id)['photo'] + api_key = self._download_json('https://www.flickr.com/hermes_error_beacon.gne', video_id, 'Downloading api key',)['site_key'] + + video_info = self._call_api('photos.getInfo', video_id, api_key, 'Downloading video info')['photo'] if video_info['media'] == 'video': - streams = self._call_api('video.getStreamInfo', video_id, video_info['secret'])['streams'] + streams = self._call_api('video.getStreamInfo', video_id, api_key, 'Downloading streams info', video_info['secret'])['streams'] preference = qualities(['iphone_wifi', '700', 'appletv', 'orig']) From f3003531a5622cc01501325b9f35dcb2424cfb70 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Sun, 1 Nov 2015 13:38:11 +0100 Subject: [PATCH 16/92] [flickr] handle error message --- youtube_dl/extractor/flickr.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/flickr.py b/youtube_dl/extractor/flickr.py index 0d5d6b0b9..e97754d36 100644 --- a/youtube_dl/extractor/flickr.py +++ b/youtube_dl/extractor/flickr.py @@ -3,6 +3,7 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..compat import compat_urllib_parse from ..utils import ( + ExtractorError, int_or_none, qualities, ) @@ -39,7 +40,10 @@ class FlickrIE(InfoExtractor): } if secret: query['secret'] = secret - return self._download_json(self._API_BASE_URL + compat_urllib_parse.urlencode(query), video_id, note) + data = self._download_json(self._API_BASE_URL + compat_urllib_parse.urlencode(query), video_id, note) + if data['stat'] != 'ok': + raise ExtractorError(data['message']) + return data def _real_extract(self, url): video_id = self._match_id(url) From 967c9076a31ca2a2b43fb71082ad1a8db88116bd Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Thu, 5 Nov 2015 18:01:13 +0100 Subject: [PATCH 17/92] raise ExtractorError if the page doesn't contain a video --- youtube_dl/extractor/flickr.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/youtube_dl/extractor/flickr.py b/youtube_dl/extractor/flickr.py index e97754d36..92d2ac553 100644 --- a/youtube_dl/extractor/flickr.py +++ b/youtube_dl/extractor/flickr.py @@ -79,3 +79,5 @@ class FlickrIE(InfoExtractor): 'uploader': owner.get('realname'), 'comment_count': int_or_none(video_info.get('comments', {}).get('_content')), } + else: + raise ExtractorError('not a video', expected=True) From a641b2459263228fb1dd86dfe05d6047cedbf345 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Fri, 6 Nov 2015 07:23:03 +0100 Subject: [PATCH 18/92] [cnet] skip hls_phone if hls_tablet is present --- youtube_dl/extractor/cnet.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/cnet.py b/youtube_dl/extractor/cnet.py index 2fac0d79d..3ecf0efd4 100644 --- a/youtube_dl/extractor/cnet.py +++ b/youtube_dl/extractor/cnet.py @@ -57,7 +57,9 @@ class CNETIE(InfoExtractor): subtitles = {} description = vdata.get('description') - for vid in vdata['files'].values(): + for (fkey, vid) in vdata['files'].items(): + if fkey == 'hls_phone' and 'hls_tablet' in vdata['files']: + continue result = tp.extract(('http://link.theplatform.com/s/%s/%s' % (mpx_account, vid))) formats.extend(result['formats']) subtitles = self._merge_subtitles(subtitles, result['subtitles']) From 3793090b1b1c1e3462b80dd3045a3573545cfb29 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Sat, 7 Nov 2015 16:54:35 +0100 Subject: [PATCH 19/92] [amp] Add generic extractor for Akamai AMP feeds and use it in dramafever and foxnews extractors --- youtube_dl/extractor/amp.py | 84 ++++++++++++++++++++++++++++++ youtube_dl/extractor/dramafever.py | 65 ++++------------------- youtube_dl/extractor/foxnews.py | 64 ++++------------------- 3 files changed, 105 insertions(+), 108 deletions(-) create mode 100644 youtube_dl/extractor/amp.py diff --git a/youtube_dl/extractor/amp.py b/youtube_dl/extractor/amp.py new file mode 100644 index 000000000..b573b9280 --- /dev/null +++ b/youtube_dl/extractor/amp.py @@ -0,0 +1,84 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + parse_iso8601, +) + + +class AMPIE(InfoExtractor): + def _get_media_node(self, item, name, default=None): + media_name = 'media-%s' % name + media_group = item.get('media-group') or item + return media_group.get(media_name) or item.get(media_name) or item.get(name, default) + + # parse Akamai Adaptive Media Player feed + def _extract_feed_info(self, url): + item = self._download_json( + url, None, + 'Downloading Akamai AMP feed', + 'Unable to download Akamai AMP feed' + )['channel']['item'] + + video_id = item['guid'] + + thumbnails = [] + media_thumbnail = self._get_media_node(item, 'thumbnail') + if media_thumbnail: + if isinstance(media_thumbnail, dict): + media_thumbnail = [media_thumbnail] + for thumbnail_data in media_thumbnail: + thumbnail = thumbnail_data['@attributes'] + thumbnails.append({ + 'url': self._proto_relative_url(thumbnail['url'], 'http:'), + 'width': int_or_none(thumbnail.get('width')), + 'height': int_or_none(thumbnail.get('height')), + }) + + subtitles = {} + media_subtitle = self._get_media_node(item, 'subTitle') + if media_subtitle: + if isinstance(media_subtitle, dict): + media_subtitle = [media_subtitle] + for subtitle_data in media_subtitle: + subtitle = subtitle_data['@attributes'] + lang = subtitle.get('lang') or 'en' + subtitles[lang] = [{'url': subtitle['href']}] + + formats = [] + media_content = self._get_media_node(item, 'content') + if isinstance(media_content, dict): + media_content = [media_content] + for media_data in media_content: + media = media_data['@attributes'] + media_type = media['type'] + if media_type == 'video/f4m': + f4m_formats = self._extract_f4m_formats(media['url'] + '?hdcore=3.4.0&plugin=aasp-3.4.0.132.124', video_id, f4m_id='hds', fatal=False) + if f4m_formats: + formats.extend(f4m_formats) + elif media_type == 'application/x-mpegURL': + m3u8_formats = self._extract_m3u8_formats(media['url'], video_id, m3u8_id='hls', fatal=False) + if m3u8_formats: + formats.extend(m3u8_formats) + else: + formats.append({ + 'format_id': media_data['media-category']['@attributes']['label'], + 'url': media['url'], + 'preference': 1, + 'vbr': int_or_none(media.get('bitrate')), + 'filesize': int_or_none(media.get('fileSize')), + }) + + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': self._get_media_node(item, 'title'), + 'description': self._get_media_node(item, 'description'), + 'thumbnails': thumbnails, + 'timestamp': parse_iso8601(item.get('pubDate'), ' '), + 'duration': int_or_none(media_content[0].get('@attributes', {}).get('duration')), + 'formats': formats, + } diff --git a/youtube_dl/extractor/dramafever.py b/youtube_dl/extractor/dramafever.py index 38e6597c8..80a928827 100644 --- a/youtube_dl/extractor/dramafever.py +++ b/youtube_dl/extractor/dramafever.py @@ -3,7 +3,7 @@ from __future__ import unicode_literals import itertools -from .common import InfoExtractor +from .amp import AMPIE from ..compat import ( compat_HTTPError, compat_urllib_parse, @@ -19,7 +19,7 @@ from ..utils import ( ) -class DramaFeverBaseIE(InfoExtractor): +class DramaFeverBaseIE(AMPIE): _LOGIN_URL = 'https://www.dramafever.com/accounts/login/' _NETRC_MACHINE = 'dramafever' @@ -80,60 +80,24 @@ class DramaFeverIE(DramaFeverBaseIE): 'timestamp': 1404336058, 'upload_date': '20140702', 'duration': 343, - } + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, } def _real_extract(self, url): video_id = self._match_id(url).replace('/', '.') try: - feed = self._download_json( - 'http://www.dramafever.com/amp/episode/feed.json?guid=%s' % video_id, - video_id, 'Downloading episode JSON')['channel']['item'] + info = self._extract_feed_info('http://www.dramafever.com/amp/episode/feed.json?guid=%s' % video_id) except ExtractorError as e: if isinstance(e.cause, compat_HTTPError): raise ExtractorError( 'Currently unavailable in your country.', expected=True) raise - media_group = feed.get('media-group', {}) - - formats = [] - for media_content in media_group['media-content']: - src = media_content.get('@attributes', {}).get('url') - if not src: - continue - ext = determine_ext(src) - if ext == 'f4m': - formats.extend(self._extract_f4m_formats( - src, video_id, f4m_id='hds')) - elif ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - src, video_id, 'mp4', m3u8_id='hls')) - else: - formats.append({ - 'url': src, - }) - self._sort_formats(formats) - - title = media_group.get('media-title') - description = media_group.get('media-description') - duration = int_or_none(media_group['media-content'][0].get('@attributes', {}).get('duration')) - thumbnail = self._proto_relative_url( - media_group.get('media-thumbnail', {}).get('@attributes', {}).get('url')) - timestamp = parse_iso8601(feed.get('pubDate'), ' ') - - subtitles = {} - for media_subtitle in media_group.get('media-subTitle', []): - lang = media_subtitle.get('@attributes', {}).get('lang') - href = media_subtitle.get('@attributes', {}).get('href') - if not lang or not href: - continue - subtitles[lang] = [{ - 'ext': 'ttml', - 'url': href, - }] - series_id, episode_number = video_id.split('.') episode_info = self._download_json( # We only need a single episode info, so restricting page size to one episode @@ -146,21 +110,12 @@ class DramaFeverIE(DramaFeverBaseIE): if value: subfile = value[0].get('subfile') or value[0].get('new_subfile') if subfile and subfile != 'http://www.dramafever.com/st/': - subtitles.setdefault('English', []).append({ + info['subtitiles'].setdefault('English', []).append({ 'ext': 'srt', 'url': subfile, }) - return { - 'id': video_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'timestamp': timestamp, - 'duration': duration, - 'formats': formats, - 'subtitles': subtitles, - } + return info class DramaFeverSeriesIE(DramaFeverBaseIE): diff --git a/youtube_dl/extractor/foxnews.py b/youtube_dl/extractor/foxnews.py index 3a4a59135..0cd0f9fa8 100644 --- a/youtube_dl/extractor/foxnews.py +++ b/youtube_dl/extractor/foxnews.py @@ -2,14 +2,14 @@ from __future__ import unicode_literals import re -from .common import InfoExtractor +from .amp import AMPIE from ..utils import ( parse_iso8601, int_or_none, ) -class FoxNewsIE(InfoExtractor): +class FoxNewsIE(AMPIE): IE_DESC = 'Fox News and Fox Business Video' _VALID_URL = r'https?://(?P<host>video\.fox(?:news|business)\.com)/v/(?:video-embed\.html\?video_id=)?(?P<id>\d+)' _TESTS = [ @@ -20,10 +20,10 @@ class FoxNewsIE(InfoExtractor): 'id': '3937480', 'ext': 'flv', 'title': 'Frozen in Time', - 'description': 'Doctors baffled by 16-year-old girl that is the size of a toddler', + 'description': '16-year-old girl is size of toddler', 'duration': 265, - 'timestamp': 1304411491, - 'upload_date': '20110503', + #'timestamp': 1304411491, + #'upload_date': '20110503', 'thumbnail': 're:^https?://.*\.jpg$', }, }, @@ -34,10 +34,10 @@ class FoxNewsIE(InfoExtractor): 'id': '3922535568001', 'ext': 'mp4', 'title': "Rep. Luis Gutierrez on if Obama's immigration plan is legal", - 'description': "Congressman discusses the president's executive action", + 'description': "Congressman discusses president's plan", 'duration': 292, - 'timestamp': 1417662047, - 'upload_date': '20141204', + #'timestamp': 1417662047, + #'upload_date': '20141204', 'thumbnail': 're:^https?://.*\.jpg$', }, }, @@ -56,48 +56,6 @@ class FoxNewsIE(InfoExtractor): video_id = mobj.group('id') host = mobj.group('host') - video = self._download_json( - 'http://%s/v/feed/video/%s.js?template=fox' % (host, video_id), video_id) - - item = video['channel']['item'] - title = item['title'] - description = item['description'] - timestamp = parse_iso8601(item['dc-date']) - - media_group = item['media-group'] - duration = None - formats = [] - for media in media_group['media-content']: - attributes = media['@attributes'] - video_url = attributes['url'] - if video_url.endswith('.f4m'): - formats.extend(self._extract_f4m_formats(video_url + '?hdcore=3.4.0&plugin=aasp-3.4.0.132.124', video_id)) - elif video_url.endswith('.m3u8'): - formats.extend(self._extract_m3u8_formats(video_url, video_id, 'flv')) - elif not video_url.endswith('.smil'): - duration = int_or_none(attributes.get('duration')) - formats.append({ - 'url': video_url, - 'format_id': media['media-category']['@attributes']['label'], - 'preference': 1, - 'vbr': int_or_none(attributes.get('bitrate')), - 'filesize': int_or_none(attributes.get('fileSize')) - }) - self._sort_formats(formats) - - media_thumbnail = media_group['media-thumbnail']['@attributes'] - thumbnails = [{ - 'url': media_thumbnail['url'], - 'width': int_or_none(media_thumbnail.get('width')), - 'height': int_or_none(media_thumbnail.get('height')), - }] if media_thumbnail else [] - - return { - 'id': video_id, - 'title': title, - 'description': description, - 'duration': duration, - 'timestamp': timestamp, - 'formats': formats, - 'thumbnails': thumbnails, - } + info = self._extract_feed_info('http://%s/v/feed/video/%s.js?template=fox' % (host, video_id)) + info['id'] = video_id + return info From 63b728f06f00c2f1a45a67eddebd18bcdc36a753 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Sat, 7 Nov 2015 16:56:21 +0100 Subject: [PATCH 20/92] [bleacherreport] Add new Extractor --- youtube_dl/extractor/__init__.py | 4 + youtube_dl/extractor/bleacherreport.py | 121 +++++++++++++++++++++++++ 2 files changed, 125 insertions(+) create mode 100644 youtube_dl/extractor/bleacherreport.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 94150a28f..4d65ece94 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -54,6 +54,10 @@ from .beatportpro import BeatportProIE from .bet import BetIE from .bild import BildIE from .bilibili import BiliBiliIE +from .bleacherreport import ( + BleacherReportIE, + BleacherReportCMSIE, +) from .blinkx import BlinkxIE from .bliptv import BlipTVIE, BlipTVUserIE from .bloomberg import BloombergIE diff --git a/youtube_dl/extractor/bleacherreport.py b/youtube_dl/extractor/bleacherreport.py new file mode 100644 index 000000000..a55e696d2 --- /dev/null +++ b/youtube_dl/extractor/bleacherreport.py @@ -0,0 +1,121 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from .amp import AMPIE +from ..utils import ( + ExtractorError, + int_or_none, + parse_iso8601, +) + + +class BleacherReportIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?bleacherreport\.com/articles/(?P<id>\d+)' + _TESTS = [{ + 'url': 'http://bleacherreport.com/articles/2496438-fsu-stat-projections-is-jalen-ramsey-best-defensive-player-in-college-football', + 'md5': 'a3ffc3dc73afdbc2010f02d98f990f20', + 'info_dict': { + 'id': '2496438', + 'ext': 'mp4', + 'title': 'FSU Stat Projections: Is Jalen Ramsey Best Defensive Player in College Football?', + 'uploader_id': 3992341, + 'description': 'CFB, ACC, Florida State', + 'timestamp': 1434380212, + 'upload_date': '20150615', + 'uploader': 'Team Stream Now ', + }, + 'add_ie': ['Ooyala'], + },{ + 'url': 'http://bleacherreport.com/articles/2586817-aussie-golfers-get-fright-of-their-lives-after-being-chased-by-angry-kangaroo', + 'md5': 'af5f90dc9c7ba1c19d0a3eac806bbf50', + 'info_dict': { + 'id': '2586817', + 'ext': 'mp4', + 'title': 'Aussie Golfers Get Fright of Their Lives After Being Chased by Angry Kangaroo', + 'timestamp': 1446839961, + 'uploader': 'Sean Fay', + 'description': 'md5:e95afafa43619816552723878b3b0a84', + 'uploader_id': 6466954, + 'upload_date': '20151011', + }, + 'add_ie': ['Youtube'], + },{ + 'url': 'http://bleacherreport.com/articles/2496438-fsu-stat-projections-is-jalen-ramsey-best-defensive-player-in-college-football', + 'md5': 'a3ffc3dc73afdbc2010f02d98f990f20', + 'info_dict': { + 'id': '2496438', + 'ext': 'mp4', + 'title': 'FSU Stat Projections: Is Jalen Ramsey Best Defensive Player in College Football?', + 'upload_date': '20150615', + 'uploader': 'Team Stream Now ', + 'timestamp': 1434380212, + 'description': 'CFB, ACC, Florida State', + 'uploader_id': 3992341, + }, + 'add_ie': ['Vine'], + }] + + def _real_extract(self, url): + article_id = self._match_id(url) + + article_data = self._download_json('http://api.bleacherreport.com/api/v1/articles/%s' % article_id, article_id)['article'] + + thumbnails = [] + primary_photo = article_data.get('primaryPhoto') + if primary_photo: + thumbnails = [{ + 'url': primary_photo['url'], + 'width': primary_photo.get('width'), + 'height': primary_photo.get('height'), + }] + + info = { + '_type': 'url_transparent', + 'id': article_id, + 'title': article_data['title'], + 'uploader': article_data.get('author', {}).get('name'), + 'uploader_id': article_data.get('authorId'), + 'timestamp': parse_iso8601(article_data.get('createdAt')), + 'thumbnails': thumbnails, + 'comment_count': int_or_none(article_data.get('commentsCount')), + 'view_count': int_or_none(article_data.get('hitCount')), + } + + video = article_data.get('video') + if video: + video_type = video['type'] + if video_type == 'cms.bleacherreport.com': + info['url'] = 'http://bleacherreport.com/video_embed?id=%s' % video['id'] + elif video_type == 'ooyala.com': + info['url'] = 'ooyala:%s' % video['id'] + elif video_type == 'youtube.com': + info['url'] = video['id'] + elif video_type == 'vine.co': + info['url'] = 'https://vine.co/v/%s' % video['id'] + else: + info['url'] = video_type + video['id'] + return info + else: + raise ExtractorError('no video in the article', expected=True) + + +class BleacherReportCMSIE(AMPIE): + _VALID_URL = r'https?://(?:www\.)?bleacherreport\.com/video_embed\?id=(?P<id>[0-9a-f-]{36})' + _TESTS = [{ + 'url': 'http://bleacherreport.com/video_embed?id=8fd44c2f-3dc5-4821-9118-2c825a98c0e1', + 'md5': 'f0ca220af012d4df857b54f792c586bb', + 'info_dict': { + 'id': '8fd44c2f-3dc5-4821-9118-2c825a98c0e1', + 'ext': 'flv', + 'title': 'Cena vs. Rollins Would Expose the Heavyweight Division', + 'description': 'md5:984afb4ade2f9c0db35f3267ed88b36e', + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + info = self._extract_feed_info('http://cms.bleacherreport.com/media/items/%s/akamai.json' % video_id) + info['id'] = video_id + return info From 31b2051e211f3e2691a186d16733cf91eb4ab391 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 14 Dec 2015 21:30:58 +0600 Subject: [PATCH 21/92] [utils] Add remove_quotes --- test/test_utils.py | 10 ++++++++++ youtube_dl/utils.py | 9 +++++++++ 2 files changed, 19 insertions(+) diff --git a/test/test_utils.py b/test/test_utils.py index 501355c74..8fc74e591 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -43,6 +43,7 @@ from youtube_dl.utils import ( sanitize_path, prepend_extension, replace_extension, + remove_quotes, shell_quote, smuggle_url, str_to_int, @@ -200,6 +201,15 @@ class TestUtil(unittest.TestCase): self.assertEqual(replace_extension('.abc', 'temp'), '.abc.temp') self.assertEqual(replace_extension('.abc.ext', 'temp'), '.abc.temp') + def test_remove_quotes(self): + self.assertEqual(remove_quotes(None), None) + self.assertEqual(remove_quotes('"'), '"') + self.assertEqual(remove_quotes("'"), "'") + self.assertEqual(remove_quotes(';'), ';') + self.assertEqual(remove_quotes('";'), '";') + self.assertEqual(remove_quotes('""'), '') + self.assertEqual(remove_quotes('";"'), ';') + def test_ordered_set(self): self.assertEqual(orderedSet([1, 1, 2, 3, 4, 4, 5, 6, 7, 3, 5]), [1, 2, 3, 4, 5, 6, 7]) self.assertEqual(orderedSet([]), []) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index d0606b4bc..91917fc96 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1406,6 +1406,15 @@ def remove_end(s, end): return s +def remove_quotes(s): + if s is None or len(s) < 2: + return s + for quote in ('"', "'", ): + if s[0] == quote and s[-1] == quote: + return s[1:-1] + return s + + def url_basename(url): path = compat_urlparse.urlparse(url).path return path.strip('/').split('/')[-1] From 0cb58b0259de0b0f44b0326d492b98a8eeb6316e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 14 Dec 2015 21:31:53 +0600 Subject: [PATCH 22/92] [youtube] Extract alt_title and creator for music videos (Closes #7862) --- youtube_dl/extractor/youtube.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 9b39505ba..4556a16fb 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -33,6 +33,7 @@ from ..utils import ( int_or_none, orderedSet, parse_duration, + remove_quotes, remove_start, sanitized_Request, smuggle_url, @@ -395,12 +396,14 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'ext': 'mp4', 'upload_date': '20120506', 'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]', + 'alt_title': 'I Love It (feat. Charli XCX)', 'description': 'md5:782e8651347686cba06e58f71ab51773', 'tags': ['Icona Pop i love it', 'sweden', 'pop music', 'big beat records', 'big beat', 'charli', 'xcx', 'charli xcx', 'girls', 'hbo', 'i love it', "i don't care", 'icona', 'pop', 'iconic ep', 'iconic', 'love', 'it'], 'uploader': 'Icona Pop', 'uploader_id': 'IconaPop', + 'creator': 'Icona Pop', } }, { @@ -411,9 +414,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'ext': 'mp4', 'upload_date': '20130703', 'title': 'Justin Timberlake - Tunnel Vision (Explicit)', + 'alt_title': 'Tunnel Vision', 'description': 'md5:64249768eec3bc4276236606ea996373', 'uploader': 'justintimberlakeVEVO', 'uploader_id': 'justintimberlakeVEVO', + 'creator': 'Justin Timberlake', 'age_limit': 18, } }, @@ -492,10 +497,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'id': 'nfWlot6h_JM', 'ext': 'm4a', 'title': 'Taylor Swift - Shake It Off', + 'alt_title': 'Shake It Off', 'description': 'md5:95f66187cd7c8b2c13eb78e1223b63c3', 'uploader': 'TaylorSwiftVEVO', 'uploader_id': 'TaylorSwiftVEVO', 'upload_date': '20140818', + 'creator': 'Taylor Swift', }, 'params': { 'youtube_include_dash_manifest': True, @@ -551,9 +558,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'ext': 'mp4', 'upload_date': '20100430', 'uploader_id': 'deadmau5', + 'creator': 'deadmau5', 'description': 'md5:12c56784b8032162bb936a5f76d55360', 'uploader': 'deadmau5', 'title': 'Deadmau5 - Some Chords (HD)', + 'alt_title': 'Some Chords', }, 'expected_warnings': [ 'DASH manifest missing', @@ -701,10 +710,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'id': 'lsguqyKfVQg', 'ext': 'mp4', 'title': '{dark walk}; Loki/AC/Dishonored; collab w/Elflover21', + 'alt_title': 'Dark Walk', 'description': 'md5:8085699c11dc3f597ce0410b0dcbb34a', 'upload_date': '20151119', 'uploader_id': 'IronSoulElf', 'uploader': 'IronSoulElf', + 'creator': 'Todd Haberman, Daniel Law Heath & Aaron Kaplan', }, 'params': { 'skip_download': True, @@ -1308,6 +1319,15 @@ class YoutubeIE(YoutubeBaseInfoExtractor): upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split()) upload_date = unified_strdate(upload_date) + m_music = re.search( + r'<h4[^>]+class="title"[^>]*>\s*Music\s*</h4>\s*<ul[^>]*>\s*<li>(?P<title>.+?) by (?P<creator>.+?)(?:\(.+?\))?</li', + video_webpage) + if m_music: + video_alt_title = remove_quotes(unescapeHTML(m_music.group('title'))) + video_creator = clean_html(m_music.group('creator')) + else: + video_alt_title = video_creator = None + m_cat_container = self._search_regex( r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>', video_webpage, 'categories', default=None) @@ -1537,7 +1557,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader': video_uploader, 'uploader_id': video_uploader_id, 'upload_date': upload_date, + 'creator': video_creator, 'title': video_title, + 'alt_title': video_alt_title, 'thumbnail': video_thumbnail, 'description': video_description, 'categories': video_categories, From d7ffcfcf9703f1f02e642d0855c0977056e2d0fc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 15 Dec 2015 21:09:14 +0600 Subject: [PATCH 23/92] [tf1] Fix extraction (Closes #7873) --- youtube_dl/extractor/tf1.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/tf1.py b/youtube_dl/extractor/tf1.py index 3a68eaa80..32f5f78bf 100644 --- a/youtube_dl/extractor/tf1.py +++ b/youtube_dl/extractor/tf1.py @@ -43,11 +43,9 @@ class TF1IE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - embed_url = self._html_search_regex( - r'["\'](https?://www.wat.tv/embedframe/.*?)["\']', webpage, 'embed url') - embed_page = self._download_webpage(embed_url, video_id, - 'Downloading embed player page') - wat_id = self._search_regex(r'UVID=(.*?)&', embed_page, 'wat id') + wat_id = self._html_search_regex( + r'(["\'])(?:https?:)?//www\.wat\.tv/embedframe/.*?(?P<id>\d+)\1', + webpage, 'wat id', group='id') wat_info = self._download_json( 'http://www.wat.tv/interface/contentv3/%s' % wat_id, video_id) return self.url_result(wat_info['media']['url'], 'Wat') From ae5e94808e70e608e6bbdf71c0fb8436bcaa76e2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 15 Dec 2015 21:11:52 +0600 Subject: [PATCH 24/92] [tf1] Fix extraction (2) --- youtube_dl/extractor/tf1.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/tf1.py b/youtube_dl/extractor/tf1.py index 32f5f78bf..2d84c9bf8 100644 --- a/youtube_dl/extractor/tf1.py +++ b/youtube_dl/extractor/tf1.py @@ -44,7 +44,7 @@ class TF1IE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) wat_id = self._html_search_regex( - r'(["\'])(?:https?:)?//www\.wat\.tv/embedframe/.*?(?P<id>\d+)\1', + r'(["\'])(?:https?:)?//www\.wat\.tv/embedframe/.*?(?P<id>\d{8})\1', webpage, 'wat id', group='id') wat_info = self._download_json( 'http://www.wat.tv/interface/contentv3/%s' % wat_id, video_id) From 05467d5a5211adb5e2473decaef82769e778ffc4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 15 Dec 2015 21:31:58 +0600 Subject: [PATCH 25/92] [tf1] Relax _VALID_URL --- youtube_dl/extractor/tf1.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/tf1.py b/youtube_dl/extractor/tf1.py index 2d84c9bf8..80cd92938 100644 --- a/youtube_dl/extractor/tf1.py +++ b/youtube_dl/extractor/tf1.py @@ -6,7 +6,7 @@ from .common import InfoExtractor class TF1IE(InfoExtractor): """TF1 uses the wat.tv player.""" - _VALID_URL = r'http://(?:(?:videos|www|lci)\.tf1|www\.tfou)\.fr/.*?-(?P<id>\d+)(?:-\d+)?\.html' + _VALID_URL = r'http://(?:(?:videos|www|lci)\.tf1|www\.tfou)\.fr/(?:[^/]+/)*(?P<id>.+?)\.html' _TESTS = [{ 'url': 'http://videos.tf1.fr/auto-moto/citroen-grand-c4-picasso-2013-presentation-officielle-8062060.html', 'info_dict': { @@ -38,6 +38,9 @@ class TF1IE(InfoExtractor): }, { 'url': 'http://lci.tf1.fr/sept-a-huit/videos/sept-a-huit-du-24-mai-2015-8611550.html', 'only_matching': True, + }, { + 'url': 'http://www.tf1.fr/hd1/documentaire/videos/mylene-farmer-d-une-icone.html', + 'only_matching': True, }] def _real_extract(self, url): From ad1b6017cd48c62896a2d6c978f21e27f5d96d2f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 15 Dec 2015 21:36:59 +0600 Subject: [PATCH 26/92] [tf1] Fix tests --- youtube_dl/extractor/tf1.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/tf1.py b/youtube_dl/extractor/tf1.py index 80cd92938..6890021cf 100644 --- a/youtube_dl/extractor/tf1.py +++ b/youtube_dl/extractor/tf1.py @@ -22,7 +22,7 @@ class TF1IE(InfoExtractor): }, { 'url': 'http://www.tfou.fr/chuggington/videos/le-grand-mysterioso-chuggington-7085291-739.html', 'info_dict': { - 'id': '12043945', + 'id': 'le-grand-mysterioso-chuggington-7085291-739', 'ext': 'mp4', 'title': 'Le grand Mystérioso - Chuggington', 'description': 'Le grand Mystérioso - Emery rêve qu\'un article lui soit consacré dans le journal.', @@ -32,6 +32,7 @@ class TF1IE(InfoExtractor): # Sometimes wat serves the whole file with the --test option 'skip_download': True, }, + 'skip': 'HTTP Error 410: Gone', }, { 'url': 'http://www.tf1.fr/tf1/koh-lanta/videos/replay-koh-lanta-22-mai-2015.html', 'only_matching': True, From 2d3b70271ca427cb4bbe3cf7bb8b6dc0b471cae5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 16 Dec 2015 04:44:17 +0600 Subject: [PATCH 27/92] [rutube] Extend _VALID_URL --- youtube_dl/extractor/rutube.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/rutube.py b/youtube_dl/extractor/rutube.py index 6b09550b0..9db62adb1 100644 --- a/youtube_dl/extractor/rutube.py +++ b/youtube_dl/extractor/rutube.py @@ -17,9 +17,9 @@ from ..utils import ( class RutubeIE(InfoExtractor): IE_NAME = 'rutube' IE_DESC = 'Rutube videos' - _VALID_URL = r'https?://rutube\.ru/video/(?P<id>[\da-z]{32})' + _VALID_URL = r'https?://rutube\.ru/(?:video|play/embed)/(?P<id>[\da-z]{32})' - _TEST = { + _TESTS = [{ 'url': 'http://rutube.ru/video/3eac3b4561676c17df9132a9a1e62e3e/', 'info_dict': { 'id': '3eac3b4561676c17df9132a9a1e62e3e', @@ -36,7 +36,10 @@ class RutubeIE(InfoExtractor): # It requires ffmpeg (m3u8 download) 'skip_download': True, }, - } + }, { + 'url': 'http://rutube.ru/play/embed/a10e53b86e8f349080f718582ce4c661', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) From eb4f27405b172e6ab19ed622a0296ec8d65abe0f Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Wed, 16 Dec 2015 09:43:53 +0100 Subject: [PATCH 28/92] [vimeo] extract source file(closes #1072) --- youtube_dl/extractor/vimeo.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index f392ccf1c..cf854a0f6 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -23,6 +23,7 @@ from ..utils import ( unsmuggle_url, urlencode_postdata, unescapeHTML, + parse_filesize, ) @@ -392,6 +393,20 @@ class VimeoIE(VimeoBaseInfoExtractor): comment_count = None formats = [] + download_request = sanitized_Request('https://vimeo.com/%s?action=load_download_config' % video_id, headers={ + 'X-Requested-With': 'XMLHttpRequest'}) + download_data = self._download_json(download_request, video_id, fatal=False) + if download_data: + source_file = download_data.get('source_file') + if source_file and not source_file.get('is_cold') and not source_file.get('is_defrosting'): + formats.append({ + 'url': source_file['download_url'], + 'ext': source_file['extension'].lower(), + 'width': int_or_none(source_file.get('width')), + 'height': int_or_none(source_file.get('height')), + 'filesize': parse_filesize(source_file.get('size')), + 'format_id': source_file.get('public_name', 'Original'), + }) config_files = config['video'].get('files') or config['request'].get('files', {}) for f in config_files.get('progressive', []): video_url = f.get('url') From 8534bf1f0051640399b7c65c7d33e38b02f598fd Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Wed, 16 Dec 2015 16:36:25 +0100 Subject: [PATCH 29/92] [vimeo] prefer original format --- youtube_dl/extractor/vimeo.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index cf854a0f6..715ede34e 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -406,6 +406,7 @@ class VimeoIE(VimeoBaseInfoExtractor): 'height': int_or_none(source_file.get('height')), 'filesize': parse_filesize(source_file.get('size')), 'format_id': source_file.get('public_name', 'Original'), + 'preference': 1, }) config_files = config['video'].get('files') or config['request'].get('files', {}) for f in config_files.get('progressive', []): @@ -423,12 +424,12 @@ class VimeoIE(VimeoBaseInfoExtractor): m3u8_url = config_files.get('hls', {}).get('url') if m3u8_url: m3u8_formats = self._extract_m3u8_formats( - m3u8_url, video_id, 'mp4', 'm3u8_native', 0, 'hls', fatal=False) + m3u8_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False) if m3u8_formats: formats.extend(m3u8_formats) # Bitrates are completely broken. Single m3u8 may contain entries in kbps and bps # at the same time without actual units specified. This lead to wrong sorting. - self._sort_formats(formats, field_preference=('height', 'width', 'fps', 'format_id')) + self._sort_formats(formats, field_preference=('preference', 'height', 'width', 'fps', 'format_id')) subtitles = {} text_tracks = config['request'].get('text_tracks') From 323f82a7e05afcc3518706dc18016ebd97fbc052 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Wed, 16 Dec 2015 17:00:17 +0100 Subject: [PATCH 30/92] [vimeo] add test for original format --- youtube_dl/extractor/vimeo.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 715ede34e..ce08e6955 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -185,6 +185,20 @@ class VimeoIE(VimeoBaseInfoExtractor): 'uploader_id': 'user28849593', }, }, + { + # contains original format + 'url': 'https://vimeo.com/33951933', + 'md5': '53c688fa95a55bf4b7293d37a89c5c53', + 'info_dict': { + 'id': '33951933', + 'ext': 'mp4', + 'title': 'FOX CLASSICS - Forever Classic ID - A Full Minute', + 'uploader': 'The DMCI', + 'uploader_id': 'dmci', + 'upload_date': '20111220', + 'description': 'md5:ae23671e82d05415868f7ad1aec21147', + }, + }, { 'url': 'https://vimeo.com/109815029', 'note': 'Video not completely processed, "failed" seed status', From 35e22b6b32d0e662d486c9be8c76f6ea86f7cdd4 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Thu, 17 Dec 2015 12:51:50 +0100 Subject: [PATCH 31/92] [youku] check for the correct variable --- youtube_dl/extractor/youku.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youku.py b/youtube_dl/extractor/youku.py index d33caa79e..3a3432be8 100644 --- a/youtube_dl/extractor/youku.py +++ b/youtube_dl/extractor/youku.py @@ -221,7 +221,7 @@ class YoukuIE(InfoExtractor): 'Youku said: Sorry, this video is available in China only', expected=True) else: msg = 'Youku server reported error %i' % error.get('code') - if error is not None: + if error_note is not None: msg += ': ' + error_note raise ExtractorError(msg) From 8f0afda028e64f7620d8ce21e9cfa4d686e3e6c6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 17 Dec 2015 20:24:33 +0600 Subject: [PATCH 32/92] [pbs] Extend _VALID_URL (Closes #7889) --- youtube_dl/extractor/pbs.py | 320 ++++++++++++++++++------------------ 1 file changed, 160 insertions(+), 160 deletions(-) diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py index 604a0dd22..744e4a09a 100644 --- a/youtube_dl/extractor/pbs.py +++ b/youtube_dl/extractor/pbs.py @@ -16,165 +16,165 @@ from ..utils import ( class PBSIE(InfoExtractor): _STATIONS = ( - ('video.pbs.org', 'PBS: Public Broadcasting Service'), # http://www.pbs.org/ - ('video.aptv.org', 'APT - Alabama Public Television (WBIQ)'), # http://aptv.org/ - ('video.gpb.org', 'GPB/Georgia Public Broadcasting (WGTV)'), # http://www.gpb.org/ - ('video.mpbonline.org', 'Mississippi Public Broadcasting (WMPN)'), # http://www.mpbonline.org - ('video.wnpt.org', 'Nashville Public Television (WNPT)'), # http://www.wnpt.org - ('video.wfsu.org', 'WFSU-TV (WFSU)'), # http://wfsu.org/ - ('video.wsre.org', 'WSRE (WSRE)'), # http://www.wsre.org - ('video.wtcitv.org', 'WTCI (WTCI)'), # http://www.wtcitv.org - ('video.pba.org', 'WPBA/Channel 30 (WPBA)'), # http://pba.org/ - ('video.alaskapublic.org', 'Alaska Public Media (KAKM)'), # http://alaskapublic.org/kakm - # ('kuac.org', 'KUAC (KUAC)'), # http://kuac.org/kuac-tv/ - # ('ktoo.org', '360 North (KTOO)'), # http://www.ktoo.org/ - # ('azpm.org', 'KUAT 6 (KUAT)'), # http://www.azpm.org/ - ('video.azpbs.org', 'Arizona PBS (KAET)'), # http://www.azpbs.org - ('portal.knme.org', 'KNME-TV/Channel 5 (KNME)'), # http://www.newmexicopbs.org/ - ('video.vegaspbs.org', 'Vegas PBS (KLVX)'), # http://vegaspbs.org/ - ('watch.aetn.org', 'AETN/ARKANSAS ETV NETWORK (KETS)'), # http://www.aetn.org/ - ('video.ket.org', 'KET (WKLE)'), # http://www.ket.org/ - ('video.wkno.org', 'WKNO/Channel 10 (WKNO)'), # http://www.wkno.org/ - ('video.lpb.org', 'LPB/LOUISIANA PUBLIC BROADCASTING (WLPB)'), # http://www.lpb.org/ - ('videos.oeta.tv', 'OETA (KETA)'), # http://www.oeta.tv - ('video.optv.org', 'Ozarks Public Television (KOZK)'), # http://www.optv.org/ - ('watch.wsiu.org', 'WSIU Public Broadcasting (WSIU)'), # http://www.wsiu.org/ - ('video.keet.org', 'KEET TV (KEET)'), # http://www.keet.org - ('pbs.kixe.org', 'KIXE/Channel 9 (KIXE)'), # http://kixe.org/ - ('video.kpbs.org', 'KPBS San Diego (KPBS)'), # http://www.kpbs.org/ - ('video.kqed.org', 'KQED (KQED)'), # http://www.kqed.org - ('vids.kvie.org', 'KVIE Public Television (KVIE)'), # http://www.kvie.org - ('video.pbssocal.org', 'PBS SoCal/KOCE (KOCE)'), # http://www.pbssocal.org/ - ('video.valleypbs.org', 'ValleyPBS (KVPT)'), # http://www.valleypbs.org/ - ('video.cptv.org', 'CONNECTICUT PUBLIC TELEVISION (WEDH)'), # http://cptv.org - ('watch.knpb.org', 'KNPB Channel 5 (KNPB)'), # http://www.knpb.org/ - ('video.soptv.org', 'SOPTV (KSYS)'), # http://www.soptv.org - # ('klcs.org', 'KLCS/Channel 58 (KLCS)'), # http://www.klcs.org - # ('krcb.org', 'KRCB Television & Radio (KRCB)'), # http://www.krcb.org - # ('kvcr.org', 'KVCR TV/DT/FM :: Vision for the Future (KVCR)'), # http://kvcr.org - ('video.rmpbs.org', 'Rocky Mountain PBS (KRMA)'), # http://www.rmpbs.org - ('video.kenw.org', 'KENW-TV3 (KENW)'), # http://www.kenw.org - ('video.kued.org', 'KUED Channel 7 (KUED)'), # http://www.kued.org - ('video.wyomingpbs.org', 'Wyoming PBS (KCWC)'), # http://www.wyomingpbs.org - ('video.cpt12.org', 'Colorado Public Television / KBDI 12 (KBDI)'), # http://www.cpt12.org/ - ('video.kbyueleven.org', 'KBYU-TV (KBYU)'), # http://www.kbyutv.org/ - ('video.thirteen.org', 'Thirteen/WNET New York (WNET)'), # http://www.thirteen.org - ('video.wgbh.org', 'WGBH/Channel 2 (WGBH)'), # http://wgbh.org - ('video.wgby.org', 'WGBY (WGBY)'), # http://www.wgby.org - ('watch.njtvonline.org', 'NJTV Public Media NJ (WNJT)'), # http://www.njtvonline.org/ - # ('ripbs.org', 'Rhode Island PBS (WSBE)'), # http://www.ripbs.org/home/ - ('watch.wliw.org', 'WLIW21 (WLIW)'), # http://www.wliw.org/ - ('video.mpt.tv', 'mpt/Maryland Public Television (WMPB)'), # http://www.mpt.org - ('watch.weta.org', 'WETA Television and Radio (WETA)'), # http://www.weta.org - ('video.whyy.org', 'WHYY (WHYY)'), # http://www.whyy.org - ('video.wlvt.org', 'PBS 39 (WLVT)'), # http://www.wlvt.org/ - ('video.wvpt.net', 'WVPT - Your Source for PBS and More! (WVPT)'), # http://www.wvpt.net - ('video.whut.org', 'Howard University Television (WHUT)'), # http://www.whut.org - ('video.wedu.org', 'WEDU PBS (WEDU)'), # http://www.wedu.org - ('video.wgcu.org', 'WGCU Public Media (WGCU)'), # http://www.wgcu.org/ - # ('wjct.org', 'WJCT Public Broadcasting (WJCT)'), # http://www.wjct.org - ('video.wpbt2.org', 'WPBT2 (WPBT)'), # http://www.wpbt2.org - ('video.wucftv.org', 'WUCF TV (WUCF)'), # http://wucftv.org - ('video.wuft.org', 'WUFT/Channel 5 (WUFT)'), # http://www.wuft.org - ('watch.wxel.org', 'WXEL/Channel 42 (WXEL)'), # http://www.wxel.org/home/ - ('video.wlrn.org', 'WLRN/Channel 17 (WLRN)'), # http://www.wlrn.org/ - ('video.wusf.usf.edu', 'WUSF Public Broadcasting (WUSF)'), # http://wusf.org/ - ('video.scetv.org', 'ETV (WRLK)'), # http://www.scetv.org - ('video.unctv.org', 'UNC-TV (WUNC)'), # http://www.unctv.org/ - # ('pbsguam.org', 'PBS Guam (KGTF)'), # http://www.pbsguam.org/ - ('video.pbshawaii.org', 'PBS Hawaii - Oceanic Cable Channel 10 (KHET)'), # http://www.pbshawaii.org/ - ('video.idahoptv.org', 'Idaho Public Television (KAID)'), # http://idahoptv.org - ('video.ksps.org', 'KSPS (KSPS)'), # http://www.ksps.org/home/ - ('watch.opb.org', 'OPB (KOPB)'), # http://www.opb.org - ('watch.nwptv.org', 'KWSU/Channel 10 & KTNW/Channel 31 (KWSU)'), # http://www.kwsu.org - ('video.will.illinois.edu', 'WILL-TV (WILL)'), # http://will.illinois.edu/ - ('video.networkknowledge.tv', 'Network Knowledge - WSEC/Springfield (WSEC)'), # http://www.wsec.tv - ('video.wttw.com', 'WTTW11 (WTTW)'), # http://www.wttw.com/ - # ('wtvp.org', 'WTVP & WTVP.org, Public Media for Central Illinois (WTVP)'), # http://www.wtvp.org/ - ('video.iptv.org', 'Iowa Public Television/IPTV (KDIN)'), # http://www.iptv.org/ - ('video.ninenet.org', 'Nine Network (KETC)'), # http://www.ninenet.org - ('video.wfwa.org', 'PBS39 Fort Wayne (WFWA)'), # http://wfwa.org/ - ('video.wfyi.org', 'WFYI Indianapolis (WFYI)'), # http://www.wfyi.org - ('video.mptv.org', 'Milwaukee Public Television (WMVS)'), # http://www.mptv.org - ('video.wnin.org', 'WNIN (WNIN)'), # http://www.wnin.org/ - ('video.wnit.org', 'WNIT Public Television (WNIT)'), # http://www.wnit.org/ - ('video.wpt.org', 'WPT (WPNE)'), # http://www.wpt.org/ - ('video.wvut.org', 'WVUT/Channel 22 (WVUT)'), # http://wvut.org/ - ('video.weiu.net', 'WEIU/Channel 51 (WEIU)'), # http://www.weiu.net - ('video.wqpt.org', 'WQPT-TV (WQPT)'), # http://www.wqpt.org - ('video.wycc.org', 'WYCC PBS Chicago (WYCC)'), # http://www.wycc.org - # ('lakeshorepublicmedia.org', 'Lakeshore Public Television (WYIN)'), # http://lakeshorepublicmedia.org/ - ('video.wipb.org', 'WIPB-TV (WIPB)'), # http://wipb.org - ('video.indianapublicmedia.org', 'WTIU (WTIU)'), # http://indianapublicmedia.org/tv/ - ('watch.cetconnect.org', 'CET (WCET)'), # http://www.cetconnect.org - ('video.thinktv.org', 'ThinkTVNetwork (WPTD)'), # http://www.thinktv.org - ('video.wbgu.org', 'WBGU-TV (WBGU)'), # http://wbgu.org - ('video.wgvu.org', 'WGVU TV (WGVU)'), # http://www.wgvu.org/ - ('video.netnebraska.org', 'NET1 (KUON)'), # http://netnebraska.org - ('video.pioneer.org', 'Pioneer Public Television (KWCM)'), # http://www.pioneer.org - ('watch.sdpb.org', 'SDPB Television (KUSD)'), # http://www.sdpb.org - ('video.tpt.org', 'TPT (KTCA)'), # http://www.tpt.org - ('watch.ksmq.org', 'KSMQ (KSMQ)'), # http://www.ksmq.org/ - ('watch.kpts.org', 'KPTS/Channel 8 (KPTS)'), # http://www.kpts.org/ - ('watch.ktwu.org', 'KTWU/Channel 11 (KTWU)'), # http://ktwu.org - # ('shptv.org', 'Smoky Hills Public Television (KOOD)'), # http://www.shptv.org - # ('kcpt.org', 'KCPT Kansas City Public Television (KCPT)'), # http://kcpt.org/ - # ('blueridgepbs.org', 'Blue Ridge PBS (WBRA)'), # http://www.blueridgepbs.org/ - ('watch.easttennesseepbs.org', 'East Tennessee PBS (WSJK)'), # http://easttennesseepbs.org - ('video.wcte.tv', 'WCTE-TV (WCTE)'), # http://www.wcte.org - ('video.wljt.org', 'WLJT, Channel 11 (WLJT)'), # http://wljt.org/ - ('video.wosu.org', 'WOSU TV (WOSU)'), # http://wosu.org/ - ('video.woub.org', 'WOUB/WOUC (WOUB)'), # http://woub.org/tv/index.php?section=5 - ('video.wvpublic.org', 'WVPB (WVPB)'), # http://wvpublic.org/ - ('video.wkyupbs.org', 'WKYU-PBS (WKYU)'), # http://www.wkyupbs.org - # ('wyes.org', 'WYES-TV/New Orleans (WYES)'), # http://www.wyes.org - ('video.kera.org', 'KERA 13 (KERA)'), # http://www.kera.org/ - ('video.mpbn.net', 'MPBN (WCBB)'), # http://www.mpbn.net/ - ('video.mountainlake.org', 'Mountain Lake PBS (WCFE)'), # http://www.mountainlake.org/ - ('video.nhptv.org', 'NHPTV (WENH)'), # http://nhptv.org/ - ('video.vpt.org', 'Vermont PBS (WETK)'), # http://www.vpt.org - ('video.witf.org', 'witf (WITF)'), # http://www.witf.org - ('watch.wqed.org', 'WQED Multimedia (WQED)'), # http://www.wqed.org/ - ('video.wmht.org', 'WMHT Educational Telecommunications (WMHT)'), # http://www.wmht.org/home/ - ('video.deltabroadcasting.org', 'Q-TV (WDCQ)'), # http://www.deltabroadcasting.org - ('video.dptv.org', 'WTVS Detroit Public TV (WTVS)'), # http://www.dptv.org/ - ('video.wcmu.org', 'CMU Public Television (WCMU)'), # http://www.wcmu.org - ('video.wkar.org', 'WKAR-TV (WKAR)'), # http://wkar.org/ - ('wnmuvideo.nmu.edu', 'WNMU-TV Public TV 13 (WNMU)'), # http://wnmutv.nmu.edu - ('video.wdse.org', 'WDSE - WRPT (WDSE)'), # http://www.wdse.org/ - ('video.wgte.org', 'WGTE TV (WGTE)'), # http://www.wgte.org - ('video.lptv.org', 'Lakeland Public Television (KAWE)'), # http://www.lakelandptv.org - # ('prairiepublic.org', 'PRAIRIE PUBLIC (KFME)'), # http://www.prairiepublic.org/ - ('video.kmos.org', 'KMOS-TV - Channels 6.1, 6.2 and 6.3 (KMOS)'), # http://www.kmos.org/ - ('watch.montanapbs.org', 'MontanaPBS (KUSM)'), # http://montanapbs.org - ('video.krwg.org', 'KRWG/Channel 22 (KRWG)'), # http://www.krwg.org - ('video.kacvtv.org', 'KACV (KACV)'), # http://www.panhandlepbs.org/home/ - ('video.kcostv.org', 'KCOS/Channel 13 (KCOS)'), # www.kcostv.org - ('video.wcny.org', 'WCNY/Channel 24 (WCNY)'), # http://www.wcny.org - ('video.wned.org', 'WNED (WNED)'), # http://www.wned.org/ - ('watch.wpbstv.org', 'WPBS (WPBS)'), # http://www.wpbstv.org - ('video.wskg.org', 'WSKG Public TV (WSKG)'), # http://wskg.org - ('video.wxxi.org', 'WXXI (WXXI)'), # http://wxxi.org - ('video.wpsu.org', 'WPSU (WPSU)'), # http://www.wpsu.org - # ('wqln.org', 'WQLN/Channel 54 (WQLN)'), # http://www.wqln.org - ('on-demand.wvia.org', 'WVIA Public Media Studios (WVIA)'), # http://www.wvia.org/ - ('video.wtvi.org', 'WTVI (WTVI)'), # http://www.wtvi.org/ - # ('whro.org', 'WHRO (WHRO)'), # http://whro.org - ('video.westernreservepublicmedia.org', 'Western Reserve PBS (WNEO)'), # http://www.WesternReservePublicMedia.org/ - ('video.ideastream.org', 'WVIZ/PBS ideastream (WVIZ)'), # http://www.wviz.org/ - ('video.kcts9.org', 'KCTS 9 (KCTS)'), # http://kcts9.org/ - ('video.basinpbs.org', 'Basin PBS (KPBT)'), # http://www.basinpbs.org - ('video.houstonpbs.org', 'KUHT / Channel 8 (KUHT)'), # http://www.houstonpublicmedia.org/ - # ('tamu.edu', 'KAMU - TV (KAMU)'), # http://KAMU.tamu.edu - # ('kedt.org', 'KEDT/Channel 16 (KEDT)'), # http://www.kedt.org - ('video.klrn.org', 'KLRN (KLRN)'), # http://www.klrn.org - ('video.klru.tv', 'KLRU (KLRU)'), # http://www.klru.org - # ('kmbh.org', 'KMBH-TV (KMBH)'), # http://www.kmbh.org - # ('knct.org', 'KNCT (KNCT)'), # http://www.knct.org - # ('ktxt.org', 'KTTZ-TV (KTXT)'), # http://www.ktxt.org - ('video.wtjx.org', 'WTJX Channel 12 (WTJX)'), # http://www.wtjx.org/ - ('video.ideastations.org', 'WCVE PBS (WCVE)'), # http://ideastations.org/ - ('video.kbtc.org', 'KBTC Public Television (KBTC)'), # http://kbtc.org + (r'(?:video|www)\.pbs\.org', 'PBS: Public Broadcasting Service'), # http://www.pbs.org/ + (r'video\.aptv\.org', 'APT - Alabama Public Television (WBIQ)'), # http://aptv.org/ + (r'video\.gpb\.org', 'GPB/Georgia Public Broadcasting (WGTV)'), # http://www.gpb.org/ + (r'video\.mpbonline\.org', 'Mississippi Public Broadcasting (WMPN)'), # http://www.mpbonline.org + (r'video\.wnpt\.org', 'Nashville Public Television (WNPT)'), # http://www.wnpt.org + (r'video\.wfsu\.org', 'WFSU-TV (WFSU)'), # http://wfsu.org/ + (r'video\.wsre\.org', 'WSRE (WSRE)'), # http://www.wsre.org + (r'video\.wtcitv\.org', 'WTCI (WTCI)'), # http://www.wtcitv.org + (r'video\.pba\.org', 'WPBA/Channel 30 (WPBA)'), # http://pba.org/ + (r'video\.alaskapublic\.org', 'Alaska Public Media (KAKM)'), # http://alaskapublic.org/kakm + # (r'kuac\.org', 'KUAC (KUAC)'), # http://kuac.org/kuac-tv/ + # (r'ktoo\.org', '360 North (KTOO)'), # http://www.ktoo.org/ + # (r'azpm\.org', 'KUAT 6 (KUAT)'), # http://www.azpm.org/ + (r'video\.azpbs\.org', 'Arizona PBS (KAET)'), # http://www.azpbs.org + (r'portal\.knme\.org', 'KNME-TV/Channel 5 (KNME)'), # http://www.newmexicopbs.org/ + (r'video\.vegaspbs\.org', 'Vegas PBS (KLVX)'), # http://vegaspbs.org/ + (r'watch\.aetn\.org', 'AETN/ARKANSAS ETV NETWORK (KETS)'), # http://www.aetn.org/ + (r'video\.ket\.org', 'KET (WKLE)'), # http://www.ket.org/ + (r'video\.wkno\.org', 'WKNO/Channel 10 (WKNO)'), # http://www.wkno.org/ + (r'video\.lpb\.org', 'LPB/LOUISIANA PUBLIC BROADCASTING (WLPB)'), # http://www.lpb.org/ + (r'videos\.oeta\.tv', 'OETA (KETA)'), # http://www.oeta.tv + (r'video\.optv\.org', 'Ozarks Public Television (KOZK)'), # http://www.optv.org/ + (r'watch\.wsiu\.org', 'WSIU Public Broadcasting (WSIU)'), # http://www.wsiu.org/ + (r'video\.keet\.org', 'KEET TV (KEET)'), # http://www.keet.org + (r'pbs\.kixe\.org', 'KIXE/Channel 9 (KIXE)'), # http://kixe.org/ + (r'video\.kpbs\.org', 'KPBS San Diego (KPBS)'), # http://www.kpbs.org/ + (r'video\.kqed\.org', 'KQED (KQED)'), # http://www.kqed.org + (r'vids\.kvie\.org', 'KVIE Public Television (KVIE)'), # http://www.kvie.org + (r'video\.pbssocal\.org', 'PBS SoCal/KOCE (KOCE)'), # http://www.pbssocal.org/ + (r'video\.valleypbs\.org', 'ValleyPBS (KVPT)'), # http://www.valleypbs.org/ + (r'video\.cptv\.org', 'CONNECTICUT PUBLIC TELEVISION (WEDH)'), # http://cptv.org + (r'watch\.knpb\.org', 'KNPB Channel 5 (KNPB)'), # http://www.knpb.org/ + (r'video\.soptv\.org', 'SOPTV (KSYS)'), # http://www.soptv.org + # (r'klcs\.org', 'KLCS/Channel 58 (KLCS)'), # http://www.klcs.org + # (r'krcb\.org', 'KRCB Television & Radio (KRCB)'), # http://www.krcb.org + # (r'kvcr\.org', 'KVCR TV/DT/FM :: Vision for the Future (KVCR)'), # http://kvcr.org + (r'video\.rmpbs\.org', 'Rocky Mountain PBS (KRMA)'), # http://www.rmpbs.org + (r'video\.kenw\.org', 'KENW-TV3 (KENW)'), # http://www.kenw.org + (r'video\.kued\.org', 'KUED Channel 7 (KUED)'), # http://www.kued.org + (r'video\.wyomingpbs\.org', 'Wyoming PBS (KCWC)'), # http://www.wyomingpbs.org + (r'video\.cpt12\.org', 'Colorado Public Television / KBDI 12 (KBDI)'), # http://www.cpt12.org/ + (r'video\.kbyueleven\.org', 'KBYU-TV (KBYU)'), # http://www.kbyutv.org/ + (r'video\.thirteen\.org', 'Thirteen/WNET New York (WNET)'), # http://www.thirteen.org + (r'video\.wgbh\.org', 'WGBH/Channel 2 (WGBH)'), # http://wgbh.org + (r'video\.wgby\.org', 'WGBY (WGBY)'), # http://www.wgby.org + (r'watch\.njtvonline\.org', 'NJTV Public Media NJ (WNJT)'), # http://www.njtvonline.org/ + # (r'ripbs\.org', 'Rhode Island PBS (WSBE)'), # http://www.ripbs.org/home/ + (r'watch\.wliw\.org', 'WLIW21 (WLIW)'), # http://www.wliw.org/ + (r'video\.mpt\.tv', 'mpt/Maryland Public Television (WMPB)'), # http://www.mpt.org + (r'watch\.weta\.org', 'WETA Television and Radio (WETA)'), # http://www.weta.org + (r'video\.whyy\.org', 'WHYY (WHYY)'), # http://www.whyy.org + (r'video\.wlvt\.org', 'PBS 39 (WLVT)'), # http://www.wlvt.org/ + (r'video\.wvpt\.net', 'WVPT - Your Source for PBS and More! (WVPT)'), # http://www.wvpt.net + (r'video\.whut\.org', 'Howard University Television (WHUT)'), # http://www.whut.org + (r'video\.wedu\.org', 'WEDU PBS (WEDU)'), # http://www.wedu.org + (r'video\.wgcu\.org', 'WGCU Public Media (WGCU)'), # http://www.wgcu.org/ + # (r'wjct\.org', 'WJCT Public Broadcasting (WJCT)'), # http://www.wjct.org + (r'video\.wpbt2\.org', 'WPBT2 (WPBT)'), # http://www.wpbt2.org + (r'video\.wucftv\.org', 'WUCF TV (WUCF)'), # http://wucftv.org + (r'video\.wuft\.org', 'WUFT/Channel 5 (WUFT)'), # http://www.wuft.org + (r'watch\.wxel\.org', 'WXEL/Channel 42 (WXEL)'), # http://www.wxel.org/home/ + (r'video\.wlrn\.org', 'WLRN/Channel 17 (WLRN)'), # http://www.wlrn.org/ + (r'video\.wusf\.usf\.edu', 'WUSF Public Broadcasting (WUSF)'), # http://wusf.org/ + (r'video\.scetv\.org', 'ETV (WRLK)'), # http://www.scetv.org + (r'video\.unctv\.org', 'UNC-TV (WUNC)'), # http://www.unctv.org/ + # (r'pbsguam\.org', 'PBS Guam (KGTF)'), # http://www.pbsguam.org/ + (r'video\.pbshawaii\.org', 'PBS Hawaii - Oceanic Cable Channel 10 (KHET)'), # http://www.pbshawaii.org/ + (r'video\.idahoptv\.org', 'Idaho Public Television (KAID)'), # http://idahoptv.org + (r'video\.ksps\.org', 'KSPS (KSPS)'), # http://www.ksps.org/home/ + (r'watch\.opb\.org', 'OPB (KOPB)'), # http://www.opb.org + (r'watch\.nwptv\.org', 'KWSU/Channel 10 & KTNW/Channel 31 (KWSU)'), # http://www.kwsu.org + (r'video\.will\.illinois\.edu', 'WILL-TV (WILL)'), # http://will.illinois.edu/ + (r'video\.networkknowledge\.tv', 'Network Knowledge - WSEC/Springfield (WSEC)'), # http://www.wsec.tv + (r'video\.wttw\.com', 'WTTW11 (WTTW)'), # http://www.wttw.com/ + # (r'wtvp\.org', 'WTVP & WTVP.org, Public Media for Central Illinois (WTVP)'), # http://www.wtvp.org/ + (r'video\.iptv\.org', 'Iowa Public Television/IPTV (KDIN)'), # http://www.iptv.org/ + (r'video\.ninenet\.org', 'Nine Network (KETC)'), # http://www.ninenet.org + (r'video\.wfwa\.org', 'PBS39 Fort Wayne (WFWA)'), # http://wfwa.org/ + (r'video\.wfyi\.org', 'WFYI Indianapolis (WFYI)'), # http://www.wfyi.org + (r'video\.mptv\.org', 'Milwaukee Public Television (WMVS)'), # http://www.mptv.org + (r'video\.wnin\.org', 'WNIN (WNIN)'), # http://www.wnin.org/ + (r'video\.wnit\.org', 'WNIT Public Television (WNIT)'), # http://www.wnit.org/ + (r'video\.wpt\.org', 'WPT (WPNE)'), # http://www.wpt.org/ + (r'video\.wvut\.org', 'WVUT/Channel 22 (WVUT)'), # http://wvut.org/ + (r'video\.weiu\.net', 'WEIU/Channel 51 (WEIU)'), # http://www.weiu.net + (r'video\.wqpt\.org', 'WQPT-TV (WQPT)'), # http://www.wqpt.org + (r'video\.wycc\.org', 'WYCC PBS Chicago (WYCC)'), # http://www.wycc.org + # (r'lakeshorepublicmedia\.org', 'Lakeshore Public Television (WYIN)'), # http://lakeshorepublicmedia.org/ + (r'video\.wipb\.org', 'WIPB-TV (WIPB)'), # http://wipb.org + (r'video\.indianapublicmedia\.org', 'WTIU (WTIU)'), # http://indianapublicmedia.org/tv/ + (r'watch\.cetconnect\.org', 'CET (WCET)'), # http://www.cetconnect.org + (r'video\.thinktv\.org', 'ThinkTVNetwork (WPTD)'), # http://www.thinktv.org + (r'video\.wbgu\.org', 'WBGU-TV (WBGU)'), # http://wbgu.org + (r'video\.wgvu\.org', 'WGVU TV (WGVU)'), # http://www.wgvu.org/ + (r'video\.netnebraska\.org', 'NET1 (KUON)'), # http://netnebraska.org + (r'video\.pioneer\.org', 'Pioneer Public Television (KWCM)'), # http://www.pioneer.org + (r'watch\.sdpb\.org', 'SDPB Television (KUSD)'), # http://www.sdpb.org + (r'video\.tpt\.org', 'TPT (KTCA)'), # http://www.tpt.org + (r'watch\.ksmq\.org', 'KSMQ (KSMQ)'), # http://www.ksmq.org/ + (r'watch\.kpts\.org', 'KPTS/Channel 8 (KPTS)'), # http://www.kpts.org/ + (r'watch\.ktwu\.org', 'KTWU/Channel 11 (KTWU)'), # http://ktwu.org + # (r'shptv\.org', 'Smoky Hills Public Television (KOOD)'), # http://www.shptv.org + # (r'kcpt\.org', 'KCPT Kansas City Public Television (KCPT)'), # http://kcpt.org/ + # (r'blueridgepbs\.org', 'Blue Ridge PBS (WBRA)'), # http://www.blueridgepbs.org/ + (r'watch\.easttennesseepbs\.org', 'East Tennessee PBS (WSJK)'), # http://easttennesseepbs.org + (r'video\.wcte\.tv', 'WCTE-TV (WCTE)'), # http://www.wcte.org + (r'video\.wljt\.org', 'WLJT, Channel 11 (WLJT)'), # http://wljt.org/ + (r'video\.wosu\.org', 'WOSU TV (WOSU)'), # http://wosu.org/ + (r'video\.woub\.org', 'WOUB/WOUC (WOUB)'), # http://woub.org/tv/index.php?section=5 + (r'video\.wvpublic\.org', 'WVPB (WVPB)'), # http://wvpublic.org/ + (r'video\.wkyupbs\.org', 'WKYU-PBS (WKYU)'), # http://www.wkyupbs.org + # (r'wyes\.org', 'WYES-TV/New Orleans (WYES)'), # http://www.wyes.org + (r'video\.kera\.org', 'KERA 13 (KERA)'), # http://www.kera.org/ + (r'video\.mpbn\.net', 'MPBN (WCBB)'), # http://www.mpbn.net/ + (r'video\.mountainlake\.org', 'Mountain Lake PBS (WCFE)'), # http://www.mountainlake.org/ + (r'video\.nhptv\.org', 'NHPTV (WENH)'), # http://nhptv.org/ + (r'video\.vpt\.org', 'Vermont PBS (WETK)'), # http://www.vpt.org + (r'video\.witf\.org', 'witf (WITF)'), # http://www.witf.org + (r'watch\.wqed\.org', 'WQED Multimedia (WQED)'), # http://www.wqed.org/ + (r'video\.wmht\.org', 'WMHT Educational Telecommunications (WMHT)'), # http://www.wmht.org/home/ + (r'video\.deltabroadcasting\.org', 'Q-TV (WDCQ)'), # http://www.deltabroadcasting.org + (r'video\.dptv\.org', 'WTVS Detroit Public TV (WTVS)'), # http://www.dptv.org/ + (r'video\.wcmu\.org', 'CMU Public Television (WCMU)'), # http://www.wcmu.org + (r'video\.wkar\.org', 'WKAR-TV (WKAR)'), # http://wkar.org/ + (r'wnmuvideo\.nmu\.edu', 'WNMU-TV Public TV 13 (WNMU)'), # http://wnmutv.nmu.edu + (r'video\.wdse\.org', 'WDSE - WRPT (WDSE)'), # http://www.wdse.org/ + (r'video\.wgte\.org', 'WGTE TV (WGTE)'), # http://www.wgte.org + (r'video\.lptv\.org', 'Lakeland Public Television (KAWE)'), # http://www.lakelandptv.org + # (r'prairiepublic\.org', 'PRAIRIE PUBLIC (KFME)'), # http://www.prairiepublic.org/ + (r'video\.kmos\.org', 'KMOS-TV - Channels 6.1, 6.2 and 6.3 (KMOS)'), # http://www.kmos.org/ + (r'watch\.montanapbs\.org', 'MontanaPBS (KUSM)'), # http://montanapbs.org + (r'video\.krwg\.org', 'KRWG/Channel 22 (KRWG)'), # http://www.krwg.org + (r'video\.kacvtv\.org', 'KACV (KACV)'), # http://www.panhandlepbs.org/home/ + (r'video\.kcostv\.org', 'KCOS/Channel 13 (KCOS)'), # www.kcostv.org + (r'video\.wcny\.org', 'WCNY/Channel 24 (WCNY)'), # http://www.wcny.org + (r'video\.wned\.org', 'WNED (WNED)'), # http://www.wned.org/ + (r'watch\.wpbstv\.org', 'WPBS (WPBS)'), # http://www.wpbstv.org + (r'video\.wskg\.org', 'WSKG Public TV (WSKG)'), # http://wskg.org + (r'video\.wxxi\.org', 'WXXI (WXXI)'), # http://wxxi.org + (r'video\.wpsu\.org', 'WPSU (WPSU)'), # http://www.wpsu.org + # (r'wqln\.org', 'WQLN/Channel 54 (WQLN)'), # http://www.wqln.org + (r'on-demand\.wvia\.org', 'WVIA Public Media Studios (WVIA)'), # http://www.wvia.org/ + (r'video\.wtvi\.org', 'WTVI (WTVI)'), # http://www.wtvi.org/ + # (r'whro\.org', 'WHRO (WHRO)'), # http://whro.org + (r'video\.westernreservepublicmedia\.org', 'Western Reserve PBS (WNEO)'), # http://www.WesternReservePublicMedia.org/ + (r'video\.ideastream\.org', 'WVIZ/PBS ideastream (WVIZ)'), # http://www.wviz.org/ + (r'video\.kcts9\.org', 'KCTS 9 (KCTS)'), # http://kcts9.org/ + (r'video\.basinpbs\.org', 'Basin PBS (KPBT)'), # http://www.basinpbs.org + (r'video\.houstonpbs\.org', 'KUHT / Channel 8 (KUHT)'), # http://www.houstonpublicmedia.org/ + # (r'tamu\.edu', 'KAMU - TV (KAMU)'), # http://KAMU.tamu.edu + # (r'kedt\.org', 'KEDT/Channel 16 (KEDT)'), # http://www.kedt.org + (r'video\.klrn\.org', 'KLRN (KLRN)'), # http://www.klrn.org + (r'video\.klru\.tv', 'KLRU (KLRU)'), # http://www.klru.org + # (r'kmbh\.org', 'KMBH-TV (KMBH)'), # http://www.kmbh.org + # (r'knct\.org', 'KNCT (KNCT)'), # http://www.knct.org + # (r'ktxt\.org', 'KTTZ-TV (KTXT)'), # http://www.ktxt.org + (r'video\.wtjx\.org', 'WTJX Channel 12 (WTJX)'), # http://www.wtjx.org/ + (r'video\.ideastations\.org', 'WCVE PBS (WCVE)'), # http://ideastations.org/ + (r'video\.kbtc\.org', 'KBTC Public Television (KBTC)'), # http://kbtc.org ) IE_NAME = 'pbs' @@ -189,7 +189,7 @@ class PBSIE(InfoExtractor): # Player (?:video|player)\.pbs\.org/(?:widget/)?partnerplayer/(?P<player_id>[^/]+)/ ) - ''' % '|'.join(re.escape(p) for p in list(zip(*_STATIONS))[0]) + ''' % '|'.join(list(zip(*_STATIONS))[0]) _TESTS = [ { From 2469a6aecbc819b87915d31dd2fb5940d84d11d2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 17 Dec 2015 22:16:22 +0600 Subject: [PATCH 33/92] [noco] Adjust timestamp according to server time (Closes #7864) --- youtube_dl/extractor/noco.py | 27 +++++++++++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/noco.py b/youtube_dl/extractor/noco.py index 76bd21e6d..63b97f170 100644 --- a/youtube_dl/extractor/noco.py +++ b/youtube_dl/extractor/noco.py @@ -9,6 +9,7 @@ from .common import InfoExtractor from ..compat import ( compat_str, compat_urllib_parse, + compat_urlparse, ) from ..utils import ( clean_html, @@ -82,14 +83,21 @@ class NocoIE(InfoExtractor): if 'erreur' in login: raise ExtractorError('Unable to login: %s' % clean_html(login['erreur']), expected=True) + @staticmethod + def _ts(): + return int(time.time() * 1000) + def _call_api(self, path, video_id, note, sub_lang=None): - ts = compat_str(int(time.time() * 1000)) + ts = compat_str(self._ts() + self._ts_offset) tk = hashlib.md5((hashlib.md5(ts.encode('ascii')).hexdigest() + '#8S?uCraTedap6a').encode('ascii')).hexdigest() url = self._API_URL_TEMPLATE % (path, ts, tk) if sub_lang: url += self._SUB_LANG_TEMPLATE % sub_lang - resp = self._download_json(url, video_id, note) + request = sanitized_Request(url) + request.add_header('Referer', self._referer) + + resp = self._download_json(request, video_id, note) if isinstance(resp, dict) and resp.get('error'): self._raise_error(resp['error'], resp['description']) @@ -105,6 +113,21 @@ class NocoIE(InfoExtractor): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') + # Timestamp adjustment offset between server time and local time + # must be calculated in order to use timestamps closest to server's + # in all API requests (see https://github.com/rg3/youtube-dl/issues/7864) + webpage = self._download_webpage(url, video_id) + + player_url = self._search_regex( + r'(["\'])(?P<player>https?://noco\.tv/(?:[^/]+/)+NocoPlayer.+?\.swf.*?)\1', + webpage, 'noco player', group='player', + default='http://noco.tv/cdata/js/player/NocoPlayer-v1.2.40.swf') + + qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(player_url).query) + ts = int_or_none(qs.get('ts', [None])[0]) + self._ts_offset = ts - self._ts() if ts else 0 + self._referer = player_url + medias = self._call_api( 'shows/%s/medias' % video_id, video_id, 'Downloading video JSON') From 7824e1f6a61e97c3c79c21db65de33a6380e29c3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 17 Dec 2015 22:16:58 +0600 Subject: [PATCH 34/92] [noco] Modernize --- youtube_dl/extractor/noco.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/youtube_dl/extractor/noco.py b/youtube_dl/extractor/noco.py index 63b97f170..daba2e12f 100644 --- a/youtube_dl/extractor/noco.py +++ b/youtube_dl/extractor/noco.py @@ -110,8 +110,7 @@ class NocoIE(InfoExtractor): expected=True) def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + video_id = self._match_id(url) # Timestamp adjustment offset between server time and local time # must be calculated in order to use timestamps closest to server's From 9dc1d94a0ccf39c4dab686cbd5941d75208248fb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 17 Dec 2015 22:18:28 +0600 Subject: [PATCH 35/92] [noco] Fix bitrates --- youtube_dl/extractor/noco.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/noco.py b/youtube_dl/extractor/noco.py index daba2e12f..d440313d5 100644 --- a/youtube_dl/extractor/noco.py +++ b/youtube_dl/extractor/noco.py @@ -177,8 +177,8 @@ class NocoIE(InfoExtractor): 'format_id': format_id_extended, 'width': int_or_none(fmt.get('res_width')), 'height': int_or_none(fmt.get('res_lines')), - 'abr': int_or_none(fmt.get('audiobitrate')), - 'vbr': int_or_none(fmt.get('videobitrate')), + 'abr': int_or_none(fmt.get('audiobitrate'), 1000), + 'vbr': int_or_none(fmt.get('videobitrate'), 1000), 'filesize': int_or_none(fmt.get('filesize')), 'format_note': qualities[format_id].get('quality_name'), 'quality': qualities[format_id].get('priority'), From 45dad8bab98e30eb3fb1cf0f20770c1dce74610b Mon Sep 17 00:00:00 2001 From: Gautam M <Blue9@users.noreply.github.com> Date: Fri, 18 Dec 2015 03:16:36 -0500 Subject: [PATCH 36/92] Fix hyperlink to youtube-dl options --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 5c3915d64..7002f45e0 100644 --- a/README.md +++ b/README.md @@ -757,7 +757,7 @@ with youtube_dl.YoutubeDL(ydl_opts) as ydl: ydl.download(['http://www.youtube.com/watch?v=BaW_jenozKc']) ``` -Most likely, you'll want to use various options. For a list of what can be done, have a look at [youtube_dl/YoutubeDL.py](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/YoutubeDL.py#L117-L265). For a start, if you want to intercept youtube-dl's output, set a `logger` object. +Most likely, you'll want to use various options. For a list of what can be done, have a look at [`youtube_dl/YoutubeDL.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/YoutubeDL.py#L121-L269). For a start, if you want to intercept youtube-dl's output, set a `logger` object. Here's a more complete example of a program that outputs only errors (and a short message after the download is finished), and downloads/converts the video to an mp3 file: From 10171468d94e68f92505a77938be11d266ccabd8 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Fri, 18 Dec 2015 18:20:41 +0800 Subject: [PATCH 37/92] [iqiyi] Update key (closes #7896) --- youtube_dl/extractor/iqiyi.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/iqiyi.py b/youtube_dl/extractor/iqiyi.py index f96e12e69..c3731a110 100644 --- a/youtube_dl/extractor/iqiyi.py +++ b/youtube_dl/extractor/iqiyi.py @@ -205,8 +205,8 @@ class IqiyiIE(InfoExtractor): def get_enc_key(self, swf_url, video_id): # TODO: automatic key extraction - # last update at 2015-12-06 for Zombie::bite - enc_key = '3719f6a1da83ee0aee3488d8802d7696'[::-1] + # last update at 2015-12-18 for Zombie::bite + enc_key = '8b6b683780897eb8d9a48a02ccc4817d'[::-1] return enc_key def _real_extract(self, url): From b95779be21929c1aed1c0edbc3f9a5d11826973f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 18 Dec 2015 18:57:49 +0600 Subject: [PATCH 38/92] [jsinterp] Extend function regex (Closes #7900, closes #7901) --- youtube_dl/jsinterp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/jsinterp.py b/youtube_dl/jsinterp.py index 2191e8b89..a7440c582 100644 --- a/youtube_dl/jsinterp.py +++ b/youtube_dl/jsinterp.py @@ -232,7 +232,7 @@ class JSInterpreter(object): def extract_function(self, funcname): func_m = re.search( r'''(?x) - (?:function\s+%s|[{;]%s\s*=\s*function|var\s+%s\s*=\s*function)\s* + (?:function\s+%s|[{;,]%s\s*=\s*function|var\s+%s\s*=\s*function)\s* \((?P<args>[^)]*)\)\s* \{(?P<code>[^}]+)\}''' % ( re.escape(funcname), re.escape(funcname), re.escape(funcname)), From 016dd820505e635ed1d4e5890f370cf800b25c7f Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Fri, 18 Dec 2015 14:21:30 +0100 Subject: [PATCH 39/92] release 2015.12.18 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 20b44b94d..01607693e 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2015.12.13' +__version__ = '2015.12.18' From 9796a9b20ccc46217c27523774f5088d74e672bb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 18 Dec 2015 21:34:17 +0600 Subject: [PATCH 40/92] [ndr] Fix description and upload date extraction (Closes #7893) --- youtube_dl/extractor/ndr.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/ndr.py b/youtube_dl/extractor/ndr.py index 16213eed9..894c51399 100644 --- a/youtube_dl/extractor/ndr.py +++ b/youtube_dl/extractor/ndr.py @@ -88,10 +88,10 @@ class NDRIE(NDRBaseIE): 'embedURL', webpage, 'embed URL', fatal=True) description = self._search_regex( r'<p[^>]+itemprop="description">([^<]+)</p>', - webpage, 'description', fatal=False) + webpage, 'description', default=None) or self._og_search_description(webpage) timestamp = parse_iso8601( self._search_regex( - r'<span itemprop="datePublished" content="([^"]+)">', + r'<span[^>]+itemprop="(?:datePublished|uploadDate)"[^>]+content="([^"]+)"', webpage, 'upload date', fatal=False)) return { '_type': 'url_transparent', From 7234d1d9c795bd43799de47952f65f72952e0564 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 18 Dec 2015 22:05:32 +0600 Subject: [PATCH 41/92] [brightcove:new] Add _extract_url --- youtube_dl/extractor/brightcove.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index f5ebae1e6..66b8d2dff 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -389,6 +389,11 @@ class BrightcoveNewIE(InfoExtractor): } }] + @staticmethod + def _extract_url(webpage): + urls = BrightcoveNewIE._extract_urls(webpage) + return urls[0] if urls else None + @staticmethod def _extract_urls(webpage): # Reference: From 15d50aca64421c97d251bcd86e9bb308265dfdaa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 18 Dec 2015 22:05:56 +0600 Subject: [PATCH 42/92] [nowness] Add support for brightcove:new videos (Closes #7884) --- youtube_dl/extractor/nowness.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/nowness.py b/youtube_dl/extractor/nowness.py index d480fb58c..446f5901c 100644 --- a/youtube_dl/extractor/nowness.py +++ b/youtube_dl/extractor/nowness.py @@ -1,7 +1,10 @@ # encoding: utf-8 from __future__ import unicode_literals -from .brightcove import BrightcoveLegacyIE +from .brightcove import ( + BrightcoveLegacyIE, + BrightcoveNewIE, +) from .common import InfoExtractor from ..compat import compat_str from ..utils import ( @@ -23,9 +26,12 @@ class NownessBaseIE(InfoExtractor): note='Downloading player JavaScript', errnote='Unable to download player JavaScript') bc_url = BrightcoveLegacyIE._extract_brightcove_url(player_code) - if bc_url is None: - raise ExtractorError('Could not find player definition') - return self.url_result(bc_url, 'BrightcoveLegacy') + if bc_url: + return self.url_result(bc_url, BrightcoveLegacyIE.ie_key()) + bc_url = BrightcoveNewIE._extract_url(player_code) + if bc_url: + return self.url_result(bc_url, BrightcoveNewIE.ie_key()) + raise ExtractorError('Could not find player definition') elif source == 'vimeo': return self.url_result('http://vimeo.com/%s' % video_id, 'Vimeo') elif source == 'youtube': From 9fd0f676788f94768b80fe5ff87895b607c84345 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 18 Dec 2015 22:18:55 +0600 Subject: [PATCH 43/92] [brightcove:new] Add support for ref: preffixed video ids (Closes #7794) --- youtube_dl/extractor/brightcove.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index 66b8d2dff..d77590195 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -355,7 +355,7 @@ class BrightcoveLegacyIE(InfoExtractor): class BrightcoveNewIE(InfoExtractor): IE_NAME = 'brightcove:new' - _VALID_URL = r'https?://players\.brightcove\.net/(?P<account_id>\d+)/(?P<player_id>[^/]+)_(?P<embed>[^/]+)/index\.html\?.*videoId=(?P<video_id>\d+)' + _VALID_URL = r'https?://players\.brightcove\.net/(?P<account_id>\d+)/(?P<player_id>[^/]+)_(?P<embed>[^/]+)/index\.html\?.*videoId=(?P<video_id>(?:ref:)?\d+)' _TESTS = [{ 'url': 'http://players.brightcove.net/929656772001/e41d32dc-ec74-459e-a845-6c69f7b724ea_default/index.html?videoId=4463358922001', 'md5': 'c8100925723840d4b0d243f7025703be', @@ -414,7 +414,7 @@ class BrightcoveNewIE(InfoExtractor): # may be optional and what to do when it is r'''(?sx) <video[^>]+ - data-video-id=["\'](\d+)["\'][^>]*>.*? + data-video-id=["\']((?:ref:)?\d+)["\'][^>]*>.*? </video>.*? <script[^>]+ src=["\'](?:https?:)?//players\.brightcove\.net/ From f81ccbb3df205c612275f8881ddf21115eb590b4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 18 Dec 2015 22:20:44 +0600 Subject: [PATCH 44/92] [brightcove:new] Fix typo --- youtube_dl/extractor/brightcove.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index d77590195..b0536ddbd 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -398,7 +398,7 @@ class BrightcoveNewIE(InfoExtractor): def _extract_urls(webpage): # Reference: # 1. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/publish-video.html#setvideoiniframe - # 2. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/publish-video.html#setvideousingjavascript) + # 2. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/publish-video.html#setvideousingjavascript # 3. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/embed-in-page.html entries = [] From 5b72fda140488685be55215d584f764c39577b02 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 18 Dec 2015 22:22:41 +0600 Subject: [PATCH 45/92] [brightcove:new] Clarify ref: prefix --- youtube_dl/extractor/brightcove.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index b0536ddbd..4e983fa10 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -400,6 +400,7 @@ class BrightcoveNewIE(InfoExtractor): # 1. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/publish-video.html#setvideoiniframe # 2. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/publish-video.html#setvideousingjavascript # 3. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/embed-in-page.html + # 4. https://support.brightcove.com/en/video-cloud/docs/dynamically-assigning-videos-player entries = [] @@ -412,6 +413,7 @@ class BrightcoveNewIE(InfoExtractor): for video_id, account_id, player_id, embed in re.findall( # According to examples from [3] it's unclear whether video id # may be optional and what to do when it is + # According to [4] data-video-id may be prefixed with ref: r'''(?sx) <video[^>]+ data-video-id=["\']((?:ref:)?\d+)["\'][^>]*>.*? From 4f29fa99069760dc47ef9ca5dbf607a567d2982f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 18 Dec 2015 22:31:48 +0600 Subject: [PATCH 46/92] [brightcove:new] Add test for ref: prefixed video id --- youtube_dl/extractor/brightcove.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index 4e983fa10..03a4f446e 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -387,6 +387,10 @@ class BrightcoveNewIE(InfoExtractor): 'params': { 'skip_download': True, } + }, { + # ref: prefixed video id + 'url': 'http://players.brightcove.net/3910869709001/21519b5c-4b3b-4363-accb-bdc8f358f823_default/index.html?videoId=ref:7069442', + 'only_matching': True, }] @staticmethod From d631d5f9f27f93767226192e4288990413fa9dbd Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Sat, 19 Dec 2015 18:21:42 +0800 Subject: [PATCH 47/92] [utils] Fix TTML conversion Tolerate invalid timestamps (closes #7909) --- test/test_utils.py | 7 +++++-- youtube_dl/utils.py | 11 ++++++++--- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/test/test_utils.py b/test/test_utils.py index 8fc74e591..86045e680 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -661,8 +661,8 @@ ffmpeg version 2.4.4 Copyright (c) 2000-2014 the FFmpeg ...'''), '2.4.4') {'like_count': 190, 'dislike_count': 10})) def test_parse_dfxp_time_expr(self): - self.assertEqual(parse_dfxp_time_expr(None), 0.0) - self.assertEqual(parse_dfxp_time_expr(''), 0.0) + self.assertEqual(parse_dfxp_time_expr(None), None) + self.assertEqual(parse_dfxp_time_expr(''), None) self.assertEqual(parse_dfxp_time_expr('0.1'), 0.1) self.assertEqual(parse_dfxp_time_expr('0.1s'), 0.1) self.assertEqual(parse_dfxp_time_expr('00:00:01'), 1.0) @@ -676,6 +676,9 @@ ffmpeg version 2.4.4 Copyright (c) 2000-2014 the FFmpeg ...'''), '2.4.4') <p begin="0" end="1">The following line contains Chinese characters and special symbols</p> <p begin="1" end="2">第二行<br/>♪♪</p> <p begin="2" dur="1"><span>Third<br/>Line</span></p> + <p begin="3" end="-1">Lines with invalid timestamps are ignored</p> + <p begin="-1" end="-1">Ignore, two</p> + <p begin="3" dur="-1">Ignored, three</p> </div> </body> </tt>''' diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 91917fc96..ee20c3d9b 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1976,7 +1976,7 @@ def match_filter_func(filter_str): def parse_dfxp_time_expr(time_expr): if not time_expr: - return 0.0 + return mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr) if mobj: @@ -2020,10 +2020,15 @@ def dfxp2srt(dfxp_data): raise ValueError('Invalid dfxp/TTML subtitle') for para, index in zip(paras, itertools.count(1)): - begin_time = parse_dfxp_time_expr(para.attrib['begin']) + begin_time = parse_dfxp_time_expr(para.attrib.get('begin')) end_time = parse_dfxp_time_expr(para.attrib.get('end')) + dur = parse_dfxp_time_expr(para.attrib.get('dur')) + if begin_time is None: + continue if not end_time: - end_time = begin_time + parse_dfxp_time_expr(para.attrib['dur']) + if not dur: + continue + end_time = begin_time + dur out.append('%d\n%s --> %s\n%s\n\n' % ( index, srt_subtitles_timecode(begin_time), From db2fe38b5508cbd28b89893219d9cccd41406851 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Sat, 19 Dec 2015 19:29:51 +0800 Subject: [PATCH 48/92] [utils] Support alternative timestamp format in TTML Fixes #7608 --- test/test_utils.py | 1 + youtube_dl/utils.py | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/test/test_utils.py b/test/test_utils.py index 86045e680..e4e8d3825 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -667,6 +667,7 @@ ffmpeg version 2.4.4 Copyright (c) 2000-2014 the FFmpeg ...'''), '2.4.4') self.assertEqual(parse_dfxp_time_expr('0.1s'), 0.1) self.assertEqual(parse_dfxp_time_expr('00:00:01'), 1.0) self.assertEqual(parse_dfxp_time_expr('00:00:01.100'), 1.1) + self.assertEqual(parse_dfxp_time_expr('00:00:01:100'), 1.1) def test_dfxp2srt(self): dfxp_data = '''<?xml version="1.0" encoding="UTF-8"?> diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index ee20c3d9b..5b396ede8 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1982,9 +1982,9 @@ def parse_dfxp_time_expr(time_expr): if mobj: return float(mobj.group('time_offset')) - mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:\.\d+)?)$', time_expr) + mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr) if mobj: - return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3)) + return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.')) def srt_subtitles_timecode(seconds): From ee0f0393cfd3daeff729f5f5594e4c5b1bec5436 Mon Sep 17 00:00:00 2001 From: ping <lipng.ong@gmail.com> Date: Thu, 17 Sep 2015 13:51:50 +0800 Subject: [PATCH 49/92] [togglesg] New extractor for toggle.sg --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/togglesg.py | 159 +++++++++++++++++++++++++++++++ 2 files changed, 160 insertions(+) create mode 100644 youtube_dl/extractor/togglesg.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 3cd95ba01..b88fbcc4d 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -675,6 +675,7 @@ from .tnaflix import ( EMPFlixIE, MovieFapIE, ) +from .togglesg import ToggleSgIE from .thvideo import ( THVideoIE, THVideoPlaylistIE diff --git a/youtube_dl/extractor/togglesg.py b/youtube_dl/extractor/togglesg.py new file mode 100644 index 000000000..56ef4b464 --- /dev/null +++ b/youtube_dl/extractor/togglesg.py @@ -0,0 +1,159 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json +import re +import itertools + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + int_or_none, + determine_ext, + parse_iso8601, + remove_end +) +from ..compat import compat_urllib_request + + +class ToggleSgIE(InfoExtractor): + IE_NAME = 'togglesg' + _VALID_URL = r'https?://video\.toggle\.sg/(?:(en|zh))/(?:(series|clips|movies))/.+?/(?P<id>[0-9]+)' + _TESTS = [{ + 'url': 'http://video.toggle.sg/en/series/lion-moms-tif/trailers/lion-moms-premier/343115', + 'info_dict': { + 'id': '343115', + 'ext': 'mp4', + 'title': 'Lion Moms Premiere', + 'description': 'md5:aea1149404bff4d7f7b6da11fafd8e6b', + 'upload_date': '20150910', + 'timestamp': 1441858274, + }, + 'params': { + 'skip_download': 'm3u8 download', + } + }, { + 'note': 'DRM-protected video', + 'url': 'http://video.toggle.sg/en/movies/dug-s-special-mission/341413', + 'info_dict': { + 'id': '341413', + 'ext': 'wvm', + 'title': 'Dug\'s Special Mission', + 'description': 'md5:e86c6f4458214905c1772398fabc93e0', + 'upload_date': '20150827', + 'timestamp': 1440644006, + }, + 'params': { + 'skip_download': 'DRM-protected wvm download', + } + }, { + 'note': 'm3u8 links are geo-restricted, but Android/mp4 is okay', + 'url': 'http://video.toggle.sg/en/series/28th-sea-games-5-show/ep11/332861', + 'info_dict': { + 'id': '332861', + 'ext': 'mp4', + 'title': '28th SEA Games (5 Show) - Episode 11', + 'description': 'md5:3cd4f5f56c7c3b1340c50a863f896faa', + 'upload_date': '20150605', + 'timestamp': 1433480166, + }, + 'params': { + 'skip_download': 'DRM-protected wvm download', + }, + 'skip': 'm3u8 links are geo-restricted' + }, { + 'url': 'http://video.toggle.sg/en/clips/seraph-sun-aloysius-will-suddenly-sing-some-old-songs-in-high-pitch-on-set/343331', + 'only_matching': True, + }, { + 'url': 'http://video.toggle.sg/zh/series/zero-calling-s2-hd/ep13/336367', + 'only_matching': True, + }, { + 'url': 'http://video.toggle.sg/en/series/vetri-s2/webisodes/jeeva-is-an-orphan-vetri-s2-webisode-7/342302', + 'only_matching': True, + }, { + 'url': 'http://video.toggle.sg/en/movies/seven-days/321936', + 'only_matching': True, + }] + + _FORMAT_PREFERENCES = { + 'wvm-STBMain': -10, + 'wvm-iPadMain': -20, + 'wvm-iPhoneMain': -30, + 'wvm-Android': -40, + } + _API_USER = 'tvpapi_147' + _API_PASS = '11111' + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id, note='Downloading video page') + + api_user = self._search_regex( + r'apiUser:\s*"([^"]+)"', webpage, 'apiUser', default=self._API_USER, fatal=False) + api_pass = self._search_regex( + r'apiPass:\s*"([^"]+)"', webpage, 'apiPass', default=self._API_PASS, fatal=False) + + params = { + 'initObj': { + 'Locale': { + 'LocaleLanguage': '', 'LocaleCountry': '', + 'LocaleDevice': '', 'LocaleUserState': 0 + }, + 'Platform': 0, 'SiteGuid': 0, 'DomainID': '0', 'UDID': '', + 'ApiUser': api_user, 'ApiPass': api_pass + }, + 'MediaID': video_id, + 'mediaType': 0, + } + + req = compat_urllib_request.Request( + 'http://tvpapi.as.tvinci.com/v2_9/gateways/jsonpostgw.aspx?m=GetMediaInfo', + json.dumps(params).encode('utf-8')) + info = self._download_json(req, video_id, 'Downloading video info json') + + title = info['MediaName'] + duration = int_or_none(info.get('Duration')) + thumbnail = info.get('PicURL') + description = info.get('Description') + created_at = parse_iso8601(info.get('CreationDate') or None) + formats = [] + + for video_file in info.get('Files', []): + ext = determine_ext(video_file['URL']) + vid_format = video_file['Format'].replace(' ', '') + # if geo-restricted, m3u8 is inaccessible, but mp4 is okay + if ext == 'm3u8': + m3u8_formats = self._extract_m3u8_formats( + video_file['URL'], video_id, ext='mp4', m3u8_id=vid_format, + note='Downloading %s m3u8 information' % vid_format, + errnote='Failed to download %s m3u8 information' % vid_format, + fatal=False + ) + if m3u8_formats: + formats.extend(m3u8_formats) + if ext in ['mp4', 'wvm']: + # wvm are drm-protected files + formats.append({ + 'ext': ext, + 'url': video_file['URL'], + 'format_id': vid_format, + 'preference': self._FORMAT_PREFERENCES.get(ext + '-' + vid_format) or -1, + 'format_note': 'DRM-protected video' if ext == 'wvm' else None + }) + + if not formats: + # Most likely because geo-blocked + raise ExtractorError('No downloadable videos found', expected=True) + + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'duration': duration, + 'timestamp': created_at, + 'thumbnail': thumbnail, + 'formats': formats, + } From ed370ff0e6f52aaf3647b32cdbba9d25ce8533e5 Mon Sep 17 00:00:00 2001 From: ping <lipng.ong@gmail.com> Date: Fri, 18 Sep 2015 00:51:41 +0800 Subject: [PATCH 50/92] [togglesg] Fixes --- youtube_dl/extractor/togglesg.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/togglesg.py b/youtube_dl/extractor/togglesg.py index 56ef4b464..9f958d453 100644 --- a/youtube_dl/extractor/togglesg.py +++ b/youtube_dl/extractor/togglesg.py @@ -18,7 +18,7 @@ from ..compat import compat_urllib_request class ToggleSgIE(InfoExtractor): IE_NAME = 'togglesg' - _VALID_URL = r'https?://video\.toggle\.sg/(?:(en|zh))/(?:(series|clips|movies))/.+?/(?P<id>[0-9]+)' + _VALID_URL = r'https?://video\.toggle\.sg/(?:en|zh)/(?:series|clips|movies)/.+?/(?P<id>[0-9]+)' _TESTS = [{ 'url': 'http://video.toggle.sg/en/series/lion-moms-tif/trailers/lion-moms-premier/343115', 'info_dict': { @@ -90,9 +90,9 @@ class ToggleSgIE(InfoExtractor): webpage = self._download_webpage(url, video_id, note='Downloading video page') api_user = self._search_regex( - r'apiUser:\s*"([^"]+)"', webpage, 'apiUser', default=self._API_USER, fatal=False) + r'apiUser:\s*"([^"]+)"', webpage, 'apiUser', default=self._API_USER) api_pass = self._search_regex( - r'apiPass:\s*"([^"]+)"', webpage, 'apiPass', default=self._API_PASS, fatal=False) + r'apiPass:\s*"([^"]+)"', webpage, 'apiPass', default=self._API_PASS) params = { 'initObj': { From f8253af561655c7c470323e631e8b66283bcf623 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 19 Dec 2015 19:03:55 +0600 Subject: [PATCH 51/92] [toggle] Use sanitized_Request --- youtube_dl/extractor/togglesg.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/togglesg.py b/youtube_dl/extractor/togglesg.py index 9f958d453..05a9e5439 100644 --- a/youtube_dl/extractor/togglesg.py +++ b/youtube_dl/extractor/togglesg.py @@ -11,7 +11,8 @@ from ..utils import ( int_or_none, determine_ext, parse_iso8601, - remove_end + remove_end, + sanitized_Request, ) from ..compat import compat_urllib_request @@ -107,7 +108,7 @@ class ToggleSgIE(InfoExtractor): 'mediaType': 0, } - req = compat_urllib_request.Request( + req = sanitized_Request( 'http://tvpapi.as.tvinci.com/v2_9/gateways/jsonpostgw.aspx?m=GetMediaInfo', json.dumps(params).encode('utf-8')) info = self._download_json(req, video_id, 'Downloading video info json') From c82a8dd14c813e1b022188925fc33fb4d831bc3d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 19 Dec 2015 19:04:38 +0600 Subject: [PATCH 52/92] [toggle] Remove unused imports --- youtube_dl/extractor/togglesg.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/youtube_dl/extractor/togglesg.py b/youtube_dl/extractor/togglesg.py index 05a9e5439..ea660960e 100644 --- a/youtube_dl/extractor/togglesg.py +++ b/youtube_dl/extractor/togglesg.py @@ -2,19 +2,15 @@ from __future__ import unicode_literals import json -import re -import itertools from .common import InfoExtractor from ..utils import ( + determine_ext, ExtractorError, int_or_none, - determine_ext, parse_iso8601, - remove_end, sanitized_Request, ) -from ..compat import compat_urllib_request class ToggleSgIE(InfoExtractor): From 74c730174fa872ae37b4709a3a452391ee6273fb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 19 Dec 2015 19:06:05 +0600 Subject: [PATCH 53/92] [toggle] Style --- youtube_dl/extractor/togglesg.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/togglesg.py b/youtube_dl/extractor/togglesg.py index ea660960e..d3903b58d 100644 --- a/youtube_dl/extractor/togglesg.py +++ b/youtube_dl/extractor/togglesg.py @@ -94,11 +94,17 @@ class ToggleSgIE(InfoExtractor): params = { 'initObj': { 'Locale': { - 'LocaleLanguage': '', 'LocaleCountry': '', - 'LocaleDevice': '', 'LocaleUserState': 0 + 'LocaleLanguage': '', + 'LocaleCountry': '', + 'LocaleDevice': '', + 'LocaleUserState': 0 }, - 'Platform': 0, 'SiteGuid': 0, 'DomainID': '0', 'UDID': '', - 'ApiUser': api_user, 'ApiPass': api_pass + 'Platform': 0, + 'SiteGuid': 0, + 'DomainID': '0', + 'UDID': '', + 'ApiUser': api_user, + 'ApiPass': api_pass }, 'MediaID': video_id, 'mediaType': 0, From ffaf6e66e3830d0e4750aec2cfdaff1a6bd9c2ad Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 19 Dec 2015 19:08:47 +0600 Subject: [PATCH 54/92] [toggle] Improve --- youtube_dl/extractor/togglesg.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/togglesg.py b/youtube_dl/extractor/togglesg.py index d3903b58d..244c79e8d 100644 --- a/youtube_dl/extractor/togglesg.py +++ b/youtube_dl/extractor/togglesg.py @@ -84,12 +84,15 @@ class ToggleSgIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id, note='Downloading video page') + webpage = self._download_webpage( + url, video_id, note='Downloading video page') api_user = self._search_regex( - r'apiUser:\s*"([^"]+)"', webpage, 'apiUser', default=self._API_USER) + r'apiUser\s*:\s*(["\'])(?P<user>.+?)\1', webpage, 'apiUser', + default=self._API_USER, group='user') api_pass = self._search_regex( - r'apiPass:\s*"([^"]+)"', webpage, 'apiPass', default=self._API_PASS) + r'apiPass\s*:\s*(["\'])(?P<pass>.+?)\1', webpage, 'apiPass', + default=self._API_PASS, group='pass') params = { 'initObj': { @@ -131,11 +134,10 @@ class ToggleSgIE(InfoExtractor): video_file['URL'], video_id, ext='mp4', m3u8_id=vid_format, note='Downloading %s m3u8 information' % vid_format, errnote='Failed to download %s m3u8 information' % vid_format, - fatal=False - ) + fatal=False) if m3u8_formats: formats.extend(m3u8_formats) - if ext in ['mp4', 'wvm']: + elif ext in ('mp4', 'wvm'): # wvm are drm-protected files formats.append({ 'ext': ext, From c40dbb19ab475e8a0e1b29548130adf9ce13ea43 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 19 Dec 2015 19:19:26 +0600 Subject: [PATCH 55/92] [toggle] Extract thumbnails --- youtube_dl/extractor/togglesg.py | 34 ++++++++++++++++++++++++-------- 1 file changed, 26 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/togglesg.py b/youtube_dl/extractor/togglesg.py index 244c79e8d..a2b89d6bb 100644 --- a/youtube_dl/extractor/togglesg.py +++ b/youtube_dl/extractor/togglesg.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals import json +import re from .common import InfoExtractor from ..utils import ( @@ -119,12 +120,8 @@ class ToggleSgIE(InfoExtractor): info = self._download_json(req, video_id, 'Downloading video info json') title = info['MediaName'] - duration = int_or_none(info.get('Duration')) - thumbnail = info.get('PicURL') - description = info.get('Description') - created_at = parse_iso8601(info.get('CreationDate') or None) - formats = [] + formats = [] for video_file in info.get('Files', []): ext = determine_ext(video_file['URL']) vid_format = video_file['Format'].replace(' ', '') @@ -146,19 +143,40 @@ class ToggleSgIE(InfoExtractor): 'preference': self._FORMAT_PREFERENCES.get(ext + '-' + vid_format) or -1, 'format_note': 'DRM-protected video' if ext == 'wvm' else None }) - if not formats: # Most likely because geo-blocked raise ExtractorError('No downloadable videos found', expected=True) - self._sort_formats(formats) + duration = int_or_none(info.get('Duration')) + description = info.get('Description') + created_at = parse_iso8601(info.get('CreationDate') or None) + + thumbnails = [] + for picture in info.get('Pictures', []): + if not isinstance(picture, dict): + continue + pic_url = picture.get('URL') + if not pic_url: + continue + thumbnail = { + 'url': pic_url, + } + pic_size = picture.get('PicSize', '') + m = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', pic_size) + if m: + thumbnail.update({ + 'width': int(m.group('width')), + 'height': int(m.group('height')), + }) + thumbnails.append(thumbnail) + return { 'id': video_id, 'title': title, 'description': description, 'duration': duration, 'timestamp': created_at, - 'thumbnail': thumbnail, + 'thumbnails': thumbnails, 'formats': formats, } From 8f097af4ec5d6ff72a80c744d67f947336fecb28 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 19 Dec 2015 19:23:28 +0600 Subject: [PATCH 56/92] [toggle] Extract counters --- youtube_dl/extractor/togglesg.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/youtube_dl/extractor/togglesg.py b/youtube_dl/extractor/togglesg.py index a2b89d6bb..36367971f 100644 --- a/youtube_dl/extractor/togglesg.py +++ b/youtube_dl/extractor/togglesg.py @@ -8,6 +8,7 @@ from .common import InfoExtractor from ..utils import ( determine_ext, ExtractorError, + float_or_none, int_or_none, parse_iso8601, sanitized_Request, @@ -152,6 +153,10 @@ class ToggleSgIE(InfoExtractor): description = info.get('Description') created_at = parse_iso8601(info.get('CreationDate') or None) + average_rating = float_or_none(info.get('Rating')) + view_count = int_or_none(info.get('ViewCounter') or info.get('view_counter')) + like_count = int_or_none(info.get('LikeCounter') or info.get('like_counter')) + thumbnails = [] for picture in info.get('Pictures', []): if not isinstance(picture, dict): @@ -177,6 +182,9 @@ class ToggleSgIE(InfoExtractor): 'description': description, 'duration': duration, 'timestamp': created_at, + 'average_rating': average_rating, + 'view_count': view_count, + 'like_count': like_count, 'thumbnails': thumbnails, 'formats': formats, } From 989e9f8eadc936ab47fcd0b7ba63aac13020a526 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 19 Dec 2015 19:52:37 +0600 Subject: [PATCH 57/92] [toggle] Improve formats extraction robustness --- youtube_dl/extractor/togglesg.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/togglesg.py b/youtube_dl/extractor/togglesg.py index 36367971f..47b2bfcb8 100644 --- a/youtube_dl/extractor/togglesg.py +++ b/youtube_dl/extractor/togglesg.py @@ -124,12 +124,15 @@ class ToggleSgIE(InfoExtractor): formats = [] for video_file in info.get('Files', []): - ext = determine_ext(video_file['URL']) - vid_format = video_file['Format'].replace(' ', '') + video_url, vid_format = video_file.get('URL'), video_file.get('Format') + if not video_url or not vid_format: + continue + ext = determine_ext(video_url) + vid_format = vid_format.replace(' ', '') # if geo-restricted, m3u8 is inaccessible, but mp4 is okay if ext == 'm3u8': m3u8_formats = self._extract_m3u8_formats( - video_file['URL'], video_id, ext='mp4', m3u8_id=vid_format, + video_url, video_id, ext='mp4', m3u8_id=vid_format, note='Downloading %s m3u8 information' % vid_format, errnote='Failed to download %s m3u8 information' % vid_format, fatal=False) @@ -139,7 +142,7 @@ class ToggleSgIE(InfoExtractor): # wvm are drm-protected files formats.append({ 'ext': ext, - 'url': video_file['URL'], + 'url': video_url, 'format_id': vid_format, 'preference': self._FORMAT_PREFERENCES.get(ext + '-' + vid_format) or -1, 'format_note': 'DRM-protected video' if ext == 'wvm' else None From e33c9cba7c6c9d4a04d749c88ab447a57bcfe423 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 19 Dec 2015 19:58:18 +0600 Subject: [PATCH 58/92] [toggle] Improve _VALID_URL --- youtube_dl/extractor/togglesg.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/togglesg.py b/youtube_dl/extractor/togglesg.py index 47b2bfcb8..22c6c91b9 100644 --- a/youtube_dl/extractor/togglesg.py +++ b/youtube_dl/extractor/togglesg.py @@ -17,7 +17,7 @@ from ..utils import ( class ToggleSgIE(InfoExtractor): IE_NAME = 'togglesg' - _VALID_URL = r'https?://video\.toggle\.sg/(?:en|zh)/(?:series|clips|movies)/.+?/(?P<id>[0-9]+)' + _VALID_URL = r'https?://video\.toggle\.sg/(?:en|zh)/(?:series|clips|movies)/(?:[^/]+/)+(?P<id>[0-9]+)' _TESTS = [{ 'url': 'http://video.toggle.sg/en/series/lion-moms-tif/trailers/lion-moms-premier/343115', 'info_dict': { @@ -46,8 +46,9 @@ class ToggleSgIE(InfoExtractor): 'skip_download': 'DRM-protected wvm download', } }, { + # this also tests correct video id extraction 'note': 'm3u8 links are geo-restricted, but Android/mp4 is okay', - 'url': 'http://video.toggle.sg/en/series/28th-sea-games-5-show/ep11/332861', + 'url': 'http://video.toggle.sg/en/series/28th-sea-games-5-show/28th-sea-games-5-show-ep11/332861', 'info_dict': { 'id': '332861', 'ext': 'mp4', From cc0f378d5429225caf0477f5d10b342b1ef136d5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 19 Dec 2015 19:59:00 +0600 Subject: [PATCH 59/92] [toggle] Rename to toggle --- youtube_dl/extractor/__init__.py | 2 +- youtube_dl/extractor/{togglesg.py => toggle.py} | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) rename youtube_dl/extractor/{togglesg.py => toggle.py} (99%) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index b88fbcc4d..fef2cff62 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -675,7 +675,7 @@ from .tnaflix import ( EMPFlixIE, MovieFapIE, ) -from .togglesg import ToggleSgIE +from .toggle import ToggleIE from .thvideo import ( THVideoIE, THVideoPlaylistIE diff --git a/youtube_dl/extractor/togglesg.py b/youtube_dl/extractor/toggle.py similarity index 99% rename from youtube_dl/extractor/togglesg.py rename to youtube_dl/extractor/toggle.py index 22c6c91b9..c9d6e52e1 100644 --- a/youtube_dl/extractor/togglesg.py +++ b/youtube_dl/extractor/toggle.py @@ -15,7 +15,7 @@ from ..utils import ( ) -class ToggleSgIE(InfoExtractor): +class ToggleIE(InfoExtractor): IE_NAME = 'togglesg' _VALID_URL = r'https?://video\.toggle\.sg/(?:en|zh)/(?:series|clips|movies)/(?:[^/]+/)+(?P<id>[0-9]+)' _TESTS = [{ From 0f206ee81447349f9ec8c430de4868a86020564c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 19 Dec 2015 23:11:23 +0600 Subject: [PATCH 60/92] [toggle] Change IE_NAME --- youtube_dl/extractor/toggle.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/toggle.py b/youtube_dl/extractor/toggle.py index c9d6e52e1..a47239952 100644 --- a/youtube_dl/extractor/toggle.py +++ b/youtube_dl/extractor/toggle.py @@ -16,7 +16,7 @@ from ..utils import ( class ToggleIE(InfoExtractor): - IE_NAME = 'togglesg' + IE_NAME = 'toggle' _VALID_URL = r'https?://video\.toggle\.sg/(?:en|zh)/(?:series|clips|movies)/(?:[^/]+/)+(?P<id>[0-9]+)' _TESTS = [{ 'url': 'http://video.toggle.sg/en/series/lion-moms-tif/trailers/lion-moms-premier/343115', From e0f06eae432e592b159237f2ce9813449431cc7b Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Sat, 19 Dec 2015 18:26:28 +0100 Subject: [PATCH 61/92] [fktv] fix info extraction --- youtube_dl/extractor/fktv.py | 30 +++++++++++++----------------- 1 file changed, 13 insertions(+), 17 deletions(-) diff --git a/youtube_dl/extractor/fktv.py b/youtube_dl/extractor/fktv.py index 40ea27895..5f6e65dae 100644 --- a/youtube_dl/extractor/fktv.py +++ b/youtube_dl/extractor/fktv.py @@ -1,12 +1,10 @@ from __future__ import unicode_literals -import re - from .common import InfoExtractor from ..utils import ( clean_html, determine_ext, - ExtractorError, + js_to_json, ) @@ -32,24 +30,22 @@ class FKTVIE(InfoExtractor): 'http://fernsehkritik.tv/folge-%s/play' % episode, episode) title = clean_html(self._html_search_regex( '<h3>([^<]+)</h3>', webpage, 'title')) - matches = re.search( - r'(?s)<video(?:(?!poster)[^>])+(?:poster="([^"]+)")?[^>]*>(.*)</video>', - webpage) - if matches is None: - raise ExtractorError('Unable to extract the video') + thumbnail = self._search_regex(r'POSTER\s*=\s*"([^"]+)', webpage, 'thumbnail', fatal=False) + sources = self._parse_json(self._search_regex(r'(?s)MEDIA\s*=\s*(\[.+?\]);', webpage, 'media'), episode, js_to_json) - poster, sources = matches.groups() - if poster is None: - self.report_warning('unable to extract thumbnail') + formats = [] + for source in sources: + furl = source.get('src') + if furl: + formats.append({ + 'url': furl, + 'format_id': determine_ext(furl), + }) + self._sort_formats(formats) - urls = re.findall(r'<source[^>]+src="([^"]+)"', sources) - formats = [{ - 'url': furl, - 'format_id': determine_ext(furl), - } for furl in urls] return { 'id': episode, 'title': title, 'formats': formats, - 'thumbnail': poster, + 'thumbnail': thumbnail, } From fa64a8431166ad7ca7bd2881306ee379b6e2abfd Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Sat, 19 Dec 2015 19:02:04 +0100 Subject: [PATCH 62/92] [faz] fix info extraction --- youtube_dl/extractor/faz.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/faz.py b/youtube_dl/extractor/faz.py index cebdd0193..d9a868119 100644 --- a/youtube_dl/extractor/faz.py +++ b/youtube_dl/extractor/faz.py @@ -38,7 +38,7 @@ class FazIE(InfoExtractor): webpage = self._download_webpage(url, video_id) config_xml_url = self._search_regex( - r'writeFLV\(\'(.+?)\',', webpage, 'config xml url') + r'(?:var\s+)?videoXMLURL\s*=\s*"([^"]+)', webpage, 'config xml url') config = self._download_xml( config_xml_url, video_id, 'Downloading config xml') From dd85e4d70787dd0c106fb05bdb0a38f51037fd39 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 20 Dec 2015 02:43:50 +0600 Subject: [PATCH 63/92] [extractor/common] Properly decode error string on python 2 (Closes #1354, closes #3957, closes #4037, closes #6449) --- youtube_dl/extractor/common.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 6ab2d68d6..3ab72ff76 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -34,6 +34,7 @@ from ..utils import ( fix_xml_ampersands, float_or_none, int_or_none, + preferredencoding, RegexNotFoundError, sanitize_filename, sanitized_Request, @@ -332,7 +333,12 @@ class InfoExtractor(object): return False if errnote is None: errnote = 'Unable to download webpage' - errmsg = '%s: %s' % (errnote, compat_str(err)) + err_str = str(err) + # On python 2 error byte string must be decoded with proper + # encoding rather than ascii + if sys.version_info[0] < 3: + err_str = err_str.decode(preferredencoding()) + errmsg = '%s: %s' % (errnote, err_str) if fatal: raise ExtractorError(errmsg, sys.exc_info()[2], cause=err) else: From ec6504b39c9b3d9766c551691a96e481278fa659 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Sat, 19 Dec 2015 23:28:54 +0100 Subject: [PATCH 64/92] [gputechconf] Add new extractor(closes #5775) --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/gputechconf.py | 57 +++++++++++++++++++++++++++++ 2 files changed, 58 insertions(+) create mode 100644 youtube_dl/extractor/gputechconf.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index fef2cff62..65c5e1c92 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -235,6 +235,7 @@ from .golem import GolemIE from .googleplus import GooglePlusIE from .googlesearch import GoogleSearchIE from .goshgay import GoshgayIE +from .gputechconf import GPUTechConfIE from .groupon import GrouponIE from .hark import HarkIE from .hearthisat import HearThisAtIE diff --git a/youtube_dl/extractor/gputechconf.py b/youtube_dl/extractor/gputechconf.py new file mode 100644 index 000000000..becc61f1e --- /dev/null +++ b/youtube_dl/extractor/gputechconf.py @@ -0,0 +1,57 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + xpath_element, + xpath_text, + int_or_none, + parse_duration, +) + + +class GPUTechConfIE(InfoExtractor): + _VALID_URL = r'https?://on-demand\.gputechconf\.com/gtc/2015/video/S(?P<id>\d+)\.html' + _TEST = { + 'url': 'http://on-demand.gputechconf.com/gtc/2015/video/S5156.html', + 'md5': 'a8862a00a0fd65b8b43acc5b8e33f798', + 'info_dict': { + 'id': '5156', + 'ext': 'mp4', + 'title': 'Coordinating More Than 3 Million CUDA Threads for Social Network Analysis', + 'duration': 1219, + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + root_path = self._search_regex(r'var\s+rootPath\s*=\s*"([^"]+)', webpage, 'root path', 'http://evt.dispeak.com/nvidia/events/gtc15/') + xml_file_id = self._search_regex(r'var\s+xmlFileId\s*=\s*"([^"]+)', webpage, 'xml file id') + + doc = self._download_xml('%sxml/%s.xml' % (root_path, xml_file_id), video_id) + + metadata = xpath_element(doc, 'metadata') + http_host = xpath_text(metadata, 'httpHost') + mbr_videos = xpath_element(metadata, 'MBRVideos') + + formats = [] + for mbr_video in mbr_videos.findall('MBRVideo'): + stream_name = xpath_text(mbr_video, 'streamName') + if stream_name: + bitrate = int_or_none(xpath_text(mbr_video, 'bitrate')) + formats.append({ + 'url': 'http://%s/%s' % (http_host, stream_name.replace('mp4:', '')), + 'tbr': bitrate, + 'format_id': 'http-%d' % bitrate, + }) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': xpath_text(metadata, 'title'), + 'duration': parse_duration(xpath_text(metadata, 'endTime')), + 'creator': xpath_text(metadata, 'speaker'), + 'formats': formats, + } From 1deb710f26b0ccd14db13103cb400b663e243c57 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Sat, 19 Dec 2015 23:59:00 +0100 Subject: [PATCH 65/92] [gputechconf] improve extraction --- youtube_dl/extractor/gputechconf.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/gputechconf.py b/youtube_dl/extractor/gputechconf.py index becc61f1e..145b55bf3 100644 --- a/youtube_dl/extractor/gputechconf.py +++ b/youtube_dl/extractor/gputechconf.py @@ -33,18 +33,16 @@ class GPUTechConfIE(InfoExtractor): doc = self._download_xml('%sxml/%s.xml' % (root_path, xml_file_id), video_id) metadata = xpath_element(doc, 'metadata') - http_host = xpath_text(metadata, 'httpHost') + http_host = xpath_text(metadata, 'httpHost', 'http host', True) mbr_videos = xpath_element(metadata, 'MBRVideos') formats = [] for mbr_video in mbr_videos.findall('MBRVideo'): stream_name = xpath_text(mbr_video, 'streamName') if stream_name: - bitrate = int_or_none(xpath_text(mbr_video, 'bitrate')) formats.append({ 'url': 'http://%s/%s' % (http_host, stream_name.replace('mp4:', '')), - 'tbr': bitrate, - 'format_id': 'http-%d' % bitrate, + 'tbr': int_or_none(xpath_text(mbr_video, 'bitrate')), }) self._sort_formats(formats) From fdae2358581be6f80ac5fd20b1062e87a8797f46 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 20 Dec 2015 05:26:47 +0600 Subject: [PATCH 66/92] [utils] Add error_to_str --- youtube_dl/utils.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 5b396ede8..6d3119760 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1806,6 +1806,15 @@ def args_to_str(args): return ' '.join(shlex_quote(a) for a in args) +def error_to_str(err): + err_str = str(err) + # On python 2 error byte string must be decoded with proper + # encoding rather than ascii + if sys.version_info[0] < 3: + err_str = err_str.decode(preferredencoding()) + return err_str + + def mimetype2ext(mt): _, _, res = mt.rpartition('/') From 7f8b271465df75bb1f83cb181dc45fee5fe02cf9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 20 Dec 2015 05:27:38 +0600 Subject: [PATCH 67/92] Properly convert errors to strings --- youtube_dl/YoutubeDL.py | 7 ++++--- youtube_dl/downloader/common.py | 4 ++-- youtube_dl/extractor/common.py | 12 ++++-------- youtube_dl/extractor/dailymotion.py | 6 +++--- youtube_dl/extractor/facebook.py | 4 ++-- youtube_dl/extractor/youtube.py | 3 ++- 6 files changed, 17 insertions(+), 19 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index c642a1fbf..26b3adb02 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -48,6 +48,7 @@ from .utils import ( determine_ext, DownloadError, encodeFilename, + error_to_str, ExtractorError, format_bytes, formatSeconds, @@ -681,7 +682,7 @@ class YoutubeDL(object): raise except Exception as e: if self.params.get('ignoreerrors', False): - self.report_error(compat_str(e), tb=compat_str(traceback.format_exc())) + self.report_error(error_to_str(e), tb=compat_str(traceback.format_exc())) break else: raise @@ -1459,7 +1460,7 @@ class YoutubeDL(object): if dn and not os.path.exists(dn): os.makedirs(dn) except (OSError, IOError) as err: - self.report_error('unable to create directory ' + compat_str(err)) + self.report_error('unable to create directory ' + error_to_str(err)) return if self.params.get('writedescription', False): @@ -2039,4 +2040,4 @@ class YoutubeDL(object): (info_dict['extractor'], info_dict['id'], thumb_display_id, thumb_filename)) except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: self.report_warning('Unable to download thumbnail "%s": %s' % - (t['url'], compat_str(err))) + (t['url'], error_to_str(err))) diff --git a/youtube_dl/downloader/common.py b/youtube_dl/downloader/common.py index b8bf8daf8..eb63ccffd 100644 --- a/youtube_dl/downloader/common.py +++ b/youtube_dl/downloader/common.py @@ -5,9 +5,9 @@ import re import sys import time -from ..compat import compat_str from ..utils import ( encodeFilename, + error_to_str, decodeArgument, format_bytes, timeconvert, @@ -186,7 +186,7 @@ class FileDownloader(object): return os.rename(encodeFilename(old_filename), encodeFilename(new_filename)) except (IOError, OSError) as err: - self.report_error('unable to rename file: %s' % compat_str(err)) + self.report_error('unable to rename file: %s' % error_to_str(err)) def try_utime(self, filename, last_modified_hdr): """Try to set the last-modified time of the given file.""" diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 3ab72ff76..5659e40c8 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -30,11 +30,11 @@ from ..utils import ( clean_html, compiled_regex_type, determine_ext, + error_to_str, ExtractorError, fix_xml_ampersands, float_or_none, int_or_none, - preferredencoding, RegexNotFoundError, sanitize_filename, sanitized_Request, @@ -333,12 +333,8 @@ class InfoExtractor(object): return False if errnote is None: errnote = 'Unable to download webpage' - err_str = str(err) - # On python 2 error byte string must be decoded with proper - # encoding rather than ascii - if sys.version_info[0] < 3: - err_str = err_str.decode(preferredencoding()) - errmsg = '%s: %s' % (errnote, err_str) + + errmsg = '%s: %s' % (errnote, error_to_str(err)) if fatal: raise ExtractorError(errmsg, sys.exc_info()[2], cause=err) else: @@ -628,7 +624,7 @@ class InfoExtractor(object): else: raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE) except (IOError, netrc.NetrcParseError) as err: - self._downloader.report_warning('parsing .netrc: %s' % compat_str(err)) + self._downloader.report_warning('parsing .netrc: %s' % error_to_str(err)) return (username, password) diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index 428556213..7d66baf96 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -7,10 +7,10 @@ import itertools from .common import InfoExtractor -from ..compat import compat_str from ..utils import ( - ExtractorError, determine_ext, + error_to_str, + ExtractorError, int_or_none, parse_iso8601, sanitized_Request, @@ -278,7 +278,7 @@ class DailymotionIE(DailymotionBaseInfoExtractor): 'https://api.dailymotion.com/video/%s/subtitles?fields=id,language,url' % video_id, video_id, note=False) except ExtractorError as err: - self._downloader.report_warning('unable to download video subtitles: %s' % compat_str(err)) + self._downloader.report_warning('unable to download video subtitles: %s' % error_to_str(err)) return {} info = json.loads(sub_list) if (info['total'] > 0): diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index 321eec59e..296d3dd77 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -7,11 +7,11 @@ import socket from .common import InfoExtractor from ..compat import ( compat_http_client, - compat_str, compat_urllib_error, compat_urllib_parse_unquote, ) from ..utils import ( + error_to_str, ExtractorError, limit_length, sanitized_Request, @@ -116,7 +116,7 @@ class FacebookIE(InfoExtractor): if re.search(r'id="checkpointSubmitButton"', check_response) is not None: self._downloader.report_warning('Unable to confirm login, you have to login in your brower and authorize the login.') except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.report_warning('unable to log in: %s' % compat_str(err)) + self._downloader.report_warning('unable to log in: %s' % error_to_str(err)) return def _real_initialize(self): diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 4556a16fb..d63aa5f4e 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -26,6 +26,7 @@ from ..compat import ( from ..utils import ( clean_html, encode_dict, + error_to_str, ExtractorError, float_or_none, get_element_by_attribute, @@ -903,7 +904,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id, video_id, note=False) except ExtractorError as err: - self._downloader.report_warning('unable to download video subtitles: %s' % compat_str(err)) + self._downloader.report_warning('unable to download video subtitles: %s' % error_to_str(err)) return {} sub_lang_list = {} From fb043a6e4ee98a2ea9fbb0975e701341bbd34bc3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 20 Dec 2015 06:16:19 +0600 Subject: [PATCH 68/92] [YoutubeDL] Use error_to_str --- youtube_dl/YoutubeDL.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 26b3adb02..216b5a5db 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -675,8 +675,8 @@ class YoutubeDL(object): return self.process_ie_result(ie_result, download, extra_info) else: return ie_result - except ExtractorError as de: # An error we somewhat expected - self.report_error(compat_str(de), de.format_traceback()) + except ExtractorError as e: # An error we somewhat expected + self.report_error(error_to_str(e), e.format_traceback()) break except MaxDownloadsReached: raise From 8900ab4d9bc6bf8a05a1a2608efaff08561945e8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 20 Dec 2015 06:22:01 +0600 Subject: [PATCH 69/92] [YoutubeDL] More error_to_str --- youtube_dl/YoutubeDL.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 216b5a5db..57df6a279 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -1511,7 +1511,7 @@ class YoutubeDL(object): sub_info['url'], info_dict['id'], note=False) except ExtractorError as err: self.report_warning('Unable to download subtitle for "%s": %s' % - (sub_lang, compat_str(err.cause))) + (sub_lang, error_to_str(err.cause))) continue try: sub_filename = subtitles_filename(filename, sub_lang, sub_format) From 8e60dc7526596f456c0b5d7dc48daa6cae08ebb6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 20 Dec 2015 06:26:26 +0600 Subject: [PATCH 70/92] [utils] Add encode_compat_str --- youtube_dl/utils.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 6d3119760..9a2dd1439 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1712,6 +1712,10 @@ def encode_dict(d, encoding='utf-8'): return dict((encode(k), encode(v)) for k, v in d.items()) +def encode_compat_str(string, encoding=preferredencoding(), errors='strict'): + return string if isinstance(string, compat_str) else compat_str(string, encoding, errors) + + US_RATINGS = { 'G': 0, 'PG': 10, From c0384f221e5a8383c377a3c43f634cf9807eb634 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 20 Dec 2015 06:29:36 +0600 Subject: [PATCH 71/92] Use proper encoding on compat_str construction when necessary --- youtube_dl/YoutubeDL.py | 5 +++-- youtube_dl/update.py | 16 ++++++++-------- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 57df6a279..1795097ae 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -47,6 +47,7 @@ from .utils import ( DEFAULT_OUTTMPL, determine_ext, DownloadError, + encode_compat_str, encodeFilename, error_to_str, ExtractorError, @@ -496,7 +497,7 @@ class YoutubeDL(object): tb = '' if hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]: tb += ''.join(traceback.format_exception(*sys.exc_info()[1].exc_info)) - tb += compat_str(traceback.format_exc()) + tb += encode_compat_str(traceback.format_exc()) else: tb_data = traceback.format_list(traceback.extract_stack()) tb = ''.join(tb_data) @@ -682,7 +683,7 @@ class YoutubeDL(object): raise except Exception as e: if self.params.get('ignoreerrors', False): - self.report_error(error_to_str(e), tb=compat_str(traceback.format_exc())) + self.report_error(error_to_str(e), tb=encode_compat_str(traceback.format_exc())) break else: raise diff --git a/youtube_dl/update.py b/youtube_dl/update.py index 074eb64a7..995b8ed96 100644 --- a/youtube_dl/update.py +++ b/youtube_dl/update.py @@ -9,7 +9,7 @@ import subprocess import sys from zipimport import zipimporter -from .compat import compat_str +from .utils import encode_compat_str from .version import __version__ @@ -61,7 +61,7 @@ def update_self(to_screen, verbose, opener): newversion = opener.open(VERSION_URL).read().decode('utf-8').strip() except Exception: if verbose: - to_screen(compat_str(traceback.format_exc())) + to_screen(encode_compat_str(traceback.format_exc())) to_screen('ERROR: can\'t find the current version. Please try again later.') return if newversion == __version__: @@ -74,7 +74,7 @@ def update_self(to_screen, verbose, opener): versions_info = json.loads(versions_info) except Exception: if verbose: - to_screen(compat_str(traceback.format_exc())) + to_screen(encode_compat_str(traceback.format_exc())) to_screen('ERROR: can\'t obtain versions info. Please try again later.') return if 'signature' not in versions_info: @@ -123,7 +123,7 @@ def update_self(to_screen, verbose, opener): urlh.close() except (IOError, OSError): if verbose: - to_screen(compat_str(traceback.format_exc())) + to_screen(encode_compat_str(traceback.format_exc())) to_screen('ERROR: unable to download latest version') return @@ -137,7 +137,7 @@ def update_self(to_screen, verbose, opener): outf.write(newcontent) except (IOError, OSError): if verbose: - to_screen(compat_str(traceback.format_exc())) + to_screen(encode_compat_str(traceback.format_exc())) to_screen('ERROR: unable to write the new version') return @@ -157,7 +157,7 @@ start /b "" cmd /c del "%%~f0"&exit /b" return # Do not show premature success messages except (IOError, OSError): if verbose: - to_screen(compat_str(traceback.format_exc())) + to_screen(encode_compat_str(traceback.format_exc())) to_screen('ERROR: unable to overwrite current version') return @@ -169,7 +169,7 @@ start /b "" cmd /c del "%%~f0"&exit /b" urlh.close() except (IOError, OSError): if verbose: - to_screen(compat_str(traceback.format_exc())) + to_screen(encode_compat_str(traceback.format_exc())) to_screen('ERROR: unable to download latest version') return @@ -183,7 +183,7 @@ start /b "" cmd /c del "%%~f0"&exit /b" outf.write(newcontent) except (IOError, OSError): if verbose: - to_screen(compat_str(traceback.format_exc())) + to_screen(encode_compat_str(traceback.format_exc())) to_screen('ERROR: unable to overwrite current version') return From 2c74e6fa77935804b8e63807e23762e97a25f4d5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 20 Dec 2015 06:35:58 +0600 Subject: [PATCH 72/92] [YoutubeDL] Revert error_to_str for ExtractorError --- youtube_dl/YoutubeDL.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 1795097ae..bf05e9340 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -677,7 +677,7 @@ class YoutubeDL(object): else: return ie_result except ExtractorError as e: # An error we somewhat expected - self.report_error(error_to_str(e), e.format_traceback()) + self.report_error(compat_str(e), e.format_traceback()) break except MaxDownloadsReached: raise From d890b4cc0ac6b88daa917a1a791afe7541cd0411 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 20 Dec 2015 06:43:42 +0600 Subject: [PATCH 73/92] [nbc:news] Remove unnecessary compat_str --- youtube_dl/extractor/nbc.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/nbc.py b/youtube_dl/extractor/nbc.py index 4c1eca96f..340c922bd 100644 --- a/youtube_dl/extractor/nbc.py +++ b/youtube_dl/extractor/nbc.py @@ -3,10 +3,7 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import ( - compat_str, - compat_HTTPError, -) +from ..compat import compat_HTTPError from ..utils import ( ExtractorError, find_xpath_attr, @@ -189,7 +186,7 @@ class NBCNewsIE(InfoExtractor): 'title': info.find('headline').text, 'ext': 'flv', 'url': find_xpath_attr(info, 'media', 'type', 'flashVideo').text, - 'description': compat_str(info.find('caption').text), + 'description': info.find('caption').text, 'thumbnail': find_xpath_attr(info, 'media', 'type', 'thumbnail').text, } else: From 9b9c5355e4c0fe6f9028aaa3b1b8beb28a085433 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 20 Dec 2015 07:00:39 +0600 Subject: [PATCH 74/92] Rename error_to_str to error_to_compat_str --- youtube_dl/YoutubeDL.py | 10 +++++----- youtube_dl/downloader/common.py | 4 ++-- youtube_dl/extractor/common.py | 6 +++--- youtube_dl/extractor/dailymotion.py | 4 ++-- youtube_dl/extractor/facebook.py | 4 ++-- youtube_dl/extractor/youtube.py | 4 ++-- youtube_dl/utils.py | 2 +- 7 files changed, 17 insertions(+), 17 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index bf05e9340..50425b8d7 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -49,7 +49,7 @@ from .utils import ( DownloadError, encode_compat_str, encodeFilename, - error_to_str, + error_to_compat_str, ExtractorError, format_bytes, formatSeconds, @@ -683,7 +683,7 @@ class YoutubeDL(object): raise except Exception as e: if self.params.get('ignoreerrors', False): - self.report_error(error_to_str(e), tb=encode_compat_str(traceback.format_exc())) + self.report_error(error_to_compat_str(e), tb=encode_compat_str(traceback.format_exc())) break else: raise @@ -1461,7 +1461,7 @@ class YoutubeDL(object): if dn and not os.path.exists(dn): os.makedirs(dn) except (OSError, IOError) as err: - self.report_error('unable to create directory ' + error_to_str(err)) + self.report_error('unable to create directory ' + error_to_compat_str(err)) return if self.params.get('writedescription', False): @@ -1512,7 +1512,7 @@ class YoutubeDL(object): sub_info['url'], info_dict['id'], note=False) except ExtractorError as err: self.report_warning('Unable to download subtitle for "%s": %s' % - (sub_lang, error_to_str(err.cause))) + (sub_lang, error_to_compat_str(err.cause))) continue try: sub_filename = subtitles_filename(filename, sub_lang, sub_format) @@ -2041,4 +2041,4 @@ class YoutubeDL(object): (info_dict['extractor'], info_dict['id'], thumb_display_id, thumb_filename)) except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: self.report_warning('Unable to download thumbnail "%s": %s' % - (t['url'], error_to_str(err))) + (t['url'], error_to_compat_str(err))) diff --git a/youtube_dl/downloader/common.py b/youtube_dl/downloader/common.py index eb63ccffd..beae8c4d0 100644 --- a/youtube_dl/downloader/common.py +++ b/youtube_dl/downloader/common.py @@ -7,7 +7,7 @@ import time from ..utils import ( encodeFilename, - error_to_str, + error_to_compat_str, decodeArgument, format_bytes, timeconvert, @@ -186,7 +186,7 @@ class FileDownloader(object): return os.rename(encodeFilename(old_filename), encodeFilename(new_filename)) except (IOError, OSError) as err: - self.report_error('unable to rename file: %s' % error_to_str(err)) + self.report_error('unable to rename file: %s' % error_to_compat_str(err)) def try_utime(self, filename, last_modified_hdr): """Try to set the last-modified time of the given file.""" diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 5659e40c8..828f58f12 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -30,7 +30,7 @@ from ..utils import ( clean_html, compiled_regex_type, determine_ext, - error_to_str, + error_to_compat_str, ExtractorError, fix_xml_ampersands, float_or_none, @@ -334,7 +334,7 @@ class InfoExtractor(object): if errnote is None: errnote = 'Unable to download webpage' - errmsg = '%s: %s' % (errnote, error_to_str(err)) + errmsg = '%s: %s' % (errnote, error_to_compat_str(err)) if fatal: raise ExtractorError(errmsg, sys.exc_info()[2], cause=err) else: @@ -624,7 +624,7 @@ class InfoExtractor(object): else: raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE) except (IOError, netrc.NetrcParseError) as err: - self._downloader.report_warning('parsing .netrc: %s' % error_to_str(err)) + self._downloader.report_warning('parsing .netrc: %s' % error_to_compat_str(err)) return (username, password) diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index 7d66baf96..0c5b6617f 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -9,7 +9,7 @@ from .common import InfoExtractor from ..utils import ( determine_ext, - error_to_str, + error_to_compat_str, ExtractorError, int_or_none, parse_iso8601, @@ -278,7 +278,7 @@ class DailymotionIE(DailymotionBaseInfoExtractor): 'https://api.dailymotion.com/video/%s/subtitles?fields=id,language,url' % video_id, video_id, note=False) except ExtractorError as err: - self._downloader.report_warning('unable to download video subtitles: %s' % error_to_str(err)) + self._downloader.report_warning('unable to download video subtitles: %s' % error_to_compat_str(err)) return {} info = json.loads(sub_list) if (info['total'] > 0): diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index 296d3dd77..39c481068 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -11,7 +11,7 @@ from ..compat import ( compat_urllib_parse_unquote, ) from ..utils import ( - error_to_str, + error_to_compat_str, ExtractorError, limit_length, sanitized_Request, @@ -116,7 +116,7 @@ class FacebookIE(InfoExtractor): if re.search(r'id="checkpointSubmitButton"', check_response) is not None: self._downloader.report_warning('Unable to confirm login, you have to login in your brower and authorize the login.') except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.report_warning('unable to log in: %s' % error_to_str(err)) + self._downloader.report_warning('unable to log in: %s' % error_to_compat_str(err)) return def _real_initialize(self): diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index d63aa5f4e..89759a1cb 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -26,7 +26,7 @@ from ..compat import ( from ..utils import ( clean_html, encode_dict, - error_to_str, + error_to_compat_str, ExtractorError, float_or_none, get_element_by_attribute, @@ -904,7 +904,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id, video_id, note=False) except ExtractorError as err: - self._downloader.report_warning('unable to download video subtitles: %s' % error_to_str(err)) + self._downloader.report_warning('unable to download video subtitles: %s' % error_to_compat_str(err)) return {} sub_lang_list = {} diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 9a2dd1439..1737ac5f6 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1810,7 +1810,7 @@ def args_to_str(args): return ' '.join(shlex_quote(a) for a in args) -def error_to_str(err): +def error_to_compat_str(err): err_str = str(err) # On python 2 error byte string must be decoded with proper # encoding rather than ascii From 6b77d52b1f8ca3777284e4164e5363c110aeba66 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 20 Dec 2015 07:07:14 +0600 Subject: [PATCH 75/92] [test_utils] Add tests for encode_compat_str --- test/test_utils.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/test/test_utils.py b/test/test_utils.py index e4e8d3825..1c3290d9b 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -22,6 +22,7 @@ from youtube_dl.utils import ( DateRange, detect_exe_version, determine_ext, + encode_compat_str, encodeFilename, escape_rfc3986, escape_url, @@ -449,6 +450,10 @@ class TestUtil(unittest.TestCase): data = urlencode_postdata({'username': 'foo@bar.com', 'password': '1234'}) self.assertTrue(isinstance(data, bytes)) + def test_encode_compat_str(self): + self.assertEqual(encode_compat_str(b'\xd1\x82\xd0\xb5\xd1\x81\xd1\x82', 'utf-8'), 'тест') + self.assertEqual(encode_compat_str('тест', 'utf-8'), 'тест') + def test_parse_iso8601(self): self.assertEqual(parse_iso8601('2014-03-23T23:04:26+0100'), 1395612266) self.assertEqual(parse_iso8601('2014-03-23T22:04:26+0000'), 1395612266) From e462474e1d944e8b1547edc619a13a7ee2abbc2b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 20 Dec 2015 07:48:16 +0600 Subject: [PATCH 76/92] [youtube] Generalize playlists extractor --- youtube_dl/extractor/__init__.py | 2 +- youtube_dl/extractor/youtube.py | 19 +++++++++++++++---- 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 65c5e1c92..eac50eda5 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -852,7 +852,7 @@ from .youtube import ( YoutubeTruncatedIDIE, YoutubeTruncatedURLIE, YoutubeUserIE, - YoutubeUserPlaylistsIE, + YoutubePlaylistsIE, YoutubeWatchLaterIE, ) from .zapiks import ZapiksIE diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 89759a1cb..4aac2cc03 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1775,6 +1775,10 @@ class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor): }, }] + @classmethod + def suitable(cls, url): + return False if YoutubePlaylistsIE.suitable(url) else super(YoutubeChannelIE, cls).suitable(url) + def _real_extract(self, url): channel_id = self._match_id(url) @@ -1848,10 +1852,10 @@ class YoutubeUserIE(YoutubeChannelIE): return super(YoutubeUserIE, cls).suitable(url) -class YoutubeUserPlaylistsIE(YoutubePlaylistsBaseInfoExtractor): - IE_DESC = 'YouTube.com user playlists' - _VALID_URL = r'https?://(?:\w+\.)?youtube\.com/user/(?P<id>[^/]+)/playlists' - IE_NAME = 'youtube:user:playlists' +class YoutubePlaylistsIE(YoutubePlaylistsBaseInfoExtractor): + IE_DESC = 'YouTube.com user/channel playlists' + _VALID_URL = r'https?://(?:\w+\.)?youtube\.com/(?:user|channel)/(?P<id>[^/]+)/playlists' + IE_NAME = 'youtube:playlists' _TESTS = [{ 'url': 'http://www.youtube.com/user/ThirstForScience/playlists', @@ -1868,6 +1872,13 @@ class YoutubeUserPlaylistsIE(YoutubePlaylistsBaseInfoExtractor): 'id': 'igorkle1', 'title': 'Игорь Клейнер', }, + }, { + 'url': 'https://www.youtube.com/channel/UCiU1dHvZObB2iP6xkJ__Icw/playlists', + 'playlist_mincount': 17, + 'info_dict': { + 'id': 'UCiU1dHvZObB2iP6xkJ__Icw', + 'title': 'Chem Player', + }, }] From c6ed6fadc248bd4957fd7cccc1eeb0f109ac3ffa Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Sun, 20 Dec 2015 12:43:00 +0100 Subject: [PATCH 77/92] [cnet] improve extraction - relex data json regex - extract the platform metadata once - extract hds formats - extract duration - extract thumbnail --- youtube_dl/extractor/cnet.py | 41 ++++++++++++++++++++---------------- 1 file changed, 23 insertions(+), 18 deletions(-) diff --git a/youtube_dl/extractor/cnet.py b/youtube_dl/extractor/cnet.py index 3ecf0efd4..5c3908f72 100644 --- a/youtube_dl/extractor/cnet.py +++ b/youtube_dl/extractor/cnet.py @@ -1,33 +1,33 @@ # coding: utf-8 from __future__ import unicode_literals -import json - -from .common import InfoExtractor from .theplatform import ThePlatformIE +from ..utils import int_or_none -class CNETIE(InfoExtractor): +class CNETIE(ThePlatformIE): _VALID_URL = r'https?://(?:www\.)?cnet\.com/videos/(?P<id>[^/]+)/' _TESTS = [{ 'url': 'http://www.cnet.com/videos/hands-on-with-microsofts-windows-8-1-update/', 'info_dict': { 'id': '56f4ea68-bd21-4852-b08c-4de5b8354c60', - 'ext': 'mp4', + 'ext': 'flv', 'title': 'Hands-on with Microsoft Windows 8.1 Update', 'description': 'The new update to the Windows 8 OS brings improved performance for mouse and keyboard users.', 'uploader_id': '6085384d-619e-11e3-b231-14feb5ca9861', 'uploader': 'Sarah Mitroff', + 'duration': 70, }, }, { 'url': 'http://www.cnet.com/videos/whiny-pothole-tweets-at-local-government-when-hit-by-cars-tomorrow-daily-187/', 'info_dict': { 'id': '56527b93-d25d-44e3-b738-f989ce2e49ba', - 'ext': 'mp4', + 'ext': 'flv', + 'title': 'Whiny potholes tweet at local government when hit by cars (Tomorrow Daily 187)', 'description': 'Khail and Ashley wonder what other civic woes can be solved by self-tweeting objects, investigate a new kind of VR camera and watch an origami robot self-assemble, walk, climb, dig and dissolve. #TDPothole', 'uploader_id': 'b163284d-6b73-44fc-b3e6-3da66c392d40', 'uploader': 'Ashley Esqueda', - 'title': 'Whiny potholes tweet at local government when hit by cars (Tomorrow Daily 187)', + 'duration': 1482, }, }] @@ -36,10 +36,10 @@ class CNETIE(InfoExtractor): webpage = self._download_webpage(url, display_id) data_json = self._html_search_regex( - r"<div class=\"videoPlayer\"\s+.*?data-cnet-video-uvp-options='([^']+)'", + r"data-cnet-video(?:-uvp)?-options='([^']+)'", webpage, 'data json') - data = json.loads(data_json) - vdata = data['videos'][0] + data = self._parse_json(data_json, display_id) + vdata = data.get('video') or data['videos'][0] video_id = vdata['id'] title = vdata['title'] @@ -52,19 +52,22 @@ class CNETIE(InfoExtractor): uploader_id = None mpx_account = data['config']['uvpConfig']['default']['mpx_account'] - tp = ThePlatformIE(self._downloader) + + metadata = self.get_metadata('%s/%s' % (mpx_account, list(vdata['files'].values())[0]), video_id) + description = vdata.get('description') or metadata.get('description') + duration = int_or_none(vdata.get('duration')) or metadata.get('duration') + formats = [] subtitles = {} - description = vdata.get('description') - for (fkey, vid) in vdata['files'].items(): if fkey == 'hls_phone' and 'hls_tablet' in vdata['files']: continue - result = tp.extract(('http://link.theplatform.com/s/%s/%s' % (mpx_account, vid))) - formats.extend(result['formats']) - subtitles = self._merge_subtitles(subtitles, result['subtitles']) - description = description or result.get('description') - + release_url = 'http://link.theplatform.com/s/%s/%s?format=SMIL&mbr=true' % (mpx_account, vid) + if fkey == 'hds': + release_url += '&manifest=f4m' + tp_formats, tp_subtitles = self._extract_theplatform_smil(release_url, video_id, 'Downloading %s SMIL data' % fkey) + formats.extend(tp_formats) + subtitles = self._merge_subtitles(subtitles, tp_subtitles) self._sort_formats(formats) return { @@ -72,6 +75,8 @@ class CNETIE(InfoExtractor): 'display_id': display_id, 'title': title, 'description': description, + 'thumbnail': metadata.get('thumbnail'), + 'duration': duration, 'uploader': uploader, 'uploader_id': uploader_id, 'subtitles': subtitles, From 6882c0870eb89a17eb81d3d273a9f9e42a8bea5e Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Sun, 20 Dec 2015 15:48:19 +0100 Subject: [PATCH 78/92] [tele13] improve extraction - improve jwplayer setup regex - sort formats - remove duplicate formats - update youtube test --- youtube_dl/extractor/tele13.py | 64 +++++++++++++++++----------------- 1 file changed, 32 insertions(+), 32 deletions(-) diff --git a/youtube_dl/extractor/tele13.py b/youtube_dl/extractor/tele13.py index f1764eb2f..a363b4d40 100644 --- a/youtube_dl/extractor/tele13.py +++ b/youtube_dl/extractor/tele13.py @@ -1,10 +1,13 @@ # coding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor -from ..utils import js_to_json +from .youtube import YoutubeIE +from ..utils import ( + js_to_json, + qualities, + determine_ext, +) class Tele13IE(InfoExtractor): @@ -25,12 +28,12 @@ class Tele13IE(InfoExtractor): }, { 'url': 'http://www.t13.cl/videos/mundo/tendencias/video-captan-misteriosa-bola-fuego-cielos-bangkok', - 'md5': '65d1ae54812c96f4b345dd21d3bb1adc', + 'md5': '867adf6a3b3fef932c68a71d70b70946', 'info_dict': { 'id': 'rOoKv2OMpOw', 'ext': 'mp4', 'title': 'Shooting star seen on 7-Sep-2015', - 'description': 'md5:a1cd2e74f6ee6851552c9cf5851d6b06', + 'description': 'md5:7292ff2a34b2f673da77da222ae77e1e', 'uploader': 'Porjai Jaturongkhakun', 'upload_date': '20150906', 'uploader_id': 'UCnLY_3ezwNcDSC_Wc6suZxw', @@ -41,41 +44,38 @@ class Tele13IE(InfoExtractor): def _real_extract(self, url): display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) - setup_js = self._parse_json( - js_to_json( - self._search_regex( - r"jwplayer\('player-vivo'\).setup\((\{.*?\})\)", - webpage, - 'setup code', - flags=re.DOTALL - ).replace('\n//', '') - ), - display_id - ) - title = setup_js['title'] - thumbnail = setup_js.get('image') or setup_js['playlist'][0].get('image') - description = self._html_search_meta( - 'description', webpage, 'description') + setup_js = self._search_regex(r"(?s)jwplayer\('player-vivo'\).setup\((\{.*?\})\)", webpage, 'setup code') + sources = self._parse_json(self._search_regex(r'sources\s*:\s*(\[[^\]]+\])', setup_js, 'sources'), display_id, js_to_json) + preference = qualities(['Móvil', 'SD', 'HD']) formats = [] - for f in setup_js['playlist'][0]['sources']: + urls = [] + for f in sources: format_url = f['file'] - if format_url != '': - if '.m3u8' in format_url: - formats.extend(self._extract_m3u8_formats(format_url, display_id)) + if format_url and format_url not in urls: + ext = determine_ext(format_url) + if ext == 'm3u8': + m3u8_formats = self._extract_m3u8_formats(format_url, display_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False) + if m3u8_formats: + formats.extend(m3u8_formats) + elif YoutubeIE.suitable(format_url): + return self.url_result(format_url, 'Youtube') else: - if 'youtube.com' in format_url: - return self.url_result(format_url, 'Youtube') - else: - formats.append({'url': format_url, 'format_id': f.get('label')}) + formats.append({ + 'url': format_url, + 'format_id': f.get('label'), + 'preference': preference(f.get('label')), + 'ext': ext, + }) + urls.append(format_url) + self._sort_formats(formats) return { 'id': display_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, + 'title': self._search_regex(r'title\s*:\s*"([^"]+)"', setup_js, 'title'), + 'description': self._html_search_meta('description', webpage, 'description'), + 'thumbnail': self._search_regex(r'image\s*:\s*"([^"]+)"', setup_js, 'thumbnail', default=None), 'formats': formats, } From ecbccea703cc7812b66c6dd3a543d60b5be8aa48 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Sun, 20 Dec 2015 21:38:30 +0100 Subject: [PATCH 79/92] [faz] extract duration and bitrate and use xpath_element and xpath_text for extraction --- youtube_dl/extractor/faz.py | 36 +++++++++++++++++++++--------------- 1 file changed, 21 insertions(+), 15 deletions(-) diff --git a/youtube_dl/extractor/faz.py b/youtube_dl/extractor/faz.py index d9a868119..6f9b003c2 100644 --- a/youtube_dl/extractor/faz.py +++ b/youtube_dl/extractor/faz.py @@ -2,6 +2,11 @@ from __future__ import unicode_literals from .common import InfoExtractor +from ..utils import ( + xpath_element, + xpath_text, + int_or_none, +) class FazIE(InfoExtractor): @@ -37,31 +42,32 @@ class FazIE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) + description = self._og_search_description(webpage) config_xml_url = self._search_regex( - r'(?:var\s+)?videoXMLURL\s*=\s*"([^"]+)', webpage, 'config xml url') + r'videoXMLURL\s*=\s*"([^"]+)', webpage, 'config xml url') config = self._download_xml( config_xml_url, video_id, 'Downloading config xml') - encodings = config.find('ENCODINGS') + encodings = xpath_element(config, 'ENCODINGS', 'encodings', True) formats = [] for pref, code in enumerate(['LOW', 'HIGH', 'HQ']): - encoding = encodings.find(code) - if encoding is None: - continue - encoding_url = encoding.find('FILENAME').text - formats.append({ - 'url': encoding_url, - 'format_id': code.lower(), - 'quality': pref, - }) + encoding = xpath_element(encodings, code) + if encoding: + encoding_url = xpath_text(encoding, 'FILENAME') + if encoding_url: + formats.append({ + 'url': encoding_url, + 'format_id': code.lower(), + 'quality': pref, + 'tbr': int_or_none(xpath_text(encoding, 'AVERAGEBITRATE')), + }) self._sort_formats(formats) - descr = self._html_search_regex( - r'<p class="Content Copy">(.*?)</p>', webpage, 'description', fatal=False) return { 'id': video_id, 'title': self._og_search_title(webpage), 'formats': formats, - 'description': descr, - 'thumbnail': config.find('STILL/STILL_BIG').text, + 'description': description.strip() if description else None, + 'thumbnail': xpath_text(config, 'STILL/STILL_BIG'), + 'duration': int_or_none(xpath_text(config, 'DURATION')), } From 5b95419ca59e6238ab0041d48a53a9e51485e1cc Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Mon, 21 Dec 2015 00:20:22 +0100 Subject: [PATCH 80/92] [flickr] extract views_count and tags --- youtube_dl/extractor/flickr.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/youtube_dl/extractor/flickr.py b/youtube_dl/extractor/flickr.py index 92d2ac553..452b27b26 100644 --- a/youtube_dl/extractor/flickr.py +++ b/youtube_dl/extractor/flickr.py @@ -25,6 +25,8 @@ class FlickrIE(InfoExtractor): 'uploader_id': '10922353@N03', 'uploader': 'Forest Wander', 'comment_count': int, + 'view_count': int, + 'tags': list, } } @@ -78,6 +80,8 @@ class FlickrIE(InfoExtractor): 'uploader_id': owner.get('nsid'), 'uploader': owner.get('realname'), 'comment_count': int_or_none(video_info.get('comments', {}).get('_content')), + 'view_count': int_or_none(video_info.get('views')), + 'tags': [tag.get('_content') for tag in video_info.get('tags', {}).get('tag', [])] } else: raise ExtractorError('not a video', expected=True) From 5b251628e9f45c89c1becb3f62c4212874eb74ea Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Mon, 21 Dec 2015 03:05:34 +0100 Subject: [PATCH 81/92] [googledrive] Modernize --- youtube_dl/extractor/__init__.py | 5 +- youtube_dl/extractor/generic.py | 4 +- youtube_dl/extractor/googledrive.py | 146 ++++++++++------------------ 3 files changed, 54 insertions(+), 101 deletions(-) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 02e18a0da..6655d7eb5 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -209,10 +209,7 @@ from .globo import GloboIE from .godtube import GodTubeIE from .goldenmoustache import GoldenMoustacheIE from .golem import GolemIE -from .googledrive import ( - GoogleDriveEmbedIE, - GoogleDriveIE, -) +from .googledrive import GoogleDriveIE from .googleplus import GooglePlusIE from .googlesearch import GoogleSearchIE from .gorillavid import GorillaVidIE diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 3f7b094db..abd98e500 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -48,7 +48,7 @@ from .vimeo import VimeoIE from .dailymotion import DailymotionCloudIE from .onionstudios import OnionStudiosIE from .snagfilms import SnagFilmsEmbedIE -from .googledrive import GoogleDriveEmbedIE +from .googledrive import GoogleDriveIE class GenericIE(InfoExtractor): @@ -1601,7 +1601,7 @@ class GenericIE(InfoExtractor): return self.url_result(nbc_sports_url, 'NBCSportsVPlayer') # Look for Google Drive embeds - google_drive_url = GoogleDriveEmbedIE._extract_url(webpage) + google_drive_url = GoogleDriveIE._extract_url(webpage) if google_drive_url: return self.url_result(google_drive_url, 'GoogleDrive') diff --git a/youtube_dl/extractor/googledrive.py b/youtube_dl/extractor/googledrive.py index 7bc7b7a0d..f354c9c7a 100644 --- a/youtube_dl/extractor/googledrive.py +++ b/youtube_dl/extractor/googledrive.py @@ -1,132 +1,88 @@ +from __future__ import unicode_literals + import re from .common import InfoExtractor from ..utils import ( - RegexNotFoundError, ExtractorError, + int_or_none, ) -class GoogleDriveEmbedIE(InfoExtractor): - _VALID_URL = r'https?://(?:video\.google\.com/get_player\?.*?docid=|(?:docs|drive)\.google\.com/file/d/)(?P<id>[a-zA-Z0-9_-]{28})' + +class GoogleDriveIE(InfoExtractor): + _VALID_URL = r'https?://(?:(?:docs|drive)\.google\.com/(?:uc\?.*?id=|file/d/)|video\.google\.com/get_player\?.*?docid=)(?P<id>[a-zA-Z0-9_-]{28})' _TEST = { - 'url': 'https://docs.google.com/file/d/0B8KB9DRosYGKMXNoeWxqa3JYclE/preview', + 'url': 'https://drive.google.com/file/d/0ByeS4oOUV-49Zzh4R1J6R09zazQ/edit?pli=1', + 'md5': '881f7700aec4f538571fa1e0eed4a7b6', 'info_dict': { - 'id': '0B8KB9DRosYGKMXNoeWxqa3JYclE', + 'id': '0ByeS4oOUV-49Zzh4R1J6R09zazQ', 'ext': 'mp4', - 'title': 'Jimmy Fallon Sings Since You\'ve Been Gone.wmv', + 'title': 'Big Buck Bunny.mp4', + 'duration': 46, } } + _FORMATS_EXT = { + '5': 'flv', + '6': 'flv', + '13': '3gp', + '17': '3gp', + '18': 'mp4', + '22': 'mp4', + '34': 'flv', + '35': 'flv', + '36': '3gp', + '37': 'mp4', + '38': 'mp4', + '43': 'webm', + '44': 'webm', + '45': 'webm', + '46': 'webm', + '59': 'mp4', + } @staticmethod def _extract_url(webpage): mobj = re.search( - r'<iframe src="https?://(?:video\.google\.com/get_player\?.*?docid=|(?:docs|drive)\.google\.com/file/d/)(?P<id>[a-zA-Z0-9_-]{28})', + r'<iframe[^>]+src="https?://(?:video\.google\.com/get_player\?.*?docid=|(?:docs|drive)\.google\.com/file/d/)(?P<id>[a-zA-Z0-9_-]{28})', webpage) if mobj: return 'https://drive.google.com/file/d/%s' % mobj.group('id') - def _real_extract(self, url): - video_id = self._match_id(url) - return { - '_type': 'url', - 'ie_key': 'GoogleDrive', - 'url': 'https://drive.google.com/file/d/%s' % video_id - } - -class GoogleDriveIE(InfoExtractor): - _VALID_URL = r'https?://(?:docs|drive)\.google\.com/(?:uc\?.*?id=|file/d/)(?P<id>[a-zA-Z0-9_-]{28})' - _TEST = { - 'url': 'https://drive.google.com/file/d/0ByeS4oOUV-49Zzh4R1J6R09zazQ/edit?pli=1', - 'info_dict': { - 'id': '0ByeS4oOUV-49Zzh4R1J6R09zazQ', - 'ext': 'mp4', - 'title': 'Big Buck Bunny.mp4', - } - } - _formats = { - '5': {'ext': 'flv'}, - '6': {'ext': 'flv'}, - '13': {'ext': '3gp'}, - '17': {'ext': '3gp'}, - '18': {'ext': 'mp4'}, - '22': {'ext': 'mp4'}, - '34': {'ext': 'flv'}, - '35': {'ext': 'flv'}, - '36': {'ext': '3gp'}, - '37': {'ext': 'mp4'}, - '38': {'ext': 'mp4'}, - '43': {'ext': 'webm'}, - '44': {'ext': 'webm'}, - '45': {'ext': 'webm'}, - '46': {'ext': 'webm'}, - '59': {'ext': 'mp4'} - } - def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage( - 'http://docs.google.com/file/d/' + video_id, video_id, encoding='unicode_escape' - ) - try: - title = self._html_search_regex( - r'"title"\s*,\s*"([^"]+)', - webpage, - 'title' - ) - fmt_stream_map = self._html_search_regex( - r'"fmt_stream_map"\s*,\s*"([^"]+)', - webpage, - 'fmt_stream_map' - ) - fmt_list = self._html_search_regex( - r'"fmt_list"\s*,\s*"([^"]+)', - webpage, - 'fmt_list' - ) -# timestamp = self._html_search_regex( -# r'"timestamp"\s*,\s*"([^"]+)', -# webpage, -# 'timestamp' -# ) - length_seconds = self._html_search_regex( - r'"length_seconds"\s*,\s*"([^"]+)', - webpage, - 'length_seconds' - ) - except RegexNotFoundError: - try: - reason = self._html_search_regex( - r'"reason","([^"]+)', - webpage, - 'reason' - ) - raise ExtractorError(reason) - return - except RegexNotFoundError: - raise ExtractorError('not a video') - return + 'http://docs.google.com/file/d/%s' % video_id, video_id, encoding='unicode_escape') + + reason = self._search_regex(r'"reason"\s*,\s*"([^"]+)', webpage, 'reason', default=None) + if reason: + raise ExtractorError(reason) + + title = self._search_regex(r'"title"\s*,\s*"([^"]+)', webpage, 'title') + duration = int_or_none(self._search_regex( + r'"length_seconds"\s*,\s*"([^"]+)', webpage, 'length seconds', default=None)) + fmt_stream_map = self._search_regex( + r'"fmt_stream_map"\s*,\s*"([^"]+)', webpage, 'fmt stream map').split(',') + fmt_list = self._search_regex(r'"fmt_list"\s*,\s*"([^"]+)', webpage, 'fmt_list').split(',') - fmt_stream_map = fmt_stream_map.split(',') - fmt_list = fmt_list.split(',') formats = [] - for i in range(len(fmt_stream_map)): - fmt_id, fmt_url = fmt_stream_map[i].split('|') - resolution = fmt_list[i].split('/')[1] + for fmt, fmt_stream in zip(fmt_list, fmt_stream_map): + fmt_id, fmt_url = fmt_stream.split('|') + resolution = fmt.split('/')[1] width, height = resolution.split('x') formats.append({ 'url': fmt_url, 'format_id': fmt_id, 'resolution': resolution, - 'width': int(width), - 'height': int(height), - 'ext': self._formats[fmt_id]['ext'] + 'width': int_or_none(width), + 'height': int_or_none(height), + 'ext': self._FORMATS_EXT[fmt_id], }) self._sort_formats(formats) return { 'id': video_id, 'title': title, -# 'timestamp': int(timestamp), - 'duration': int(length_seconds), - 'formats': formats + 'thumbnail': self._og_search_thumbnail(webpage), + 'duration': duration, + 'formats': formats, } From 7cb09524749d4a061acc801d0a2d6ad08463e549 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Mon, 21 Dec 2015 04:24:58 +0100 Subject: [PATCH 82/92] [makertv] improve extraction --- youtube_dl/extractor/generic.py | 6 ++++++ youtube_dl/extractor/jwplatform.py | 10 +++++++--- youtube_dl/extractor/makertv.py | 13 +++++++++---- 3 files changed, 22 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 4d38b0c9d..f5dd88f54 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -53,6 +53,7 @@ from .onionstudios import OnionStudiosIE from .snagfilms import SnagFilmsEmbedIE from .screenwavemedia import ScreenwaveMediaIE from .mtv import MTVServicesEmbeddedIE +from .jwplatform import JWPlatformIE class GenericIE(InfoExtractor): @@ -1787,6 +1788,11 @@ class GenericIE(InfoExtractor): if snagfilms_url: return self.url_result(snagfilms_url) + # Look for JWPlatform embeds + jwplatform_url = JWPlatformIE._extract_url(webpage) + if jwplatform_url: + return self.url_result(jwplatform_url, 'JWPlatform') + # Look for ScreenwaveMedia embeds mobj = re.search(ScreenwaveMediaIE.EMBED_PATTERN, webpage) if mobj is not None: diff --git a/youtube_dl/extractor/jwplatform.py b/youtube_dl/extractor/jwplatform.py index 3a3dc439a..cdc095a79 100644 --- a/youtube_dl/extractor/jwplatform.py +++ b/youtube_dl/extractor/jwplatform.py @@ -1,6 +1,8 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor from ..utils import int_or_none @@ -23,7 +25,7 @@ class JWPlatformIE(InfoExtractor): @staticmethod def _extract_url(webpage): mobj = re.search( - r'<script[^>]+?src=["\'](?P<url>(?:https?:)?//content.jwplatform.com/players/[a-zA-Z0-9]{8}', + r'<script[^>]+?src=["\'](?P<url>(?:https?:)?//content.jwplatform.com/players/[a-zA-Z0-9]{8})', webpage) if mobj: return mobj.group('url') @@ -42,7 +44,9 @@ class JWPlatformIE(InfoExtractor): source_url = self._proto_relative_url(source['file']) source_type = source.get('type') or '' if source_type == 'application/vnd.apple.mpegurl': - formats.extend(self._extract_m3u8_formats(source_url, video_id, 'mp4', 'm3u8_native', fatal=None)) + m3u8_formats = self._extract_m3u8_formats(source_url, video_id, 'mp4', 'm3u8_native', fatal=None) + if m3u8_formats: + formats.extend(m3u8_formats) elif source_type.startswith('audio'): formats.append({ 'url': source_url, @@ -57,7 +61,7 @@ class JWPlatformIE(InfoExtractor): self._sort_formats(formats) return { - 'id': video_data['mediaid'], + 'id': video_id, 'title': video_data['title'], 'description': video_data.get('description'), 'thumbnail': self._proto_relative_url(video_data.get('image')), diff --git a/youtube_dl/extractor/makertv.py b/youtube_dl/extractor/makertv.py index 0256e4e24..3c34d4604 100644 --- a/youtube_dl/extractor/makertv.py +++ b/youtube_dl/extractor/makertv.py @@ -5,12 +5,12 @@ from .common import InfoExtractor class MakerTVIE(InfoExtractor): - _VALID_URL = r'https?://(?:(?:www\.)?maker\.tv/(?:[^/]+/)?video|http://makerplayer.com/embed/maker)/(?P<id>[a-zA-Z0-9]{12})' + _VALID_URL = r'https?://(?:(?:www\.)?maker\.tv/(?:[^/]+/)*video|makerplayer.com/embed/maker)/(?P<id>[a-zA-Z0-9]{12})' _TEST = { 'url': 'http://www.maker.tv/video/Fh3QgymL9gsc', 'md5': 'ca237a53a8eb20b6dc5bd60564d4ab3e', 'info_dict': { - 'id': 'brOEcGut', + 'id': 'Fh3QgymL9gsc', 'ext': 'mp4', 'title': 'Maze Runner: The Scorch Trials Official Movie Review', 'description': 'md5:11ff3362d7ef1d679fdb649f6413975a', @@ -22,6 +22,11 @@ class MakerTVIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - jwplatform_id = self._search_regex([r'jwid="([^"]+)"', r'Maker.jw_id\s*=\s*"([^"]+)";'], webpage, 'jwplatform id') + jwplatform_id = self._search_regex(r'jw_?id="([^"]+)"', webpage, 'jwplatform id') - return self.url_result('jwplatform:%s' % jwplatform_id, 'JWPlatform') + return { + '_type': 'url_transparent', + 'id': video_id, + 'url': 'jwplatform:%s' % jwplatform_id, + 'ie_key': 'JWPlatform', + } From c7fa5fa42cb91aa32c5b21c6821f6d190ec047a2 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Mon, 21 Dec 2015 11:12:58 +0100 Subject: [PATCH 83/92] [bleacherreport] fix style issues and simplify --- youtube_dl/extractor/amp.py | 38 +++++++++++++------------- youtube_dl/extractor/bleacherreport.py | 19 ++----------- youtube_dl/extractor/dramafever.py | 6 ++-- youtube_dl/extractor/foxnews.py | 19 +++++-------- 4 files changed, 30 insertions(+), 52 deletions(-) diff --git a/youtube_dl/extractor/amp.py b/youtube_dl/extractor/amp.py index b573b9280..dcc3c97f1 100644 --- a/youtube_dl/extractor/amp.py +++ b/youtube_dl/extractor/amp.py @@ -9,23 +9,21 @@ from ..utils import ( class AMPIE(InfoExtractor): - def _get_media_node(self, item, name, default=None): - media_name = 'media-%s' % name - media_group = item.get('media-group') or item - return media_group.get(media_name) or item.get(media_name) or item.get(name, default) - # parse Akamai Adaptive Media Player feed def _extract_feed_info(self, url): item = self._download_json( - url, None, - 'Downloading Akamai AMP feed', - 'Unable to download Akamai AMP feed' - )['channel']['item'] + url, None, 'Downloading Akamai AMP feed', + 'Unable to download Akamai AMP feed')['channel']['item'] video_id = item['guid'] - + + def get_media_node(name, default=None): + media_name = 'media-%s' % name + media_group = item.get('media-group') or item + return media_group.get(media_name) or item.get(media_name) or item.get(name, default) + thumbnails = [] - media_thumbnail = self._get_media_node(item, 'thumbnail') + media_thumbnail = get_media_node('thumbnail') if media_thumbnail: if isinstance(media_thumbnail, dict): media_thumbnail = [media_thumbnail] @@ -38,7 +36,7 @@ class AMPIE(InfoExtractor): }) subtitles = {} - media_subtitle = self._get_media_node(item, 'subTitle') + media_subtitle = get_media_node('subTitle') if media_subtitle: if isinstance(media_subtitle, dict): media_subtitle = [media_subtitle] @@ -48,26 +46,28 @@ class AMPIE(InfoExtractor): subtitles[lang] = [{'url': subtitle['href']}] formats = [] - media_content = self._get_media_node(item, 'content') + media_content = get_media_node('content') if isinstance(media_content, dict): media_content = [media_content] for media_data in media_content: media = media_data['@attributes'] media_type = media['type'] if media_type == 'video/f4m': - f4m_formats = self._extract_f4m_formats(media['url'] + '?hdcore=3.4.0&plugin=aasp-3.4.0.132.124', video_id, f4m_id='hds', fatal=False) + f4m_formats = self._extract_f4m_formats( + media['url'] + '?hdcore=3.4.0&plugin=aasp-3.4.0.132.124', + video_id, f4m_id='hds', fatal=False) if f4m_formats: formats.extend(f4m_formats) elif media_type == 'application/x-mpegURL': - m3u8_formats = self._extract_m3u8_formats(media['url'], video_id, m3u8_id='hls', fatal=False) + m3u8_formats = self._extract_m3u8_formats( + media['url'], video_id, 'mp4', m3u8_id='hls', fatal=False) if m3u8_formats: formats.extend(m3u8_formats) else: formats.append({ 'format_id': media_data['media-category']['@attributes']['label'], 'url': media['url'], - 'preference': 1, - 'vbr': int_or_none(media.get('bitrate')), + 'tbr': int_or_none(media.get('bitrate')), 'filesize': int_or_none(media.get('fileSize')), }) @@ -75,8 +75,8 @@ class AMPIE(InfoExtractor): return { 'id': video_id, - 'title': self._get_media_node(item, 'title'), - 'description': self._get_media_node(item, 'description'), + 'title': get_media_node('title'), + 'description': get_media_node('description'), 'thumbnails': thumbnails, 'timestamp': parse_iso8601(item.get('pubDate'), ' '), 'duration': int_or_none(media_content[0].get('@attributes', {}).get('duration')), diff --git a/youtube_dl/extractor/bleacherreport.py b/youtube_dl/extractor/bleacherreport.py index a55e696d2..bd2a6340b 100644 --- a/youtube_dl/extractor/bleacherreport.py +++ b/youtube_dl/extractor/bleacherreport.py @@ -26,7 +26,7 @@ class BleacherReportIE(InfoExtractor): 'uploader': 'Team Stream Now ', }, 'add_ie': ['Ooyala'], - },{ + }, { 'url': 'http://bleacherreport.com/articles/2586817-aussie-golfers-get-fright-of-their-lives-after-being-chased-by-angry-kangaroo', 'md5': 'af5f90dc9c7ba1c19d0a3eac806bbf50', 'info_dict': { @@ -35,25 +35,11 @@ class BleacherReportIE(InfoExtractor): 'title': 'Aussie Golfers Get Fright of Their Lives After Being Chased by Angry Kangaroo', 'timestamp': 1446839961, 'uploader': 'Sean Fay', - 'description': 'md5:e95afafa43619816552723878b3b0a84', + 'description': 'md5:825e94e0f3521df52fa83b2ed198fa20', 'uploader_id': 6466954, 'upload_date': '20151011', }, 'add_ie': ['Youtube'], - },{ - 'url': 'http://bleacherreport.com/articles/2496438-fsu-stat-projections-is-jalen-ramsey-best-defensive-player-in-college-football', - 'md5': 'a3ffc3dc73afdbc2010f02d98f990f20', - 'info_dict': { - 'id': '2496438', - 'ext': 'mp4', - 'title': 'FSU Stat Projections: Is Jalen Ramsey Best Defensive Player in College Football?', - 'upload_date': '20150615', - 'uploader': 'Team Stream Now ', - 'timestamp': 1434380212, - 'description': 'CFB, ACC, Florida State', - 'uploader_id': 3992341, - }, - 'add_ie': ['Vine'], }] def _real_extract(self, url): @@ -115,7 +101,6 @@ class BleacherReportCMSIE(AMPIE): def _real_extract(self, url): video_id = self._match_id(url) - info = self._extract_feed_info('http://cms.bleacherreport.com/media/items/%s/akamai.json' % video_id) info['id'] = video_id return info diff --git a/youtube_dl/extractor/dramafever.py b/youtube_dl/extractor/dramafever.py index 80a928827..2a4310754 100644 --- a/youtube_dl/extractor/dramafever.py +++ b/youtube_dl/extractor/dramafever.py @@ -13,9 +13,6 @@ from ..compat import ( from ..utils import ( ExtractorError, clean_html, - determine_ext, - int_or_none, - parse_iso8601, ) @@ -91,7 +88,8 @@ class DramaFeverIE(DramaFeverBaseIE): video_id = self._match_id(url).replace('/', '.') try: - info = self._extract_feed_info('http://www.dramafever.com/amp/episode/feed.json?guid=%s' % video_id) + info = self._extract_feed_info( + 'http://www.dramafever.com/amp/episode/feed.json?guid=%s' % video_id) except ExtractorError as e: if isinstance(e.cause, compat_HTTPError): raise ExtractorError( diff --git a/youtube_dl/extractor/foxnews.py b/youtube_dl/extractor/foxnews.py index 0cd0f9fa8..318ac013d 100644 --- a/youtube_dl/extractor/foxnews.py +++ b/youtube_dl/extractor/foxnews.py @@ -3,10 +3,6 @@ from __future__ import unicode_literals import re from .amp import AMPIE -from ..utils import ( - parse_iso8601, - int_or_none, -) class FoxNewsIE(AMPIE): @@ -22,8 +18,8 @@ class FoxNewsIE(AMPIE): 'title': 'Frozen in Time', 'description': '16-year-old girl is size of toddler', 'duration': 265, - #'timestamp': 1304411491, - #'upload_date': '20110503', + # 'timestamp': 1304411491, + # 'upload_date': '20110503', 'thumbnail': 're:^https?://.*\.jpg$', }, }, @@ -36,8 +32,8 @@ class FoxNewsIE(AMPIE): 'title': "Rep. Luis Gutierrez on if Obama's immigration plan is legal", 'description': "Congressman discusses president's plan", 'duration': 292, - #'timestamp': 1417662047, - #'upload_date': '20141204', + # 'timestamp': 1417662047, + # 'upload_date': '20141204', 'thumbnail': 're:^https?://.*\.jpg$', }, }, @@ -52,10 +48,9 @@ class FoxNewsIE(AMPIE): ] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - host = mobj.group('host') + host, video_id = re.match(self._VALID_URL, url).groups() - info = self._extract_feed_info('http://%s/v/feed/video/%s.js?template=fox' % (host, video_id)) + info = self._extract_feed_info( + 'http://%s/v/feed/video/%s.js?template=fox' % (host, video_id)) info['id'] = video_id return info From 0197004f786eba88238c5dd638dc1923b85b2b4c Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 21 Dec 2015 11:42:25 +0100 Subject: [PATCH 84/92] release 2015.12.21 --- docs/supportedsites.md | 13 +++++++++---- youtube_dl/version.py | 2 +- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 8253335e3..299bc5e72 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -65,9 +65,9 @@ - **Bet** - **Bild**: Bild.de - **BiliBili** + - **BleacherReport** + - **BleacherReportCMS** - **blinkx** - - **blip.tv:user** - - **BlipTV** - **Bloomberg** - **Bpb**: Bundeszentrale für politische Bildung - **BR**: Bayerischer Rundfunk Mediathek @@ -80,7 +80,6 @@ - **BYUtv** - **Camdemy** - **CamdemyFolder** - - **Canal13cl** - **canalc2.tv** - **Canalplus**: canalplus.fr, piwiplus.fr and d8.tv - **CBS** @@ -210,7 +209,9 @@ - **GodTube** - **GoldenMoustache** - **Golem** + - **GoogleDrive** - **Goshgay** + - **GPUTechConf** - **Groupon** - **Hark** - **HearThisAt** @@ -252,6 +253,7 @@ - **Jove** - **jpopsuki.tv** - **Jukebox** + - **JWPlatform** - **Kaltura** - **KanalPlay**: Kanal 5/9/11 Play - **Kankan** @@ -292,6 +294,7 @@ - **m6** - **macgamestore**: MacGameStore trailers - **mailru**: Видео@Mail.Ru + - **MakerTV** - **Malemotion** - **MDR**: MDR.DE and KiKA - **media.ccc.de** @@ -551,6 +554,7 @@ - **TechTalks** - **techtv.mit.edu** - **ted** + - **Tele13** - **TeleBruxelles** - **Telecinco**: telecinco.es, cuatro.com and mediaset.es - **Telegraaf** @@ -573,6 +577,7 @@ - **TMZ** - **TMZArticle** - **TNAFlix** + - **toggle** - **tou.tv** - **Toypics**: Toypics user profile - **ToypicsUser**: Toypics user profile @@ -711,6 +716,7 @@ - **youtube:favorites**: YouTube.com favourite videos, ":ytfav" for short (requires authentication) - **youtube:history**: Youtube watch history, ":ythistory" for short (requires authentication) - **youtube:playlist**: YouTube.com playlists + - **youtube:playlists**: YouTube.com user/channel playlists - **youtube:recommended**: YouTube.com recommended videos, ":ytrec" for short (requires authentication) - **youtube:search**: YouTube.com searches - **youtube:search:date**: YouTube.com searches, newest videos first @@ -718,7 +724,6 @@ - **youtube:show**: YouTube.com (multi-season) shows - **youtube:subscriptions**: YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication) - **youtube:user**: YouTube.com user videos (URL or "ytuser" keyword) - - **youtube:user:playlists**: YouTube.com user playlists - **youtube:watchlater**: Youtube watch later list, ":ytwatchlater" for short (requires authentication) - **Zapiks** - **ZDF** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 01607693e..7095033c5 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2015.12.18' +__version__ = '2015.12.21' From 5c5a3ecf1b5337510378b782a14525584769e6d6 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Mon, 21 Dec 2015 13:07:52 +0100 Subject: [PATCH 85/92] [abc] detect expired state and update tests --- youtube_dl/extractor/abc.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/youtube_dl/extractor/abc.py b/youtube_dl/extractor/abc.py index c0e5d1abf..6a29e587f 100644 --- a/youtube_dl/extractor/abc.py +++ b/youtube_dl/extractor/abc.py @@ -23,6 +23,7 @@ class ABCIE(InfoExtractor): 'title': 'Australia to help staff Ebola treatment centre in Sierra Leone', 'description': 'md5:809ad29c67a05f54eb41f2a105693a67', }, + 'skip': 'this video has expired', }, { 'url': 'http://www.abc.net.au/news/2015-08-17/warren-entsch-introduces-same-sex-marriage-bill/6702326', 'md5': 'db2a5369238b51f9811ad815b69dc086', @@ -36,6 +37,7 @@ class ABCIE(InfoExtractor): 'title': 'Marriage Equality: Warren Entsch introduces same sex marriage bill', }, 'add_ie': ['Youtube'], + 'skip': 'Not accessible from Travis CI server', }, { 'url': 'http://www.abc.net.au/news/2015-10-23/nab-lifts-interest-rates-following-westpac-and-cba/6880080', 'md5': 'b96eee7c9edf4fc5a358a0252881cc1f', @@ -58,6 +60,9 @@ class ABCIE(InfoExtractor): r'inline(?P<type>Video|Audio|YouTube)Data\.push\((?P<json_data>[^)]+)\);', webpage) if mobj is None: + expired = self._html_search_regex(r'(?s)class="expired-(?:video|audio)".+?<span>(.+?)</span>', webpage, 'expired', None) + if expired: + raise ExtractorError('%s said: %s' % (self.IE_NAME, expired), expected=True) raise ExtractorError('Unable to extract video urls') urls_info = self._parse_json( From 61ebb401b7ff8f15850bfc5778cdc08ab72145e1 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Mon, 21 Dec 2015 16:26:40 +0100 Subject: [PATCH 86/92] [atresplayer] improve extraction - select hashlib.md5 constructor as digestmod(in python 3.4+ MD5 as implicit default digest for digestmod is deprecated.) - extract hls formats - update tests - extract errors --- youtube_dl/extractor/atresplayer.py | 117 +++++++++++++++++++--------- 1 file changed, 81 insertions(+), 36 deletions(-) diff --git a/youtube_dl/extractor/atresplayer.py b/youtube_dl/extractor/atresplayer.py index 50e47ba0a..7ac3044c7 100644 --- a/youtube_dl/extractor/atresplayer.py +++ b/youtube_dl/extractor/atresplayer.py @@ -2,6 +2,8 @@ from __future__ import unicode_literals import time import hmac +import hashlib +import re from .common import InfoExtractor from ..compat import ( @@ -32,6 +34,19 @@ class AtresPlayerIE(InfoExtractor): 'duration': 5527.6, 'thumbnail': 're:^https?://.*\.jpg$', }, + 'skip': 'This video is only available for registered users' + }, + { + 'url': 'http://www.atresplayer.com/television/especial/videoencuentros/temporada-1/capitulo-112-david-bustamante_2014121600375.html', + 'md5': '0d0e918533bbd4b263f2de4d197d4aac', + 'info_dict': { + 'id': 'capitulo-112-david-bustamante', + 'ext': 'flv', + 'title': 'David Bustamante', + 'description': 'md5:f33f1c0a05be57f6708d4dd83a3b81c6', + 'duration': 1439.0, + 'thumbnail': 're:^https?://.*\.jpg$', + }, }, { 'url': 'http://www.atresplayer.com/television/series/el-secreto-de-puente-viejo/el-chico-de-los-tres-lunares/capitulo-977-29-12-14_2014122400174.html', @@ -50,6 +65,13 @@ class AtresPlayerIE(InfoExtractor): _LOGIN_URL = 'https://servicios.atresplayer.com/j_spring_security_check' + _ERRORS = { + 'UNPUBLISHED': 'We\'re sorry, but this video is not yet available.', + 'DELETED': 'This video has expired and is no longer available for online streaming.', + 'GEOUNPUBLISHED': 'We\'re sorry, but this video is not available in your region due to right restrictions.', + # 'PREMIUM': 'PREMIUM', + } + def _real_initialize(self): self._login() @@ -83,58 +105,81 @@ class AtresPlayerIE(InfoExtractor): episode_id = self._search_regex( r'episode="([^"]+)"', webpage, 'episode id') + request = sanitized_Request( + self._PLAYER_URL_TEMPLATE % episode_id, + headers={'User-Agent': self._USER_AGENT}) + player = self._download_json(request, episode_id, 'Downloading player JSON') + + episode_type = player.get('typeOfEpisode') + error_message = self._ERRORS.get(episode_type) + if error_message: + raise ExtractorError( + '%s returned error: %s' % (self.IE_NAME, error_message), expected=True) + + formats = [] + video_url = player.get('urlVideo') + if video_url: + format_info = { + 'url': video_url, + 'format_id': 'http', + } + mobj = re.search(r'(?P<bitrate>\d+)K_(?P<width>\d+)x(?P<height>\d+)', video_url) + if mobj: + format_info.update({ + 'width': int_or_none(mobj.group('width')), + 'height': int_or_none(mobj.group('height')), + 'tbr': int_or_none(mobj.group('bitrate')), + }) + formats.append(format_info) + + m3u8_url = player.get('urlVideoHls') + if m3u8_url: + m3u8_formats = self._extract_m3u8_formats( + m3u8_url, episode_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False) + if m3u8_formats: + formats.extend(m3u8_formats) + timestamp = int_or_none(self._download_webpage( self._TIME_API_URL, video_id, 'Downloading timestamp', fatal=False), 1000, time.time()) timestamp_shifted = compat_str(timestamp + self._TIMESTAMP_SHIFT) token = hmac.new( self._MAGIC.encode('ascii'), - (episode_id + timestamp_shifted).encode('utf-8') + (episode_id + timestamp_shifted).encode('utf-8'), hashlib.md5 ).hexdigest() - formats = [] - for fmt in ['windows', 'android_tablet']: - request = sanitized_Request( - self._URL_VIDEO_TEMPLATE.format(fmt, episode_id, timestamp_shifted, token)) - request.add_header('User-Agent', self._USER_AGENT) + request = sanitized_Request( + self._URL_VIDEO_TEMPLATE.format('windows', episode_id, timestamp_shifted, token), + headers={'User-Agent': self._USER_AGENT}) - fmt_json = self._download_json( - request, video_id, 'Downloading %s video JSON' % fmt) + fmt_json = self._download_json( + request, video_id, 'Downloading windows video JSON') - result = fmt_json.get('resultDes') - if result.lower() != 'ok': - raise ExtractorError( - '%s returned error: %s' % (self.IE_NAME, result), expected=True) + result = fmt_json.get('resultDes') + if result.lower() != 'ok': + raise ExtractorError( + '%s returned error: %s' % (self.IE_NAME, result), expected=True) - for format_id, video_url in fmt_json['resultObject'].items(): - if format_id == 'token' or not video_url.startswith('http'): - continue - if video_url.endswith('/Manifest'): - if 'geodeswowsmpra3player' in video_url: - f4m_path = video_url.split('smil:', 1)[-1].split('free_', 1)[0] - f4m_url = 'http://drg.antena3.com/{0}hds/es/sd.f4m'.format(f4m_path) - # this videos are protected by DRM, the f4m downloader doesn't support them - continue - else: - f4m_url = video_url[:-9] + '/manifest.f4m' - formats.extend(self._extract_f4m_formats(f4m_url, video_id)) - else: - formats.append({ - 'url': video_url, - 'format_id': 'android-%s' % format_id, - 'preference': 1, - }) + for format_id, video_url in fmt_json['resultObject'].items(): + if format_id == 'token' or not video_url.startswith('http'): + continue + if 'geodeswowsmpra3player' in video_url: + f4m_path = video_url.split('smil:', 1)[-1].split('free_', 1)[0] + f4m_url = 'http://drg.antena3.com/{0}hds/es/sd.f4m'.format(f4m_path) + # this videos are protected by DRM, the f4m downloader doesn't support them + continue + else: + f4m_url = video_url[:-9] + '/manifest.f4m' + f4m_formats = self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False) + if f4m_formats: + formats.extend(f4m_formats) self._sort_formats(formats) - player = self._download_json( - self._PLAYER_URL_TEMPLATE % episode_id, - episode_id) - path_data = player.get('pathData') episode = self._download_xml( - self._EPISODE_URL_TEMPLATE % path_data, - video_id, 'Downloading episode XML') + self._EPISODE_URL_TEMPLATE % path_data, video_id, + 'Downloading episode XML') duration = float_or_none(xpath_text( episode, './media/asset/info/technical/contentDuration', 'duration')) From f11d00fa4109ce754c58c4a7b8bef04fc14a257a Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Mon, 21 Dec 2015 16:52:47 +0100 Subject: [PATCH 87/92] [test_subtitles] remove BlipTV test --- test/test_subtitles.py | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/test/test_subtitles.py b/test/test_subtitles.py index 75f0ea75f..9ed9fe622 100644 --- a/test/test_subtitles.py +++ b/test/test_subtitles.py @@ -11,7 +11,6 @@ from test.helper import FakeYDL, md5 from youtube_dl.extractor import ( - BlipTVIE, YoutubeIE, DailymotionIE, TEDIE, @@ -145,18 +144,6 @@ class TestTedSubtitles(BaseTestSubtitles): self.assertTrue(subtitles.get(lang) is not None, 'Subtitles for \'%s\' not extracted' % lang) -class TestBlipTVSubtitles(BaseTestSubtitles): - url = 'http://blip.tv/a/a-6603250' - IE = BlipTVIE - - def test_allsubtitles(self): - self.DL.params['writesubtitles'] = True - self.DL.params['allsubtitles'] = True - subtitles = self.getSubtitles() - self.assertEqual(set(subtitles.keys()), set(['en'])) - self.assertEqual(md5(subtitles['en']), '5b75c300af65fe4476dff79478bb93e4') - - class TestVimeoSubtitles(BaseTestSubtitles): url = 'http://vimeo.com/76979871' IE = VimeoIE From 0f15ad7b9b6cae4e8719d2b960d8e882963f7a5b Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Mon, 21 Dec 2015 17:07:19 +0100 Subject: [PATCH 88/92] [adultswim] update test --- youtube_dl/extractor/adultswim.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/adultswim.py b/youtube_dl/extractor/adultswim.py index 3ae618e71..bf21a6887 100644 --- a/youtube_dl/extractor/adultswim.py +++ b/youtube_dl/extractor/adultswim.py @@ -68,7 +68,7 @@ class AdultSwimIE(InfoExtractor): 'md5': '3e346a2ab0087d687a05e1e7f3b3e529', 'info_dict': { 'id': 'sY3cMUR_TbuE4YmdjzbIcQ-0', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Tim and Eric Awesome Show Great Job! - Dr. Steve Brule, For Your Wine', 'description': 'Dr. Brule reports live from Wine Country with a special report on wines. \r\nWatch Tim and Eric Awesome Show Great Job! episode #20, "Embarrassed" on Adult Swim.\r\n\r\n', }, @@ -79,6 +79,10 @@ class AdultSwimIE(InfoExtractor): 'title': 'Tim and Eric Awesome Show Great Job! - Dr. Steve Brule, For Your Wine', 'description': 'Dr. Brule reports live from Wine Country with a special report on wines. \r\nWatch Tim and Eric Awesome Show Great Job! episode #20, "Embarrassed" on Adult Swim.\r\n\r\n', }, + 'params': { + # m3u8 download + 'skip_download': True, + } }] @staticmethod From 5ef5d25b150a383362aab046394849ed523b3730 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 21 Dec 2015 22:51:58 +0600 Subject: [PATCH 89/92] [audiomack] Fix typo (Closes #7936) --- youtube_dl/extractor/audiomack.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/audiomack.py b/youtube_dl/extractor/audiomack.py index 693ba22c6..3eed91279 100644 --- a/youtube_dl/extractor/audiomack.py +++ b/youtube_dl/extractor/audiomack.py @@ -56,7 +56,7 @@ class AudiomackIE(InfoExtractor): # API is inconsistent with errors if 'url' not in api_response or not api_response['url'] or 'error' in api_response: - raise ExtractorError('Invalid url %s', url) + raise ExtractorError('Invalid url %s' % url) # Audiomack wraps a lot of soundcloud tracks in their branded wrapper # if so, pass the work off to the soundcloud extractor From 5625bd0617d6c661c66049b420556ae442102ffc Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Mon, 21 Dec 2015 21:06:10 +0100 Subject: [PATCH 90/92] [br] add support for br-klassik.de and improve extraction - extend _VALID_URL to match both br.de and br-klassik.de - extract all formats(hls,hds and rtmp) - use xpath_element and xpath_text for xml info extraction --- youtube_dl/extractor/br.py | 127 +++++++++++++++++++++++-------------- 1 file changed, 78 insertions(+), 49 deletions(-) diff --git a/youtube_dl/extractor/br.py b/youtube_dl/extractor/br.py index 66e394e10..e66854538 100644 --- a/youtube_dl/extractor/br.py +++ b/youtube_dl/extractor/br.py @@ -1,18 +1,21 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor from ..utils import ( ExtractorError, int_or_none, parse_duration, + xpath_element, + xpath_text, ) class BRIE(InfoExtractor): IE_DESC = 'Bayerischer Rundfunk Mediathek' - _VALID_URL = r'https?://(?:www\.)?br\.de/(?:[a-z0-9\-_]+/)+(?P<id>[a-z0-9\-_]+)\.html' - _BASE_URL = 'http://www.br.de' + _VALID_URL = r'(?P<base_url>https?://(?:www\.)?br(?:-klassik)?\.de)/(?:[a-z0-9\-_]+/)+(?P<id>[a-z0-9\-_]+)\.html' _TESTS = [ { @@ -22,7 +25,7 @@ class BRIE(InfoExtractor): 'id': '48f656ef-287e-486f-be86-459122db22cc', 'ext': 'mp4', 'title': 'Die böse Überraschung', - 'description': 'Betriebliche Altersvorsorge: Die böse Überraschung', + 'description': 'md5:ce9ac81b466ce775b8018f6801b48ac9', 'duration': 180, 'uploader': 'Reinhard Weber', 'upload_date': '20150422', @@ -30,23 +33,23 @@ class BRIE(InfoExtractor): }, { 'url': 'http://www.br.de/nachrichten/oberbayern/inhalt/muenchner-polizeipraesident-schreiber-gestorben-100.html', - 'md5': 'a44396d73ab6a68a69a568fae10705bb', + 'md5': 'af3a3a4aa43ff0ce6a89504c67f427ef', 'info_dict': { 'id': 'a4b83e34-123d-4b81-9f4e-c0d3121a4e05', - 'ext': 'mp4', + 'ext': 'flv', 'title': 'Manfred Schreiber ist tot', - 'description': 'Abendschau kompakt: Manfred Schreiber ist tot', + 'description': 'md5:b454d867f2a9fc524ebe88c3f5092d97', 'duration': 26, } }, { - 'url': 'http://www.br.de/radio/br-klassik/sendungen/allegro/premiere-urauffuehrung-the-land-2015-dance-festival-muenchen-100.html', + 'url': 'https://www.br-klassik.de/audio/peeping-tom-premierenkritik-dance-festival-muenchen-100.html', 'md5': '8b5b27c0b090f3b35eac4ab3f7a73d3d', 'info_dict': { 'id': '74c603c9-26d3-48bb-b85b-079aeed66e0b', 'ext': 'aac', 'title': 'Kurzweilig und sehr bewegend', - 'description': '"The Land" von Peeping Tom: Kurzweilig und sehr bewegend', + 'description': 'md5:0351996e3283d64adeb38ede91fac54e', 'duration': 296, } }, @@ -57,7 +60,7 @@ class BRIE(InfoExtractor): 'id': '6ba73750-d405-45d3-861d-1ce8c524e059', 'ext': 'mp4', 'title': 'Umweltbewusster Häuslebauer', - 'description': 'Uwe Erdelt: Umweltbewusster Häuslebauer', + 'description': 'md5:d52dae9792d00226348c1dbb13c9bae2', 'duration': 116, } }, @@ -68,7 +71,7 @@ class BRIE(InfoExtractor): 'id': 'd982c9ce-8648-4753-b358-98abb8aec43d', 'ext': 'mp4', 'title': 'Folge 1 - Metaphysik', - 'description': 'Kant für Anfänger: Folge 1 - Metaphysik', + 'description': 'md5:bb659990e9e59905c3d41e369db1fbe3', 'duration': 893, 'uploader': 'Eva Maria Steimle', 'upload_date': '20140117', @@ -77,28 +80,31 @@ class BRIE(InfoExtractor): ] def _real_extract(self, url): - display_id = self._match_id(url) + base_url, display_id = re.search(self._VALID_URL, url).groups() page = self._download_webpage(url, display_id) xml_url = self._search_regex( r"return BRavFramework\.register\(BRavFramework\('avPlayer_(?:[a-f0-9-]{36})'\)\.setup\({dataURL:'(/(?:[a-z0-9\-]+/)+[a-z0-9/~_.-]+)'}\)\);", page, 'XMLURL') - xml = self._download_xml(self._BASE_URL + xml_url, None) + xml = self._download_xml(base_url + xml_url, display_id) medias = [] for xml_media in xml.findall('video') + xml.findall('audio'): + media_id = xml_media.get('externalId') media = { - 'id': xml_media.get('externalId'), - 'title': xml_media.find('title').text, - 'duration': parse_duration(xml_media.find('duration').text), - 'formats': self._extract_formats(xml_media.find('assets')), - 'thumbnails': self._extract_thumbnails(xml_media.find('teaserImage/variants')), - 'description': ' '.join(xml_media.find('shareTitle').text.splitlines()), - 'webpage_url': xml_media.find('permalink').text + 'id': media_id, + 'title': xpath_text(xml_media, 'title', 'title', True), + 'duration': parse_duration(xpath_text(xml_media, 'duration')), + 'formats': self._extract_formats(xpath_element( + xml_media, 'assets'), media_id), + 'thumbnails': self._extract_thumbnails(xpath_element( + xml_media, 'teaserImage/variants'), base_url), + 'description': xpath_text(xml_media, 'desc'), + 'webpage_url': xpath_text(xml_media, 'permalink'), + 'uploader': xpath_text(xml_media, 'author'), } - if xml_media.find('author').text: - media['uploader'] = xml_media.find('author').text - if xml_media.find('broadcastDate').text: - media['upload_date'] = ''.join(reversed(xml_media.find('broadcastDate').text.split('.'))) + broadcast_date = xpath_text(xml_media, 'broadcastDate') + if broadcast_date: + media['upload_date'] = ''.join(reversed(broadcast_date.split('.'))) medias.append(media) if len(medias) > 1: @@ -109,35 +115,58 @@ class BRIE(InfoExtractor): raise ExtractorError('No media entries found') return medias[0] - def _extract_formats(self, assets): - - def text_or_none(asset, tag): - elem = asset.find(tag) - return None if elem is None else elem.text - - formats = [{ - 'url': text_or_none(asset, 'downloadUrl'), - 'ext': text_or_none(asset, 'mediaType'), - 'format_id': asset.get('type'), - 'width': int_or_none(text_or_none(asset, 'frameWidth')), - 'height': int_or_none(text_or_none(asset, 'frameHeight')), - 'tbr': int_or_none(text_or_none(asset, 'bitrateVideo')), - 'abr': int_or_none(text_or_none(asset, 'bitrateAudio')), - 'vcodec': text_or_none(asset, 'codecVideo'), - 'acodec': text_or_none(asset, 'codecAudio'), - 'container': text_or_none(asset, 'mediaType'), - 'filesize': int_or_none(text_or_none(asset, 'size')), - } for asset in assets.findall('asset') - if asset.find('downloadUrl') is not None] - + def _extract_formats(self, assets, media_id): + formats = [] + for asset in assets.findall('asset'): + format_url = xpath_text(asset, ['downloadUrl', 'url']) + asset_type = asset.get('type') + if asset_type == 'HDS': + f4m_formats = self._extract_f4m_formats( + format_url + '?hdcore=3.2.0', media_id, f4m_id='hds', fatal=False) + if f4m_formats: + formats.extend(f4m_formats) + elif asset_type == 'HLS': + m3u8_formats = self._extract_m3u8_formats( + format_url, media_id, 'mp4', 'm3u8_native', m3u8_id='hds', fatal=False) + if m3u8_formats: + formats.extend(m3u8_formats) + else: + format_info = { + 'ext': xpath_text(asset, 'mediaType'), + 'width': int_or_none(xpath_text(asset, 'frameWidth')), + 'height': int_or_none(xpath_text(asset, 'frameHeight')), + 'tbr': int_or_none(xpath_text(asset, 'bitrateVideo')), + 'abr': int_or_none(xpath_text(asset, 'bitrateAudio')), + 'vcodec': xpath_text(asset, 'codecVideo'), + 'acodec': xpath_text(asset, 'codecAudio'), + 'container': xpath_text(asset, 'mediaType'), + 'filesize': int_or_none(xpath_text(asset, 'size')), + } + format_url = self._proto_relative_url(format_url) + if format_url: + http_format_info = format_info.copy() + http_format_info.update({ + 'url': format_url, + 'format_id': 'http-%s' % asset_type, + }) + formats.append(http_format_info) + server_prefix = xpath_text(asset, 'serverPrefix') + if server_prefix: + rtmp_format_info = format_info.copy() + rtmp_format_info.update({ + 'url': server_prefix, + 'play_path': xpath_text(asset, 'fileName'), + 'format_id': 'rtmp-%s' % asset_type, + }) + formats.append(rtmp_format_info) self._sort_formats(formats) return formats - def _extract_thumbnails(self, variants): + def _extract_thumbnails(self, variants, base_url): thumbnails = [{ - 'url': self._BASE_URL + variant.find('url').text, - 'width': int_or_none(variant.find('width').text), - 'height': int_or_none(variant.find('height').text), - } for variant in variants.findall('variant')] + 'url': base_url + xpath_text(variant, 'url'), + 'width': int_or_none(xpath_text(variant, 'width')), + 'height': int_or_none(xpath_text(variant, 'height')), + } for variant in variants.findall('variant') if xpath_text(variant, 'url')] thumbnails.sort(key=lambda x: x['width'] * x['height'], reverse=True) return thumbnails From eed30fea755f454582eb5ff555517cce3870dc87 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Mon, 21 Dec 2015 22:10:16 +0100 Subject: [PATCH 91/92] [flickr] fix format sorting --- youtube_dl/extractor/flickr.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/flickr.py b/youtube_dl/extractor/flickr.py index 452b27b26..18f439df9 100644 --- a/youtube_dl/extractor/flickr.py +++ b/youtube_dl/extractor/flickr.py @@ -50,13 +50,19 @@ class FlickrIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - api_key = self._download_json('https://www.flickr.com/hermes_error_beacon.gne', video_id, 'Downloading api key',)['site_key'] + api_key = self._download_json( + 'https://www.flickr.com/hermes_error_beacon.gne', video_id, + 'Downloading api key')['site_key'] - video_info = self._call_api('photos.getInfo', video_id, api_key, 'Downloading video info')['photo'] + video_info = self._call_api( + 'photos.getInfo', video_id, api_key, 'Downloading video info')['photo'] if video_info['media'] == 'video': - streams = self._call_api('video.getStreamInfo', video_id, api_key, 'Downloading streams info', video_info['secret'])['streams'] + streams = self._call_api( + 'video.getStreamInfo', video_id, api_key, + 'Downloading streams info', video_info['secret'])['streams'] - preference = qualities(['iphone_wifi', '700', 'appletv', 'orig']) + preference = qualities( + ['288p', 'iphone_wifi', '100', '300', '700', '360p', 'appletv', '720p', '1080p', 'orig']) formats = [] for stream in streams['stream']: From c7224074d61adf0dda362332a27f8ae20c4502cd Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Mon, 21 Dec 2015 23:02:55 +0100 Subject: [PATCH 92/92] [audimedia] correct test case id --- youtube_dl/extractor/audimedia.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/audimedia.py b/youtube_dl/extractor/audimedia.py index b0b089dee..4382a302b 100644 --- a/youtube_dl/extractor/audimedia.py +++ b/youtube_dl/extractor/audimedia.py @@ -15,7 +15,7 @@ class AudiMediaIE(InfoExtractor): 'url': 'https://audimedia.tv/en/vid/60-seconds-of-audi-sport-104-2015-wec-bahrain-rookie-test', 'md5': '79a8b71c46d49042609795ab59779b66', 'info_dict': { - 'id': '1564', + 'id': '1565', 'ext': 'mp4', 'title': '60 Seconds of Audi Sport 104/2015 - WEC Bahrain, Rookie Test', 'description': 'md5:60e5d30a78ced725f7b8d34370762941',