From 431b60a7885e3f471f4983bcaa8e75bd8989d8be Mon Sep 17 00:00:00 2001 From: gfabiano Date: Fri, 16 Jun 2017 01:46:58 +0200 Subject: [PATCH 1/3] [mirror] Add new extractor --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/mirror.py | 73 ++++++++++++++++++++++++++++++ 2 files changed, 74 insertions(+) create mode 100644 youtube_dl/extractor/mirror.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 7e45232dd..cd9cf0149 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -565,6 +565,7 @@ from .minhateca import MinhatecaIE from .ministrygrid import MinistryGridIE from .minoto import MinotoIE from .miomio import MioMioIE +from .mirror import MirrorIE from .mit import TechTVMITIE, MITIE, OCWMITIE from .mitele import MiTeleIE from .mixcloud import ( diff --git a/youtube_dl/extractor/mirror.py b/youtube_dl/extractor/mirror.py new file mode 100644 index 000000000..e8512a7c2 --- /dev/null +++ b/youtube_dl/extractor/mirror.py @@ -0,0 +1,73 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + +from ..utils import ( + ExtractorError, +) + +import re + + +class MirrorIE(InfoExtractor): + _VALID_URL = (r'https?://(?:www\.)?(?:mirror\.co\.uk)/.*') + + _TESTS = [{ + 'url': 'http://www.mirror.co.uk/news/uk-news/grenfell-tower-block-fire-london-10619120?service=responsive', + 'md5': 'e8c52bcdf5180884b4e4d3159de3a40b', + 'info_dict': { + 'id': '5470567455001', + 'ext': 'mp4', + 'title': 'Blaze at Grenfell Tower continues 11 hours on', + 'timestamp': 1497433370, + 'uploader_id': '4221396001', + 'upload_date': '20170614' + } + }, { + 'url': 'http://www.mirror.co.uk/tv/tv-news/tim-brown-grenfell-fire-us-10627790', + 'md5': 'cd8e2ee6a57b043d9612321d8b4d07be', + 'info_dict': { + 'id': '5472207456001', + 'ext': 'mp4', + 'title': 'Structural engineer who warned of cladding fire dangers explains Grenfell Tower fire', + 'timestamp': 1497527486, + 'uploader_id': '4221396001', + 'upload_date': '20170615', + 'description': 'Structural engineer who warned of cladding fire dangers explains what made Grenfell Tower a death trap' + } + }] + + def _real_extract(self, url): + webpage = self._download_webpage(url, '', 'Downloading webpage') + + mobj = re.search(r'.+?brightcove\.com/\d+/(?P[^/]+)/\d+/\d+/\2_\d+_(?P[^.]+)\.mp4)', webpage) + + if mobj is None: + raise ExtractorError('Video does not exist', expected=True) + + account_id = mobj.group('account_id') + video_id = mobj.group('video_id') + video_url = mobj.group('video_url') + + player_id = self._search_regex( + r'(&l?s?quot?;)+playerId\1+:\1+(?P[a-zA-Z0-9]+)\1+,', + webpage, 'player id', group='player_id' + ) + + player_url = 'http://players.brightcove.net/%s/%s_default/index.html?videoId=%s' % (account_id, player_id, video_id) + info = None + try: + info = self.url_result( + player_url, 'BrightcoveNew', video_id + ) + except: + info = { + 'id': video_id, + 'title': self._search_regex( + r'(&l?s?quot?;)videoTitle\1+:\1(?P[^&]+)\1[^}]+%s' % video_id, + webpage, 'video title', group='video_title' + ), + 'url': video_url + } + return info From 161a4d3b288c69f97b797c361a8e531b625a4baf Mon Sep 17 00:00:00 2001 From: gfabiano Date: Fri, 16 Jun 2017 22:25:15 +0200 Subject: [PATCH 2/3] requested fixs --- youtube_dl/extractor/mirror.py | 94 +++++++++++++++++++--------------- 1 file changed, 52 insertions(+), 42 deletions(-) diff --git a/youtube_dl/extractor/mirror.py b/youtube_dl/extractor/mirror.py index e8512a7c2..7086b6c7e 100644 --- a/youtube_dl/extractor/mirror.py +++ b/youtube_dl/extractor/mirror.py @@ -5,15 +5,16 @@ from .common import InfoExtractor from ..utils import ( ExtractorError, + RegexNotFoundError, ) import re class MirrorIE(InfoExtractor): - _VALID_URL = (r'https?://(?:www\.)?(?:mirror\.co\.uk)/.*') + _VALID_URL = r'https?://(?:www\.)?mirror\.co\.uk.*/(?P[^/?#&]+)' - _TESTS = [{ + _TEST = { 'url': 'http://www.mirror.co.uk/news/uk-news/grenfell-tower-block-fire-london-10619120?service=responsive', 'md5': 'e8c52bcdf5180884b4e4d3159de3a40b', 'info_dict': { @@ -21,53 +22,62 @@ class MirrorIE(InfoExtractor): 'ext': 'mp4', 'title': 'Blaze at Grenfell Tower continues 11 hours on', 'timestamp': 1497433370, + 'upload_date': '20170614', 'uploader_id': '4221396001', - 'upload_date': '20170614' } - }, { - 'url': 'http://www.mirror.co.uk/tv/tv-news/tim-brown-grenfell-fire-us-10627790', - 'md5': 'cd8e2ee6a57b043d9612321d8b4d07be', - 'info_dict': { - 'id': '5472207456001', - 'ext': 'mp4', - 'title': 'Structural engineer who warned of cladding fire dangers explains Grenfell Tower fire', - 'timestamp': 1497527486, - 'uploader_id': '4221396001', - 'upload_date': '20170615', - 'description': 'Structural engineer who warned of cladding fire dangers explains what made Grenfell Tower a death trap' - } - }] + } def _real_extract(self, url): - webpage = self._download_webpage(url, '', 'Downloading webpage') + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) - mobj = re.search(r'.+?brightcove\.com/\d+/(?P[^/]+)/\d+/\d+/\2_\d+_(?P[^.]+)\.mp4)', webpage) + title = None + account_id = None + video_id = None + player_id = None - if mobj is None: - raise ExtractorError('Video does not exist', expected=True) - - account_id = mobj.group('account_id') - video_id = mobj.group('video_id') - video_url = mobj.group('video_url') - - player_id = self._search_regex( - r'(&l?s?quot?;)+playerId\1+:\1+(?P[a-zA-Z0-9]+)\1+,', - webpage, 'player id', group='player_id' - ) - - player_url = 'http://players.brightcove.net/%s/%s_default/index.html?videoId=%s' % (account_id, player_id, video_id) - info = None try: - info = self.url_result( - player_url, 'BrightcoveNew', video_id + json_data = self._parse_json(self._html_search_regex( + r']+class=(["\'])json-placeholder\1[^>]+data-json=\1(?P.*?)\1', + webpage, 'extract json', group='json', fatal=False + ), display_id, fatal=False) + + if all(k in json_data for k in ('playerData', 'videoData')): + player_id = json_data['playerData'].get('playerId') + account_id = json_data['playerData'].get('account') + video_id = json_data['videoData'].get('videoId') + title = json_data['videoData'].get('videoTitle') + else: + raise ExtractorError('json data not found') + except (RegexNotFoundError, ExtractorError): + title = self._og_search_title( + webpage, default=None) or self._search_regex( + r'([^<]+)', webpage, + 'title', default=None) + account_id = self._html_search_regex( + r'<meta[^<]+?property=[\'"]+videoUrl[\'"]+.+?content=[\'"].+?brightcove\.com/\d+/(?P<account_id>[^/]+)/', + webpage, 'account id', group='account_id' ) - except: - info = { + video_id = self._html_search_regex( + r'<meta[^<]+?property=[\'"]+videoUrl[\'"]+.+?content=[\'"].+?brightcove\.com/[a-zA-Z0-9-_/]+_(?P<video_id>\d+)\.mp4', + webpage, 'video id', group='video_id' + ) + + try: + return { + '_type': 'url_transparent', 'id': video_id, - 'title': self._search_regex( - r'(&l?s?quot?;)videoTitle\1+:\1(?P<video_title>[^&]+)\1[^}]+%s' % video_id, - webpage, 'video title', group='video_title' - ), - 'url': video_url + 'display_id': display_id, + 'url': 'http://players.brightcove.net/%s/%s_default/index.html?videoId=%s' % (account_id, player_id, video_id), + 'ie_key': 'BrightcoveNew' + } + except ExtractorError: + return { + 'id': video_id, + 'title': title, + 'url': self._search_regex( + r'<meta[^<]+?property=[\'"]+videoUrl[\'"]+.+?content=[\'"](?P<video_url>.+?brightcove\.com\/[^\'"]+)', + webpage, 'video url', group='video_url' + ), + 'uploader_id': account_id } - return info From 1595b83b9b943ba38094a04d2eb7f9ba02681b05 Mon Sep 17 00:00:00 2001 From: gfabiano <gfabiano40@gmail.com> Date: Mon, 30 Jul 2018 16:07:46 +0200 Subject: [PATCH 3/3] improved and fixed --- youtube_dl/extractor/mirror.py | 47 +++++++++++++++++++--------------- 1 file changed, 27 insertions(+), 20 deletions(-) diff --git a/youtube_dl/extractor/mirror.py b/youtube_dl/extractor/mirror.py index 7086b6c7e..cbd8fcd7a 100644 --- a/youtube_dl/extractor/mirror.py +++ b/youtube_dl/extractor/mirror.py @@ -6,10 +6,9 @@ from .common import InfoExtractor from ..utils import ( ExtractorError, RegexNotFoundError, + unified_strdate, ) -import re - class MirrorIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?mirror\.co\.uk.*/(?P<id>[^/?#&]+)' @@ -27,6 +26,8 @@ class MirrorIE(InfoExtractor): } } + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/%s_default/index.html?videoId=%s' + def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) @@ -39,45 +40,51 @@ class MirrorIE(InfoExtractor): try: json_data = self._parse_json(self._html_search_regex( r'<div[^>]+class=(["\'])json-placeholder\1[^>]+data-json=\1(?P<json>.*?)\1', - webpage, 'extract json', group='json', fatal=False - ), display_id, fatal=False) + webpage, 'extract json', group='json', default='{}' + ), display_id) if all(k in json_data for k in ('playerData', 'videoData')): player_id = json_data['playerData'].get('playerId') account_id = json_data['playerData'].get('account') - video_id = json_data['videoData'].get('videoId') - title = json_data['videoData'].get('videoTitle') + video_id = json_data['videoData']['videoId'] + title = json_data['videoData']['videoTitle'] else: raise ExtractorError('json data not found') - except (RegexNotFoundError, ExtractorError): + except (RegexNotFoundError, ExtractorError, KeyError): title = self._og_search_title( - webpage, default=None) or self._search_regex( - r'<title>([^<]+)', webpage, - 'title', default=None) + webpage, default=None) or self._html_search_regex( + r'<title>([^>]+)', webpage, + 'title') account_id = self._html_search_regex( - r'<meta[^<]+?property=[\'"]+videoUrl[\'"]+.+?content=[\'"].+?brightcove\.com/\d+/(?P<account_id>[^/]+)/', - webpage, 'account id', group='account_id' + r'<meta[^>]+?property=[\'"]+videoUrl[\'"]+.+?content=[\'"].+?brightcove\.com/\d+/([^/]+)/', + webpage, 'account id', default=None ) video_id = self._html_search_regex( - r'<meta[^<]+?property=[\'"]+videoUrl[\'"]+.+?content=[\'"].+?brightcove\.com/[a-zA-Z0-9-_/]+_(?P<video_id>\d+)\.mp4', - webpage, 'video id', group='video_id' + r'<meta[^>]+?property=[\'"]+videoUrl[\'"]+.+?content=[\'"].+?brightcove\.com/[a-zA-Z0-9-_/]+_(\d+)\.mp4', + webpage, 'video id' ) - try: + if player_id and account_id: return { '_type': 'url_transparent', 'id': video_id, 'display_id': display_id, - 'url': 'http://players.brightcove.net/%s/%s_default/index.html?videoId=%s' % (account_id, player_id, video_id), + 'title': title, + 'url': self.BRIGHTCOVE_URL_TEMPLATE % (account_id, player_id, video_id), 'ie_key': 'BrightcoveNew' } - except ExtractorError: + else: # fallback return { 'id': video_id, + 'display_id': display_id, 'title': title, 'url': self._search_regex( - r'<meta[^<]+?property=[\'"]+videoUrl[\'"]+.+?content=[\'"](?P<video_url>.+?brightcove\.com\/[^\'"]+)', - webpage, 'video url', group='video_url' + r'<meta[^>]+?property=[\'"]+videoUrl[\'"]+.+?content=[\'"](.+?brightcove\.com\/[^\'"]+)', + webpage, 'video url' ), - 'uploader_id': account_id + 'uploader_id': account_id, + 'upload_date': unified_strdate(self._html_search_regex( + r'<time[^>]+?class=[\'"]+\s*?date-published\s*[\'"]+\s*datetime=[\'"]+([^\'">]+)', + webpage, 'date', default=None) + ) }