From e4b81d8ee9f915f6f4c43bc4085352433d583d5d Mon Sep 17 00:00:00 2001 From: jgilf Date: Wed, 27 May 2020 16:03:28 +1000 Subject: [PATCH 1/5] [Viddler] Add ViddlerBaseIE --- youtube_dl/extractor/viddler.py | 126 ++++++++++++++++---------------- 1 file changed, 65 insertions(+), 61 deletions(-) diff --git a/youtube_dl/extractor/viddler.py b/youtube_dl/extractor/viddler.py index 642358433..a7f7ab063 100644 --- a/youtube_dl/extractor/viddler.py +++ b/youtube_dl/extractor/viddler.py @@ -9,7 +9,70 @@ from ..utils import ( ) -class ViddlerIE(InfoExtractor): +class ViddlerBaseIE(InfoExtractor): + def _extract_viddler_info(self, url, video_id, secret): + query = { + 'video_id': video_id, + 'key': 'v0vhrt7bg2xq1vyxhkct', + } + if secret: + query['secret'] = secret + + data = self._download_json( + 'http://api.viddler.com/api/v2/viddler.videos.getPlaybackDetails.json', + video_id, headers={'Referer': url}, query=query)['video'] + + formats = [] + for filed in data['files']: + if filed.get('status', 'ready') != 'ready': + continue + format_id = filed.get('profile_id') or filed['profile_name'] + f = { + 'format_id': format_id, + 'format_note': filed['profile_name'], + 'url': self._proto_relative_url(filed['url']), + 'width': int_or_none(filed.get('width')), + 'height': int_or_none(filed.get('height')), + 'filesize': int_or_none(filed.get('size')), + 'ext': filed.get('ext'), + 'source_preference': -1, + } + formats.append(f) + + if filed.get('cdn_url'): + f = f.copy() + f['url'] = self._proto_relative_url(filed['cdn_url'], 'http:') + f['format_id'] = format_id + '-cdn' + f['source_preference'] = 1 + formats.append(f) + + if filed.get('html5_video_source'): + f = f.copy() + f['url'] = self._proto_relative_url(filed['html5_video_source']) + f['format_id'] = format_id + '-html5' + f['source_preference'] = 0 + formats.append(f) + self._sort_formats(formats) + + categories = [ + t.get('text') for t in data.get('tags', []) if 'text' in t] + + return { + 'id': video_id, + 'title': data['title'], + 'formats': formats, + 'description': data.get('description'), + 'timestamp': int_or_none(data.get('upload_time')), + 'thumbnail': self._proto_relative_url(data.get('thumbnail_url')), + 'uploader': data.get('author'), + 'duration': float_or_none(data.get('length')), + 'view_count': int_or_none(data.get('view_count')), + 'comment_count': int_or_none(data.get('comment_count')), + 'categories': categories, + } + + +class ViddlerIE(ViddlerBaseIE): _VALID_URL = r'https?://(?:www\.)?viddler\.com/(?:v|embed|player)/(?P[a-z0-9]+)(?:.+?\bsecret=(\d+))?' _TESTS = [{ 'url': 'http://www.viddler.com/v/43903784', @@ -76,63 +139,4 @@ class ViddlerIE(InfoExtractor): def _real_extract(self, url): video_id, secret = re.match(self._VALID_URL, url).groups() - - query = { - 'video_id': video_id, - 'key': 'v0vhrt7bg2xq1vyxhkct', - } - if secret: - query['secret'] = secret - - data = self._download_json( - 'http://api.viddler.com/api/v2/viddler.videos.getPlaybackDetails.json', - video_id, headers={'Referer': url}, query=query)['video'] - - formats = [] - for filed in data['files']: - if filed.get('status', 'ready') != 'ready': - continue - format_id = filed.get('profile_id') or filed['profile_name'] - f = { - 'format_id': format_id, - 'format_note': filed['profile_name'], - 'url': self._proto_relative_url(filed['url']), - 'width': int_or_none(filed.get('width')), - 'height': int_or_none(filed.get('height')), - 'filesize': int_or_none(filed.get('size')), - 'ext': filed.get('ext'), - 'source_preference': -1, - } - formats.append(f) - - if filed.get('cdn_url'): - f = f.copy() - f['url'] = self._proto_relative_url(filed['cdn_url'], 'http:') - f['format_id'] = format_id + '-cdn' - f['source_preference'] = 1 - formats.append(f) - - if filed.get('html5_video_source'): - f = f.copy() - f['url'] = self._proto_relative_url(filed['html5_video_source']) - f['format_id'] = format_id + '-html5' - f['source_preference'] = 0 - formats.append(f) - self._sort_formats(formats) - - categories = [ - t.get('text') for t in data.get('tags', []) if 'text' in t] - - return { - 'id': video_id, - 'title': data['title'], - 'formats': formats, - 'description': data.get('description'), - 'timestamp': int_or_none(data.get('upload_time')), - 'thumbnail': self._proto_relative_url(data.get('thumbnail_url')), - 'uploader': data.get('author'), - 'duration': float_or_none(data.get('length')), - 'view_count': int_or_none(data.get('view_count')), - 'comment_count': int_or_none(data.get('comment_count')), - 'categories': categories, - } + return self._extract_viddler_info(url, video_id, secret) From 8ae114ea933763994af8856f4b67af28c06060b9 Mon Sep 17 00:00:00 2001 From: jgilf Date: Wed, 27 May 2020 16:04:06 +1000 Subject: [PATCH 2/5] [British Council] Add new extractor --- youtube_dl/extractor/britishcouncil.py | 30 ++++++++++++++++++++++++++ youtube_dl/extractor/extractors.py | 1 + 2 files changed, 31 insertions(+) create mode 100644 youtube_dl/extractor/britishcouncil.py diff --git a/youtube_dl/extractor/britishcouncil.py b/youtube_dl/extractor/britishcouncil.py new file mode 100644 index 000000000..c6afe6b61 --- /dev/null +++ b/youtube_dl/extractor/britishcouncil.py @@ -0,0 +1,30 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .viddler import ViddlerBaseIE + + +class BritishCouncilIE(ViddlerBaseIE): + _VALID_URL = r'https?://(?:www\.)?(?:learnenglish\.)?britishcouncil.org/(?P.*)' + _TEST = { + 'url': 'https://learnenglish.britishcouncil.org/episode-01-they-meet', + 'md5': '796e9c4fa07017e3da79d5e99ef36fe8', + 'info_dict': { + 'id': '34d5e84c', + 'ext': 'mp4', + 'title': 'StartingOut.s01e01', + 'upload_date': '20160927', + 'uploader': 'BCLearnenglish', + 'timestamp': 1474975664, + 'view_count': int, + 'comment_count': int, + } + } + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + video_id = self._html_search_regex( + r'data-video-id=([\'"])(?P[^\'"]+)\1', + webpage, 'video ID', group='id', default=None) + return self._extract_viddler_info(url, video_id, None) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 4b3092028..58e2d0514 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -132,6 +132,7 @@ from .brightcove import ( BrightcoveLegacyIE, BrightcoveNewIE, ) +from .britishcouncil import BritishCouncilIE from .businessinsider import BusinessInsiderIE from .buzzfeed import BuzzFeedIE from .byutv import BYUtvIE From e488cfaf0e02b67308e90d9ad0a3a825ad9ada1f Mon Sep 17 00:00:00 2001 From: jgilf Date: Wed, 27 May 2020 22:03:34 +1000 Subject: [PATCH 3/5] [Viddler] Add support for generic embeds --- youtube_dl/extractor/britishcouncil.py | 30 ------ youtube_dl/extractor/extractors.py | 1 - youtube_dl/extractor/generic.py | 21 ++++ youtube_dl/extractor/viddler.py | 135 +++++++++++++------------ 4 files changed, 91 insertions(+), 96 deletions(-) delete mode 100644 youtube_dl/extractor/britishcouncil.py diff --git a/youtube_dl/extractor/britishcouncil.py b/youtube_dl/extractor/britishcouncil.py deleted file mode 100644 index c6afe6b61..000000000 --- a/youtube_dl/extractor/britishcouncil.py +++ /dev/null @@ -1,30 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .viddler import ViddlerBaseIE - - -class BritishCouncilIE(ViddlerBaseIE): - _VALID_URL = r'https?://(?:www\.)?(?:learnenglish\.)?britishcouncil.org/(?P.*)' - _TEST = { - 'url': 'https://learnenglish.britishcouncil.org/episode-01-they-meet', - 'md5': '796e9c4fa07017e3da79d5e99ef36fe8', - 'info_dict': { - 'id': '34d5e84c', - 'ext': 'mp4', - 'title': 'StartingOut.s01e01', - 'upload_date': '20160927', - 'uploader': 'BCLearnenglish', - 'timestamp': 1474975664, - 'view_count': int, - 'comment_count': int, - } - } - - def _real_extract(self, url): - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) - video_id = self._html_search_regex( - r'data-video-id=([\'"])(?P[^\'"]+)\1', - webpage, 'video ID', group='id', default=None) - return self._extract_viddler_info(url, video_id, None) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 58e2d0514..4b3092028 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -132,7 +132,6 @@ from .brightcove import ( BrightcoveLegacyIE, BrightcoveNewIE, ) -from .britishcouncil import BritishCouncilIE from .businessinsider import BusinessInsiderIE from .buzzfeed import BuzzFeedIE from .byutv import BYUtvIE diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index ce8252f6a..e8e87b356 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -119,6 +119,7 @@ from .expressen import ExpressenIE from .zype import ZypeIE from .odnoklassniki import OdnoklassnikiIE from .kinja import KinjaEmbedIE +from .viddler import ViddlerIE class GenericIE(InfoExtractor): @@ -1098,6 +1099,20 @@ class GenericIE(InfoExtractor): }, 'add_ie': ['Viddler'], }, + { + 'url': 'https://learnenglish.britishcouncil.org/episode-01-they-meet', + 'md5': '796e9c4fa07017e3da79d5e99ef36fe8', + 'info_dict': { + 'id': '34d5e84c', + 'ext': 'mp4', + 'title': 'StartingOut.s01e01', + 'upload_date': '20160927', + 'uploader': 'BCLearnenglish', + 'timestamp': 1474975664, + 'view_count': int, + 'comment_count': int, + }, + }, # Libsyn embed { 'url': 'http://thedailyshow.cc.com/podcast/episodetwelve', @@ -2580,6 +2595,12 @@ class GenericIE(InfoExtractor): if mobj is not None: return self.url_result(mobj.group('url')) + mobj = re.search( + r'
[^\'"]+)\1', + webpage) + if mobj is not None: + return ViddlerIE._build_url_result(mobj.group('id')) + # Look for NYTimes player mobj = re.search( r']+src=(["\'])(?P(?:https?:)?//graphics8\.nytimes\.com/bcvideo/[^/]+/iframe/embed\.html.+?)\1>', diff --git a/youtube_dl/extractor/viddler.py b/youtube_dl/extractor/viddler.py index a7f7ab063..421ec7d09 100644 --- a/youtube_dl/extractor/viddler.py +++ b/youtube_dl/extractor/viddler.py @@ -9,70 +9,7 @@ from ..utils import ( ) -class ViddlerBaseIE(InfoExtractor): - def _extract_viddler_info(self, url, video_id, secret): - query = { - 'video_id': video_id, - 'key': 'v0vhrt7bg2xq1vyxhkct', - } - if secret: - query['secret'] = secret - - data = self._download_json( - 'http://api.viddler.com/api/v2/viddler.videos.getPlaybackDetails.json', - video_id, headers={'Referer': url}, query=query)['video'] - - formats = [] - for filed in data['files']: - if filed.get('status', 'ready') != 'ready': - continue - format_id = filed.get('profile_id') or filed['profile_name'] - f = { - 'format_id': format_id, - 'format_note': filed['profile_name'], - 'url': self._proto_relative_url(filed['url']), - 'width': int_or_none(filed.get('width')), - 'height': int_or_none(filed.get('height')), - 'filesize': int_or_none(filed.get('size')), - 'ext': filed.get('ext'), - 'source_preference': -1, - } - formats.append(f) - - if filed.get('cdn_url'): - f = f.copy() - f['url'] = self._proto_relative_url(filed['cdn_url'], 'http:') - f['format_id'] = format_id + '-cdn' - f['source_preference'] = 1 - formats.append(f) - - if filed.get('html5_video_source'): - f = f.copy() - f['url'] = self._proto_relative_url(filed['html5_video_source']) - f['format_id'] = format_id + '-html5' - f['source_preference'] = 0 - formats.append(f) - self._sort_formats(formats) - - categories = [ - t.get('text') for t in data.get('tags', []) if 'text' in t] - - return { - 'id': video_id, - 'title': data['title'], - 'formats': formats, - 'description': data.get('description'), - 'timestamp': int_or_none(data.get('upload_time')), - 'thumbnail': self._proto_relative_url(data.get('thumbnail_url')), - 'uploader': data.get('author'), - 'duration': float_or_none(data.get('length')), - 'view_count': int_or_none(data.get('view_count')), - 'comment_count': int_or_none(data.get('comment_count')), - 'categories': categories, - } - - -class ViddlerIE(ViddlerBaseIE): +class ViddlerIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?viddler\.com/(?:v|embed|player)/(?P[a-z0-9]+)(?:.+?\bsecret=(\d+))?' _TESTS = [{ 'url': 'http://www.viddler.com/v/43903784', @@ -137,6 +74,74 @@ class ViddlerIE(ViddlerBaseIE): }, }] + @staticmethod + def _url_for_id(id): + return 'http://www.viddler.com/v/%s' % id + + @classmethod + def _build_url_result(cls, id): + return cls.url_result(cls._url_for_id(id), + ie=cls.ie_key()) + def _real_extract(self, url): video_id, secret = re.match(self._VALID_URL, url).groups() - return self._extract_viddler_info(url, video_id, secret) + + query = { + 'video_id': video_id, + 'key': 'v0vhrt7bg2xq1vyxhkct', + } + if secret: + query['secret'] = secret + + data = self._download_json( + 'http://api.viddler.com/api/v2/viddler.videos.getPlaybackDetails.json', + video_id, headers={'Referer': url}, query=query)['video'] + + formats = [] + for filed in data['files']: + if filed.get('status', 'ready') != 'ready': + continue + format_id = filed.get('profile_id') or filed['profile_name'] + f = { + 'format_id': format_id, + 'format_note': filed['profile_name'], + 'url': self._proto_relative_url(filed['url']), + 'width': int_or_none(filed.get('width')), + 'height': int_or_none(filed.get('height')), + 'filesize': int_or_none(filed.get('size')), + 'ext': filed.get('ext'), + 'source_preference': -1, + } + formats.append(f) + + if filed.get('cdn_url'): + f = f.copy() + f['url'] = self._proto_relative_url(filed['cdn_url'], 'http:') + f['format_id'] = format_id + '-cdn' + f['source_preference'] = 1 + formats.append(f) + + if filed.get('html5_video_source'): + f = f.copy() + f['url'] = self._proto_relative_url(filed['html5_video_source']) + f['format_id'] = format_id + '-html5' + f['source_preference'] = 0 + formats.append(f) + self._sort_formats(formats) + + categories = [ + t.get('text') for t in data.get('tags', []) if 'text' in t] + + return { + 'id': video_id, + 'title': data['title'], + 'formats': formats, + 'description': data.get('description'), + 'timestamp': int_or_none(data.get('upload_time')), + 'thumbnail': self._proto_relative_url(data.get('thumbnail_url')), + 'uploader': data.get('author'), + 'duration': float_or_none(data.get('length')), + 'view_count': int_or_none(data.get('view_count')), + 'comment_count': int_or_none(data.get('comment_count')), + 'categories': categories, + } From 847e96d70ab67063791368750aeb6fc15d0888f1 Mon Sep 17 00:00:00 2001 From: jgilf Date: Thu, 28 May 2020 12:07:30 +1000 Subject: [PATCH 4/5] [Viddler] Fix regex to meet coding conventions --- youtube_dl/extractor/generic.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index e8e87b356..ec9efe5cd 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -2595,11 +2595,11 @@ class GenericIE(InfoExtractor): if mobj is not None: return self.url_result(mobj.group('url')) - mobj = re.search( - r'
[^\'"]+)\1', - webpage) - if mobj is not None: - return ViddlerIE._build_url_result(mobj.group('id')) + viddler_id = self._search_regex( + r']+class=(["\'])viddler-auto-embed\1[^>]+data-video-id=([\'"])(?P[^\'"]+)', + webpage, 'viddler_id', group='id') + if viddler_id is not None: + return ViddlerIE._build_url_result(viddler_id) # Look for NYTimes player mobj = re.search( From 558052ee16c5d0b48b17081067074e1a6324a855 Mon Sep 17 00:00:00 2001 From: jgilf Date: Thu, 28 May 2020 15:39:30 +1000 Subject: [PATCH 5/5] [Viddler] Fix errors --- youtube_dl/extractor/generic.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index ec9efe5cd..f540be72e 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -2595,11 +2595,11 @@ class GenericIE(InfoExtractor): if mobj is not None: return self.url_result(mobj.group('url')) - viddler_id = self._search_regex( + mobj = re.search( r']+class=(["\'])viddler-auto-embed\1[^>]+data-video-id=([\'"])(?P[^\'"]+)', - webpage, 'viddler_id', group='id') - if viddler_id is not None: - return ViddlerIE._build_url_result(viddler_id) + webpage) + if mobj is not None: + return ViddlerIE._build_url_result(mobj.group('id')) # Look for NYTimes player mobj = re.search(