From 91e64c6941d9057b8b20405323988480746ead3e Mon Sep 17 00:00:00 2001 From: Andrew Udvare Date: Thu, 10 Aug 2017 08:14:15 -0400 Subject: [PATCH 1/9] [spreaker] Add new extractor --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/spreaker.py | 161 +++++++++++++++++++++++++++++ 2 files changed, 162 insertions(+) create mode 100644 youtube_dl/extractor/spreaker.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 897557f93..09ee08aad 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -958,6 +958,7 @@ from .sport5 import Sport5IE from .sportbox import SportBoxEmbedIE from .sportdeutschland import SportDeutschlandIE from .sportschau import SportschauIE +from .spreaker import SpreakerIE from .sprout import SproutIE from .srgssr import ( SRGSSRIE, diff --git a/youtube_dl/extractor/spreaker.py b/youtube_dl/extractor/spreaker.py new file mode 100644 index 000000000..3982267a8 --- /dev/null +++ b/youtube_dl/extractor/spreaker.py @@ -0,0 +1,161 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import int_or_none, ExtractorError + + +class SpreakerIE(InfoExtractor): + IE_NAME = 'spreaker' + _VALID_URL = r"""(?x)^ + https?:// + (?:www.|api.)? + spreaker.com/ + (?: + show/[a-z0-9_-]+| + user/[a-z0-9_-]+/[a-z0-9_-]| + episode/(?P[0-9]+) + ) + """ + _TESTS = [ + { + 'url': 'https://www.spreaker.com/show/success-with-music', + 'info_dict': { + 'title': 'Success With Music', + 'id': 2317431, + }, + 'playlist_mincount': 14, + }, + { + 'url': ('https://www.spreaker.com/user/9780658/swm-ep15-how-to-' + 'market-your-music-part-2'), + 'info_dict': { + 'id': '12534508', + 'ext': 'mp3', + 'title': 'Marketing Your Music - Part 2', + 'upload_date': '20170809', + 'uploader': 'SWM', + 'uploader_id': 9780658, + }, + }, + { + 'url': 'https://api.spreaker.com/episode/12534508', + 'info_dict': { + 'id': '12534508', + 'ext': 'mp3', + 'title': 'Marketing Your Music - Part 2', + 'upload_date': '20170809', + 'uploader': 'SWM', + 'uploader_id': 9780658, + }, + } + ] + + def _spreaker_episode_data_to_info(self, data): + upload_date = data['published_at'][0:10].replace('-', '') + author = data.get('author') + if not author: + author = {} + stats = data.get('stats') + view_count = like_count = comment_count = 0 + show = data.get('show') + if not show: + show = {} + else: + show_image = show.get('image') + if not show_image: + show_image = {} + + if stats: + view_count = (stats.get('plays', 0) + + stats.get('plays_streaming', 0) + + stats.get('plays_download', 0)) + like_count = stats.get('likes', 0) + comment_count = stats.get('messages', 0) + + return { + 'id': compat_str(data['episode_id']), + 'title': data['title'], + 'url': data['download_url'], + 'display_id': data.get('permalink'), + 'webpage_url': data.get('site_url'), + 'uploader': author.get('fullname'), + 'creator': author.get('fullname'), + 'release_date': upload_date, + 'upload_date': upload_date, + 'uploader_id': author.get('user_id'), + 'duration': int_or_none(data.get('length')), + 'view_count': int_or_none(view_count), + 'like_count': int_or_none(like_count), + 'comment_count': int_or_none(comment_count), + 'format': 'MPEG Layer 3', + 'format_id': 'mp3', + 'container': 'mp3', + 'ext': 'mp3', + 'thumbnail': show_image.get('big_url'), + 'language': show.get('language'), + 'thumbnails': [ + { + 'id': show_image.get('image_id'), + 'url': show_image.get('big_url'), + 'width': int_or_none(show_image.get('width')), + 'height': int_or_none(show_image.get('height')), + }, + { + 'url': show_image.get('large_url'), + }, + { + 'url': show_image.get('medium_url') + }, + { + 'url': show_image.get('small_url') + }, + ], + } + + def _real_extract(self, url): + episode_id = self._match_id(url) + + if re.match(r'^[0-9]+$', episode_id): + data_url = url + elif '/show/' in url: + html = self._download_webpage(url, None) + playlist_url = self._html_search_regex( + r'data-playlist_url="(?Phttps\://[^"]+")', html, 'url') + items = self._download_json(playlist_url, None) + items = items['response']['playlist']['items'] + + if not items: + raise ExtractorError('Empty playlist') + + urls = [x['api_url'] for x in items] + ret = [] + for index, url in enumerate(urls): + data = self._download_json(url, None)['response']['episode'] + dict_ = self._spreaker_episode_data_to_info(data) + dict_.update({ + 'playlist_id': compat_str(data['show_id']), + 'playlist_title': data['show']['title'], + 'playlist_index': index, + }) + ret.append(dict_) + + return self.playlist_result(ret, + data['show_id'], + data['show']['title']) + else: + html = self._download_webpage(url, None) + episode_id = self._html_search_regex( + r'data-episode_id="(?P[0-9]+)"', html, 'id') + if not re.match(r'^[0-9]+$', episode_id): + raise ExtractorError('Could not find episode ID') + data_url = 'https://api.spreaker.com/episode/%s' % (episode_id) + + data = self._download_json(data_url, episode_id)['response']['episode'] + if not data['download_enabled']: + raise ExtractorError('Not supported yet') + + return self._spreaker_episode_data_to_info(data) From 2eb228df1cb2ad0c91ef37fe25be47add412d313 Mon Sep 17 00:00:00 2001 From: Andrew Udvare Date: Thu, 10 Aug 2017 16:38:47 -0400 Subject: [PATCH 2/9] [spreaker] Fixes requested Escape . in regexes Make separate extractors for episode page, playlist (show), API Support API's direct links to MP3 files Make counts set to None in case they are not found Handle when published_at is not present Other fixes --- youtube_dl/extractor/extractors.py | 6 +- youtube_dl/extractor/spreaker.py | 176 +++++++++++++++-------------- 2 files changed, 99 insertions(+), 83 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 09ee08aad..7ca2cfd19 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -958,7 +958,11 @@ from .sport5 import Sport5IE from .sportbox import SportBoxEmbedIE from .sportdeutschland import SportDeutschlandIE from .sportschau import SportschauIE -from .spreaker import SpreakerIE +from .spreaker import ( + SpreakerIE, + SpreakerAPIEpisodeIE, + SpreakerPlaylistIE +) from .sprout import SproutIE from .srgssr import ( SRGSSRIE, diff --git a/youtube_dl/extractor/spreaker.py b/youtube_dl/extractor/spreaker.py index 3982267a8..d2fb6c304 100644 --- a/youtube_dl/extractor/spreaker.py +++ b/youtube_dl/extractor/spreaker.py @@ -8,39 +8,49 @@ from ..compat import compat_str from ..utils import int_or_none, ExtractorError -class SpreakerIE(InfoExtractor): +class SpreakerPlaylistIE(InfoExtractor): IE_NAME = 'spreaker' - _VALID_URL = r"""(?x)^ - https?:// - (?:www.|api.)? - spreaker.com/ - (?: - show/[a-z0-9_-]+| - user/[a-z0-9_-]+/[a-z0-9_-]| - episode/(?P[0-9]+) - ) - """ - _TESTS = [ - { + _VALID_URL = r'^https?://(?:www\.)?spreaker\.com/show/[a-z0-9_-]+' + _TEST = { 'url': 'https://www.spreaker.com/show/success-with-music', 'info_dict': { 'title': 'Success With Music', 'id': 2317431, }, 'playlist_mincount': 14, - }, - { - 'url': ('https://www.spreaker.com/user/9780658/swm-ep15-how-to-' - 'market-your-music-part-2'), - 'info_dict': { - 'id': '12534508', - 'ext': 'mp3', - 'title': 'Marketing Your Music - Part 2', - 'upload_date': '20170809', - 'uploader': 'SWM', - 'uploader_id': 9780658, - }, - }, + } + + def _real_extract(self, url): + html = self._download_webpage(url, None) + playlist_url = self._html_search_regex( + r'data-playlist_url="(?Phttps\://[^"]+")', html, 'url') + items = self._download_json(playlist_url, None) + items = items['response']['playlist']['items'] + + if not items: + raise ExtractorError('Empty playlist') + + urls = [x['api_url'] for x in items] + ret = [] + for index, url in enumerate(urls): + data = self._download_json(url, None)['response']['episode'] + dict_ = SpreakerIE._spreaker_episode_data_to_info(data) + dict_.update({ + 'playlist_id': compat_str(data['show_id']), + 'playlist_title': data['show']['title'], + 'playlist_index': index, + }) + ret.append(dict_) + + return self.playlist_result(ret, + data['show_id'], + data['show']['title']) + + +class SpreakerAPIEpisodeIE(InfoExtractor): + IE_NAME = 'spreaker' + _VALID_URL = r'^https?://(?:api\.)?spreaker\.com/(?:download/)?episode/(?P[0-9]+)(?:/[^\.]+\.mp3$)?' + _TESTS = [ { 'url': 'https://api.spreaker.com/episode/12534508', 'info_dict': { @@ -51,23 +61,59 @@ class SpreakerIE(InfoExtractor): 'uploader': 'SWM', 'uploader_id': 9780658, }, - } + }, + { + 'url': 'https://api.spreaker.com/download/episode/12534508/swm_ep15_how_to_market_your_music_part_2.mp3', + 'info_dict': { + 'id': '12534508', + 'ext': 'mp3', + 'title': 'Marketing Your Music - Part 2', + 'upload_date': '20170809', + 'uploader': 'SWM', + 'uploader_id': 9780658, + }, + }, ] - def _spreaker_episode_data_to_info(self, data): - upload_date = data['published_at'][0:10].replace('-', '') - author = data.get('author') - if not author: - author = {} - stats = data.get('stats') - view_count = like_count = comment_count = 0 - show = data.get('show') - if not show: - show = {} - else: - show_image = show.get('image') - if not show_image: - show_image = {} + def _real_extract(self, url): + episode_id = self._match_id(url) + if not re.match(r'^[0-9]+$', episode_id): + raise ExtractorError('Invalid ID') + + url = 'https://api.spreaker.com/episode/%s' % (episode_id,) + data = self._download_json(url, episode_id)['response']['episode'] + if not data['download_enabled']: + raise ExtractorError('Not supported yet') + + return SpreakerIE._spreaker_episode_data_to_info(data) + + +class SpreakerIE(InfoExtractor): + IE_NAME = 'spreaker' + _VALID_URL = r'^https?://(?:www\.)?spreaker\.com/user/[a-z0-9_-]+/[a-z0-9_-]' + _TEST = { + 'url': 'https://www.spreaker.com/user/9780658/swm-ep15-how-to-market-your-music-part-2', + 'info_dict': { + 'id': '12534508', + 'ext': 'mp3', + 'title': 'Marketing Your Music - Part 2', + 'upload_date': '20170809', + 'uploader': 'SWM', + 'uploader_id': 9780658, + }, + } + + @staticmethod + def _spreaker_episode_data_to_info(data): + published_at = data.get('published_at') + upload_date = None + if published_at: + upload_date = published_at[0:10].replace('-', '') + author = data.get('author', {}) + stats = data.get('stats', {}) + view_count = like_count = comment_count = None + show = data.get('show', {}) + show_image = show.get('image', {}) if stats: view_count = (stats.get('plays', 0) + @@ -117,45 +163,11 @@ class SpreakerIE(InfoExtractor): } def _real_extract(self, url): - episode_id = self._match_id(url) + html = self._download_webpage(url, None) + episode_id = self._html_search_regex( + r'data-episode_id="(?P[0-9]+)"', html, 'id') + if not re.match(r'^[0-9]+$', episode_id): + raise ExtractorError('Could not find episode ID') + data_url = 'https://api.spreaker.com/episode/%s' % (episode_id) - if re.match(r'^[0-9]+$', episode_id): - data_url = url - elif '/show/' in url: - html = self._download_webpage(url, None) - playlist_url = self._html_search_regex( - r'data-playlist_url="(?Phttps\://[^"]+")', html, 'url') - items = self._download_json(playlist_url, None) - items = items['response']['playlist']['items'] - - if not items: - raise ExtractorError('Empty playlist') - - urls = [x['api_url'] for x in items] - ret = [] - for index, url in enumerate(urls): - data = self._download_json(url, None)['response']['episode'] - dict_ = self._spreaker_episode_data_to_info(data) - dict_.update({ - 'playlist_id': compat_str(data['show_id']), - 'playlist_title': data['show']['title'], - 'playlist_index': index, - }) - ret.append(dict_) - - return self.playlist_result(ret, - data['show_id'], - data['show']['title']) - else: - html = self._download_webpage(url, None) - episode_id = self._html_search_regex( - r'data-episode_id="(?P[0-9]+)"', html, 'id') - if not re.match(r'^[0-9]+$', episode_id): - raise ExtractorError('Could not find episode ID') - data_url = 'https://api.spreaker.com/episode/%s' % (episode_id) - - data = self._download_json(data_url, episode_id)['response']['episode'] - if not data['download_enabled']: - raise ExtractorError('Not supported yet') - - return self._spreaker_episode_data_to_info(data) + return self.url_result(data_url) From a894bc1c3e5580f90faa965d83073ec5be2a7159 Mon Sep 17 00:00:00 2001 From: Andrew Udvare Date: Thu, 10 Aug 2017 16:42:45 -0400 Subject: [PATCH 3/9] [spreaker] Make IE_NAME values unique --- youtube_dl/extractor/spreaker.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/spreaker.py b/youtube_dl/extractor/spreaker.py index d2fb6c304..60f6af624 100644 --- a/youtube_dl/extractor/spreaker.py +++ b/youtube_dl/extractor/spreaker.py @@ -9,7 +9,7 @@ from ..utils import int_or_none, ExtractorError class SpreakerPlaylistIE(InfoExtractor): - IE_NAME = 'spreaker' + IE_NAME = 'spreaker:playlist' _VALID_URL = r'^https?://(?:www\.)?spreaker\.com/show/[a-z0-9_-]+' _TEST = { 'url': 'https://www.spreaker.com/show/success-with-music', @@ -48,7 +48,7 @@ class SpreakerPlaylistIE(InfoExtractor): class SpreakerAPIEpisodeIE(InfoExtractor): - IE_NAME = 'spreaker' + IE_NAME = 'spreaker:api' _VALID_URL = r'^https?://(?:api\.)?spreaker\.com/(?:download/)?episode/(?P[0-9]+)(?:/[^\.]+\.mp3$)?' _TESTS = [ { From 1c9e16b8b9f9fb73c5a717423891ccba96a3f41d Mon Sep 17 00:00:00 2001 From: Andrew Udvare Date: Thu, 10 Aug 2017 23:28:19 -0400 Subject: [PATCH 4/9] [spreaker] Set extractor name to spreaker to override IE_NAME --- youtube_dl/extractor/spreaker.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/spreaker.py b/youtube_dl/extractor/spreaker.py index 60f6af624..40ccdd973 100644 --- a/youtube_dl/extractor/spreaker.py +++ b/youtube_dl/extractor/spreaker.py @@ -160,6 +160,7 @@ class SpreakerIE(InfoExtractor): 'url': show_image.get('small_url') }, ], + 'extractor': 'spreaker', } def _real_extract(self, url): From b653c19f8fa1d05b2b7479c0bf60e4c83e431717 Mon Sep 17 00:00:00 2001 From: Andrew Udvare Date: Fri, 11 Aug 2017 00:31:57 -0400 Subject: [PATCH 5/9] [spreaker] Handle when playlist JSON has multiple pages --- youtube_dl/extractor/spreaker.py | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/spreaker.py b/youtube_dl/extractor/spreaker.py index 40ccdd973..e27078b76 100644 --- a/youtube_dl/extractor/spreaker.py +++ b/youtube_dl/extractor/spreaker.py @@ -24,12 +24,30 @@ class SpreakerPlaylistIE(InfoExtractor): html = self._download_webpage(url, None) playlist_url = self._html_search_regex( r'data-playlist_url="(?Phttps\://[^"]+")', html, 'url') - items = self._download_json(playlist_url, None) - items = items['response']['playlist']['items'] + items = self._download_json(playlist_url, + None, + 'Downloading playlist JSON') + playlist = items['response']['playlist'] + next_url = playlist.get('next_url') + items = playlist.get('items', []) if not items: raise ExtractorError('Empty playlist') + page_no = 2 + download_str = 'Downloading playlist JSON page #%d' + while next_url: + items_ = self._download_json(next_url, + None, + download_str % (page_no,)) + playlist_ = items_['response']['playlist'] + new_items = playlist_.get('items', []) + if not new_items: + break + items += new_items + next_url = playlist_.get('next_url') + page_no += 1 + urls = [x['api_url'] for x in items] ret = [] for index, url in enumerate(urls): From 2160768a215849e82a167912cb8f0aa054e87d8c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 30 Jun 2018 23:39:56 +0700 Subject: [PATCH 6/9] [npo] Fix typo (closes #16872) --- youtube_dl/extractor/npo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index cb8319f0d..c2cb85a73 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -282,7 +282,7 @@ class NPOIE(NPOBaseIE): video_url = stream_info.get('url') if not video_url or video_url in urls: continue - urls.add(item_url) + urls.add(video_url) if determine_ext(video_url) == 'm3u8': formats.extend(self._extract_m3u8_formats( video_url, video_id, ext='mp4', From eca1f0d115e6a2712ff0d5f6b25e3ded5e52db71 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 1 Jul 2018 02:00:16 +0700 Subject: [PATCH 7/9] [extractor/common] Properly escape % in MPD templates (closes #16867) --- youtube_dl/extractor/common.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index f3fec160d..78f053f18 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -2106,7 +2106,21 @@ class InfoExtractor(object): representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info) def prepare_template(template_name, identifiers): - t = representation_ms_info[template_name] + tmpl = representation_ms_info[template_name] + # First of, % characters outside $...$ templates + # must be escaped by doubling for proper processing + # by % operator string formatting used further (see + # https://github.com/rg3/youtube-dl/issues/16867). + t = '' + in_template = False + for c in tmpl: + t += c + if c == '$': + in_template = not in_template + elif c == '%' and not in_template: + t += c + # Next, $...$ templates are translated to their + # %(...) counterparts to be used with % operator t = t.replace('$RepresentationID$', representation_id) t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t) t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t) From 973b6ceebbf0c79086cbf3203a8a8c79daf0b1ba Mon Sep 17 00:00:00 2001 From: coreynicholson Date: Sun, 1 Jul 2018 15:19:17 +0100 Subject: [PATCH 8/9] [vlive] Fix live streams extraction --- youtube_dl/extractor/vlive.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/vlive.py b/youtube_dl/extractor/vlive.py index 64d0224e6..0b5165fd0 100644 --- a/youtube_dl/extractor/vlive.py +++ b/youtube_dl/extractor/vlive.py @@ -57,7 +57,7 @@ class VLiveIE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage( - 'http://www.vlive.tv/video/%s' % video_id, video_id) + 'https://www.vlive.tv/video/%s' % video_id, video_id) VIDEO_PARAMS_RE = r'\bvlive\.video\.init\(([^)]+)' VIDEO_PARAMS_FIELD = 'video params' @@ -108,11 +108,11 @@ class VLiveIE(InfoExtractor): def _live(self, video_id, webpage): init_page = self._download_webpage( - 'http://www.vlive.tv/video/init/view', + 'https://www.vlive.tv/video/init/view', video_id, note='Downloading live webpage', data=urlencode_postdata({'videoSeq': video_id}), headers={ - 'Referer': 'http://www.vlive.tv/video/%s' % video_id, + 'Referer': 'https://www.vlive.tv/video/%s' % video_id, 'Content-Type': 'application/x-www-form-urlencoded' }) From 5ec9047d2cba4e29aab1e5d1c5ee7d5b8a4b1ae7 Mon Sep 17 00:00:00 2001 From: Andrew Udvare Date: Sun, 1 Jul 2018 17:25:38 -0400 Subject: [PATCH 9/9] [spreaker] Fixes requested --- youtube_dl/extractor/extractors.py | 2 - youtube_dl/extractor/spreaker.py | 224 ++++++++++++++--------------- 2 files changed, 112 insertions(+), 114 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 95927dd7b..03c857aac 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1022,10 +1022,8 @@ from .stitcher import StitcherIE from .sport5 import Sport5IE from .sportbox import SportBoxEmbedIE from .sportdeutschland import SportDeutschlandIE -from .sportschau import SportschauIE from .spreaker import ( SpreakerIE, - SpreakerAPIEpisodeIE, SpreakerPlaylistIE ) from .springboardplatform import SpringboardPlatformIE diff --git a/youtube_dl/extractor/spreaker.py b/youtube_dl/extractor/spreaker.py index e27078b76..d89fcdc92 100644 --- a/youtube_dl/extractor/spreaker.py +++ b/youtube_dl/extractor/spreaker.py @@ -8,16 +8,84 @@ from ..compat import compat_str from ..utils import int_or_none, ExtractorError +def _spreaker_episode_data_to_info(data): + published_at = data.get('published_at') + upload_date = None + if published_at: + upload_date = published_at[0:10].replace('-', '') + author = data.get('author', {}) + stats = data.get('stats', {}) + view_count = like_count = comment_count = None + show = data.get('show', {}) + show_image = show.get('image', {}) + + if stats: + plays = stats.get('plays') + plays_streaming = stats.get('plays_streaming') + plays_download = stats.get('plays_download') + view_count = None + for x in [plays, plays_streaming, plays_download]: + if x is None: + continue + if view_count is None: + view_count = x + else: + view_count += x + like_count = stats.get('likes') + comment_count = stats.get('messages') + + return { + 'id': compat_str(data['episode_id']), + 'title': data['title'], + 'url': data['download_url'], + 'display_id': data.get('permalink'), + 'webpage_url': data.get('site_url'), + 'uploader': author.get('fullname'), + 'creator': author.get('fullname'), + 'release_date': upload_date, + 'upload_date': upload_date, + 'uploader_id': author.get('user_id'), + 'duration': int_or_none(data.get('length')), + 'view_count': int_or_none(view_count), + 'like_count': int_or_none(like_count), + 'comment_count': int_or_none(comment_count), + 'format': 'MPEG Layer 3', + 'format_id': 'mp3', + 'container': 'mp3', + 'ext': 'mp3', + 'thumbnail': show_image.get('big_url'), + 'language': show.get('language'), + 'thumbnails': [ + { + 'id': show_image.get('image_id'), + 'url': show_image.get('big_url'), + 'width': int_or_none(show_image.get('width')), + 'height': int_or_none(show_image.get('height')), + }, + { + 'url': show_image.get('large_url'), + }, + { + 'url': show_image.get('medium_url') + }, + { + 'url': show_image.get('small_url') + }, + ], + 'extractor': 'spreaker', + } + + class SpreakerPlaylistIE(InfoExtractor): IE_NAME = 'spreaker:playlist' - _VALID_URL = r'^https?://(?:www\.)?spreaker\.com/show/[a-z0-9_-]+' + _VALID_URL = r'https?://(?:www\.)?spreaker\.com/show/[a-z0-9_-]+' _TEST = { - 'url': 'https://www.spreaker.com/show/success-with-music', - 'info_dict': { - 'title': 'Success With Music', - 'id': 2317431, - }, - 'playlist_mincount': 14, + 'url': 'https://www.spreaker.com/show/success-with-music', + 'info_dict': { + 'title': 'Success With Music', + 'id': 2317431, + }, + 'playlist_mincount': 14, } def _real_extract(self, url): @@ -49,144 +117,76 @@ class SpreakerPlaylistIE(InfoExtractor): page_no += 1 urls = [x['api_url'] for x in items] - ret = [] + entries = [] for index, url in enumerate(urls): data = self._download_json(url, None)['response']['episode'] - dict_ = SpreakerIE._spreaker_episode_data_to_info(data) - dict_.update({ - 'playlist_id': compat_str(data['show_id']), - 'playlist_title': data['show']['title'], - 'playlist_index': index, - }) - ret.append(dict_) + dict_ = _spreaker_episode_data_to_info(data) + entries.append(dict_) - return self.playlist_result(ret, + return self.playlist_result(entries, data['show_id'], data['show']['title']) -class SpreakerAPIEpisodeIE(InfoExtractor): - IE_NAME = 'spreaker:api' - _VALID_URL = r'^https?://(?:api\.)?spreaker\.com/(?:download/)?episode/(?P[0-9]+)(?:/[^\.]+\.mp3$)?' +class SpreakerIE(InfoExtractor): + IE_NAME = 'spreaker' + _VALID_URL = (r'https?://(?:(?:api|www)\.)?spreaker\.com/' + r'(?:(?:(?:download/)?episode/(?P[0-9]+)' + r'(?:/[^\.]+\.mp3$)?)|user/[a-z0-9_-]+/[a-z0-9_-]+)') _TESTS = [ { 'url': 'https://api.spreaker.com/episode/12534508', 'info_dict': { 'id': '12534508', 'ext': 'mp3', - 'title': 'Marketing Your Music - Part 2', + 'title': 'EP:15 | Music Marketing (Likes) - Part 2', 'upload_date': '20170809', 'uploader': 'SWM', 'uploader_id': 9780658, }, }, { - 'url': 'https://api.spreaker.com/download/episode/12534508/swm_ep15_how_to_market_your_music_part_2.mp3', + 'url': ('https://api.spreaker.com/download/episode/12534508/' + 'swm_ep15_how_to_market_your_music_part_2.mp3'), 'info_dict': { 'id': '12534508', 'ext': 'mp3', - 'title': 'Marketing Your Music - Part 2', + 'title': 'EP:15 | Music Marketing (Likes) - Part 2', 'upload_date': '20170809', 'uploader': 'SWM', 'uploader_id': 9780658, }, }, + { + 'url': ('https://www.spreaker.com/user/9780658/swm-ep15-how-to-' + 'market-your-music-part-2'), + 'info_dict': { + 'id': '12534508', + 'ext': 'mp3', + 'title': 'EP:15 | Music Marketing (Likes) - Part 2', + 'upload_date': '20170809', + 'uploader': 'SWM', + 'uploader_id': 9780658, + }, + } ] def _real_extract(self, url): episode_id = self._match_id(url) - if not re.match(r'^[0-9]+$', episode_id): - raise ExtractorError('Invalid ID') + if re.match(r'^[0-9]+$', episode_id): + url = 'https://api.spreaker.com/episode/%s' % (episode_id,) + else: + html = self._download_webpage(url, + None, + note='Downloading episode page') + episode_id = self._html_search_regex( + r'data-episode_id="(?P[0-9]+)"', html, 'id') + if not re.match(r'^[0-9]+$', episode_id): + raise ExtractorError('Could not find episode ID') + url = 'https://api.spreaker.com/episode/%s' % (episode_id) - url = 'https://api.spreaker.com/episode/%s' % (episode_id,) data = self._download_json(url, episode_id)['response']['episode'] if not data['download_enabled']: raise ExtractorError('Not supported yet') - return SpreakerIE._spreaker_episode_data_to_info(data) - - -class SpreakerIE(InfoExtractor): - IE_NAME = 'spreaker' - _VALID_URL = r'^https?://(?:www\.)?spreaker\.com/user/[a-z0-9_-]+/[a-z0-9_-]' - _TEST = { - 'url': 'https://www.spreaker.com/user/9780658/swm-ep15-how-to-market-your-music-part-2', - 'info_dict': { - 'id': '12534508', - 'ext': 'mp3', - 'title': 'Marketing Your Music - Part 2', - 'upload_date': '20170809', - 'uploader': 'SWM', - 'uploader_id': 9780658, - }, - } - - @staticmethod - def _spreaker_episode_data_to_info(data): - published_at = data.get('published_at') - upload_date = None - if published_at: - upload_date = published_at[0:10].replace('-', '') - author = data.get('author', {}) - stats = data.get('stats', {}) - view_count = like_count = comment_count = None - show = data.get('show', {}) - show_image = show.get('image', {}) - - if stats: - view_count = (stats.get('plays', 0) + - stats.get('plays_streaming', 0) + - stats.get('plays_download', 0)) - like_count = stats.get('likes', 0) - comment_count = stats.get('messages', 0) - - return { - 'id': compat_str(data['episode_id']), - 'title': data['title'], - 'url': data['download_url'], - 'display_id': data.get('permalink'), - 'webpage_url': data.get('site_url'), - 'uploader': author.get('fullname'), - 'creator': author.get('fullname'), - 'release_date': upload_date, - 'upload_date': upload_date, - 'uploader_id': author.get('user_id'), - 'duration': int_or_none(data.get('length')), - 'view_count': int_or_none(view_count), - 'like_count': int_or_none(like_count), - 'comment_count': int_or_none(comment_count), - 'format': 'MPEG Layer 3', - 'format_id': 'mp3', - 'container': 'mp3', - 'ext': 'mp3', - 'thumbnail': show_image.get('big_url'), - 'language': show.get('language'), - 'thumbnails': [ - { - 'id': show_image.get('image_id'), - 'url': show_image.get('big_url'), - 'width': int_or_none(show_image.get('width')), - 'height': int_or_none(show_image.get('height')), - }, - { - 'url': show_image.get('large_url'), - }, - { - 'url': show_image.get('medium_url') - }, - { - 'url': show_image.get('small_url') - }, - ], - 'extractor': 'spreaker', - } - - def _real_extract(self, url): - html = self._download_webpage(url, None) - episode_id = self._html_search_regex( - r'data-episode_id="(?P[0-9]+)"', html, 'id') - if not re.match(r'^[0-9]+$', episode_id): - raise ExtractorError('Could not find episode ID') - data_url = 'https://api.spreaker.com/episode/%s' % (episode_id) - - return self.url_result(data_url) + return _spreaker_episode_data_to_info(data)