From 5de538787df3f13476489b4a7dfae442c046fab7 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 19 Apr 2019 20:44:31 +0100 Subject: [PATCH 1/9] [udemy] add another course id extraction pattern(closes #20491) --- youtube_dl/extractor/udemy.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/udemy.py b/youtube_dl/extractor/udemy.py index 66ea6fb15..2a4faecef 100644 --- a/youtube_dl/extractor/udemy.py +++ b/youtube_dl/extractor/udemy.py @@ -76,7 +76,10 @@ class UdemyIE(InfoExtractor): webpage, 'course', default='{}')), video_id, fatal=False) or {} course_id = course.get('id') or self._search_regex( - r'data-course-id=["\'](\d+)', webpage, 'course id') + [ + r'data-course-id=["\'](\d+)', + r'"courseId"\s*:\s*(\d+)' + ], webpage, 'course id') return course_id, course.get('title') def _enroll_course(self, base_url, webpage, course_id): From 061d1cd9486d1b31cb37e000e8181f7684024798 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sun, 21 Apr 2019 13:17:22 +0100 Subject: [PATCH 2/9] [nhk] add support for audio URLs --- youtube_dl/extractor/nhk.py | 96 +++++++++++++++++++++++-------------- 1 file changed, 61 insertions(+), 35 deletions(-) diff --git a/youtube_dl/extractor/nhk.py b/youtube_dl/extractor/nhk.py index d4acbcc3e..727302560 100644 --- a/youtube_dl/extractor/nhk.py +++ b/youtube_dl/extractor/nhk.py @@ -1,54 +1,80 @@ from __future__ import unicode_literals +import re + from .common import InfoExtractor -from ..utils import ExtractorError class NhkVodIE(InfoExtractor): - _VALID_URL = r'https?://www3\.nhk\.or\.jp/nhkworld/en/(?:vod|ondemand)/(?P[^/]+/[^/?#&]+)' + _VALID_URL = r'https?://www3\.nhk\.or\.jp/nhkworld/(?P[a-z]{2})/ondemand/(?Pvideo|audio)/(?P\d{7}|[a-z]+-\d{8}-\d+)' + # Content available only for a limited period of time. Visit + # https://www3.nhk.or.jp/nhkworld/en/ondemand/ for working samples. _TESTS = [{ - # Videos available only for a limited period of time. Visit - # http://www3.nhk.or.jp/nhkworld/en/vod/ for working samples. - 'url': 'http://www3.nhk.or.jp/nhkworld/en/vod/tokyofashion/20160815', - 'info_dict': { - 'id': 'A1bnNiNTE6nY3jLllS-BIISfcC_PpvF5', - 'ext': 'flv', - 'title': 'TOKYO FASHION EXPRESS - The Kimono as Global Fashion', - 'description': 'md5:db338ee6ce8204f415b754782f819824', - 'series': 'TOKYO FASHION EXPRESS', - 'episode': 'The Kimono as Global Fashion', - }, - 'skip': 'Videos available only for a limited period of time', - }, { 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/video/2015173/', 'only_matching': True, + }, { + 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/audio/plugin-20190404-1/', + 'only_matching': True, + }, { + 'url': 'https://www3.nhk.or.jp/nhkworld/fr/ondemand/audio/plugin-20190404-1/', + 'only_matching': True, }] - _API_URL = 'http://api.nhk.or.jp/nhkworld/vodesdlist/v1/all/all/all.json?apikey=EJfK8jdS57GqlupFgAfAAwr573q01y6k' + _API_URL_TEMPLATE = 'https://api.nhk.or.jp/nhkworld/%sodesdlist/v7/episode/%s/%s/all%s.json' def _real_extract(self, url): - video_id = self._match_id(url) - - data = self._download_json(self._API_URL, video_id) - - try: - episode = next( - e for e in data['data']['episodes'] - if e.get('url') and video_id in e['url']) - except StopIteration: - raise ExtractorError('Unable to find episode') - - embed_code = episode['vod_id'] + lang, m_type, episode_id = re.match(self._VALID_URL, url).groups() + if episode_id.isdigit(): + episode_id = episode_id[:4] + '-' + episode_id[4:] + is_video = m_type == 'video' + episode = self._download_json( + self._API_URL_TEMPLATE % ('v' if is_video else 'r', episode_id, lang, '/all' if is_video else ''), + episode_id, query={'apikey': 'EJfK8jdS57GqlupFgAfAAwr573q01y6k'})['data']['episodes'][0] title = episode.get('sub_title_clean') or episode['sub_title'] - description = episode.get('description_clean') or episode.get('description') - series = episode.get('title_clean') or episode.get('title') - return { - '_type': 'url_transparent', - 'ie_key': 'Ooyala', - 'url': 'ooyala:%s' % embed_code, + def get_clean_field(key): + return episode.get(key + '_clean') or episode.get(key) + + series = get_clean_field('title') + + thumbnails = [] + for s, w, h in [('', 640, 360), ('_l', 1280, 720)]: + img_path = episode.get('image' + s) + if not img_path: + continue + thumbnails.append({ + 'id': '%dp' % h, + 'height': h, + 'width': w, + 'url': 'https://www3.nhk.or.jp' + img_path, + }) + + info = { + 'id': episode_id + '-' + lang, 'title': '%s - %s' % (series, title) if series and title else title, - 'description': description, + 'description': get_clean_field('description'), + 'thumbnails': thumbnails, 'series': series, 'episode': title, } + if is_video: + info.update({ + '_type': 'url_transparent', + 'ie_key': 'Ooyala', + 'url': 'ooyala:' + episode['vod_id'], + }) + else: + audio = episode['audio'] + audio_path = audio['audio'] + info['formats'] = self._extract_m3u8_formats( + 'https://nhks-vh.akamaihd.net/i%s/master.m3u8' % audio_path, + episode_id, 'm4a', m3u8_id='hls', fatal=False) + info['formats'].append({ + 'ext': 'flv', + 'format_id': 'flv', + 'url': 'rtmp://flv.nhk.or.jp/ondemand/mp4:flv' + audio_path, + 'vcodec': 'none', + }) + for f in info['formats']: + f['language'] = lang + return info From 47cfa0051641d65894da02d64484055b04f767e0 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sun, 21 Apr 2019 13:25:04 +0100 Subject: [PATCH 3/9] [nhk] extract rtmpt format --- youtube_dl/extractor/nhk.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/nhk.py b/youtube_dl/extractor/nhk.py index 727302560..241412f98 100644 --- a/youtube_dl/extractor/nhk.py +++ b/youtube_dl/extractor/nhk.py @@ -69,12 +69,13 @@ class NhkVodIE(InfoExtractor): info['formats'] = self._extract_m3u8_formats( 'https://nhks-vh.akamaihd.net/i%s/master.m3u8' % audio_path, episode_id, 'm4a', m3u8_id='hls', fatal=False) - info['formats'].append({ - 'ext': 'flv', - 'format_id': 'flv', - 'url': 'rtmp://flv.nhk.or.jp/ondemand/mp4:flv' + audio_path, - 'vcodec': 'none', - }) + for proto in ('rtmpt', 'rtmp'): + info['formats'].append({ + 'ext': 'flv', + 'format_id': proto, + 'url': '%s://flv.nhk.or.jp/ondemand/mp4:flv%s' % (proto, audio_path), + 'vcodec': 'none', + }) for f in info['formats']: f['language'] = lang return info From c9b19d7a55549baa8b931390d94bdefb12a76d1d Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sun, 21 Apr 2019 14:51:26 +0100 Subject: [PATCH 4/9] [ntvcojp] Add new extractor --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/ntvcojp.py | 49 ++++++++++++++++++++++++++++++ 2 files changed, 50 insertions(+) create mode 100644 youtube_dl/extractor/ntvcojp.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 86ecc0b66..bac90f277 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -808,6 +808,7 @@ from .nrk import ( NRKTVSeasonIE, NRKTVSeriesIE, ) +from .ntvcojp import NTVCoJpCUIE from .ntvde import NTVDeIE from .ntvru import NTVRuIE from .nytimes import ( diff --git a/youtube_dl/extractor/ntvcojp.py b/youtube_dl/extractor/ntvcojp.py new file mode 100644 index 000000000..0c8221b22 --- /dev/null +++ b/youtube_dl/extractor/ntvcojp.py @@ -0,0 +1,49 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + js_to_json, + smuggle_url, +) + + +class NTVCoJpCUIE(InfoExtractor): + IE_NAME = 'cu.ntv.co.jp' + IE_DESC = 'Nippon Television Network' + _VALID_URL = r'https?://cu\.ntv\.co\.jp/(?!program)(?P[^/?&#]+)' + _TEST = { + 'url': 'https://cu.ntv.co.jp/televiva-chill-gohan_181031/', + 'info_dict': { + 'id': '5978891207001', + 'ext': 'mp4', + 'title': '桜エビと炒り卵がポイント! 「中華風 エビチリおにぎり」──『美虎』五十嵐美幸', + 'upload_date': '20181213', + 'description': 'md5:211b52f4fd60f3e0e72b68b0c6ba52a9', + 'uploader_id': '3855502814001', + 'timestamp': 1544669941, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + } + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/default_default/index.html?videoId=%s' + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + player_config = self._parse_json(self._search_regex( + r'(?s)PLAYER_CONFIG\s*=\s*({.+?})', + webpage, 'player config'), display_id, js_to_json) + video_id = player_config['videoId'] + account_id = player_config.get('account') or '3855502814001' + return { + '_type': 'url_transparent', + 'id': video_id, + 'display_id': display_id, + 'title': self._search_regex(r']+class="title"[^>]*>([^<]+)', webpage, 'title').strip(), + 'description': self._html_search_meta(['description', 'og:description'], webpage), + 'url': smuggle_url(self.BRIGHTCOVE_URL_TEMPLATE % (account_id, video_id), {'geo_countries': ['JP']}), + 'ie_key': 'BrightcoveNew', + } From c25720ef6ab7e100d107df64efb3a1e1776fd66a Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sun, 21 Apr 2019 17:20:28 +0100 Subject: [PATCH 5/9] [vimeo] add support live streams and improve info extraction(closes #19144) --- youtube_dl/extractor/common.py | 2 ++ youtube_dl/extractor/vimeo.py | 51 +++++++++++++++++++--------------- 2 files changed, 31 insertions(+), 22 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 0889288f0..59ad455c1 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -2019,6 +2019,8 @@ class InfoExtractor(object): if res is False: return [] mpd_doc, urlh = res + if mpd_doc is None: + return [] mpd_base_url = base_url(urlh.geturl()) return self._parse_mpd_formats( diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 9aec8a2ab..d404498aa 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -109,23 +109,8 @@ class VimeoBaseInfoExtractor(InfoExtractor): def _parse_config(self, config, video_id): video_data = config['video'] - # Extract title video_title = video_data['title'] - - # Extract uploader, uploader_url and uploader_id - video_uploader = video_data.get('owner', {}).get('name') - video_uploader_url = video_data.get('owner', {}).get('url') - video_uploader_id = video_uploader_url.split('/')[-1] if video_uploader_url else None - - # Extract video thumbnail - video_thumbnail = video_data.get('thumbnail') - if video_thumbnail is None: - video_thumbs = video_data.get('thumbs') - if video_thumbs and isinstance(video_thumbs, dict): - _, video_thumbnail = sorted((int(width if width.isdigit() else 0), t_url) for (width, t_url) in video_thumbs.items())[-1] - - # Extract video duration - video_duration = int_or_none(video_data.get('duration')) + is_live = try_get(video_data, lambda x: x['live_event']['status']) == 'started' formats = [] config_files = video_data.get('files') or config['request'].get('files', {}) @@ -151,7 +136,7 @@ class VimeoBaseInfoExtractor(InfoExtractor): if files_type == 'hls': formats.extend(self._extract_m3u8_formats( manifest_url, video_id, 'mp4', - 'm3u8_native', m3u8_id=format_id, + 'm3u8' if is_live else 'm3u8_native', m3u8_id=format_id, note='Downloading %s m3u8 information' % cdn_name, fatal=False)) elif files_type == 'dash': @@ -164,6 +149,10 @@ class VimeoBaseInfoExtractor(InfoExtractor): else: mpd_manifest_urls = [(format_id, manifest_url)] for f_id, m_url in mpd_manifest_urls: + if 'json=1' in m_url: + real_m_url = (self._download_json(m_url, video_id, fatal=False) or {}).get('url') + if real_m_url: + m_url = real_m_url mpd_formats = self._extract_mpd_formats( m_url.replace('/master.json', '/master.mpd'), video_id, f_id, 'Downloading %s MPD information' % cdn_name, @@ -184,15 +173,33 @@ class VimeoBaseInfoExtractor(InfoExtractor): 'url': 'https://vimeo.com' + tt['url'], }] + thumbnails = [] + if not is_live: + for key, thumb in video_data.get('thumbs', {}).items(): + thumbnails.append({ + 'id': key, + 'width': int_or_none(key), + 'url': thumb, + }) + thumbnail = video_data.get('thumbnail') + if thumbnail: + thumbnails.append({ + 'url': thumbnail, + }) + + owner = video_data.get('owner') or {} + video_uploader_url = owner.get('url') + return { - 'title': video_title, - 'uploader': video_uploader, - 'uploader_id': video_uploader_id, + 'title': self._live_title(video_title) if is_live else video_title, + 'uploader': owner.get('name'), + 'uploader_id': video_uploader_url.split('/')[-1] if video_uploader_url else None, 'uploader_url': video_uploader_url, - 'thumbnail': video_thumbnail, - 'duration': video_duration, + 'thumbnails': thumbnails, + 'duration': int_or_none(video_data.get('duration')), 'formats': formats, 'subtitles': subtitles, + 'is_live': is_live, } def _extract_original_format(self, url, video_id): From 85b6335d55c7b0ed7f6815f7b8b9a365b0a28c37 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sun, 21 Apr 2019 21:05:58 +0100 Subject: [PATCH 6/9] [vimeo] extract live archive source format(#19144) --- youtube_dl/extractor/vimeo.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index d404498aa..a41178bab 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -110,7 +110,8 @@ class VimeoBaseInfoExtractor(InfoExtractor): def _parse_config(self, config, video_id): video_data = config['video'] video_title = video_data['title'] - is_live = try_get(video_data, lambda x: x['live_event']['status']) == 'started' + live_event = video_data.get('live_event') or {} + is_live = live_event.get('status') == 'started' formats = [] config_files = video_data.get('files') or config['request'].get('files', {}) @@ -127,6 +128,7 @@ class VimeoBaseInfoExtractor(InfoExtractor): 'tbr': int_or_none(f.get('bitrate')), }) + # TODO: fix handling of 308 status code returned for live archive manifest requests for files_type in ('hls', 'dash'): for cdn_name, cdn_data in config_files.get(files_type, {}).get('cdns', {}).items(): manifest_url = cdn_data.get('url') @@ -164,6 +166,15 @@ class VimeoBaseInfoExtractor(InfoExtractor): f['preference'] = -40 formats.extend(mpd_formats) + live_archive = live_event.get('archive') or {} + live_archive_source_url = live_archive.get('source_url') + if live_archive_source_url and live_archive.get('status') == 'done': + formats.append({ + 'format_id': 'live-archive-source', + 'url': live_archive_source_url, + 'preference': 1, + }) + subtitles = {} text_tracks = config['request'].get('text_tracks') if text_tracks: From fdc2183650a1aed22266bb59d83a1198525d4111 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Mon, 22 Apr 2019 10:04:00 +0100 Subject: [PATCH 7/9] [nrl] Add new extractor(closes #15991) --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/nrl.py | 30 ++++++++++++++++++++++++++++++ 2 files changed, 31 insertions(+) create mode 100644 youtube_dl/extractor/nrl.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index bac90f277..0e3ccb82d 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -808,6 +808,7 @@ from .nrk import ( NRKTVSeasonIE, NRKTVSeriesIE, ) +from .nrl import NRLTVIE from .ntvcojp import NTVCoJpCUIE from .ntvde import NTVDeIE from .ntvru import NTVRuIE diff --git a/youtube_dl/extractor/nrl.py b/youtube_dl/extractor/nrl.py new file mode 100644 index 000000000..798b91e04 --- /dev/null +++ b/youtube_dl/extractor/nrl.py @@ -0,0 +1,30 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class NRLTVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?nrl\.com/tv(/[^/]+)*/(?P[^/?&#]+)' + _TEST = { + 'url': 'https://www.nrl.com/tv/news/match-highlights-titans-v-knights-862805/', + 'info_dict': { + 'id': 'YyNnFuaDE6kPJqlDhG4CGQ_w89mKTau4', + 'ext': 'mp4', + 'title': 'Match Highlights: Titans v Knights', + }, + 'params': { + # m3u8 download + 'skip_download': True, + 'format': 'bestvideo', + }, + } + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + q_data = self._parse_json(self._search_regex( + r"(?s)q-data='({.+?})'", webpage, 'player data'), display_id) + ooyala_id = q_data['videoId'] + return self.url_result( + 'ooyala:' + ooyala_id, 'Ooyala', ooyala_id, q_data.get('title')) From 3fd86cfe13c2ca83c81cc43ed106152a07dcf012 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Mon, 22 Apr 2019 10:04:56 +0100 Subject: [PATCH 8/9] [ooyala] add support for geo verification proxy --- youtube_dl/extractor/ooyala.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/ooyala.py b/youtube_dl/extractor/ooyala.py index ad8bf03f8..e42d67df9 100644 --- a/youtube_dl/extractor/ooyala.py +++ b/youtube_dl/extractor/ooyala.py @@ -36,7 +36,7 @@ class OoyalaBaseIE(InfoExtractor): 'domain': domain, 'supportedFormats': supportedformats or 'mp4,rtmp,m3u8,hds,dash,smooth', 'embedToken': embed_token, - }), video_id) + }), video_id, headers=self.geo_verification_headers()) cur_auth_data = auth_data['authorization_data'][embed_code] From e09965d550d8d76ea0c616cbb58800ee2249f15c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 23 Apr 2019 00:39:16 +0700 Subject: [PATCH 9/9] [soundcloud] Add support for new rendition and improve extraction (closes #20699) --- youtube_dl/extractor/soundcloud.py | 204 +++++++++++++++++++++-------- 1 file changed, 149 insertions(+), 55 deletions(-) diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index 15da3496e..277c3c7b4 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -15,7 +15,12 @@ from ..compat import ( ) from ..utils import ( ExtractorError, + float_or_none, int_or_none, + KNOWN_EXTENSIONS, + merge_dicts, + mimetype2ext, + str_or_none, try_get, unified_timestamp, update_url_query, @@ -57,7 +62,7 @@ class SoundcloudIE(InfoExtractor): 'uploader': 'E.T. ExTerrestrial Music', 'timestamp': 1349920598, 'upload_date': '20121011', - 'duration': 143, + 'duration': 143.216, 'license': 'all-rights-reserved', 'view_count': int, 'like_count': int, @@ -100,7 +105,7 @@ class SoundcloudIE(InfoExtractor): 'uploader': 'jaimeMF', 'timestamp': 1386604920, 'upload_date': '20131209', - 'duration': 9, + 'duration': 9.927, 'license': 'all-rights-reserved', 'view_count': int, 'like_count': int, @@ -120,7 +125,7 @@ class SoundcloudIE(InfoExtractor): 'uploader': 'jaimeMF', 'timestamp': 1386604920, 'upload_date': '20131209', - 'duration': 9, + 'duration': 9.927, 'license': 'all-rights-reserved', 'view_count': int, 'like_count': int, @@ -140,7 +145,7 @@ class SoundcloudIE(InfoExtractor): 'uploader': 'oddsamples', 'timestamp': 1389232924, 'upload_date': '20140109', - 'duration': 17, + 'duration': 17.346, 'license': 'cc-by-sa', 'view_count': int, 'like_count': int, @@ -160,7 +165,7 @@ class SoundcloudIE(InfoExtractor): 'uploader': 'Ori Uplift Music', 'timestamp': 1504206263, 'upload_date': '20170831', - 'duration': 7449, + 'duration': 7449.096, 'license': 'all-rights-reserved', 'view_count': int, 'like_count': int, @@ -180,7 +185,7 @@ class SoundcloudIE(InfoExtractor): 'uploader': 'garyvee', 'timestamp': 1488152409, 'upload_date': '20170226', - 'duration': 207, + 'duration': 207.012, 'thumbnail': r're:https?://.*\.jpg', 'license': 'all-rights-reserved', 'view_count': int, @@ -192,9 +197,31 @@ class SoundcloudIE(InfoExtractor): 'skip_download': True, }, }, + # not avaialble via api.soundcloud.com/i1/tracks/id/streams + { + 'url': 'https://soundcloud.com/giovannisarani/mezzo-valzer', + 'md5': 'e22aecd2bc88e0e4e432d7dcc0a1abf7', + 'info_dict': { + 'id': '583011102', + 'ext': 'mp3', + 'title': 'Mezzo Valzer', + 'description': 'md5:4138d582f81866a530317bae316e8b61', + 'uploader': 'Giovanni Sarani', + 'timestamp': 1551394171, + 'upload_date': '20190228', + 'duration': 180.157, + 'thumbnail': r're:https?://.*\.jpg', + 'license': 'all-rights-reserved', + 'view_count': int, + 'like_count': int, + 'comment_count': int, + 'repost_count': int, + }, + 'expected_warnings': ['Unable to download JSON metadata'], + } ] - _CLIENT_ID = 'NmW1FlPaiL94ueEu7oziOWjYEzZzQDcK' + _CLIENT_ID = 'FweeGBOOEOYJWLJN3oEyToGLKhmSz0I7' @staticmethod def _extract_urls(webpage): @@ -202,10 +229,6 @@ class SoundcloudIE(InfoExtractor): r']+src=(["\'])(?P(?:https?://)?(?:w\.)?soundcloud\.com/player.+?)\1', webpage)] - def report_resolve(self, video_id): - """Report information extraction.""" - self.to_screen('%s: Resolving id' % video_id) - @classmethod def _resolv_url(cls, url): return 'https://api.soundcloud.com/resolve.json?url=' + url + '&client_id=' + cls._CLIENT_ID @@ -224,6 +247,10 @@ class SoundcloudIE(InfoExtractor): def extract_count(key): return int_or_none(info.get('%s_count' % key)) + like_count = extract_count('favoritings') + if like_count is None: + like_count = extract_count('likes') + result = { 'id': track_id, 'uploader': username, @@ -231,15 +258,17 @@ class SoundcloudIE(InfoExtractor): 'title': title, 'description': info.get('description'), 'thumbnail': thumbnail, - 'duration': int_or_none(info.get('duration'), 1000), + 'duration': float_or_none(info.get('duration'), 1000), 'webpage_url': info.get('permalink_url'), 'license': info.get('license'), 'view_count': extract_count('playback'), - 'like_count': extract_count('favoritings'), + 'like_count': like_count, 'comment_count': extract_count('comment'), 'repost_count': extract_count('reposts'), 'genre': info.get('genre'), } + + format_urls = set() formats = [] query = {'client_id': self._CLIENT_ID} if secret_token is not None: @@ -248,6 +277,7 @@ class SoundcloudIE(InfoExtractor): # We can build a direct link to the song format_url = update_url_query( 'https://api.soundcloud.com/tracks/%s/download' % track_id, query) + format_urls.add(format_url) formats.append({ 'format_id': 'download', 'ext': info.get('original_format', 'mp3'), @@ -256,44 +286,91 @@ class SoundcloudIE(InfoExtractor): 'preference': 10, }) - # We have to retrieve the url + # Old API, does not work for some tracks (e.g. + # https://soundcloud.com/giovannisarani/mezzo-valzer) format_dict = self._download_json( 'https://api.soundcloud.com/i1/tracks/%s/streams' % track_id, - track_id, 'Downloading track url', query=query) + track_id, 'Downloading track url', query=query, fatal=False) - for key, stream_url in format_dict.items(): - ext, abr = 'mp3', None - mobj = re.search(r'_([^_]+)_(\d+)_url', key) - if mobj: - ext, abr = mobj.groups() - abr = int(abr) - if key.startswith('http'): - stream_formats = [{ - 'format_id': key, - 'ext': ext, - 'url': stream_url, - }] - elif key.startswith('rtmp'): - # The url doesn't have an rtmp app, we have to extract the playpath - url, path = stream_url.split('mp3:', 1) - stream_formats = [{ - 'format_id': key, - 'url': url, - 'play_path': 'mp3:' + path, - 'ext': 'flv', - }] - elif key.startswith('hls'): - stream_formats = self._extract_m3u8_formats( - stream_url, track_id, ext, entry_protocol='m3u8_native', - m3u8_id=key, fatal=False) - else: + if format_dict: + for key, stream_url in format_dict.items(): + if stream_url in format_urls: + continue + format_urls.add(stream_url) + ext, abr = 'mp3', None + mobj = re.search(r'_([^_]+)_(\d+)_url', key) + if mobj: + ext, abr = mobj.groups() + abr = int(abr) + if key.startswith('http'): + stream_formats = [{ + 'format_id': key, + 'ext': ext, + 'url': stream_url, + }] + elif key.startswith('rtmp'): + # The url doesn't have an rtmp app, we have to extract the playpath + url, path = stream_url.split('mp3:', 1) + stream_formats = [{ + 'format_id': key, + 'url': url, + 'play_path': 'mp3:' + path, + 'ext': 'flv', + }] + elif key.startswith('hls'): + stream_formats = self._extract_m3u8_formats( + stream_url, track_id, ext, entry_protocol='m3u8_native', + m3u8_id=key, fatal=False) + else: + continue + + if abr: + for f in stream_formats: + f['abr'] = abr + + formats.extend(stream_formats) + + # New API + transcodings = try_get( + info, lambda x: x['media']['transcodings'], list) or [] + for t in transcodings: + if not isinstance(t, dict): continue - - if abr: - for f in stream_formats: - f['abr'] = abr - - formats.extend(stream_formats) + format_url = url_or_none(t.get('url')) + if not format_url: + continue + stream = self._download_json( + update_url_query(format_url, query), track_id, fatal=False) + if not isinstance(stream, dict): + continue + stream_url = url_or_none(stream.get('url')) + if not stream_url: + continue + if stream_url in format_urls: + continue + format_urls.add(stream_url) + protocol = try_get(t, lambda x: x['format']['protocol'], compat_str) + if protocol != 'hls' and '/hls' in format_url: + protocol = 'hls' + ext = None + preset = str_or_none(t.get('preset')) + if preset: + ext = preset.split('_')[0] + if ext not in KNOWN_EXTENSIONS: + mimetype = try_get( + t, lambda x: x['format']['mime_type'], compat_str) + ext = mimetype2ext(mimetype) or 'mp3' + format_id_list = [] + if protocol: + format_id_list.append(protocol) + format_id_list.append(ext) + format_id = '_'.join(format_id_list) + formats.append({ + 'url': stream_url, + 'format_id': format_id, + 'ext': ext, + 'protocol': 'm3u8_native' if protocol == 'hls' else 'http', + }) if not formats: # We fallback to the stream_url in the original info, this @@ -303,11 +380,11 @@ class SoundcloudIE(InfoExtractor): 'url': update_url_query(info['stream_url'], query), 'ext': 'mp3', }) + self._check_formats(formats, track_id) for f in formats: f['vcodec'] = 'none' - self._check_formats(formats, track_id) self._sort_formats(formats) result['formats'] = formats @@ -319,6 +396,7 @@ class SoundcloudIE(InfoExtractor): raise ExtractorError('Invalid URL: %s' % url) track_id = mobj.group('track_id') + new_info = {} if track_id is not None: info_json_url = 'https://api.soundcloud.com/tracks/' + track_id + '.json?client_id=' + self._CLIENT_ID @@ -344,13 +422,31 @@ class SoundcloudIE(InfoExtractor): if token: resolve_title += '/%s' % token - self.report_resolve(full_title) + webpage = self._download_webpage(url, full_title, fatal=False) + if webpage: + entries = self._parse_json( + self._search_regex( + r'var\s+c\s*=\s*(\[.+?\])\s*,\s*o\s*=Date\b', webpage, + 'data', default='[]'), full_title, fatal=False) + if entries: + for e in entries: + if not isinstance(e, dict): + continue + if e.get('id') != 67: + continue + data = try_get(e, lambda x: x['data'][0], dict) + if data: + new_info = data + break + info_json_url = self._resolv_url( + 'https://soundcloud.com/%s' % resolve_title) - url = 'https://soundcloud.com/%s' % resolve_title - info_json_url = self._resolv_url(url) - info = self._download_json(info_json_url, full_title, 'Downloading info JSON') + # Contains some additional info missing from new_info + info = self._download_json( + info_json_url, full_title, 'Downloading info JSON') - return self._extract_info_dict(info, full_title, secret_token=token) + return self._extract_info_dict( + merge_dicts(info, new_info), full_title, secret_token=token) class SoundcloudPlaylistBaseIE(SoundcloudIE): @@ -396,8 +492,6 @@ class SoundcloudSetIE(SoundcloudPlaylistBaseIE): full_title += '/' + token url += '/' + token - self.report_resolve(full_title) - resolv_url = self._resolv_url(url) info = self._download_json(resolv_url, full_title)