From 2b4e1ace4ac422acbe63be2f8cc23429de6812b8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 10 Jan 2018 05:36:03 +0700 Subject: [PATCH 01/19] [limelight] Tolerate empty pc formats (closes #15150, closes #15151, closes #15207) --- youtube_dl/extractor/limelight.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/limelight.py b/youtube_dl/extractor/limelight.py index ad65b2759..2803d7e8d 100644 --- a/youtube_dl/extractor/limelight.py +++ b/youtube_dl/extractor/limelight.py @@ -10,6 +10,7 @@ from ..utils import ( float_or_none, int_or_none, smuggle_url, + try_get, unsmuggle_url, ExtractorError, ) @@ -220,6 +221,12 @@ class LimelightBaseIE(InfoExtractor): 'subtitles': subtitles, } + def _extract_info_helper(self, pc, mobile, i, metadata): + return self._extract_info( + try_get(pc, lambda x: x['playlistItems'][i]['streams'], list) or [], + try_get(mobile, lambda x: x['mediaList'][i]['mobileUrls'], list) or [], + metadata) + class LimelightMediaIE(LimelightBaseIE): IE_NAME = 'limelight' @@ -282,10 +289,7 @@ class LimelightMediaIE(LimelightBaseIE): 'getMobilePlaylistByMediaId', 'properties', smuggled_data.get('source_url')) - return self._extract_info( - pc['playlistItems'][0].get('streams', []), - mobile['mediaList'][0].get('mobileUrls', []) if mobile else [], - metadata) + return self._extract_info_helper(pc, mobile, 0, metadata) class LimelightChannelIE(LimelightBaseIE): @@ -326,10 +330,7 @@ class LimelightChannelIE(LimelightBaseIE): 'media', smuggled_data.get('source_url')) entries = [ - self._extract_info( - pc['playlistItems'][i].get('streams', []), - mobile['mediaList'][i].get('mobileUrls', []) if mobile else [], - medias['media_list'][i]) + self._extract_info_helper(pc, mobile, i, medias['media_list'][i]) for i in range(len(medias['media_list']))] return self.playlist_result(entries, channel_id, pc['title']) From e654829b4c4b8ebd4efb4554dd02cc1418c6fc23 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 10 Jan 2018 21:24:22 +0700 Subject: [PATCH 02/19] [digg] Add extractor (closes #15214) --- youtube_dl/extractor/digg.py | 41 ++++++++++++++++++++++++++++++ youtube_dl/extractor/extractors.py | 1 + 2 files changed, 42 insertions(+) create mode 100644 youtube_dl/extractor/digg.py diff --git a/youtube_dl/extractor/digg.py b/youtube_dl/extractor/digg.py new file mode 100644 index 000000000..611134ac0 --- /dev/null +++ b/youtube_dl/extractor/digg.py @@ -0,0 +1,41 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class DiggIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?digg\.com/video/(?P[^/?#&]+)' + _TEST = { + 'url': 'http://digg.com/video/sci-fi-short-jonah-daniel-kaluuya-get-out', + 'info_dict': { + 'id': 'LcqvmS0b', + 'ext': 'mp4', + 'title': "'Get Out' Star Daniel Kaluuya Goes On 'Moby Dick'-Like Journey In Sci-Fi Short 'Jonah'", + 'description': 'md5:541bb847648b6ee3d6514bc84b82efda', + 'upload_date': '20180109', + 'timestamp': 1515530551, + }, + 'params': { + 'skip_download': True, + }, + } + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + + jwplatform_id = self._search_regex( + r'video_id\s*:\s*["\']([a-zA-Z0-9]{8})', webpage, 'jwplatform id', + default=None) + + if not jwplatform_id: + return self.url_result(url, 'Generic') + + return { + '_type': 'url_transparent', + 'ie_key': 'JWPlatform', + 'url': 'jwplatform:%s' % jwplatform_id, + 'id': jwplatform_id, + 'display_id': display_id, + } diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index a3ad4df1f..3bfd1b7ed 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -259,6 +259,7 @@ from .deezer import DeezerPlaylistIE from .democracynow import DemocracynowIE from .dfb import DFBIE from .dhm import DHMIE +from .digg import DiggIE from .dotsub import DotsubIE from .douyutv import ( DouyuShowIE, From 1b79daffd965fcb3776e8304bd393db6573b50ea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 10 Jan 2018 22:19:51 +0700 Subject: [PATCH 03/19] [digg] Improve extraction --- youtube_dl/extractor/digg.py | 43 ++++++++++++++++++++++++------------ 1 file changed, 29 insertions(+), 14 deletions(-) diff --git a/youtube_dl/extractor/digg.py b/youtube_dl/extractor/digg.py index 611134ac0..913c1750f 100644 --- a/youtube_dl/extractor/digg.py +++ b/youtube_dl/extractor/digg.py @@ -1,11 +1,13 @@ from __future__ import unicode_literals from .common import InfoExtractor +from ..utils import js_to_json class DiggIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?digg\.com/video/(?P[^/?#&]+)' - _TEST = { + _TESTS = [{ + # JWPlatform via provider 'url': 'http://digg.com/video/sci-fi-short-jonah-daniel-kaluuya-get-out', 'info_dict': { 'id': 'LcqvmS0b', @@ -18,24 +20,37 @@ class DiggIE(InfoExtractor): 'params': { 'skip_download': True, }, - } + }, { + # Youtube via provider + 'url': 'http://digg.com/video/dog-boat-seal-play', + 'only_matching': True, + }, { + # vimeo as regular embed + 'url': 'http://digg.com/video/dream-girl-short-film', + 'only_matching': True, + }] def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) - jwplatform_id = self._search_regex( - r'video_id\s*:\s*["\']([a-zA-Z0-9]{8})', webpage, 'jwplatform id', - default=None) + info = self._parse_json( + self._search_regex( + r'(?s)video_info\s*=\s*({.+?});\n', webpage, 'video info', + default='{}'), display_id, transform_source=js_to_json, + fatal=False) - if not jwplatform_id: - return self.url_result(url, 'Generic') + video_id = info.get('video_id') - return { - '_type': 'url_transparent', - 'ie_key': 'JWPlatform', - 'url': 'jwplatform:%s' % jwplatform_id, - 'id': jwplatform_id, - 'display_id': display_id, - } + if video_id: + provider = info.get('provider_name') + if provider == 'youtube': + return self.url_result( + video_id, ie='Youtube', video_id=video_id) + elif provider == 'jwplayer': + return self.url_result( + 'jwplatform:%s' % video_id, ie='JWPlatform', + video_id=video_id) + + return self.url_result(url, 'Generic') From a90641fe87d62936b717b9c2fdbe453578b441ed Mon Sep 17 00:00:00 2001 From: scil Date: Thu, 11 Jan 2018 20:35:09 +0800 Subject: [PATCH 04/19] [ximalaya_extractor] Add new extractor ximalaya (#14687) * [ximalaya_extractor] Add new extractor * format change according by flake8 * changes accoring to review by @yan12125 at github pull #14687 * change %d to %s in a temp str * seond changes accoring to review by @yan12125 at github pull #1468 * improve TESTS about contains * changes accoring to third review by @yan12125 at github pull #1468 * forth changes accoring to forth review by @yan12125 at github pull #1468 --- youtube_dl/extractor/extractors.py | 4 + youtube_dl/extractor/ximalaya.py | 233 +++++++++++++++++++++++++++++ 2 files changed, 237 insertions(+) create mode 100644 youtube_dl/extractor/ximalaya.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 3bfd1b7ed..37624d37a 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1328,6 +1328,10 @@ from .xiami import ( XiamiArtistIE, XiamiCollectionIE ) +from .ximalaya import ( + XimalayaIE, + XimalayaAlbumIE +) from .xminus import XMinusIE from .xnxx import XNXXIE from .xstream import XstreamIE diff --git a/youtube_dl/extractor/ximalaya.py b/youtube_dl/extractor/ximalaya.py new file mode 100644 index 000000000..a912e54b8 --- /dev/null +++ b/youtube_dl/extractor/ximalaya.py @@ -0,0 +1,233 @@ +# coding: utf-8 + +from __future__ import unicode_literals + +import itertools +import re + +from .common import InfoExtractor + + +class XimalayaBaseIE(InfoExtractor): + _GEO_COUNTRIES = ['CN'] + + +class XimalayaIE(XimalayaBaseIE): + IE_NAME = 'ximalaya' + IE_DESC = '喜马拉雅FM' + _VALID_URL = r'https?://(?:www\.|m\.)?ximalaya\.com/(?P[0-9]+)/sound/(?P[0-9]+)' + _USER_URL_FORMAT = '%s://www.ximalaya.com/zhubo/%i/' + _TESTS = [ + { + 'url': 'http://www.ximalaya.com/61425525/sound/47740352/', + 'info_dict': { + 'id': '47740352', + 'ext': 'm4a', + 'uploader': '小彬彬爱听书', + 'uploader_id': 61425525, + 'uploader_url': 'http://www.ximalaya.com/zhubo/61425525/', + 'title': '261.唐诗三百首.卷八.送孟浩然之广陵.李白', + 'description': "contains:《送孟浩然之广陵》\n作者:李白\n故人西辞黄鹤楼,烟花三月下扬州。\n孤帆远影碧空尽,惟见长江天际流。", + 'thumbnails': [ + { + 'name': 'cover_url', + 'url': r're:^https?://.*\.jpg$', + }, + { + 'name': 'cover_url_142', + 'url': r're:^https?://.*\.jpg$', + 'width': 180, + 'height': 180 + } + ], + 'categories': ['renwen', '人文'], + 'duration': 93, + 'view_count': int, + 'like_count': int, + } + }, + { + 'url': 'http://m.ximalaya.com/61425525/sound/47740352/', + 'info_dict': { + 'id': '47740352', + 'ext': 'm4a', + 'uploader': '小彬彬爱听书', + 'uploader_id': 61425525, + 'uploader_url': 'http://www.ximalaya.com/zhubo/61425525/', + 'title': '261.唐诗三百首.卷八.送孟浩然之广陵.李白', + 'description': "contains:《送孟浩然之广陵》\n作者:李白\n故人西辞黄鹤楼,烟花三月下扬州。\n孤帆远影碧空尽,惟见长江天际流。", + 'thumbnails': [ + { + 'name': 'cover_url', + 'url': r're:^https?://.*\.jpg$', + }, + { + 'name': 'cover_url_142', + 'url': r're:^https?://.*\.jpg$', + 'width': 180, + 'height': 180 + } + ], + 'categories': ['renwen', '人文'], + 'duration': 93, + 'view_count': int, + 'like_count': int, + } + }, + { + 'url': 'https://www.ximalaya.com/11045267/sound/15705996/', + 'info_dict': { + 'id': '15705996', + 'ext': 'm4a', + 'uploader': '李延隆老师', + 'uploader_id': 11045267, + 'uploader_url': 'https://www.ximalaya.com/zhubo/11045267/', + 'title': 'Lesson 1 Excuse me!', + 'description': "contains:Listen to the tape then answer\xa0this question. Whose handbag is it?\n" + "听录音,然后回答问题,这是谁的手袋?", + 'thumbnails': [ + { + 'name': 'cover_url', + 'url': r're:^https?://.*\.jpg$', + }, + { + 'name': 'cover_url_142', + 'url': r're:^https?://.*\.jpg$', + 'width': 180, + 'height': 180 + } + ], + 'categories': ['train', '外语'], + 'duration': 40, + 'view_count': int, + 'like_count': int, + } + }, + ] + + def _real_extract(self, url): + + is_m = 'm.ximalaya' in url + scheme = 'https' if url.startswith('https') else 'http' + + audio_id = self._match_id(url) + webpage = self._download_webpage(url, audio_id, + note='Download sound page for %s' % audio_id, + errnote='Unable to get sound page') + + audio_info_file = '%s://m.ximalaya.com/tracks/%s.json' % (scheme, audio_id) + audio_info = self._download_json(audio_info_file, audio_id, + 'Downloading info json %s' % audio_info_file, + 'Unable to download info file') + + formats = [] + for bps, k in (('24k', 'play_path_32'), ('64k', 'play_path_64')): + if audio_info.get(k): + formats.append({ + 'format_id': bps, + 'url': audio_info[k], + }) + + thumbnails = [] + for k in audio_info.keys(): + # cover pics kyes like: cover_url', 'cover_url_142' + if k.startswith('cover_url'): + thumbnail = {'name': k, 'url': audio_info[k]} + if k == 'cover_url_142': + thumbnail['width'] = 180 + thumbnail['height'] = 180 + thumbnails.append(thumbnail) + + audio_uploader_id = audio_info.get('uid') + + if is_m: + audio_description = self._html_search_regex(r'(?s)]+>(.+?)', + webpage, 'audio_description', fatal=False) + else: + audio_description = self._html_search_regex(r'(?s)]*>(.+?)', + webpage, 'audio_description', fatal=False) + + if not audio_description: + audio_description_file = '%s://www.ximalaya.com/sounds/%s/rich_intro' % (scheme, audio_id) + audio_description = self._download_webpage(audio_description_file, audio_id, + note='Downloading description file %s' % audio_description_file, + errnote='Unable to download descrip file', + fatal=False) + audio_description = audio_description.strip() if audio_description else None + + return { + 'id': audio_id, + 'uploader': audio_info.get('nickname'), + 'uploader_id': audio_uploader_id, + 'uploader_url': self._USER_URL_FORMAT % (scheme, audio_uploader_id) if audio_uploader_id else None, + 'title': audio_info['title'], + 'thumbnails': thumbnails, + 'description': audio_description, + 'categories': list(filter(None, (audio_info.get('category_name'), audio_info.get('category_title')))), + 'duration': audio_info.get('duration'), + 'view_count': audio_info.get('play_count'), + 'like_count': audio_info.get('favorites_count'), + 'formats': formats, + } + + +class XimalayaAlbumIE(XimalayaBaseIE): + IE_NAME = 'ximalaya:album' + IE_DESC = '喜马拉雅FM 专辑' + _VALID_URL = r'https?://(?:www\.|m\.)?ximalaya\.com/(?P[0-9]+)/album/(?P[0-9]+)' + _TEMPLATE_URL = '%s://www.ximalaya.com/%s/album/%s/' + _BASE_URL_TEMPL = '%s://www.ximalaya.com%s' + _LIST_VIDEO_RE = r']+?href="(?P/%s/sound/(?P\d+)/?)"[^>]+?title="(?P[^>]+)">' + _TESTS = [{ + 'url': 'http://www.ximalaya.com/61425525/album/5534601/', + 'info_dict': { + 'title': '唐诗三百首(含赏析)', + 'id': '5534601', + }, + 'playlist_count': 312, + }, { + 'url': 'http://m.ximalaya.com/61425525/album/5534601', + 'info_dict': { + 'title': '唐诗三百首(含赏析)', + 'id': '5534601', + }, + 'playlist_count': 312, + }, + ] + + def _real_extract(self, url): + self.scheme = scheme = 'https' if url.startswith('https') else 'http' + + mobj = re.match(self._VALID_URL, url) + uid, playlist_id = mobj.group('uid'), mobj.group('id') + + webpage = self._download_webpage(self._TEMPLATE_URL % (scheme, uid, playlist_id), playlist_id, + note='Download album page for %s' % playlist_id, + errnote='Unable to get album info') + + title = self._html_search_regex(r'detailContent_title[^>]*><h1(?:[^>]+)?>([^<]+)</h1>', + webpage, 'title', fatal=False) + + return self.playlist_result(self._entries(webpage, playlist_id, uid), playlist_id, title) + + def _entries(self, page, playlist_id, uid): + html = page + for page_num in itertools.count(1): + for entry in self._process_page(html, uid): + yield entry + + next_url = self._search_regex(r'<a\s+href=(["\'])(?P<more>[\S]+)\1[^>]+rel=(["\'])next\3', + html, 'list_next_url', default=None, group='more') + if not next_url: + break + + next_full_url = self._BASE_URL_TEMPL % (self.scheme, next_url) + html = self._download_webpage(next_full_url, playlist_id) + + def _process_page(self, html, uid): + find_from = html.index('album_soundlist') + for mobj in re.finditer(self._LIST_VIDEO_RE % uid, html[find_from:]): + yield self.url_result(self._BASE_URL_TEMPL % (self.scheme, mobj.group('url')), + XimalayaIE.ie_key(), + mobj.group('id'), + mobj.group('title')) From 37941fe204ef855590af64584b75d7fc95997fc6 Mon Sep 17 00:00:00 2001 From: Chih-Hsuan Yen <yan12125@gmail.com> Date: Thu, 11 Jan 2018 20:36:06 +0800 Subject: [PATCH 05/19] [ChangeLog] Update after #14687 [ci skip] --- ChangeLog | 1 + 1 file changed, 1 insertion(+) diff --git a/ChangeLog b/ChangeLog index 51825ccfe..ea1be934c 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,6 +1,7 @@ version <unreleased> Extractors ++ [ximalaya] Add extractor (#14687) + [weibo] Add extractor (#15079) * [bilibili] fix extraction (#15188) From 64287560e4a7af9401e84318d9d04783c1b289af Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 11 Jan 2018 23:06:56 +0700 Subject: [PATCH 06/19] [pandoratv] Add support for new URL format (closes #15131) --- youtube_dl/extractor/pandoratv.py | 28 ++++++++++++++++++++++------ 1 file changed, 22 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/pandoratv.py b/youtube_dl/extractor/pandoratv.py index fc7bd3411..0c27a61d7 100644 --- a/youtube_dl/extractor/pandoratv.py +++ b/youtube_dl/extractor/pandoratv.py @@ -1,6 +1,8 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor from ..compat import ( compat_str, @@ -18,7 +20,13 @@ from ..utils import ( class PandoraTVIE(InfoExtractor): IE_NAME = 'pandora.tv' IE_DESC = '판도라TV' - _VALID_URL = r'https?://(?:.+?\.)?channel\.pandora\.tv/channel/video\.ptv\?' + _VALID_URL = r'''(?x) + https?:// + (?: + (?:www\.)?pandora\.tv/view/(?P<user_id>[^/]+)/(?P<id>\d+)| # new format + (?:.+?\.)?channel\.pandora\.tv/channel/video\.ptv\? # old format + ) + ''' _TESTS = [{ 'url': 'http://jp.channel.pandora.tv/channel/video.ptv?c1=&prgid=53294230&ch_userid=mikakim&ref=main&lot=cate_01_2', 'info_dict': { @@ -53,14 +61,22 @@ class PandoraTVIE(InfoExtractor): # Test metadata only 'skip_download': True, }, + }, { + 'url': 'http://www.pandora.tv/view/mikakim/53294230#36797454_new', + 'only_matching': True, }] def _real_extract(self, url): - qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) - video_id = qs.get('prgid', [None])[0] - user_id = qs.get('ch_userid', [None])[0] - if any(not f for f in (video_id, user_id,)): - raise ExtractorError('Invalid URL', expected=True) + mobj = re.match(self._VALID_URL, url) + user_id = mobj.group('user_id') + video_id = mobj.group('id') + + if not user_id or not video_id: + qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) + video_id = qs.get('prgid', [None])[0] + user_id = qs.get('ch_userid', [None])[0] + if any(not f for f in (video_id, user_id,)): + raise ExtractorError('Invalid URL', expected=True) data = self._download_json( 'http://m.pandora.tv/?c=view&m=viewJsonApi&ch_userid=%s&prgid=%s' From 609850acfb6c03dcdfa4d9cdba77df3b0a259968 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 11 Jan 2018 23:10:18 +0700 Subject: [PATCH 07/19] [pandoratv] Add support for mobile URLs (closes #12441) --- youtube_dl/extractor/pandoratv.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/pandoratv.py b/youtube_dl/extractor/pandoratv.py index 0c27a61d7..538738c09 100644 --- a/youtube_dl/extractor/pandoratv.py +++ b/youtube_dl/extractor/pandoratv.py @@ -24,7 +24,8 @@ class PandoraTVIE(InfoExtractor): https?:// (?: (?:www\.)?pandora\.tv/view/(?P<user_id>[^/]+)/(?P<id>\d+)| # new format - (?:.+?\.)?channel\.pandora\.tv/channel/video\.ptv\? # old format + (?:.+?\.)?channel\.pandora\.tv/channel/video\.ptv\?| # old format + m\.pandora\.tv/?\? # mobile ) ''' _TESTS = [{ @@ -64,6 +65,9 @@ class PandoraTVIE(InfoExtractor): }, { 'url': 'http://www.pandora.tv/view/mikakim/53294230#36797454_new', 'only_matching': True, + }, { + 'url': 'http://m.pandora.tv/?c=view&ch_userid=mikakim&prgid=54600346', + 'only_matching': True, }] def _real_extract(self, url): From e565a6386e61f7741a5386520f1b36efe2cb3310 Mon Sep 17 00:00:00 2001 From: Chih-Hsuan Yen <yan12125@gmail.com> Date: Fri, 12 Jan 2018 15:36:01 +0800 Subject: [PATCH 08/19] Credit @scil for ximalaya extractor (#14687) [ci skip] --- AUTHORS | 1 + 1 file changed, 1 insertion(+) diff --git a/AUTHORS b/AUTHORS index 5a090a3ef..40215a5cf 100644 --- a/AUTHORS +++ b/AUTHORS @@ -232,3 +232,4 @@ Tatsuyuki Ishi Daniel Weber Kay Bouché Yang Hongbo +Lei Wang From 47e2a9bc53c1f4a10dda62e473ec553108f7ee89 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 13 Jan 2018 18:47:47 +0700 Subject: [PATCH 09/19] [viafree] Skip rtmp formats (closes #15232) --- youtube_dl/extractor/tvplay.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/tvplay.py b/youtube_dl/extractor/tvplay.py index 46132eda1..84597b55e 100644 --- a/youtube_dl/extractor/tvplay.py +++ b/youtube_dl/extractor/tvplay.py @@ -273,6 +273,8 @@ class TVPlayIE(InfoExtractor): 'ext': ext, } if video_url.startswith('rtmp'): + if smuggled_data.get('skip_rtmp'): + continue m = re.search( r'^(?P<url>rtmp://[^/]+/(?P<app>[^/]+))/(?P<playpath>.+)$', video_url) if not m: @@ -434,6 +436,10 @@ class ViafreeIE(InfoExtractor): return self.url_result( smuggle_url( 'mtg:%s' % video_id, - {'geo_countries': [ - compat_urlparse.urlparse(url).netloc.rsplit('.', 1)[-1]]}), + { + 'geo_countries': [ + compat_urlparse.urlparse(url).netloc.rsplit('.', 1)[-1]], + # rtmp host mtgfs.fplive.net for viafree is unresolvable + 'skip_rtmp': True, + }), ie=TVPlayIE.ie_key(), video_id=video_id) From d4aedca3bd82288c802d4d766d5542bfcec4a91a Mon Sep 17 00:00:00 2001 From: "Hendrik v. Raven" <hendrik@consetetur.de> Date: Sat, 6 Jan 2018 15:09:53 +0100 Subject: [PATCH 10/19] [gamestar] Add support for gamepro.de (closes #3384) --- youtube_dl/extractor/gamestar.py | 43 +++++++++++++++++++------------- 1 file changed, 25 insertions(+), 18 deletions(-) diff --git a/youtube_dl/extractor/gamestar.py b/youtube_dl/extractor/gamestar.py index e607d6ab8..7ce2f15de 100644 --- a/youtube_dl/extractor/gamestar.py +++ b/youtube_dl/extractor/gamestar.py @@ -9,21 +9,27 @@ from ..utils import ( class GameStarIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?gamestar\.de/videos/.*,(?P<id>[0-9]+)\.html' - _TEST = { - 'url': 'http://www.gamestar.de/videos/trailer,3/hobbit-3-die-schlacht-der-fuenf-heere,76110.html', - 'md5': '96974ecbb7fd8d0d20fca5a00810cea7', - 'info_dict': { - 'id': '76110', - 'ext': 'mp4', - 'title': 'Hobbit 3: Die Schlacht der Fünf Heere - Teaser-Trailer zum dritten Teil', - 'description': 'Der Teaser-Trailer zu Hobbit 3: Die Schlacht der Fünf Heere zeigt einige Szenen aus dem dritten Teil der Saga und kündigt den...', - 'thumbnail': r're:^https?://.*\.jpg$', - 'timestamp': 1406542020, - 'upload_date': '20140728', - 'duration': 17 - } - } + _VALID_URL = r'https?://(?:www\.)?game(?:pro|star)\.de/videos/.*,(?P<id>[0-9]+)\.html' + _TESTS = [ + { + 'url': 'http://www.gamestar.de/videos/trailer,3/hobbit-3-die-schlacht-der-fuenf-heere,76110.html', + 'md5': 'ee782f1f8050448c95c5cacd63bc851c', + 'info_dict': { + 'id': '76110', + 'ext': 'mp4', + 'title': 'Hobbit 3: Die Schlacht der Fünf Heere - Teaser-Trailer zum dritten Teil', + 'description': 'Der Teaser-Trailer zu Hobbit 3: Die Schlacht der Fünf Heere zeigt einige Szenen aus dem dritten Teil der Saga und kündigt den...', + 'thumbnail': r're:^https?://.*\.jpg$', + 'timestamp': 1406542380, + 'upload_date': '20140728', + 'duration': 17, + } + }, + { + 'url': 'http://www.gamepro.de/videos/top-10-indie-spiele-fuer-nintendo-switch-video-tolle-nindies-games-zum-download,95316.html', + 'only_matching': True, + }, + ] def _real_extract(self, url): video_id = self._match_id(url) @@ -38,11 +44,12 @@ class GameStarIE(InfoExtractor): webpage, 'JSON-LD', group='json_ld'), video_id) info_dict = self._json_ld(json_ld, video_id) info_dict['title'] = remove_end(info_dict['title'], ' - GameStar') + info_dict['title'] = remove_end(info_dict['title'], ' - GamePro') - view_count = json_ld.get('interactionCount') + view_count = int_or_none(json_ld.get('interactionCount')) comment_count = int_or_none(self._html_search_regex( - r'([0-9]+) Kommentare</span>', webpage, 'comment_count', - fatal=False)) + r'<span>Kommentare</span><span class="count">\(([0-9]+)\)</span>', + webpage, 'comment_count', fatal=False)) info_dict.update({ 'id': video_id, From df16e645f60def2b5e1cf88d74164d6ced0d5651 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 13 Jan 2018 19:36:26 +0700 Subject: [PATCH 11/19] [gamestar] Fix issues (closes #15179) --- youtube_dl/extractor/gamestar.py | 61 +++++++++++++++++--------------- 1 file changed, 32 insertions(+), 29 deletions(-) diff --git a/youtube_dl/extractor/gamestar.py b/youtube_dl/extractor/gamestar.py index 7ce2f15de..f00dab2f3 100644 --- a/youtube_dl/extractor/gamestar.py +++ b/youtube_dl/extractor/gamestar.py @@ -1,6 +1,8 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor from ..utils import ( int_or_none, @@ -9,33 +11,34 @@ from ..utils import ( class GameStarIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?game(?:pro|star)\.de/videos/.*,(?P<id>[0-9]+)\.html' - _TESTS = [ - { - 'url': 'http://www.gamestar.de/videos/trailer,3/hobbit-3-die-schlacht-der-fuenf-heere,76110.html', - 'md5': 'ee782f1f8050448c95c5cacd63bc851c', - 'info_dict': { - 'id': '76110', - 'ext': 'mp4', - 'title': 'Hobbit 3: Die Schlacht der Fünf Heere - Teaser-Trailer zum dritten Teil', - 'description': 'Der Teaser-Trailer zu Hobbit 3: Die Schlacht der Fünf Heere zeigt einige Szenen aus dem dritten Teil der Saga und kündigt den...', - 'thumbnail': r're:^https?://.*\.jpg$', - 'timestamp': 1406542380, - 'upload_date': '20140728', - 'duration': 17, - } - }, - { - 'url': 'http://www.gamepro.de/videos/top-10-indie-spiele-fuer-nintendo-switch-video-tolle-nindies-games-zum-download,95316.html', - 'only_matching': True, - }, - ] + _VALID_URL = r'https?://(?:www\.)?game(?P<site>pro|star)\.de/videos/.*,(?P<id>[0-9]+)\.html' + _TESTS = [{ + 'url': 'http://www.gamestar.de/videos/trailer,3/hobbit-3-die-schlacht-der-fuenf-heere,76110.html', + 'md5': 'ee782f1f8050448c95c5cacd63bc851c', + 'info_dict': { + 'id': '76110', + 'ext': 'mp4', + 'title': 'Hobbit 3: Die Schlacht der Fünf Heere - Teaser-Trailer zum dritten Teil', + 'description': 'Der Teaser-Trailer zu Hobbit 3: Die Schlacht der Fünf Heere zeigt einige Szenen aus dem dritten Teil der Saga und kündigt den...', + 'thumbnail': r're:^https?://.*\.jpg$', + 'timestamp': 1406542380, + 'upload_date': '20140728', + 'duration': 17, + } + }, { + 'url': 'http://www.gamepro.de/videos/top-10-indie-spiele-fuer-nintendo-switch-video-tolle-nindies-games-zum-download,95316.html', + 'only_matching': True, + }, { + 'url': 'http://www.gamestar.de/videos/top-10-indie-spiele-fuer-nintendo-switch-video-tolle-nindies-games-zum-download,95316.html', + 'only_matching': True, + }] def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + mobj = re.match(self._VALID_URL, url) + site = mobj.group('site') + video_id = mobj.group('id') - url = 'http://gamestar.de/_misc/videos/portal/getVideoUrl.cfm?premium=0&videoId=' + video_id + webpage = self._download_webpage(url, video_id) # TODO: there are multiple ld+json objects in the webpage, # while _search_json_ld finds only the first one @@ -43,17 +46,17 @@ class GameStarIE(InfoExtractor): r'(?s)<script[^>]+type=(["\'])application/ld\+json\1[^>]*>(?P<json_ld>[^<]+VideoObject[^<]+)</script>', webpage, 'JSON-LD', group='json_ld'), video_id) info_dict = self._json_ld(json_ld, video_id) - info_dict['title'] = remove_end(info_dict['title'], ' - GameStar') - info_dict['title'] = remove_end(info_dict['title'], ' - GamePro') + info_dict['title'] = remove_end( + info_dict['title'], ' - Game%s' % site.title()) view_count = int_or_none(json_ld.get('interactionCount')) comment_count = int_or_none(self._html_search_regex( - r'<span>Kommentare</span><span class="count">\(([0-9]+)\)</span>', - webpage, 'comment_count', fatal=False)) + r'<span>Kommentare</span>\s*<span[^>]+class=["\']count[^>]+>\s*\(\s*([0-9]+)', + webpage, 'comment count', fatal=False)) info_dict.update({ 'id': video_id, - 'url': url, + 'url': 'http://gamestar.de/_misc/videos/portal/getVideoUrl.cfm?premium=0&videoId=' + video_id, 'ext': 'mp4', 'view_count': view_count, 'comment_count': comment_count From 2d8bb80c60289868d479e36a90cf1b73e9221893 Mon Sep 17 00:00:00 2001 From: Sebastian Leske <sebastian.leske@sleske.name> Date: Wed, 25 Oct 2017 14:59:57 +0200 Subject: [PATCH 12/19] [wdr:elefant] Add extractor --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/wdr.py | 54 ++++++++++++++++++++++++++++-- 2 files changed, 52 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 37624d37a..255df75fe 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1289,6 +1289,7 @@ from .watchbox import WatchBoxIE from .watchindianporn import WatchIndianPornIE from .wdr import ( WDRIE, + WDRElefantIE, WDRMobileIE, ) from .webcaster import ( diff --git a/youtube_dl/extractor/wdr.py b/youtube_dl/extractor/wdr.py index 621de1e1e..4871ae92b 100644 --- a/youtube_dl/extractor/wdr.py +++ b/youtube_dl/extractor/wdr.py @@ -16,7 +16,7 @@ from ..utils import ( class WDRBaseIE(InfoExtractor): - def _extract_wdr_video(self, webpage, display_id): + def _extract_jsonp_url(self, webpage, display_id): # for wdr.de the data-extension is in a tag with the class "mediaLink" # for wdr.de radio players, in a tag with the class "wdrrPlayerPlayBtn" # for wdrmaus, in a tag with the class "videoButton" (previously a link @@ -35,8 +35,9 @@ class WDRBaseIE(InfoExtractor): media_link_obj = self._parse_json(json_metadata, display_id, transform_source=js_to_json) - jsonp_url = media_link_obj['mediaObj']['url'] + return media_link_obj['mediaObj']['url'] + def _extract_wdr_video(self, jsonp_url, display_id): metadata = self._download_json( jsonp_url, display_id, transform_source=strip_jsonp) @@ -206,7 +207,8 @@ class WDRIE(WDRBaseIE): display_id = mobj.group('display_id') webpage = self._download_webpage(url, display_id) - info_dict = self._extract_wdr_video(webpage, display_id) + jsonp_url = self._extract_jsonp_url(webpage, display_id) + info_dict = self._extract_wdr_video(jsonp_url, display_id) if not info_dict: entries = [ @@ -239,6 +241,52 @@ class WDRIE(WDRBaseIE): return info_dict +class WDRElefantIE(WDRBaseIE): + _VALID_URL = r'https?://(?:www\.)wdrmaus.de/elefantenseite/#(?P<display_id>.+)' + IE_NAME = 'wdr:elefant' + + _TESTS = [ + { + 'url': 'http://www.wdrmaus.de/elefantenseite/#folge_ostern_2015', + 'info_dict': { + 'title': 'Folge Oster-Spezial 2015', + 'id': 'mdb-1088195', + 'ext': 'mp4', + 'age_limit': None, + 'upload_date': '20150406' + }, + 'params': { + 'skip_download' : True, + }, + }, + ] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + display_id = mobj.group('display_id') + + # Table of Contents seems to always be at this address, so fetch it directly. + # The website fetches configurationJS.php5, which links to tableOfContentsJS.php5. + table_of_contents = self._download_json( + 'https://www.wdrmaus.de/elefantenseite/data/tableOfContentsJS.php5', display_id) + if display_id not in table_of_contents: + raise ExtractorError( + 'No entry in site\'s table of contents for this URL. ' + 'Is the fragment part of the URL (after the #) correct?', + expected=True) + xml_metadata_path = table_of_contents[display_id]['xmlPath'] + xml_metadata = self._download_xml( + 'https://www.wdrmaus.de/elefantenseite/' + xml_metadata_path, display_id) + zmdb_url_element = xml_metadata.find('./movie/zmdb_url') + if zmdb_url_element is None: + raise ExtractorError( + 'The URL looks valid, but no video was found. Note that download only works ' + 'on pages showing a single video, not on video selection pages.', + expected=True) + info_dict = self._extract_wdr_video(zmdb_url_element.text, display_id) + return info_dict + + class WDRMobileIE(InfoExtractor): _VALID_URL = r'''(?x) https?://mobile-ondemand\.wdr\.de/ From 54e8f62e01b54eeccd8313349f86ae541082704f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 13 Jan 2018 23:28:08 +0700 Subject: [PATCH 13/19] [wdr] Rework extractors (closes #14598) --- youtube_dl/extractor/extractors.py | 2 +- youtube_dl/extractor/sportschau.py | 38 ----- youtube_dl/extractor/wdr.py | 232 +++++++++++++++-------------- 3 files changed, 124 insertions(+), 148 deletions(-) delete mode 100644 youtube_dl/extractor/sportschau.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 255df75fe..c82614bf9 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -991,7 +991,6 @@ from .stitcher import StitcherIE from .sport5 import Sport5IE from .sportbox import SportBoxEmbedIE from .sportdeutschland import SportDeutschlandIE -from .sportschau import SportschauIE from .sprout import SproutIE from .srgssr import ( SRGSSRIE, @@ -1289,6 +1288,7 @@ from .watchbox import WatchBoxIE from .watchindianporn import WatchIndianPornIE from .wdr import ( WDRIE, + WDRPageIE, WDRElefantIE, WDRMobileIE, ) diff --git a/youtube_dl/extractor/sportschau.py b/youtube_dl/extractor/sportschau.py deleted file mode 100644 index 0d7925a08..000000000 --- a/youtube_dl/extractor/sportschau.py +++ /dev/null @@ -1,38 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .wdr import WDRBaseIE -from ..utils import get_element_by_attribute - - -class SportschauIE(WDRBaseIE): - IE_NAME = 'Sportschau' - _VALID_URL = r'https?://(?:www\.)?sportschau\.de/(?:[^/]+/)+video-?(?P<id>[^/#?]+)\.html' - _TEST = { - 'url': 'http://www.sportschau.de/uefaeuro2016/videos/video-dfb-team-geht-gut-gelaunt-ins-spiel-gegen-polen-100.html', - 'info_dict': { - 'id': 'mdb-1140188', - 'display_id': 'dfb-team-geht-gut-gelaunt-ins-spiel-gegen-polen-100', - 'ext': 'mp4', - 'title': 'DFB-Team geht gut gelaunt ins Spiel gegen Polen', - 'description': 'Vor dem zweiten Gruppenspiel gegen Polen herrscht gute Stimmung im deutschen Team. Insbesondere Bastian Schweinsteiger strotzt vor Optimismus nach seinem Tor gegen die Ukraine.', - 'upload_date': '20160615', - }, - 'skip': 'Geo-restricted to Germany', - } - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - title = get_element_by_attribute('class', 'headline', webpage) - description = self._html_search_meta('description', webpage, 'description') - - info = self._extract_wdr_video(webpage, video_id) - - info.update({ - 'title': title, - 'description': description, - }) - - return info diff --git a/youtube_dl/extractor/wdr.py b/youtube_dl/extractor/wdr.py index 4871ae92b..6bf5aeaed 100644 --- a/youtube_dl/extractor/wdr.py +++ b/youtube_dl/extractor/wdr.py @@ -4,50 +4,52 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import ( + compat_str, + compat_urlparse, +) from ..utils import ( determine_ext, ExtractorError, js_to_json, strip_jsonp, + try_get, unified_strdate, update_url_query, urlhandle_detect_ext, ) -class WDRBaseIE(InfoExtractor): - def _extract_jsonp_url(self, webpage, display_id): - # for wdr.de the data-extension is in a tag with the class "mediaLink" - # for wdr.de radio players, in a tag with the class "wdrrPlayerPlayBtn" - # for wdrmaus, in a tag with the class "videoButton" (previously a link - # to the page in a multiline "videoLink"-tag) - json_metadata = self._html_search_regex( - r'''(?sx)class= - (?: - (["\'])(?:mediaLink|wdrrPlayerPlayBtn|videoButton)\b.*?\1[^>]+| - (["\'])videoLink\b.*?\2[\s]*>\n[^\n]* - )data-extension=(["\'])(?P<data>(?:(?!\3).)+)\3 - ''', - webpage, 'media link', default=None, group='data') +class WDRIE(InfoExtractor): + _VALID_URL = r'https?://deviceids-medp\.wdr\.de/ondemand/\d+/(?P<id>\d+)\.js' + _TEST = { + 'url': 'http://deviceids-medp.wdr.de/ondemand/155/1557833.js', + 'info_dict': { + 'id': 'mdb-1140188', + 'display_id': 'dfb-team-geht-gut-gelaunt-ins-spiel-gegen-polen-100', + 'ext': 'mp4', + 'title': 'DFB-Team geht gut gelaunt ins Spiel gegen Polen', + 'description': 'Vor dem zweiten Gruppenspiel gegen Polen herrscht gute Stimmung im deutschen Team. Insbesondere Bastian Schweinsteiger strotzt vor Optimismus nach seinem Tor gegen die Ukraine.', + 'upload_date': '20160615', + }, + 'skip': 'Geo-restricted to Germany', + } - if not json_metadata: - return + def _real_extract(self, url): + video_id = self._match_id(url) - media_link_obj = self._parse_json(json_metadata, display_id, - transform_source=js_to_json) - return media_link_obj['mediaObj']['url'] - - def _extract_wdr_video(self, jsonp_url, display_id): metadata = self._download_json( - jsonp_url, display_id, transform_source=strip_jsonp) + url, video_id, transform_source=strip_jsonp) - metadata_tracker_data = metadata['trackerData'] - metadata_media_resource = metadata['mediaResource'] + is_live = metadata.get('mediaType') == 'live' + + tracker_data = metadata['trackerData'] + media_resource = metadata['mediaResource'] formats = [] # check if the metadata contains a direct URL to a file - for kind, media_resource in metadata_media_resource.items(): + for kind, media_resource in media_resource.items(): if kind not in ('dflt', 'alt'): continue @@ -58,13 +60,13 @@ class WDRBaseIE(InfoExtractor): ext = determine_ext(medium_url) if ext == 'm3u8': formats.extend(self._extract_m3u8_formats( - medium_url, display_id, 'mp4', 'm3u8_native', + medium_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls')) elif ext == 'f4m': manifest_url = update_url_query( medium_url, {'hdcore': '3.2.0', 'plugin': 'aasp-3.2.0.77.18'}) formats.extend(self._extract_f4m_formats( - manifest_url, display_id, f4m_id='hds', fatal=False)) + manifest_url, video_id, f4m_id='hds', fatal=False)) elif ext == 'smil': formats.extend(self._extract_smil_formats( medium_url, 'stream', fatal=False)) @@ -74,7 +76,7 @@ class WDRBaseIE(InfoExtractor): } if ext == 'unknown_video': urlh = self._request_webpage( - medium_url, display_id, note='Determining extension') + medium_url, video_id, note='Determining extension') ext = urlhandle_detect_ext(urlh) a_format['ext'] = ext formats.append(a_format) @@ -82,30 +84,30 @@ class WDRBaseIE(InfoExtractor): self._sort_formats(formats) subtitles = {} - caption_url = metadata_media_resource.get('captionURL') + caption_url = media_resource.get('captionURL') if caption_url: subtitles['de'] = [{ 'url': caption_url, 'ext': 'ttml', }] - title = metadata_tracker_data['trackerClipTitle'] + title = tracker_data['trackerClipTitle'] return { - 'id': metadata_tracker_data.get('trackerClipId', display_id), - 'display_id': display_id, - 'title': title, - 'alt_title': metadata_tracker_data.get('trackerClipSubcategory'), + 'id': tracker_data.get('trackerClipId', video_id), + 'title': self._live_title(title) if is_live else title, + 'alt_title': tracker_data.get('trackerClipSubcategory'), 'formats': formats, 'subtitles': subtitles, - 'upload_date': unified_strdate(metadata_tracker_data.get('trackerClipAirTime')), + 'upload_date': unified_strdate(tracker_data.get('trackerClipAirTime')), + 'is_live': is_live, } -class WDRIE(WDRBaseIE): +class WDRPageIE(InfoExtractor): _CURRENT_MAUS_URL = r'https?://(?:www\.)wdrmaus.de/(?:[^/]+/){1,2}[^/?#]+\.php5' - _PAGE_REGEX = r'/(?:mediathek/)?[^/]+/(?P<type>[^/]+)/(?P<display_id>.+)\.html' - _VALID_URL = r'(?P<page_url>https?://(?:www\d\.)?wdr\d?\.de)' + _PAGE_REGEX + '|' + _CURRENT_MAUS_URL + _PAGE_REGEX = r'/(?:mediathek/)?(?:[^/]+/)*(?P<display_id>[^/]+)\.html' + _VALID_URL = r'https?://(?:www\d?\.)?(?:wdr\d?|sportschau)\.de' + _PAGE_REGEX + '|' + _CURRENT_MAUS_URL _TESTS = [ { @@ -125,6 +127,7 @@ class WDRIE(WDRBaseIE): 'ext': 'ttml', }]}, }, + 'skip': 'HTTP Error 404: Not Found', }, { 'url': 'http://www1.wdr.de/mediathek/audio/wdr3/wdr3-gespraech-am-samstag/audio-schriftstellerin-juli-zeh-100.html', @@ -140,19 +143,17 @@ class WDRIE(WDRBaseIE): 'is_live': False, 'subtitles': {} }, + 'skip': 'HTTP Error 404: Not Found', }, { 'url': 'http://www1.wdr.de/mediathek/video/live/index.html', 'info_dict': { - 'id': 'mdb-103364', + 'id': 'mdb-1406149', 'ext': 'mp4', - 'display_id': 'index', - 'title': r're:^WDR Fernsehen im Livestream [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'title': r're:^WDR Fernsehen im Livestream \(nur in Deutschland erreichbar\) [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', 'alt_title': 'WDR Fernsehen Live', - 'upload_date': None, - 'description': 'md5:ae2ff888510623bf8d4b115f95a9b7c9', + 'upload_date': '20150101', 'is_live': True, - 'subtitles': {} }, 'params': { 'skip_download': True, # m3u8 download @@ -160,19 +161,18 @@ class WDRIE(WDRBaseIE): }, { 'url': 'http://www1.wdr.de/mediathek/video/sendungen/aktuelle-stunde/aktuelle-stunde-120.html', - 'playlist_mincount': 8, + 'playlist_mincount': 7, 'info_dict': { - 'id': 'aktuelle-stunde/aktuelle-stunde-120', + 'id': 'aktuelle-stunde-120', }, }, { 'url': 'http://www.wdrmaus.de/aktuelle-sendung/index.php5', 'info_dict': { - 'id': 'mdb-1323501', + 'id': 'mdb-1552552', 'ext': 'mp4', 'upload_date': 're:^[0-9]{8}$', 'title': 're:^Die Sendung mit der Maus vom [0-9.]{10}$', - 'description': 'Die Seite mit der Maus -', }, 'skip': 'The id changes from week to week because of the new episode' }, @@ -184,7 +184,6 @@ class WDRIE(WDRBaseIE): 'ext': 'mp4', 'upload_date': '20130919', 'title': 'Sachgeschichte - Achterbahn ', - 'description': 'Die Seite mit der Maus -', }, }, { @@ -192,83 +191,100 @@ class WDRIE(WDRBaseIE): # Live stream, MD5 unstable 'info_dict': { 'id': 'mdb-869971', - 'ext': 'flv', - 'title': 'COSMO Livestream', - 'description': 'md5:2309992a6716c347891c045be50992e4', + 'ext': 'mp4', + 'title': r're:^COSMO Livestream [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', 'upload_date': '20160101', }, + 'params': { + 'skip_download': True, # m3u8 download + } + }, + { + 'url': 'http://www.sportschau.de/handballem2018/handball-nationalmannschaft-em-stolperstein-vorrunde-100.html', + 'info_dict': { + 'id': 'mdb-1556012', + 'ext': 'mp4', + 'title': 'DHB-Vizepräsident Bob Hanning - "Die Weltspitze ist extrem breit"', + 'upload_date': '20180111', + }, + 'params': { + 'skip_download': True, + }, + }, + { + 'url': 'http://www.sportschau.de/handballem2018/audio-vorschau---die-handball-em-startet-mit-grossem-favoritenfeld-100.html', + 'only_matching': True, } ] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - url_type = mobj.group('type') - page_url = mobj.group('page_url') display_id = mobj.group('display_id') webpage = self._download_webpage(url, display_id) - jsonp_url = self._extract_jsonp_url(webpage, display_id) - info_dict = self._extract_wdr_video(jsonp_url, display_id) + entries = [] - if not info_dict: + # Article with several videos + + # for wdr.de the data-extension is in a tag with the class "mediaLink" + # for wdr.de radio players, in a tag with the class "wdrrPlayerPlayBtn" + # for wdrmaus, in a tag with the class "videoButton" (previously a link + # to the page in a multiline "videoLink"-tag) + for mobj in re.finditer( + r'''(?sx)class= + (?: + (["\'])(?:mediaLink|wdrrPlayerPlayBtn|videoButton)\b.*?\1[^>]+| + (["\'])videoLink\b.*?\2[\s]*>\n[^\n]* + )data-extension=(["\'])(?P<data>(?:(?!\3).)+)\3 + ''', webpage): + media_link_obj = self._parse_json( + mobj.group('data'), display_id, transform_source=js_to_json, + fatal=False) + if not media_link_obj: + continue + jsonp_url = try_get( + media_link_obj, lambda x: x['mediaObj']['url'], compat_str) + if jsonp_url: + entries.append(self.url_result(jsonp_url, ie=WDRIE.ie_key())) + + # Playlist (e.g. https://www1.wdr.de/mediathek/video/sendungen/aktuelle-stunde/aktuelle-stunde-120.html) + if not entries: entries = [ - self.url_result(page_url + href[0], 'WDR') - for href in re.findall( - r'<a href="(%s)"[^>]+data-extension=' % self._PAGE_REGEX, - webpage) + self.url_result( + compat_urlparse.urljoin(url, mobj.group('href')), + ie=WDRPageIE.ie_key()) + for mobj in re.finditer( + r'<a[^>]+\bhref=(["\'])(?P<href>(?:(?!\1).)+)\1[^>]+\bdata-extension=', + webpage) if re.match(self._PAGE_REGEX, mobj.group('href')) ] - if entries: # Playlist page - return self.playlist_result(entries, playlist_id=display_id) - - raise ExtractorError('No downloadable streams found', expected=True) - - is_live = url_type == 'live' - - if is_live: - info_dict.update({ - 'title': self._live_title(info_dict['title']), - 'upload_date': None, - }) - elif 'upload_date' not in info_dict: - info_dict['upload_date'] = unified_strdate(self._html_search_meta('DC.Date', webpage, 'upload date')) - - info_dict.update({ - 'description': self._html_search_meta('Description', webpage), - 'is_live': is_live, - }) - - return info_dict + return self.playlist_result(entries, playlist_id=display_id) -class WDRElefantIE(WDRBaseIE): - _VALID_URL = r'https?://(?:www\.)wdrmaus.de/elefantenseite/#(?P<display_id>.+)' - IE_NAME = 'wdr:elefant' - - _TESTS = [ - { - 'url': 'http://www.wdrmaus.de/elefantenseite/#folge_ostern_2015', - 'info_dict': { - 'title': 'Folge Oster-Spezial 2015', - 'id': 'mdb-1088195', - 'ext': 'mp4', - 'age_limit': None, - 'upload_date': '20150406' - }, - 'params': { - 'skip_download' : True, - }, +class WDRElefantIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)wdrmaus\.de/elefantenseite/#(?P<id>.+)' + _TEST = { + 'url': 'http://www.wdrmaus.de/elefantenseite/#folge_ostern_2015', + 'info_dict': { + 'title': 'Folge Oster-Spezial 2015', + 'id': 'mdb-1088195', + 'ext': 'mp4', + 'age_limit': None, + 'upload_date': '20150406' }, - ] + 'params': { + 'skip_download': True, + }, + } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - display_id = mobj.group('display_id') + display_id = self._match_id(url) # Table of Contents seems to always be at this address, so fetch it directly. # The website fetches configurationJS.php5, which links to tableOfContentsJS.php5. table_of_contents = self._download_json( - 'https://www.wdrmaus.de/elefantenseite/data/tableOfContentsJS.php5', display_id) + 'https://www.wdrmaus.de/elefantenseite/data/tableOfContentsJS.php5', + display_id) if display_id not in table_of_contents: raise ExtractorError( 'No entry in site\'s table of contents for this URL. ' @@ -276,15 +292,13 @@ class WDRElefantIE(WDRBaseIE): expected=True) xml_metadata_path = table_of_contents[display_id]['xmlPath'] xml_metadata = self._download_xml( - 'https://www.wdrmaus.de/elefantenseite/' + xml_metadata_path, display_id) + 'https://www.wdrmaus.de/elefantenseite/' + xml_metadata_path, + display_id) zmdb_url_element = xml_metadata.find('./movie/zmdb_url') if zmdb_url_element is None: raise ExtractorError( - 'The URL looks valid, but no video was found. Note that download only works ' - 'on pages showing a single video, not on video selection pages.', - expected=True) - info_dict = self._extract_wdr_video(zmdb_url_element.text, display_id) - return info_dict + '%s is not a video' % display_id, expected=True) + return self.url_result(zmdb_url_element.text, ie=WDRIE.ie_key()) class WDRMobileIE(InfoExtractor): From 1915662d4fe09120d3f28db55c7be90b4d12a9f1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 13 Jan 2018 23:30:56 +0700 Subject: [PATCH 14/19] [wdr] Bypass geo restriction --- youtube_dl/extractor/wdr.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/wdr.py b/youtube_dl/extractor/wdr.py index 6bf5aeaed..d6ba254f5 100644 --- a/youtube_dl/extractor/wdr.py +++ b/youtube_dl/extractor/wdr.py @@ -22,6 +22,7 @@ from ..utils import ( class WDRIE(InfoExtractor): _VALID_URL = r'https?://deviceids-medp\.wdr\.de/ondemand/\d+/(?P<id>\d+)\.js' + _GEO_COUNTRIES = ['DE'] _TEST = { 'url': 'http://deviceids-medp.wdr.de/ondemand/155/1557833.js', 'info_dict': { From 0ce39bc542813462c2d95dc21f1b363c4ae7a1e0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 13 Jan 2018 23:33:52 +0700 Subject: [PATCH 15/19] [wdr] Fix test --- youtube_dl/extractor/wdr.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/wdr.py b/youtube_dl/extractor/wdr.py index d6ba254f5..cf6f7c7ed 100644 --- a/youtube_dl/extractor/wdr.py +++ b/youtube_dl/extractor/wdr.py @@ -26,14 +26,11 @@ class WDRIE(InfoExtractor): _TEST = { 'url': 'http://deviceids-medp.wdr.de/ondemand/155/1557833.js', 'info_dict': { - 'id': 'mdb-1140188', - 'display_id': 'dfb-team-geht-gut-gelaunt-ins-spiel-gegen-polen-100', + 'id': 'mdb-1557833', 'ext': 'mp4', - 'title': 'DFB-Team geht gut gelaunt ins Spiel gegen Polen', - 'description': 'Vor dem zweiten Gruppenspiel gegen Polen herrscht gute Stimmung im deutschen Team. Insbesondere Bastian Schweinsteiger strotzt vor Optimismus nach seinem Tor gegen die Ukraine.', - 'upload_date': '20160615', + 'title': 'Biathlon-Staffel verpasst Podest bei Olympia-Generalprobe', + 'upload_date': '20180112', }, - 'skip': 'Geo-restricted to Germany', } def _real_extract(self, url): From 391dd6f0946e2dd499147c5f3c6bb13642314515 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 14 Jan 2018 00:03:22 +0700 Subject: [PATCH 16/19] [youtube] Fix live streams extraction (closes #15202) --- youtube_dl/extractor/youtube.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 0919bef0e..a01ec1436 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1810,7 +1810,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'url': video_info['conn'][0], 'player_url': player_url, }] - elif len(video_info.get('url_encoded_fmt_stream_map', [''])[0]) >= 1 or len(video_info.get('adaptive_fmts', [''])[0]) >= 1: + elif not is_live and (len(video_info.get('url_encoded_fmt_stream_map', [''])[0]) >= 1 or len(video_info.get('adaptive_fmts', [''])[0]) >= 1): encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0] if 'rtmpe%3Dyes' in encoded_url_map: raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True) From dd896a6a07939a343776570770b5c8f69e8c0988 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 14 Jan 2018 00:10:04 +0700 Subject: [PATCH 17/19] [ChangeLog] Actualize --- ChangeLog | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/ChangeLog b/ChangeLog index ea1be934c..7c63d4bac 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,9 +1,22 @@ version <unreleased> Extractors -+ [ximalaya] Add extractor (#14687) +* [youtube] Fix live streams extraction (#15202) +* [wdr] Bypass geo restriction +* [wdr] Rework extractors (#14598) ++ [wdr] Add support for wdrmaus.de/elefantenseite (#14598) ++ [gamestar] Add support for gamepro.de (#3384) +* [viafree] Skip rtmp formats (#15232) ++ [pandoratv] Add support for mobile URLs (#12441) ++ [pandoratv] Add support for new URL format (#15131) ++ [ximalaya] Add support for ximalaya.com (#14687) ++ [digg] Add support for digg.com (#15214) +* [limelight] Tolerate empty pc formats (#15150, #15151, #15207) +* [ndr:embed:base] Make separate formats extraction non fatal (#15203) + [weibo] Add extractor (#15079) -* [bilibili] fix extraction (#15188) ++ [ok] Add support for live streams +* [canalplus] Fix extraction (#15072) +* [bilibili] Fix extraction (#15188) version 2018.01.07 From e11ccd76c6dc8438c01d30445627eab5203cb1dc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 14 Jan 2018 00:13:56 +0700 Subject: [PATCH 18/19] release 2018.01.14 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- docs/supportedsites.md | 10 ++++++++-- youtube_dl/version.py | 2 +- 4 files changed, 13 insertions(+), 7 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index ad52c8900..6bc7d0366 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.01.07*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.01.07** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.01.14*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.01.14** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -35,7 +35,7 @@ Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2018.01.07 +[debug] youtube-dl version 2018.01.14 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index 7c63d4bac..bfafaca6a 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version <unreleased> +version 2018.01.14 Extractors * [youtube] Fix live streams extraction (#15202) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 79b343048..c04a75b88 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -128,7 +128,7 @@ - **CamdemyFolder** - **CamWithHer** - **canalc2.tv** - - **Canalplus**: canalplus.fr, piwiplus.fr and d8.tv + - **Canalplus**: mycanal.fr and piwiplus.fr - **Canvas** - **CanvasEen**: canvas.be and een.be - **CarambaTV** @@ -210,6 +210,7 @@ - **defense.gouv.fr** - **democracynow** - **DHM**: Filmarchiv - Deutsches Historisches Museum + - **Digg** - **DigitallySpeaking** - **Digiteka** - **Discovery** @@ -773,7 +774,6 @@ - **Sport5** - **SportBoxEmbed** - **SportDeutschland** - - **Sportschau** - **Sprout** - **sr:mediathek**: Saarländischer Rundfunk - **SRGSSR** @@ -1002,10 +1002,14 @@ - **WatchIndianPorn**: Watch Indian Porn - **WDR** - **wdr:mobile** + - **WDRElefant** + - **WDRPage** - **Webcaster** - **WebcasterFeed** - **WebOfStories** - **WebOfStoriesPlaylist** + - **Weibo** + - **WeiboMobile** - **WeiqiTV**: WQTV - **wholecloud**: WholeCloud - **Wimp** @@ -1025,6 +1029,8 @@ - **xiami:artist**: 虾米音乐 - 歌手 - **xiami:collection**: 虾米音乐 - 精选集 - **xiami:song**: 虾米音乐 + - **ximalaya**: 喜马拉雅FM + - **ximalaya:album**: 喜马拉雅FM 专辑 - **XMinus** - **XNXX** - **Xstream** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 9030e2415..498149110 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2018.01.07' +__version__ = '2018.01.14' From a86922c4702e2c8538337124c5bf02a4b5f9aa4a Mon Sep 17 00:00:00 2001 From: Reto Kromer <retokromer@users.noreply.github.com> Date: Sat, 13 Jan 2018 18:58:38 +0100 Subject: [PATCH 19/19] [README.md] Clarify macOS name --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 47b0640ab..eb05f848f 100644 --- a/README.md +++ b/README.md @@ -46,7 +46,7 @@ Or with [MacPorts](https://www.macports.org/): Alternatively, refer to the [developer instructions](#developer-instructions) for how to check out and work with the git repository. For further options, including PGP signatures, see the [youtube-dl Download Page](https://rg3.github.io/youtube-dl/download.html). # DESCRIPTION -**youtube-dl** is a command-line program to download videos from YouTube.com and a few more sites. It requires the Python interpreter, version 2.6, 2.7, or 3.2+, and it is not platform specific. It should work on your Unix box, on Windows or on Mac OS X. It is released to the public domain, which means you can modify it, redistribute it or use it however you like. +**youtube-dl** is a command-line program to download videos from YouTube.com and a few more sites. It requires the Python interpreter, version 2.6, 2.7, or 3.2+, and it is not platform specific. It should work on your Unix box, on Windows or on macOS. It is released to the public domain, which means you can modify it, redistribute it or use it however you like. youtube-dl [OPTIONS] URL [URL...] @@ -863,7 +863,7 @@ Use the `--cookies` option, for example `--cookies /path/to/cookies/file.txt`. In order to extract cookies from browser use any conforming browser extension for exporting cookies. For example, [cookies.txt](https://chrome.google.com/webstore/detail/cookiestxt/njabckikapfpffapmjgojcnbfjonfjfg) (for Chrome) or [Export Cookies](https://addons.mozilla.org/en-US/firefox/addon/export-cookies/) (for Firefox). -Note that the cookies file must be in Mozilla/Netscape format and the first line of the cookies file must be either `# HTTP Cookie File` or `# Netscape HTTP Cookie File`. Make sure you have correct [newline format](https://en.wikipedia.org/wiki/Newline) in the cookies file and convert newlines if necessary to correspond with your OS, namely `CRLF` (`\r\n`) for Windows and `LF` (`\n`) for Unix and Unix-like systems (Linux, Mac OS, etc.). `HTTP Error 400: Bad Request` when using `--cookies` is a good sign of invalid newline format. +Note that the cookies file must be in Mozilla/Netscape format and the first line of the cookies file must be either `# HTTP Cookie File` or `# Netscape HTTP Cookie File`. Make sure you have correct [newline format](https://en.wikipedia.org/wiki/Newline) in the cookies file and convert newlines if necessary to correspond with your OS, namely `CRLF` (`\r\n`) for Windows and `LF` (`\n`) for Unix and Unix-like systems (Linux, macOS, etc.). `HTTP Error 400: Bad Request` when using `--cookies` is a good sign of invalid newline format. Passing cookies to youtube-dl is a good way to workaround login when a particular extractor does not implement it explicitly. Another use case is working around [CAPTCHA](https://en.wikipedia.org/wiki/CAPTCHA) some websites require you to solve in particular cases in order to get access (e.g. YouTube, CloudFlare).