From 7a5773090789bec38a3f58dfb09039155919a540 Mon Sep 17 00:00:00 2001 From: rrooij Date: Sun, 9 Jul 2017 09:21:40 +0200 Subject: [PATCH 01/22] [npo:live] Fix live stream id extraction (closes #13568) --- youtube_dl/extractor/npo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index 5f8b6def1..516b1e941 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -341,7 +341,7 @@ class NPOLiveIE(NPOBaseIE): webpage = self._download_webpage(url, display_id) live_id = self._search_regex( - r'data-prid="([^"]+)"', webpage, 'live id') + [r'media-id="([^"]+)"', r'data-prid="([^"]+)"'], webpage, 'live id') return { '_type': 'url_transparent', From 15237fcd51dca192103f08a910660616e3b241b8 Mon Sep 17 00:00:00 2001 From: mlindner Date: Sun, 9 Jul 2017 00:54:52 -0700 Subject: [PATCH 02/22] [veoh] Extend _VALID_URL --- youtube_dl/extractor/veoh.py | 73 ++++++++++++++++++------------------ 1 file changed, 36 insertions(+), 37 deletions(-) diff --git a/youtube_dl/extractor/veoh.py b/youtube_dl/extractor/veoh.py index 0f5d68738..b20dddc5c 100644 --- a/youtube_dl/extractor/veoh.py +++ b/youtube_dl/extractor/veoh.py @@ -12,47 +12,46 @@ from ..utils import ( class VeohIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?veoh\.com/(?:watch|iphone/#_Watch)/(?P(?:v|yapi-)[\da-zA-Z]+)' + _VALID_URL = r'https?://(?:www\.)?veoh\.com/(?:watch|iphone/#_Watch)/(?P(?:v|e|yapi-)[\da-zA-Z]+)' - _TESTS = [ - { - 'url': 'http://www.veoh.com/watch/v56314296nk7Zdmz3', - 'md5': '620e68e6a3cff80086df3348426c9ca3', - 'info_dict': { - 'id': '56314296', - 'ext': 'mp4', - 'title': 'Straight Backs Are Stronger', - 'uploader': 'LUMOback', - 'description': 'At LUMOback, we believe straight backs are stronger. The LUMOback Posture & Movement Sensor: It gently vibrates when you slouch, inspiring improved posture and mobility. Use the app to track your data and improve your posture over time. ', - }, + _TESTS = [{ + 'url': 'http://www.veoh.com/watch/v56314296nk7Zdmz3', + 'md5': '620e68e6a3cff80086df3348426c9ca3', + 'info_dict': { + 'id': '56314296', + 'ext': 'mp4', + 'title': 'Straight Backs Are Stronger', + 'uploader': 'LUMOback', + 'description': 'At LUMOback, we believe straight backs are stronger. The LUMOback Posture & Movement Sensor: It gently vibrates when you slouch, inspiring improved posture and mobility. Use the app to track your data and improve your posture over time. ', }, - { - 'url': 'http://www.veoh.com/watch/v27701988pbTc4wzN?h1=Chile+workers+cover+up+to+avoid+skin+damage', - 'md5': '4a6ff84b87d536a6a71e6aa6c0ad07fa', - 'info_dict': { - 'id': '27701988', - 'ext': 'mp4', - 'title': 'Chile workers cover up to avoid skin damage', - 'description': 'md5:2bd151625a60a32822873efc246ba20d', - 'uploader': 'afp-news', - 'duration': 123, - }, - 'skip': 'This video has been deleted.', + }, { + 'url': 'http://www.veoh.com/watch/v27701988pbTc4wzN?h1=Chile+workers+cover+up+to+avoid+skin+damage', + 'md5': '4a6ff84b87d536a6a71e6aa6c0ad07fa', + 'info_dict': { + 'id': '27701988', + 'ext': 'mp4', + 'title': 'Chile workers cover up to avoid skin damage', + 'description': 'md5:2bd151625a60a32822873efc246ba20d', + 'uploader': 'afp-news', + 'duration': 123, }, - { - 'url': 'http://www.veoh.com/watch/v69525809F6Nc4frX', - 'md5': '4fde7b9e33577bab2f2f8f260e30e979', - 'note': 'Embedded ooyala video', - 'info_dict': { - 'id': '69525809', - 'ext': 'mp4', - 'title': 'Doctors Alter Plan For Preteen\'s Weight Loss Surgery', - 'description': 'md5:f5a11c51f8fb51d2315bca0937526891', - 'uploader': 'newsy-videos', - }, - 'skip': 'This video has been deleted.', + 'skip': 'This video has been deleted.', + }, { + 'url': 'http://www.veoh.com/watch/v69525809F6Nc4frX', + 'md5': '4fde7b9e33577bab2f2f8f260e30e979', + 'note': 'Embedded ooyala video', + 'info_dict': { + 'id': '69525809', + 'ext': 'mp4', + 'title': 'Doctors Alter Plan For Preteen\'s Weight Loss Surgery', + 'description': 'md5:f5a11c51f8fb51d2315bca0937526891', + 'uploader': 'newsy-videos', }, - ] + 'skip': 'This video has been deleted.', + }, { + 'url': 'http://www.veoh.com/watch/e152215AJxZktGS', + 'only_matching': True, + }] def _extract_formats(self, source): formats = [] From 5af2fd7fa02734c2a23f917fb60f1c14da149d3d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 9 Jul 2017 15:55:04 +0700 Subject: [PATCH 03/22] [eagleplatform] Add support for another embed pattern (#13557) --- youtube_dl/extractor/eagleplatform.py | 36 ++++++++++++++++++++++----- 1 file changed, 30 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/eagleplatform.py b/youtube_dl/extractor/eagleplatform.py index 76d39adac..5e1de04a1 100644 --- a/youtube_dl/extractor/eagleplatform.py +++ b/youtube_dl/extractor/eagleplatform.py @@ -60,16 +60,40 @@ class EaglePlatformIE(InfoExtractor): webpage) if mobj is not None: return mobj.group('url') - # Basic usage embedding (see http://dultonmedia.github.io/eplayer/) + PLAYER_JS_RE = r''' + ]+ + src=(?P["\'])(?:https?:)?//(?P(?:(?!(?P=qjs)).)+\.media\.eagleplatform\.com)/player/player\.js(?P=qjs) + .+? + ''' + # "Basic usage" embedding (see http://dultonmedia.github.io/eplayer/) mobj = re.search( r'''(?xs) - ]+ - src=(?P["\'])(?:https?:)?//(?P.+?\.media\.eagleplatform\.com)/player/player\.js(?P=q1) - .+? + %s ]+ - class=(?P["\'])eagleplayer(?P=q2)[^>]+ + class=(?P["\'])eagleplayer(?P=qclass)[^>]+ data-id=["\'](?P\d+) - ''', webpage) + ''' % PLAYER_JS_RE, webpage) + if mobj is not None: + return 'eagleplatform:%(host)s:%(id)s' % mobj.groupdict() + # Generalization of "Javascript code usage", "Combined usage" and + # "Usage without attaching to DOM" embeddings (see + # http://dultonmedia.github.io/eplayer/) + mobj = re.search( + r'''(?xs) + %s + + ''' % PLAYER_JS_RE, webpage) if mobj is not None: return 'eagleplatform:%(host)s:%(id)s' % mobj.groupdict() From 665e9452461abaff7127653265c78bd585acea6c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 9 Jul 2017 15:57:33 +0700 Subject: [PATCH 04/22] [eagleplatform] Add support for referrer protected videos (closes #13557) --- youtube_dl/extractor/eagleplatform.py | 25 ++++++++++++++++++++++--- youtube_dl/extractor/generic.py | 10 +++++----- 2 files changed, 27 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/eagleplatform.py b/youtube_dl/extractor/eagleplatform.py index 5e1de04a1..34891a362 100644 --- a/youtube_dl/extractor/eagleplatform.py +++ b/youtube_dl/extractor/eagleplatform.py @@ -11,6 +11,7 @@ from ..compat import ( from ..utils import ( ExtractorError, int_or_none, + unsmuggle_url, ) @@ -50,6 +51,10 @@ class EaglePlatformIE(InfoExtractor): 'view_count': int, }, 'skip': 'Georestricted', + }, { + # referrer protected video (https://tvrain.ru/lite/teleshow/kak_vse_nachinalos/namin-418921/) + 'url': 'tvrainru.media.eagleplatform.com:582306', + 'only_matching': True, }] @staticmethod @@ -103,9 +108,10 @@ class EaglePlatformIE(InfoExtractor): if status != 200: raise ExtractorError(' '.join(response['errors']), expected=True) - def _download_json(self, url_or_request, video_id, note='Downloading JSON metadata', *args, **kwargs): + def _download_json(self, url_or_request, video_id, *args, **kwargs): try: - response = super(EaglePlatformIE, self)._download_json(url_or_request, video_id, note) + response = super(EaglePlatformIE, self)._download_json( + url_or_request, video_id, *args, **kwargs) except ExtractorError as ee: if isinstance(ee.cause, compat_HTTPError): response = self._parse_json(ee.cause.read().decode('utf-8'), video_id) @@ -117,11 +123,24 @@ class EaglePlatformIE(InfoExtractor): return self._download_json(url_or_request, video_id, note)['data'][0] def _real_extract(self, url): + url, smuggled_data = unsmuggle_url(url, {}) + mobj = re.match(self._VALID_URL, url) host, video_id = mobj.group('custom_host') or mobj.group('host'), mobj.group('id') + headers = {} + query = { + 'id': video_id, + } + + referrer = smuggled_data.get('referrer') + if referrer: + headers['Referer'] = referrer + query['referrer'] = referrer + player_data = self._download_json( - 'http://%s/api/player_data?id=%s' % (host, video_id), video_id) + 'http://%s/api/player_data' % host, video_id, + headers=headers, query=query) media = player_data['data']['playlist']['viewports'][0]['medialist'][0] diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index f9bff433c..7232f39db 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -1185,7 +1185,7 @@ class GenericIE(InfoExtractor): }, 'add_ie': ['Kaltura'], }, - # Eagle.Platform embed (generic URL) + # EaglePlatform embed (generic URL) { 'url': 'http://lenta.ru/news/2015/03/06/navalny/', # Not checking MD5 as sometimes the direct HTTP link results in 404 and HLS is used @@ -1200,7 +1200,7 @@ class GenericIE(InfoExtractor): 'age_limit': 0, }, }, - # ClipYou (Eagle.Platform) embed (custom URL) + # ClipYou (EaglePlatform) embed (custom URL) { 'url': 'http://muz-tv.ru/play/7129/', # Not checking MD5 as sometimes the direct HTTP link results in 404 and HLS is used @@ -2443,12 +2443,12 @@ class GenericIE(InfoExtractor): if kaltura_url: return self.url_result(smuggle_url(kaltura_url, {'source_url': url}), KalturaIE.ie_key()) - # Look for Eagle.Platform embeds + # Look for EaglePlatform embeds eagleplatform_url = EaglePlatformIE._extract_url(webpage) if eagleplatform_url: - return self.url_result(eagleplatform_url, EaglePlatformIE.ie_key()) + return self.url_result(smuggle_url(eagleplatform_url, {'referrer': url}), EaglePlatformIE.ie_key()) - # Look for ClipYou (uses Eagle.Platform) embeds + # Look for ClipYou (uses EaglePlatform) embeds mobj = re.search( r']+src="https?://(?Pmedia\.clipyou\.ru)/index/player\?.*\brecord_id=(?P\d+).*"', webpage) if mobj is not None: From 250b042c7e71a6e8bbff534aa41c2b92dae1acf7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 9 Jul 2017 16:02:38 +0700 Subject: [PATCH 05/22] [generic] Add tests for #13557 --- youtube_dl/extractor/generic.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 7232f39db..95c38698d 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -1199,6 +1199,24 @@ class GenericIE(InfoExtractor): 'view_count': int, 'age_limit': 0, }, + 'params': { + 'skip_download': True, + }, + }, + # referrer protected EaglePlatform embed + { + 'url': 'https://tvrain.ru/lite/teleshow/kak_vse_nachinalos/namin-418921/', + 'info_dict': { + 'id': '582306', + 'ext': 'mp4', + 'title': 'Стас Намин: «Мы нарушили девственность Кремля»', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 3382, + 'view_count': int, + }, + 'params': { + 'skip_download': True, + }, }, # ClipYou (EaglePlatform) embed (custom URL) { @@ -1212,6 +1230,9 @@ class GenericIE(InfoExtractor): 'duration': 216, 'view_count': int, }, + 'params': { + 'skip_download': True, + }, }, # Pladform embed { From 4328ddf82b812420ffc120b4150251f751bff08c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 9 Jul 2017 16:29:52 +0700 Subject: [PATCH 06/22] [extractor/common] Add support for AMP tags in _parse_html5_media_entries --- youtube_dl/extractor/common.py | 7 +++++-- youtube_dl/extractor/generic.py | 10 ++++++++++ 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index afeb4c5da..daa10885f 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -2132,15 +2132,18 @@ class InfoExtractor(object): return is_plain_url, formats entries = [] + # amp-video and amp-audio are very similar to their HTML5 counterparts + # so we wll include them right here (see + # https://www.ampproject.org/docs/reference/components/amp-video) media_tags = [(media_tag, media_type, '') for media_tag, media_type - in re.findall(r'(?s)(<(video|audio)[^>]*/>)', webpage)] + in re.findall(r'(?s)(<(?:amp-)?(video|audio)[^>]*/>)', webpage)] media_tags.extend(re.findall( # We only allow video|audio followed by a whitespace or '>'. # Allowing more characters may end up in significant slow down (see # https://github.com/rg3/youtube-dl/issues/11979, example URL: # http://www.porntrex.com/maps/videositemap.xml). - r'(?s)(<(?Pvideo|audio)(?:\s+[^>]*)?>)(.*?)', webpage)) + r'(?s)(<(?P(?:amp-)?(?:video|audio))(?:\s+[^>]*)?>)(.*?)', webpage)) for media_tag, media_type, media_content in media_tags: media_info = { 'formats': [], diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 95c38698d..919f4f987 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -1770,6 +1770,16 @@ class GenericIE(InfoExtractor): }, 'add_ie': [MediasetIE.ie_key()], }, + { + # AMP embed (see https://www.ampproject.org/docs/reference/components/amp-video) + 'url': 'https://tvrain.ru/amp/418921/', + 'md5': 'cc00413936695987e8de148b67d14f1d', + 'info_dict': { + 'id': '418921', + 'ext': 'mp4', + 'title': 'Стас Намин: «Мы нарушили девственность Кремля»', + }, + }, # { # # TODO: find another test # # http://schema.org/VideoObject From d2b9f362fabad8f9490825456d8ed679d7159271 Mon Sep 17 00:00:00 2001 From: Christopher Smith Date: Thu, 29 Jun 2017 13:10:45 -0600 Subject: [PATCH 07/22] [cjsw] Add extractor --- youtube_dl/extractor/cjsw.py | 41 ++++++++++++++++++++++++++++++ youtube_dl/extractor/extractors.py | 1 + 2 files changed, 42 insertions(+) create mode 100644 youtube_dl/extractor/cjsw.py diff --git a/youtube_dl/extractor/cjsw.py b/youtube_dl/extractor/cjsw.py new file mode 100644 index 000000000..087cac9bc --- /dev/null +++ b/youtube_dl/extractor/cjsw.py @@ -0,0 +1,41 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class CJSWIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?cjsw\.com/program/\S+/(?P[0-9]+)' + IE_NAME = 'cjsw' + _TEST = { + 'url': 'http://cjsw.com/program/freshly-squeezed/episode/20170620', + 'md5': 'cee14d40f1e9433632c56e3d14977120', + 'info_dict': { + 'id': '20170620', + 'ext': 'mp3', + 'title': 'Freshly Squeezed', + 'description': 'Sled Island artists featured // Live session with Phi Pho, followed by a live session with Sinzere & The Late Nights! // Stay Fresh Y\'all!!', + } + } + + def _real_extract(self, url): + episode_id = self._match_id(url) + + webpage = self._download_webpage(url, episode_id) + + title = self._search_regex( + r']+data-showname=(["\'])(?P(?!\1).+?)\1[^>]*>', webpage, 'title', group='title') + description = self._html_search_regex( + r'<p>(?P<description>.+?)</p>', webpage, 'description', fatal=False) + formats = [{ + 'url': self._search_regex( + r'<button[^>]+data-audio-src=(["\'])(?P<audio_url>(?!\1).+?)\1[^>]*>', webpage, 'audio_url', group='audio_url'), + 'ext': 'mp3', + 'vcodec': 'none', + }] + return { + 'id': episode_id, + 'title': title, + 'description': description, + 'formats': formats, + } diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index b83c3aba5..4524fa687 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -185,6 +185,7 @@ from .chirbit import ( ChirbitProfileIE, ) from .cinchcast import CinchcastIE +from .cjsw import CJSWIE from .clipfish import ClipfishIE from .cliphunter import CliphunterIE from .cliprs import ClipRsIE From c319d1c4833f89df818fe39f4c99cdc5c9a8bf01 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 9 Jul 2017 17:00:45 +0700 Subject: [PATCH 08/22] [csjw] Fix issues and improve extraction (closes #13525) --- youtube_dl/extractor/cjsw.py | 57 ++++++++++++++++++++++++++---------- 1 file changed, 41 insertions(+), 16 deletions(-) diff --git a/youtube_dl/extractor/cjsw.py b/youtube_dl/extractor/cjsw.py index 087cac9bc..aab6ea535 100644 --- a/youtube_dl/extractor/cjsw.py +++ b/youtube_dl/extractor/cjsw.py @@ -1,41 +1,66 @@ -# coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor +from ..utils import ( + determine_ext, + unescapeHTML, +) class CJSWIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?cjsw\.com/program/\S+/(?P<id>[0-9]+)' - IE_NAME = 'cjsw' + _VALID_URL = r'https?://(?:www\.)?cjsw\.com/program/(?P<program>[^/]+)/episode/(?P<id>\d+)' _TEST = { 'url': 'http://cjsw.com/program/freshly-squeezed/episode/20170620', 'md5': 'cee14d40f1e9433632c56e3d14977120', 'info_dict': { - 'id': '20170620', + 'id': '91d9f016-a2e7-46c5-8dcb-7cbcd7437c41', 'ext': 'mp3', - 'title': 'Freshly Squeezed', - 'description': 'Sled Island artists featured // Live session with Phi Pho, followed by a live session with Sinzere & The Late Nights! // Stay Fresh Y\'all!!', - } + 'title': 'Freshly Squeezed – Episode June 20, 2017', + 'description': 'md5:c967d63366c3898a80d0c7b0ff337202', + 'series': 'Freshly Squeezed', + 'episode_id': '20170620', + }, } def _real_extract(self, url): - episode_id = self._match_id(url) + mobj = re.match(self._VALID_URL, url) + program, episode_id = mobj.group('program', 'id') + audio_id = '%s/%s' % (program, episode_id) webpage = self._download_webpage(url, episode_id) - title = self._search_regex( - r'<button[^>]+data-showname=(["\'])(?P<title>(?!\1).+?)\1[^>]*>', webpage, 'title', group='title') - description = self._html_search_regex( - r'<p>(?P<description>.+?)</p>', webpage, 'description', fatal=False) + title = unescapeHTML(self._search_regex( + (r'<h1[^>]+class=["\']episode-header__title["\'][^>]*>(?P<title>[^<]+)', + r'data-audio-title=(["\'])(?P<title>(?:(?!\1).)+)\1'), + webpage, 'title', group='title')) + + audio_url = self._search_regex( + r'<button[^>]+data-audio-src=(["\'])(?P<url>(?:(?!\1).)+)\1', + webpage, 'audio url', group='url') + + audio_id = self._search_regex( + r'/([\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})\.mp3', + audio_url, 'audio id', default=audio_id) + formats = [{ - 'url': self._search_regex( - r'<button[^>]+data-audio-src=(["\'])(?P<audio_url>(?!\1).+?)\1[^>]*>', webpage, 'audio_url', group='audio_url'), - 'ext': 'mp3', + 'url': audio_url, + 'ext': determine_ext(audio_url, 'mp3'), 'vcodec': 'none', }] + + description = self._html_search_regex( + r'<p>(?P<description>.+?)</p>', webpage, 'description', fatal=False) + series = self._search_regex( + r'data-showname=(["\'])(?P<name>(?:(?!\1).)+)\1', webpage, + 'series', default=program, group='name') + return { - 'id': episode_id, + 'id': audio_id, 'title': title, 'description': description, 'formats': formats, + 'series': series, + 'episode_id': episode_id, } From 0d2f0b0357325823782884327a158aeccf4f9b49 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 9 Jul 2017 17:05:11 +0700 Subject: [PATCH 09/22] [csjw] Make description optional --- youtube_dl/extractor/cjsw.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/cjsw.py b/youtube_dl/extractor/cjsw.py index aab6ea535..dd271586f 100644 --- a/youtube_dl/extractor/cjsw.py +++ b/youtube_dl/extractor/cjsw.py @@ -11,7 +11,7 @@ from ..utils import ( class CJSWIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?cjsw\.com/program/(?P<program>[^/]+)/episode/(?P<id>\d+)' - _TEST = { + _TESTS = [{ 'url': 'http://cjsw.com/program/freshly-squeezed/episode/20170620', 'md5': 'cee14d40f1e9433632c56e3d14977120', 'info_dict': { @@ -22,7 +22,11 @@ class CJSWIE(InfoExtractor): 'series': 'Freshly Squeezed', 'episode_id': '20170620', }, - } + }, { + # no description + 'url': 'http://cjsw.com/program/road-pops/episode/20170707/', + 'only_matching': True, + }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) @@ -51,7 +55,8 @@ class CJSWIE(InfoExtractor): }] description = self._html_search_regex( - r'<p>(?P<description>.+?)</p>', webpage, 'description', fatal=False) + r'<p>(?P<description>.+?)</p>', webpage, 'description', + default=None) series = self._search_regex( r'data-showname=(["\'])(?P<name>(?:(?!\1).)+)\1', webpage, 'series', default=program, group='name') From a02682fd13ce5ba88d2508c90559eaa7f43b65d5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 9 Jul 2017 17:09:44 +0700 Subject: [PATCH 10/22] Keep in sync with ffmpeg's current malformed AAC bitstream wording (closes #13587) --- youtube_dl/YoutubeDL.py | 4 ++-- youtube_dl/postprocessor/ffmpeg.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index b3a6d4d3b..60ee4b7d8 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -1890,7 +1890,7 @@ class YoutubeDL(object): info_dict.get('protocol') == 'm3u8' and self.params.get('hls_prefer_native')): if fixup_policy == 'warn': - self.report_warning('%s: malformated aac bitstream.' % ( + self.report_warning('%s: malformed AAC bitstream detected.' % ( info_dict['id'])) elif fixup_policy == 'detect_or_warn': fixup_pp = FFmpegFixupM3u8PP(self) @@ -1899,7 +1899,7 @@ class YoutubeDL(object): info_dict['__postprocessors'].append(fixup_pp) else: self.report_warning( - '%s: malformated aac bitstream. %s' + '%s: malformed AAC bitstream detected. %s' % (info_dict['id'], INSTALL_FFMPEG_MESSAGE)) else: assert fixup_policy in ('ignore', 'never') diff --git a/youtube_dl/postprocessor/ffmpeg.py b/youtube_dl/postprocessor/ffmpeg.py index f021ea8fd..51256a3fb 100644 --- a/youtube_dl/postprocessor/ffmpeg.py +++ b/youtube_dl/postprocessor/ffmpeg.py @@ -542,7 +542,7 @@ class FFmpegFixupM3u8PP(FFmpegPostProcessor): temp_filename = prepend_extension(filename, 'temp') options = ['-c', 'copy', '-f', 'mp4', '-bsf:a', 'aac_adtstoasc'] - self._downloader.to_screen('[ffmpeg] Fixing malformated aac bitstream in "%s"' % filename) + self._downloader.to_screen('[ffmpeg] Fixing malformed AAC bitstream in "%s"' % filename) self.run_ffmpeg(filename, temp_filename, options) os.remove(encodeFilename(filename)) From ed84454d358f3cbfdc43dab31328b165f9c72c68 Mon Sep 17 00:00:00 2001 From: Santiago Calcagno <santicalcagno@gmail.com> Date: Tue, 13 Jun 2017 12:32:04 -0300 Subject: [PATCH 11/22] [egghead:course] Fix extraction --- youtube_dl/extractor/egghead.py | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/youtube_dl/extractor/egghead.py b/youtube_dl/extractor/egghead.py index db921465e..01fcdb6cf 100644 --- a/youtube_dl/extractor/egghead.py +++ b/youtube_dl/extractor/egghead.py @@ -1,8 +1,6 @@ # coding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor @@ -22,18 +20,18 @@ class EggheadCourseIE(InfoExtractor): def _real_extract(self, url): playlist_id = self._match_id(url) - webpage = self._download_webpage(url, playlist_id) + api_url = 'https://egghead.io/api/v1/series/' + playlist_id + course = self._download_json(api_url, playlist_id) + title = course.get('title') + description = course.get('description') - title = self._html_search_regex(r'<h1 class="title">([^<]+)</h1>', webpage, 'title') - ul = self._search_regex(r'(?s)<ul class="series-lessons-list">(.*?)</ul>', webpage, 'session list') - - found = re.findall(r'(?s)<a class="[^"]*"\s*href="([^"]+)">\s*<li class="item', ul) - entries = [self.url_result(m) for m in found] + lessons = course.get('lessons') + entries = [{'_type': 'url', 'ie_key': 'Wistia', 'url': 'wistia:' + l.get('wistia_id')} for l in lessons] return { '_type': 'playlist', 'id': playlist_id, 'title': title, - 'description': self._og_search_description(webpage), + 'description': description, 'entries': entries, } From 485cb375766df8f2ef79b7fe2915ead4ef61a01e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 9 Jul 2017 17:28:42 +0700 Subject: [PATCH 12/22] [egghead:course] Improve (closes #13370) --- youtube_dl/extractor/egghead.py | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/youtube_dl/extractor/egghead.py b/youtube_dl/extractor/egghead.py index 01fcdb6cf..c86f52319 100644 --- a/youtube_dl/extractor/egghead.py +++ b/youtube_dl/extractor/egghead.py @@ -7,7 +7,7 @@ from .common import InfoExtractor class EggheadCourseIE(InfoExtractor): IE_DESC = 'egghead.io course' IE_NAME = 'egghead:course' - _VALID_URL = r'https://egghead\.io/courses/(?P<id>[a-zA-Z_0-9-]+)' + _VALID_URL = r'https://egghead\.io/courses/(?P<id>[^/?#&]+)' _TEST = { 'url': 'https://egghead.io/courses/professor-frisby-introduces-composable-functional-javascript', 'playlist_count': 29, @@ -20,18 +20,16 @@ class EggheadCourseIE(InfoExtractor): def _real_extract(self, url): playlist_id = self._match_id(url) - api_url = 'https://egghead.io/api/v1/series/' + playlist_id - course = self._download_json(api_url, playlist_id) - title = course.get('title') - description = course.get('description') - lessons = course.get('lessons') - entries = [{'_type': 'url', 'ie_key': 'Wistia', 'url': 'wistia:' + l.get('wistia_id')} for l in lessons] + course = self._download_json( + 'https://egghead.io/api/v1/series/%s' % playlist_id, playlist_id) - return { - '_type': 'playlist', - 'id': playlist_id, - 'title': title, - 'description': description, - 'entries': entries, - } + entries = [ + self.url_result( + 'wistia:%s' % lesson['wistia_id'], ie='Wistia', + video_id=lesson['wistia_id'], video_title=lesson.get('title')) + for lesson in course['lessons'] if lesson.get('wistia_id')] + + return self.playlist_result( + entries, playlist_id, course.get('title'), + course.get('description')) From 58179eb7d96ebef26a0083e80a2022fab4ca1558 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 9 Jul 2017 17:55:40 +0700 Subject: [PATCH 13/22] [abc.net.au:iview] Extract more formats (closes #13492, closes #13489) --- youtube_dl/extractor/abc.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/abc.py b/youtube_dl/extractor/abc.py index 0247cabf9..60f753b95 100644 --- a/youtube_dl/extractor/abc.py +++ b/youtube_dl/extractor/abc.py @@ -3,11 +3,13 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( ExtractorError, js_to_json, int_or_none, parse_iso8601, + try_get, ) @@ -124,7 +126,20 @@ class ABCIViewIE(InfoExtractor): title = video_params.get('title') or video_params['seriesTitle'] stream = next(s for s in video_params['playlist'] if s.get('type') == 'program') - formats = self._extract_akamai_formats(stream['hds-unmetered'], video_id) + format_urls = [ + try_get(stream, lambda x: x['hds-unmetered'], compat_str)] + + # May have higher quality video + sd_url = try_get( + stream, lambda x: x['streams']['hds']['sd'], compat_str) + if sd_url: + format_urls.append(sd_url.replace('metered', 'um')) + + formats = [] + for format_url in format_urls: + if format_url: + formats.extend( + self._extract_akamai_formats(format_url, video_id)) self._sort_formats(formats) subtitles = {} From 256a746d21634eccad07a1e6dcafedcdf8b6181b Mon Sep 17 00:00:00 2001 From: luboss <lubos.katrinec@gmail.com> Date: Fri, 2 Jun 2017 22:44:39 +0200 Subject: [PATCH 14/22] [joj] Add extractor --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/joj.py | 56 ++++++++++++++++++++++++++++++ 2 files changed, 57 insertions(+) create mode 100755 youtube_dl/extractor/joj.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 4524fa687..9ee080895 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -470,6 +470,7 @@ from .jamendo import ( ) from .jeuxvideo import JeuxVideoIE from .jove import JoveIE +from .joj import JojIE from .jwplatform import JWPlatformIE from .jpopsukitv import JpopsukiIE from .kaltura import KalturaIE diff --git a/youtube_dl/extractor/joj.py b/youtube_dl/extractor/joj.py new file mode 100755 index 000000000..2ebfec902 --- /dev/null +++ b/youtube_dl/extractor/joj.py @@ -0,0 +1,56 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +import re + + +class JojIE(InfoExtractor): + _VALID_URL = r'https?://[a-z0-9]+\.joj\.sk/([^/]+/)*(?P<title_query>(?P<release_date>[0-9]{4}(-[0-9]{2}){2}).*)' # noqa + _TESTS = [{ + 'url': 'https://www.joj.sk/nove-byvanie/archiv/2017-05-28-nove-byvanie', # noqa + 'info_dict': { + 'id': 'a388ec4c-6019-4a4a-9312-b1bee194e932', + 'ext': 'mp4', + 'title': 'Nové Bývanie', + 'release_date': '20170528' + } + }, { + 'url': 'http://nasi.joj.sk/epizody/2016-09-06-stari-rodicia', + 'info_dict': { + 'id': 'f18b2c5f-9ea8-4941-a164-a814c53306ad', + 'ext': 'mp4', + 'title': 'Starí Rodičia', + 'release_date': '20160906' + } + }] + + media_src_url = 'http://n16.joj.sk/storage/' + xml_source_url = 'https://media.joj.sk/services/Video.php?clip=' + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + release_date = mobj.group('release_date').replace('-', '') + webpage = self._download_webpage(url, 'id') + video_id = self._html_search_regex( + r'https?://([a-z0-9]+\.)joj\.sk/embed/(?P<video_id>[a-f0-9\-]+)', + webpage, 'id', group='video_id') + xml_playlist_url = self.xml_source_url + video_id + xml_playlist_et = self._download_xml(xml_playlist_url, 'XML playlist') + formats = [] + for file_el in xml_playlist_et.findall('files/file'): + try: + height = int(file_el.attrib['id'].replace('p', '')) + except ValueError: + height = 0 + formats.append({'height': height, + 'url': self.media_src_url + file_el.attrib['path'].replace( # noqa + 'dat/', '', 1)}) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': self._og_search_title(webpage).title(), + 'formats': formats, + 'release_date': release_date + } From 73cf76a93fe48240bf82b1685b1403f05b793ebf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 9 Jul 2017 19:05:18 +0700 Subject: [PATCH 15/22] [joj] Rewrite and add support for generic embeds (closes #13268) --- youtube_dl/extractor/generic.py | 17 +++++ youtube_dl/extractor/joj.py | 108 ++++++++++++++++++++++---------- 2 files changed, 93 insertions(+), 32 deletions(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 919f4f987..f2c577f98 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -91,6 +91,7 @@ from .anvato import AnvatoIE from .washingtonpost import WashingtonPostIE from .wistia import WistiaIE from .mediaset import MediasetIE +from .joj import JojIE class GenericIE(InfoExtractor): @@ -1770,6 +1771,16 @@ class GenericIE(InfoExtractor): }, 'add_ie': [MediasetIE.ie_key()], }, + { + # JOJ.sk embeds + 'url': 'https://www.noviny.sk/slovensko/238543-slovenskom-sa-prehnala-vlna-silnych-burok', + 'info_dict': { + 'id': '238543-slovenskom-sa-prehnala-vlna-silnych-burok', + 'title': 'Slovenskom sa prehnala vlna silných búrok', + }, + 'playlist_mincount': 5, + 'add_ie': [JojIE.ie_key()], + }, { # AMP embed (see https://www.ampproject.org/docs/reference/components/amp-video) 'url': 'https://tvrain.ru/amp/418921/', @@ -2722,6 +2733,12 @@ class GenericIE(InfoExtractor): return self.playlist_from_matches( mediaset_urls, video_id, video_title, ie=MediasetIE.ie_key()) + # Look for JOJ.sk embeds + joj_urls = JojIE._extract_urls(webpage) + if joj_urls: + return self.playlist_from_matches( + joj_urls, video_id, video_title, ie=JojIE.ie_key()) + def merge_dicts(dict1, dict2): merged = {} for k, v in dict1.items(): diff --git a/youtube_dl/extractor/joj.py b/youtube_dl/extractor/joj.py index 2ebfec902..a764023e9 100755 --- a/youtube_dl/extractor/joj.py +++ b/youtube_dl/extractor/joj.py @@ -1,56 +1,100 @@ # coding: utf-8 from __future__ import unicode_literals -from .common import InfoExtractor import re +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + int_or_none, + js_to_json, + try_get, +) + class JojIE(InfoExtractor): - _VALID_URL = r'https?://[a-z0-9]+\.joj\.sk/([^/]+/)*(?P<title_query>(?P<release_date>[0-9]{4}(-[0-9]{2}){2}).*)' # noqa + _VALID_URL = r'''(?x) + (?: + joj:| + https?://media\.joj\.sk/embed/ + ) + (?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}) + ''' _TESTS = [{ - 'url': 'https://www.joj.sk/nove-byvanie/archiv/2017-05-28-nove-byvanie', # noqa + 'url': 'https://media.joj.sk/embed/a388ec4c-6019-4a4a-9312-b1bee194e932', 'info_dict': { 'id': 'a388ec4c-6019-4a4a-9312-b1bee194e932', 'ext': 'mp4', - 'title': 'Nové Bývanie', - 'release_date': '20170528' + 'title': 'NOVÉ BÝVANIE', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 3118, } }, { - 'url': 'http://nasi.joj.sk/epizody/2016-09-06-stari-rodicia', - 'info_dict': { - 'id': 'f18b2c5f-9ea8-4941-a164-a814c53306ad', - 'ext': 'mp4', - 'title': 'Starí Rodičia', - 'release_date': '20160906' - } + 'url': 'joj:a388ec4c-6019-4a4a-9312-b1bee194e932', + 'only_matching': True, }] - media_src_url = 'http://n16.joj.sk/storage/' - xml_source_url = 'https://media.joj.sk/services/Video.php?clip=' + @staticmethod + def _extract_urls(webpage): + return re.findall( + r'<iframe\b[^>]+\bsrc=["\'](?P<url>(?:https?:)?//media\.joj\.sk/embed/[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})', + webpage) def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - release_date = mobj.group('release_date').replace('-', '') - webpage = self._download_webpage(url, 'id') - video_id = self._html_search_regex( - r'https?://([a-z0-9]+\.)joj\.sk/embed/(?P<video_id>[a-f0-9\-]+)', - webpage, 'id', group='video_id') - xml_playlist_url = self.xml_source_url + video_id - xml_playlist_et = self._download_xml(xml_playlist_url, 'XML playlist') + video_id = self._match_id(url) + + webpage = self._download_webpage( + 'https://media.joj.sk/embed/%s' % video_id, video_id) + + title = self._search_regex( + (r'videoTitle\s*:\s*(["\'])(?P<title>(?:(?!\1).)+)\1', + r'<title>(?P<title>[^<]+)'), webpage, 'title', + default=None, group='title') or self._og_search_title(webpage) + + bitrates = self._parse_json( + self._search_regex( + r'(?s)bitrates\s*=\s*({.+?});', webpage, 'bitrates', + default='{}'), + video_id, transform_source=js_to_json, fatal=False) + formats = [] - for file_el in xml_playlist_et.findall('files/file'): - try: - height = int(file_el.attrib['id'].replace('p', '')) - except ValueError: - height = 0 - formats.append({'height': height, - 'url': self.media_src_url + file_el.attrib['path'].replace( # noqa - 'dat/', '', 1)}) + for format_url in try_get(bitrates, lambda x: x['mp4'], list) or []: + if isinstance(format_url, compat_str): + height = self._search_regex( + r'(\d+)[pP]\.', format_url, 'height', default=None) + formats.append({ + 'url': format_url, + 'format_id': '%sp' % height if height else None, + 'height': int(height), + }) + if not formats: + playlist = self._download_xml( + 'https://media.joj.sk/services/Video.php?clip=%s' % video_id, + video_id) + for file_el in playlist.findall('./files/file'): + path = file_el.get('path') + if not path: + continue + format_id = file_el.get('id') or file_el.get('label') + formats.append({ + 'url': 'http://n16.joj.sk/storage/%s' % path.replace( + 'dat/', '', 1), + 'format_id': format_id, + 'height': int_or_none(self._search_regex( + r'(\d+)[pP]', format_id or path, 'height', + default=None)), + }) self._sort_formats(formats) + thumbnail = self._og_search_thumbnail(webpage) + + duration = int_or_none(self._search_regex( + r'videoDuration\s*:\s*(\d+)', webpage, 'duration', fatal=False)) + return { 'id': video_id, - 'title': self._og_search_title(webpage).title(), + 'title': title, + 'thumbnail': thumbnail, + 'duration': duration, 'formats': formats, - 'release_date': release_date } From 6e925598d68f5d5216aa3e9abed5c7706a68c891 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 9 Jul 2017 19:15:48 +0700 Subject: [PATCH 16/22] [csjw] Add coding cookie --- youtube_dl/extractor/cjsw.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/cjsw.py b/youtube_dl/extractor/cjsw.py index dd271586f..505bdbe16 100644 --- a/youtube_dl/extractor/cjsw.py +++ b/youtube_dl/extractor/cjsw.py @@ -1,3 +1,4 @@ +# coding: utf-8 from __future__ import unicode_literals import re From 71a1db89198100a0e9bc5099aeed622264690203 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 9 Jul 2017 20:06:24 +0700 Subject: [PATCH 17/22] [dailymail] Add support for embeds --- youtube_dl/extractor/dailymail.py | 17 ++++++++++++++--- youtube_dl/extractor/generic.py | 21 +++++++++++++++++++++ 2 files changed, 35 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/dailymail.py b/youtube_dl/extractor/dailymail.py index 538565c66..af3978035 100644 --- a/youtube_dl/extractor/dailymail.py +++ b/youtube_dl/extractor/dailymail.py @@ -1,6 +1,8 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor from ..compat import compat_str from ..utils import ( @@ -12,8 +14,8 @@ from ..utils import ( class DailyMailIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?dailymail\.co\.uk/video/[^/]+/video-(?P<id>[0-9]+)' - _TEST = { + _VALID_URL = r'https?://(?:www\.)?dailymail\.co\.uk/(?:video/[^/]+/video-|embed/video/)(?P<id>[0-9]+)' + _TESTS = [{ 'url': 'http://www.dailymail.co.uk/video/tvshowbiz/video-1295863/The-Mountain-appears-sparkling-water-ad-Heavy-Bubbles.html', 'md5': 'f6129624562251f628296c3a9ffde124', 'info_dict': { @@ -22,7 +24,16 @@ class DailyMailIE(InfoExtractor): 'title': 'The Mountain appears in sparkling water ad for \'Heavy Bubbles\'', 'description': 'md5:a93d74b6da172dd5dc4d973e0b766a84', } - } + }, { + 'url': 'http://www.dailymail.co.uk/embed/video/1295863.html', + 'only_matching': True, + }] + + @staticmethod + def _extract_urls(webpage): + return re.findall( + r'<iframe\b[^>]+\bsrc=["\'](?P<url>(?:https?:)?//(?:www\.)?dailymail\.co\.uk/embed/video/\d+\.html)', + webpage) def _real_extract(self, url): video_id = self._match_id(url) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index f2c577f98..5e8890d41 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -57,6 +57,7 @@ from .dailymotion import ( DailymotionIE, DailymotionCloudIE, ) +from .dailymail import DailyMailIE from .onionstudios import OnionStudiosIE from .viewlift import ViewLiftEmbedIE from .mtv import MTVServicesEmbeddedIE @@ -760,6 +761,20 @@ class GenericIE(InfoExtractor): }, 'add_ie': ['Dailymotion'], }, + # DailyMail embed + { + 'url': 'http://www.bumm.sk/krimi/2017/07/05/biztonsagi-kamera-buktatta-le-az-agg-ferfit-utlegelo-apolot', + 'info_dict': { + 'id': '1495629', + 'ext': 'mp4', + 'title': 'Care worker punches elderly dementia patient in head 11 times', + 'description': 'md5:3a743dee84e57e48ec68bf67113199a5', + }, + 'add_ie': ['DailyMail'], + 'params': { + 'skip_download': True, + }, + }, # YouTube embed { 'url': 'http://www.badzine.de/ansicht/datum/2014/06/09/so-funktioniert-die-neue-englische-badminton-liga.html', @@ -2190,6 +2205,12 @@ class GenericIE(InfoExtractor): return self.playlist_from_matches( playlists, video_id, video_title, lambda p: '//dailymotion.com/playlist/%s' % p) + # Look for DailyMail embeds + dailymail_urls = DailyMailIE._extract_urls(webpage) + if dailymail_urls: + return self.playlist_from_matches( + dailymail_urls, video_id, video_title, ie=DailyMailIE.ie_key()) + # Look for embedded Wistia player wistia_url = WistiaIE._extract_url(webpage) if wistia_url: From 207acd8465b51d9d00d2bdda22f10858eb7f1bb5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 9 Jul 2017 20:15:15 +0700 Subject: [PATCH 18/22] [ChangeLog] Actualize --- ChangeLog | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/ChangeLog b/ChangeLog index 5d07c12cb..edfde8b6f 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,7 +1,27 @@ version <unreleased> +Core ++ [extractor/common] Add support for AMP tags in _parse_html5_media_entries ++ [utils] Support attributes with no values in get_elements_by_attribute + Extractors ++ [dailymail] Add support for embeds ++ [joj] Add support for joj.sk (#13268) +* [abc.net.au:iview] Extract more formats (#13492, #13489) +* [egghead:course] Fix extraction (#6635, #13370) ++ [cjsw] Add support for cjsw.com (#13525) ++ [eagleplatform] Add support for referrer protected videos (#13557) ++ [eagleplatform] Add support for another embed pattern (#13557) +* [veoh] Extend URL regular expression (#13601) +* [npo:live] Fix live stream id extraction (#13568, #13605) +* [googledrive] Fix height extraction (#13603) ++ [dailymotion] Add support for new layout (#13580) - [yam] Remove extractor +* [xhamster] Extract all formats and fix duration extraction (#13593) ++ [xhamster] Add support for new URL schema (#13593) +* [espn] Extend URL regular expression (#13244, #13549) +* [kaltura] Fix typo in subtitles extraction (#13569) +* [vier] Adapt extraction to redesign (#13575) version 2017.07.02 From 65c416dda896f8a0023f01547e6b707dd57ed30a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 9 Jul 2017 20:16:38 +0700 Subject: [PATCH 19/22] release 2017.07.09 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- docs/supportedsites.md | 3 ++- youtube_dl/version.py | 2 +- 4 files changed, 7 insertions(+), 6 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index 974603507..c4314855d 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2017.07.02*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2017.07.02** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2017.07.09*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2017.07.09** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through [README](https://github.com/rg3/youtube-dl/blob/master/README.md) and **most notably** [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -35,7 +35,7 @@ $ youtube-dl -v <your command line> [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2017.07.02 +[debug] youtube-dl version 2017.07.09 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index edfde8b6f..c379cae71 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version <unreleased> +version 2017.07.09 Core + [extractor/common] Add support for AMP tags in _parse_html5_media_entries diff --git a/docs/supportedsites.md b/docs/supportedsites.md index db2e2bac9..b6a147faf 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -154,6 +154,7 @@ - **chirbit** - **chirbit:profile** - **Cinchcast** + - **CJSW** - **Clipfish** - **cliphunter** - **ClipRs** @@ -369,6 +370,7 @@ - **Jamendo** - **JamendoAlbum** - **JeuxVideo** + - **Joj** - **Jove** - **jpopsuki.tv** - **JWPlatform** @@ -996,7 +998,6 @@ - **XVideos** - **XXXYMovies** - **Yahoo**: Yahoo screen and movies - - **Yam**: 蕃薯藤yam天空部落 - **yandexmusic:album**: Яндекс.Музыка - Альбом - **yandexmusic:playlist**: Яндекс.Музыка - Плейлист - **yandexmusic:track**: Яндекс.Музыка - Трек diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 0db974f97..14358a74c 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2017.07.02' +__version__ = '2017.07.09' From 7bf539edcc3dc44481d5196fd01637698653ffc7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 10 Jul 2017 00:14:41 +0700 Subject: [PATCH 20/22] [eagleplatform] Fix test --- youtube_dl/extractor/eagleplatform.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/eagleplatform.py b/youtube_dl/extractor/eagleplatform.py index 34891a362..42789278e 100644 --- a/youtube_dl/extractor/eagleplatform.py +++ b/youtube_dl/extractor/eagleplatform.py @@ -53,7 +53,7 @@ class EaglePlatformIE(InfoExtractor): 'skip': 'Georestricted', }, { # referrer protected video (https://tvrain.ru/lite/teleshow/kak_vse_nachinalos/namin-418921/) - 'url': 'tvrainru.media.eagleplatform.com:582306', + 'url': 'eagleplatform:tvrainru.media.eagleplatform.com:582306', 'only_matching': True, }] From b71c18b4343d54ce8373e9a11df882aca1ae82a0 Mon Sep 17 00:00:00 2001 From: coreynicholson <coreynicholson@users.noreply.github.com> Date: Sun, 9 Jul 2017 22:24:04 +0100 Subject: [PATCH 21/22] [vlive:playlist] Add extractor --- youtube_dl/extractor/extractors.py | 3 +- youtube_dl/extractor/vlive.py | 56 ++++++++++++++++++++++++++++++ 2 files changed, 58 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 9ee080895..eb1541729 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1206,7 +1206,8 @@ from .vk import ( ) from .vlive import ( VLiveIE, - VLiveChannelIE + VLiveChannelIE, + VLivePlaylistIE ) from .vodlocker import VodlockerIE from .vodpl import VODPlIE diff --git a/youtube_dl/extractor/vlive.py b/youtube_dl/extractor/vlive.py index e58940607..f3825db5c 100644 --- a/youtube_dl/extractor/vlive.py +++ b/youtube_dl/extractor/vlive.py @@ -49,6 +49,10 @@ class VLiveIE(InfoExtractor): }, }] + @classmethod + def suitable(cls, url): + return False if VLivePlaylistIE.suitable(url) else super(VLiveIE, cls).suitable(url) + def _real_extract(self, url): video_id = self._match_id(url) @@ -261,3 +265,55 @@ class VLiveChannelIE(InfoExtractor): return self.playlist_result( entries, channel_code, channel_name) + + +class VLivePlaylistIE(InfoExtractor): + IE_NAME = 'vlive:playlist' + _VALID_URL = r'https?://(?:(?:www|m)\.)?vlive\.tv/video/(?P<video_id>[0-9]+)/playlist/(?P<id>[0-9]+)' + _TEST = { + 'url': 'http://www.vlive.tv/video/22867/playlist/22912', + 'info_dict': { + 'id': '22912', + 'title': 'Valentine Day Message from TWICE' + }, + 'playlist_mincount': 9 + } + + def _real_extract(self, url): + playlist_id = self._match_id(url) + video_id_match = re.match(self._VALID_URL, url) + assert video_id_match + video_id = compat_str(video_id_match.group('video_id')) + + VIDEO_URL_TEMPLATE = 'http://www.vlive.tv/video/%s' + if self._downloader.params.get('noplaylist'): + self.to_screen( + 'Downloading just video %s because of --no-playlist' % video_id) + return self.url_result( + VIDEO_URL_TEMPLATE % video_id, + ie=VLiveIE.ie_key(), video_id=video_id) + + self.to_screen( + 'Downloading playlist %s - add --no-playlist to just download video' % playlist_id) + + webpage = self._download_webpage( + 'http://www.vlive.tv/video/%s/playlist/%s' % (video_id, playlist_id), video_id) + + playlist_name = self._html_search_regex( + r'<div[^>]+class="[^"]*multicam_playlist[^>]*>\s*<h3[^>]+>([^<]+)', + webpage, 'playlist name', fatal=False) + + item_ids = self._search_regex( + r'\bvar\s+playlistVideoSeqs\s*=\s*(\[[^]]+\])', + webpage, 'playlist item ids') + + entries = [] + for item_id in self._parse_json(item_ids, playlist_id): + item_id = compat_str(item_id) + entries.append( + self.url_result( + VIDEO_URL_TEMPLATE % item_id, + ie=VLiveIE.ie_key(), video_id=item_id)) + + return self.playlist_result( + entries, playlist_id, playlist_name) From e3cd1fcdd177613acae4198cafbff51fbbb912c2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 10 Jul 2017 04:32:24 +0700 Subject: [PATCH 22/22] [vlive:playlist] Relax and simplify --- youtube_dl/extractor/vlive.py | 41 +++++++++++++++++------------------ 1 file changed, 20 insertions(+), 21 deletions(-) diff --git a/youtube_dl/extractor/vlive.py b/youtube_dl/extractor/vlive.py index f3825db5c..77c120a57 100644 --- a/youtube_dl/extractor/vlive.py +++ b/youtube_dl/extractor/vlive.py @@ -280,10 +280,8 @@ class VLivePlaylistIE(InfoExtractor): } def _real_extract(self, url): - playlist_id = self._match_id(url) - video_id_match = re.match(self._VALID_URL, url) - assert video_id_match - video_id = compat_str(video_id_match.group('video_id')) + mobj = re.match(self._VALID_URL, url) + video_id, playlist_id = mobj.group('video_id', 'id') VIDEO_URL_TEMPLATE = 'http://www.vlive.tv/video/%s' if self._downloader.params.get('noplaylist'): @@ -294,26 +292,27 @@ class VLivePlaylistIE(InfoExtractor): ie=VLiveIE.ie_key(), video_id=video_id) self.to_screen( - 'Downloading playlist %s - add --no-playlist to just download video' % playlist_id) + 'Downloading playlist %s - add --no-playlist to just download video' + % playlist_id) webpage = self._download_webpage( - 'http://www.vlive.tv/video/%s/playlist/%s' % (video_id, playlist_id), video_id) + 'http://www.vlive.tv/video/%s/playlist/%s' + % (video_id, playlist_id), playlist_id) + + item_ids = self._parse_json( + self._search_regex( + r'playlistVideoSeqs\s*=\s*(\[[^]]+\])', webpage, + 'playlist video seqs'), + playlist_id) + + entries = [ + self.url_result( + VIDEO_URL_TEMPLATE % item_id, ie=VLiveIE.ie_key(), + video_id=compat_str(item_id)) + for item_id in item_ids] playlist_name = self._html_search_regex( r'<div[^>]+class="[^"]*multicam_playlist[^>]*>\s*<h3[^>]+>([^<]+)', - webpage, 'playlist name', fatal=False) + webpage, 'playlist title', fatal=False) - item_ids = self._search_regex( - r'\bvar\s+playlistVideoSeqs\s*=\s*(\[[^]]+\])', - webpage, 'playlist item ids') - - entries = [] - for item_id in self._parse_json(item_ids, playlist_id): - item_id = compat_str(item_id) - entries.append( - self.url_result( - VIDEO_URL_TEMPLATE % item_id, - ie=VLiveIE.ie_key(), video_id=item_id)) - - return self.playlist_result( - entries, playlist_id, playlist_name) + return self.playlist_result(entries, playlist_id, playlist_name)