From 8ec2b2c41c7f3952ad9097085993d1f24f6b6776 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 30 May 2016 21:48:35 +0700 Subject: [PATCH 01/11] [options] Add --limit-rate alias for rate limiting option Closes #9644 In order to follow regular --verb-noun pattern and better conformity with wget and curl --- youtube_dl/options.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/options.py b/youtube_dl/options.py index 38efd292d..14051b714 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -395,8 +395,8 @@ def parseOpts(overrideArguments=None): downloader = optparse.OptionGroup(parser, 'Download Options') downloader.add_option( - '-r', '--rate-limit', - dest='ratelimit', metavar='LIMIT', + '-r', '--limit-rate', '--rate-limit', + dest='ratelimit', metavar='RATE', help='Maximum download rate in bytes per second (e.g. 50K or 4.2M)') downloader.add_option( '-R', '--retries', From 877032314fdf2d9b391325f96e3bc53a60ea067c Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Wed, 1 Jun 2016 18:37:34 +0800 Subject: [PATCH 02/11] [generic] Improve Kaltura detection Closes #4004 --- youtube_dl/extractor/generic.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 303e112d2..5cb188b20 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -918,6 +918,19 @@ class GenericIE(InfoExtractor): 'uploader_id': 'echojecka', }, }, + # Kaltura embed with single quotes + { + 'url': 'http://fod.infobase.com/p_ViewPlaylist.aspx?AssignmentID=NUN8ZY', + 'info_dict': { + 'id': '0_izeg5utt', + 'ext': 'mp4', + 'title': '35871', + 'timestamp': 1355743100, + 'upload_date': '20121217', + 'uploader_id': 'batchUser', + }, + 'add_ie': ['Kaltura'], + }, # Eagle.Platform embed (generic URL) { 'url': 'http://lenta.ru/news/2015/03/06/navalny/', @@ -1903,7 +1916,7 @@ class GenericIE(InfoExtractor): return self.url_result(mobj.group('url'), 'Zapiks') # Look for Kaltura embeds - mobj = (re.search(r"(?s)kWidget\.(?:thumb)?[Ee]mbed\(\{.*?'wid'\s*:\s*'_?(?P[^']+)',.*?'entry_?[Ii]d'\s*:\s*'(?P[^']+)',", webpage) or + mobj = (re.search(r"(?s)kWidget\.(?:thumb)?[Ee]mbed\(\{.*?(?P['\"])wid(?P=q1)\s*:\s*(?P['\"])_?(?P[^'\"]+)(?P=q2),.*?(?P['\"])entry_?[Ii]d(?P=q3)\s*:\s*(?P['\"])(?P[^'\"]+)(?P=q4),", webpage) or re.search(r'(?s)(?P["\'])(?:https?:)?//cdnapi(?:sec)?\.kaltura\.com/.*?(?:p|partner_id)/(?P\d+).*?(?P=q1).*?entry_?[Ii]d\s*:\s*(?P["\'])(?P.+?)(?P=q2)', webpage)) if mobj is not None: return self.url_result(smuggle_url( From 28bab13348f84ac75e4d1362ce5828429bb7993f Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Wed, 1 Jun 2016 19:18:01 +0800 Subject: [PATCH 03/11] [generic,viewlift] Move a test case to the specialized extractor --- youtube_dl/extractor/generic.py | 12 ------------ youtube_dl/extractor/viewlift.py | 4 ++++ 2 files changed, 4 insertions(+), 12 deletions(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 5cb188b20..e478f86a8 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -881,18 +881,6 @@ class GenericIE(InfoExtractor): 'title': 'EP3S5 - Bon Appétit - Baqueira Mi Corazon !', } }, - # Kaltura embed - { - 'url': 'http://www.monumentalnetwork.com/videos/john-carlson-postgame-2-25-15', - 'info_dict': { - 'id': '1_eergr3h1', - 'ext': 'mp4', - 'upload_date': '20150226', - 'uploader_id': 'MonumentalSports-Kaltura@perfectsensedigital.com', - 'timestamp': int, - 'title': 'John Carlson Postgame 2/25/15', - }, - }, # Kaltura embed (different embed code) { 'url': 'http://www.premierchristianradio.com/Shows/Saturday/Unbelievable/Conference-Videos/Os-Guinness-Is-It-Fools-Talk-Unbelievable-Conference-2014', diff --git a/youtube_dl/extractor/viewlift.py b/youtube_dl/extractor/viewlift.py index dd4a13a4a..19500eba8 100644 --- a/youtube_dl/extractor/viewlift.py +++ b/youtube_dl/extractor/viewlift.py @@ -141,6 +141,10 @@ class ViewLiftIE(ViewLiftBaseIE): }, { 'url': 'http://www.kesari.tv/news/video/1461919076414', 'only_matching': True, + }, { + # Was once Kaltura embed + 'url': 'https://www.monumentalsportsnetwork.com/videos/john-carlson-postgame-2-25-15', + 'only_matching': True, }] def _real_extract(self, url): From 0ff3749bfe6d149dd7250ea8df83387d3af40e0f Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Wed, 1 Jun 2016 19:23:09 +0800 Subject: [PATCH 04/11] [udn] Fix m3u8 and f4m extraction as well as improve --- youtube_dl/extractor/udn.py | 62 ++++++++++++++++++++++++------------- 1 file changed, 41 insertions(+), 21 deletions(-) diff --git a/youtube_dl/extractor/udn.py b/youtube_dl/extractor/udn.py index ee35b7227..57dd73aef 100644 --- a/youtube_dl/extractor/udn.py +++ b/youtube_dl/extractor/udn.py @@ -2,10 +2,13 @@ from __future__ import unicode_literals import json +import re + from .common import InfoExtractor from ..utils import ( + determine_ext, + int_or_none, js_to_json, - ExtractorError, ) from ..compat import compat_urlparse @@ -16,13 +19,16 @@ class UDNEmbedIE(InfoExtractor): _VALID_URL = r'https?:' + _PROTOCOL_RELATIVE_VALID_URL _TESTS = [{ 'url': 'http://video.udn.com/embed/news/300040', - 'md5': 'de06b4c90b042c128395a88f0384817e', 'info_dict': { 'id': '300040', 'ext': 'mp4', 'title': '生物老師男變女 全校挺"做自己"', 'thumbnail': 're:^https?://.*\.jpg$', - } + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, }, { 'url': 'https://video.udn.com/embed/news/300040', 'only_matching': True, @@ -38,39 +44,53 @@ class UDNEmbedIE(InfoExtractor): page = self._download_webpage(url, video_id) options = json.loads(js_to_json(self._html_search_regex( - r'var options\s*=\s*([^;]+);', page, 'video urls dictionary'))) + r'var\s+options\s*=\s*([^;]+);', page, 'video urls dictionary'))) video_urls = options['video'] if video_urls.get('youtube'): return self.url_result(video_urls.get('youtube'), 'Youtube') - try: - del video_urls['youtube'] - except KeyError: - pass + formats = [] + for video_type, api_url in video_urls.items(): + if not api_url: + continue - formats = [{ - 'url': self._download_webpage( + video_url = self._download_webpage( compat_urlparse.urljoin(url, api_url), video_id, - 'retrieve url for %s video' % video_type), - 'format_id': video_type, - 'preference': 0 if video_type == 'mp4' else -1, - } for video_type, api_url in video_urls.items() if api_url] + note='retrieve url for %s video' % video_type) - if not formats: - raise ExtractorError('No videos found', expected=True) + ext = determine_ext(video_url) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + video_url, video_id, ext='mp4', m3u8_id='hls')) + elif ext == 'f4m': + formats.extend(self._extract_f4m_formats( + video_url, video_id, f4m_id='hds')) + else: + mobj = re.search(r'_(?P\d+)p_(?P\d+).mp4', video_url) + a_format = { + 'url': video_url, + # video_type may be 'mp4', which confuses YoutubeDL + 'format_id': 'http-' + video_type, + } + if mobj: + a_format.update({ + 'height': int_or_none(mobj.group('height')), + 'tbr': int_or_none(mobj.group('tbr')), + }) + formats.append(a_format) self._sort_formats(formats) - thumbnail = None - - if options.get('gallery') and len(options['gallery']): - thumbnail = options['gallery'][0].get('original') + thumbnails = [{ + 'url': img_url, + 'id': img_type, + } for img_type, img_url in options.get('gallery', [{}])[0].items() if img_url] return { 'id': video_id, 'formats': formats, 'title': options['title'], - 'thumbnail': thumbnail + 'thumbnails': thumbnails, } From 811586ebcfb04878ad3347706bfee020d0e3652b Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Wed, 1 Jun 2016 19:23:44 +0800 Subject: [PATCH 05/11] [generic] Update the UDNEmbed test case --- youtube_dl/extractor/generic.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index e478f86a8..b4138381d 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -1033,14 +1033,18 @@ class GenericIE(InfoExtractor): }, # UDN embed { - 'url': 'http://www.udn.com/news/story/7314/822787', + 'url': 'https://video.udn.com/news/300346', 'md5': 'fd2060e988c326991037b9aff9df21a6', 'info_dict': { 'id': '300346', 'ext': 'mp4', 'title': '中一中男師變性 全校師生力挺', 'thumbnail': 're:^https?://.*\.jpg$', - } + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, }, # Ooyala embed { From dde1ce7c061cae123264eb555f1da98956923301 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Wed, 1 Jun 2016 20:04:43 +0800 Subject: [PATCH 06/11] [tf1] Fix a regular expression (closes #9656) This is a Python bug fixed in 2.7.6 [1] [1] https://github.com/rg3/youtube-dl/issues/9656#issuecomment-222968594 --- youtube_dl/extractor/tf1.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/tf1.py b/youtube_dl/extractor/tf1.py index aff5121b9..6c848dc6f 100644 --- a/youtube_dl/extractor/tf1.py +++ b/youtube_dl/extractor/tf1.py @@ -48,6 +48,6 @@ class TF1IE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) wat_id = self._html_search_regex( - r'(["\'])(?:https?:)?//www\.wat\.tv/embedframe/.*?(?P\d{8})(?:.*?)?\1', + r'(["\'])(?:https?:)?//www\.wat\.tv/embedframe/.*?(?P\d{8}).*?\1', webpage, 'wat id', group='id') return self.url_result('wat:%s' % wat_id, 'Wat') From 6a1df4fb5fb76710457b59195e8b530ba269f09f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 1 Jun 2016 21:23:58 +0700 Subject: [PATCH 07/11] [spankwire] Add support for new URL format (Closes #9657) --- youtube_dl/extractor/spankwire.py | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/youtube_dl/extractor/spankwire.py b/youtube_dl/extractor/spankwire.py index 692fd78e8..92a7120a3 100644 --- a/youtube_dl/extractor/spankwire.py +++ b/youtube_dl/extractor/spankwire.py @@ -96,20 +96,18 @@ class SpankwireIE(InfoExtractor): formats = [] for height, video_url in zip(heights, video_urls): path = compat_urllib_parse_urlparse(video_url).path - _, quality = path.split('/')[4].split('_')[:2] - f = { - 'url': video_url, - 'height': height, - } - tbr = self._search_regex(r'^(\d+)[Kk]$', quality, 'tbr', default=None) - if tbr: - f.update({ - 'tbr': int(tbr), - 'format_id': '%dp' % height, - }) + m = re.search(r'/(?P\d+)[pP]_(?P\d+)[kK]', path) + if m: + tbr = int(m.group('tbr')) + height = int(m.group('height')) else: - f['format_id'] = quality - formats.append(f) + tbr = None + formats.append({ + 'url': video_url, + 'format_id': '%dp' % height, + 'height': height, + 'tbr': tbr, + }) self._sort_formats(formats) age_limit = self._rta_search(webpage) From 6e6b9f600f2f447604f6108fb6486b73cc25def1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 2 Jun 2016 01:10:23 +0700 Subject: [PATCH 08/11] [arte] Add support for playlists and rework tests (Closes #9632) --- youtube_dl/extractor/arte.py | 173 ++++++++++++++++++----------- youtube_dl/extractor/extractors.py | 1 + 2 files changed, 110 insertions(+), 64 deletions(-) diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py index e37fdae13..f40532929 100644 --- a/youtube_dl/extractor/arte.py +++ b/youtube_dl/extractor/arte.py @@ -61,10 +61,7 @@ class ArteTvIE(InfoExtractor): } -class ArteTVPlus7IE(InfoExtractor): - IE_NAME = 'arte.tv:+7' - _VALID_URL = r'https?://(?:www\.)?arte\.tv/guide/(?Pfr|de|en|es)/(?:(?:sendungen|emissions|embed)/)?(?P[^/]+)/(?P[^/?#&]+)' - +class ArteTVBaseIE(InfoExtractor): @classmethod def _extract_url_info(cls, url): mobj = re.match(cls._VALID_URL, url) @@ -78,60 +75,6 @@ class ArteTVPlus7IE(InfoExtractor): video_id = mobj.group('id') return video_id, lang - def _real_extract(self, url): - video_id, lang = self._extract_url_info(url) - webpage = self._download_webpage(url, video_id) - return self._extract_from_webpage(webpage, video_id, lang) - - def _extract_from_webpage(self, webpage, video_id, lang): - patterns_templates = (r'arte_vp_url=["\'](.*?%s.*?)["\']', r'data-url=["\']([^"]+%s[^"]+)["\']') - ids = (video_id, '') - # some pages contain multiple videos (like - # http://www.arte.tv/guide/de/sendungen/XEN/xenius/?vid=055918-015_PLUS7-D), - # so we first try to look for json URLs that contain the video id from - # the 'vid' parameter. - patterns = [t % re.escape(_id) for _id in ids for t in patterns_templates] - json_url = self._html_search_regex( - patterns, webpage, 'json vp url', default=None) - if not json_url: - def find_iframe_url(webpage, default=NO_DEFAULT): - return self._html_search_regex( - r']+src=(["\'])(?P.+\bjson_url=.+?)\1', - webpage, 'iframe url', group='url', default=default) - - iframe_url = find_iframe_url(webpage, None) - if not iframe_url: - embed_url = self._html_search_regex( - r'arte_vp_url_oembed=\'([^\']+?)\'', webpage, 'embed url', default=None) - if embed_url: - player = self._download_json( - embed_url, video_id, 'Downloading player page') - iframe_url = find_iframe_url(player['html']) - # en and es URLs produce react-based pages with different layout (e.g. - # http://www.arte.tv/guide/en/053330-002-A/carnival-italy?zone=world) - if not iframe_url: - program = self._search_regex( - r'program\s*:\s*({.+?["\']embed_html["\'].+?}),?\s*\n', - webpage, 'program', default=None) - if program: - embed_html = self._parse_json(program, video_id) - if embed_html: - iframe_url = find_iframe_url(embed_html['embed_html']) - if iframe_url: - json_url = compat_parse_qs( - compat_urllib_parse_urlparse(iframe_url).query)['json_url'][0] - if json_url: - title = self._search_regex( - r']+title=(["\'])(?P.+?)\1', - webpage, 'title', default=None, group='title') - return self._extract_from_json_url(json_url, video_id, lang, title=title) - # Different kind of embed URL (e.g. - # http://www.arte.tv/magazine/trepalium/fr/episode-0406-replay-trepalium) - embed_url = self._search_regex( - r'<iframe[^>]+src=(["\'])(?P<url>.+?)\1', - webpage, 'embed url', group='url') - return self.url_result(embed_url) - def _extract_from_json_url(self, json_url, video_id, lang, title=None): info = self._download_json(json_url, video_id) player_info = info['videoJsonPlayer'] @@ -235,6 +178,74 @@ class ArteTVPlus7IE(InfoExtractor): return info_dict +class ArteTVPlus7IE(ArteTVBaseIE): + IE_NAME = 'arte.tv:+7' + _VALID_URL = r'https?://(?:www\.)?arte\.tv/guide/(?P<lang>fr|de|en|es)/(?:(?:sendungen|emissions|embed)/)?(?P<id>[^/]+)/(?P<name>[^/?#&]+)' + + _TESTS = [{ + 'url': 'http://www.arte.tv/guide/de/sendungen/XEN/xenius/?vid=055918-015_PLUS7-D', + 'only_matching': True, + }] + + @classmethod + def suitable(cls, url): + return False if ArteTVPlaylistIE.suitable(url) else super(ArteTVPlus7IE, cls).suitable(url) + + def _real_extract(self, url): + video_id, lang = self._extract_url_info(url) + webpage = self._download_webpage(url, video_id) + return self._extract_from_webpage(webpage, video_id, lang) + + def _extract_from_webpage(self, webpage, video_id, lang): + patterns_templates = (r'arte_vp_url=["\'](.*?%s.*?)["\']', r'data-url=["\']([^"]+%s[^"]+)["\']') + ids = (video_id, '') + # some pages contain multiple videos (like + # http://www.arte.tv/guide/de/sendungen/XEN/xenius/?vid=055918-015_PLUS7-D), + # so we first try to look for json URLs that contain the video id from + # the 'vid' parameter. + patterns = [t % re.escape(_id) for _id in ids for t in patterns_templates] + json_url = self._html_search_regex( + patterns, webpage, 'json vp url', default=None) + if not json_url: + def find_iframe_url(webpage, default=NO_DEFAULT): + return self._html_search_regex( + r'<iframe[^>]+src=(["\'])(?P<url>.+\bjson_url=.+?)\1', + webpage, 'iframe url', group='url', default=default) + + iframe_url = find_iframe_url(webpage, None) + if not iframe_url: + embed_url = self._html_search_regex( + r'arte_vp_url_oembed=\'([^\']+?)\'', webpage, 'embed url', default=None) + if embed_url: + player = self._download_json( + embed_url, video_id, 'Downloading player page') + iframe_url = find_iframe_url(player['html']) + # en and es URLs produce react-based pages with different layout (e.g. + # http://www.arte.tv/guide/en/053330-002-A/carnival-italy?zone=world) + if not iframe_url: + program = self._search_regex( + r'program\s*:\s*({.+?["\']embed_html["\'].+?}),?\s*\n', + webpage, 'program', default=None) + if program: + embed_html = self._parse_json(program, video_id) + if embed_html: + iframe_url = find_iframe_url(embed_html['embed_html']) + if iframe_url: + json_url = compat_parse_qs( + compat_urllib_parse_urlparse(iframe_url).query)['json_url'][0] + if json_url: + title = self._search_regex( + r'<h3[^>]+title=(["\'])(?P<title>.+?)\1', + webpage, 'title', default=None, group='title') + return self._extract_from_json_url(json_url, video_id, lang, title=title) + # Different kind of embed URL (e.g. + # http://www.arte.tv/magazine/trepalium/fr/episode-0406-replay-trepalium) + embed_url = self._search_regex( + r'<iframe[^>]+src=(["\'])(?P<url>.+?)\1', + webpage, 'embed url', group='url') + return self.url_result(embed_url) + + # It also uses the arte_vp_url url from the webpage to extract the information class ArteTVCreativeIE(ArteTVPlus7IE): IE_NAME = 'arte.tv:creative' @@ -267,7 +278,7 @@ class ArteTVInfoIE(ArteTVPlus7IE): IE_NAME = 'arte.tv:info' _VALID_URL = r'https?://info\.arte\.tv/(?P<lang>fr|de|en|es)/(?:[^/]+/)*(?P<id>[^/?#&]+)' - _TEST = { + _TESTS = [{ 'url': 'http://info.arte.tv/fr/service-civique-un-cache-misere', 'info_dict': { 'id': '067528-000-A', @@ -275,7 +286,7 @@ class ArteTVInfoIE(ArteTVPlus7IE): 'title': 'Service civique, un cache misère ?', 'upload_date': '20160403', }, - } + }] class ArteTVFutureIE(ArteTVPlus7IE): @@ -300,6 +311,8 @@ class ArteTVDDCIE(ArteTVPlus7IE): IE_NAME = 'arte.tv:ddc' _VALID_URL = r'https?://ddc\.arte\.tv/(?P<lang>emission|folge)/(?P<id>[^/?#&]+)' + _TESTS = [] + def _real_extract(self, url): video_id, lang = self._extract_url_info(url) if lang == 'folge': @@ -318,7 +331,7 @@ class ArteTVConcertIE(ArteTVPlus7IE): IE_NAME = 'arte.tv:concert' _VALID_URL = r'https?://concert\.arte\.tv/(?P<lang>fr|de|en|es)/(?P<id>[^/?#&]+)' - _TEST = { + _TESTS = [{ 'url': 'http://concert.arte.tv/de/notwist-im-pariser-konzertclub-divan-du-monde', 'md5': '9ea035b7bd69696b67aa2ccaaa218161', 'info_dict': { @@ -328,14 +341,14 @@ class ArteTVConcertIE(ArteTVPlus7IE): 'upload_date': '20140128', 'description': 'md5:486eb08f991552ade77439fe6d82c305', }, - } + }] class ArteTVCinemaIE(ArteTVPlus7IE): IE_NAME = 'arte.tv:cinema' _VALID_URL = r'https?://cinema\.arte\.tv/(?P<lang>fr|de|en|es)/(?P<id>.+)' - _TEST = { + _TESTS = [{ 'url': 'http://cinema.arte.tv/de/node/38291', 'md5': '6b275511a5107c60bacbeeda368c3aa1', 'info_dict': { @@ -345,7 +358,7 @@ class ArteTVCinemaIE(ArteTVPlus7IE): 'upload_date': '20160122', 'description': 'md5:7f749bbb77d800ef2be11d54529b96bc', }, - } + }] class ArteTVMagazineIE(ArteTVPlus7IE): @@ -390,9 +403,41 @@ class ArteTVEmbedIE(ArteTVPlus7IE): ) ''' + _TESTS = [] + def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') lang = mobj.group('lang') json_url = mobj.group('json_url') return self._extract_from_json_url(json_url, video_id, lang) + + +class ArteTVPlaylistIE(ArteTVBaseIE): + IE_NAME = 'arte.tv:playlist' + _VALID_URL = r'https?://(?:www\.)?arte\.tv/guide/(?P<lang>fr|de|en|es)/[^#]*#collection/(?P<id>PL-\d+)' + + _TESTS = [{ + 'url': 'http://www.arte.tv/guide/de/plus7/?country=DE#collection/PL-013263/ARTETV', + 'info_dict': { + 'id': 'PL-013263', + 'title': 'Areva & Uramin', + }, + 'playlist_mincount': 6, + }, { + 'url': 'http://www.arte.tv/guide/de/playlists?country=DE#collection/PL-013190/ARTETV', + 'only_matching': True, + }] + + def _real_extract(self, url): + playlist_id, lang = self._extract_url_info(url) + collection = self._download_json( + 'https://api.arte.tv/api/player/v1/collectionData/%s/%s?source=videos' + % (lang, playlist_id), playlist_id) + title = collection.get('title') + description = collection.get('shortDescription') or collection.get('teaserText') + entries = [ + self._extract_from_json_url( + video['jsonUrl'], video.get('programId') or playlist_id, lang) + for video in collection['videos'] if video.get('jsonUrl')] + return self.playlist_result(entries, playlist_id, title, description) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index dd4b2b838..dc21cfed9 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -56,6 +56,7 @@ from .arte import ( ArteTVDDCIE, ArteTVMagazineIE, ArteTVEmbedIE, + ArteTVPlaylistIE, ) from .atresplayer import AtresPlayerIE from .atttechchannel import ATTTechChannelIE From 5e3856a2c5af0a622b74921c0d60acde53a664ff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 2 Jun 2016 01:19:57 +0700 Subject: [PATCH 09/11] release 2016.06.02 --- .github/ISSUE_TEMPLATE.md | 6 +++--- README.md | 2 +- docs/supportedsites.md | 1 + youtube_dl/version.py | 2 +- 4 files changed, 6 insertions(+), 5 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index e3de48eb5..ae98e0626 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2016.05.30.2*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2016.05.30.2** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2016.06.02*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2016.06.02** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through [README](https://github.com/rg3/youtube-dl/blob/master/README.md) and **most notably** [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -35,7 +35,7 @@ $ youtube-dl -v <your command line> [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2016.05.30.2 +[debug] youtube-dl version 2016.06.02 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/README.md b/README.md index 7e18112de..253d51bc8 100644 --- a/README.md +++ b/README.md @@ -162,7 +162,7 @@ which means you can modify it, redistribute it or use it however you like. (experimental) ## Download Options: - -r, --rate-limit LIMIT Maximum download rate in bytes per second + -r, --limit-rate RATE Maximum download rate in bytes per second (e.g. 50K or 4.2M) -R, --retries RETRIES Number of retries (default is 10), or "infinite". diff --git a/docs/supportedsites.md b/docs/supportedsites.md index bbc647030..dcbc632a1 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -55,6 +55,7 @@ - **arte.tv:future** - **arte.tv:info** - **arte.tv:magazine** + - **arte.tv:playlist** - **AtresPlayer** - **ATTTechChannel** - **AudiMedia** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index ad6fb26c6..fba427dde 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2016.05.30.2' +__version__ = '2016.06.02' From f4e4aa9b6b7057af400ad404efcca51669012b73 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Wed, 1 Jun 2016 21:18:57 +0100 Subject: [PATCH 10/11] [revision3:embed] Add new extractor --- youtube_dl/extractor/revision3.py | 132 ++++++++++++++---------------- 1 file changed, 63 insertions(+), 69 deletions(-) diff --git a/youtube_dl/extractor/revision3.py b/youtube_dl/extractor/revision3.py index 99979ebe1..833d8a2f0 100644 --- a/youtube_dl/extractor/revision3.py +++ b/youtube_dl/extractor/revision3.py @@ -13,8 +13,64 @@ from ..utils import ( ) +class Revision3EmbedIE(InfoExtractor): + IE_NAME = 'revision3:embed' + _VALID_URL = r'(?:revision3:(?:(?P<playlist_type>[^:]+):)?|https?://(?:(?:(?:www|embed)\.)?(?:revision3|animalist)|(?:(?:api|embed)\.)?seekernetwork)\.com/player/embed\?videoId=)(?P<playlist_id>\d+)' + _TEST = { + 'url': 'http://api.seekernetwork.com/player/embed?videoId=67558', + 'md5': '83bcd157cab89ad7318dd7b8c9cf1306', + 'info_dict': { + 'id': '67558', + 'ext': 'mp4', + 'title': 'The Pros & Cons Of Zoos', + 'description': 'Zoos are often depicted as a terrible place for animals to live, but is there any truth to this?', + 'uploader_id': 'dnews', + 'uploader': 'DNews', + } + } + _API_KEY = 'ba9c741bce1b9d8e3defcc22193f3651b8867e62' + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + playlist_id = mobj.group('playlist_id') + playlist_type = mobj.group('playlist_type') or 'video_id' + video_data = self._download_json( + 'http://revision3.com/api/getPlaylist.json', playlist_id, query={ + 'api_key': self._API_KEY, + 'codecs': 'h264,vp8,theora', + playlist_type: playlist_id, + })['items'][0] + + formats = [] + for vcodec, media in video_data['media'].items(): + for quality_id, quality in media.items(): + if quality_id == 'hls': + formats.extend(self._extract_m3u8_formats( + quality['url'], playlist_id, 'mp4', + 'm3u8_native', m3u8_id='hls', fatal=False)) + else: + formats.append({ + 'url': quality['url'], + 'format_id': '%s-%s' % (vcodec, quality_id), + 'tbr': int_or_none(quality.get('bitrate')), + 'vcodec': vcodec, + }) + self._sort_formats(formats) + + return { + 'id': playlist_id, + 'title': unescapeHTML(video_data['title']), + 'description': unescapeHTML(video_data.get('summary')), + 'uploader': video_data.get('show', {}).get('name'), + 'uploader_id': video_data.get('show', {}).get('slug'), + 'duration': int_or_none(video_data.get('duration')), + 'formats': formats, + } + + class Revision3IE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?(?P<domain>(?:revision3|testtube|animalist)\.com)/(?P<id>[^/]+(?:/[^/?#]+)?)' + IE_NAME = 'revision' + _VALID_URL = r'https?://(?:www\.)?(?P<domain>(?:revision3|animalist)\.com)/(?P<id>[^/]+(?:/[^/?#]+)?)' _TESTS = [{ 'url': 'http://www.revision3.com/technobuffalo/5-google-predictions-for-2016', 'md5': 'd94a72d85d0a829766de4deb8daaf7df', @@ -32,52 +88,14 @@ class Revision3IE(InfoExtractor): } }, { # Show - 'url': 'http://testtube.com/brainstuff', - 'info_dict': { - 'id': '251', - 'title': 'BrainStuff', - 'description': 'Whether the topic is popcorn or particle physics, you can count on the HowStuffWorks team to explore-and explain-the everyday science in the world around us on BrainStuff.', - }, - 'playlist_mincount': 93, - }, { - 'url': 'https://testtube.com/dnews/5-weird-ways-plants-can-eat-animals?utm_source=FB&utm_medium=DNews&utm_campaign=DNewsSocial', - 'info_dict': { - 'id': '58227', - 'display_id': 'dnews/5-weird-ways-plants-can-eat-animals', - 'duration': 275, - 'ext': 'webm', - 'title': '5 Weird Ways Plants Can Eat Animals', - 'description': 'Why have some plants evolved to eat meat?', - 'upload_date': '20150120', - 'timestamp': 1421763300, - 'uploader': 'DNews', - 'uploader_id': 'dnews', - }, - }, { - 'url': 'http://testtube.com/tt-editors-picks/the-israel-palestine-conflict-explained-in-ten-min', - 'info_dict': { - 'id': '71618', - 'ext': 'mp4', - 'display_id': 'tt-editors-picks/the-israel-palestine-conflict-explained-in-ten-min', - 'title': 'The Israel-Palestine Conflict Explained in Ten Minutes', - 'description': 'If you\'d like to learn about the struggle between Israelis and Palestinians, this video is a great place to start', - 'uploader': 'Editors\' Picks', - 'uploader_id': 'tt-editors-picks', - 'timestamp': 1453309200, - 'upload_date': '20160120', - }, - 'add_ie': ['Youtube'], + 'url': 'http://revision3.com/variant', + 'only_matching': True, }, { # Tag - 'url': 'http://testtube.com/tech-news', - 'info_dict': { - 'id': '21018', - 'title': 'tech news', - }, - 'playlist_mincount': 9, + 'url': 'http://revision3.com/vr', + 'only_matching': True, }] _PAGE_DATA_TEMPLATE = 'http://www.%s/apiProxy/ddn/%s?domain=%s' - _API_KEY = 'ba9c741bce1b9d8e3defcc22193f3651b8867e62' def _real_extract(self, url): domain, display_id = re.match(self._VALID_URL, url).groups() @@ -119,33 +137,9 @@ class Revision3IE(InfoExtractor): }) return info - video_data = self._download_json( - 'http://revision3.com/api/getPlaylist.json?api_key=%s&codecs=h264,vp8,theora&video_id=%s' % (self._API_KEY, video_id), - video_id)['items'][0] - - formats = [] - for vcodec, media in video_data['media'].items(): - for quality_id, quality in media.items(): - if quality_id == 'hls': - formats.extend(self._extract_m3u8_formats( - quality['url'], video_id, 'mp4', - 'm3u8_native', m3u8_id='hls', fatal=False)) - else: - formats.append({ - 'url': quality['url'], - 'format_id': '%s-%s' % (vcodec, quality_id), - 'tbr': int_or_none(quality.get('bitrate')), - 'vcodec': vcodec, - }) - self._sort_formats(formats) - info.update({ - 'title': unescapeHTML(video_data['title']), - 'description': unescapeHTML(video_data.get('summary')), - 'uploader': video_data.get('show', {}).get('name'), - 'uploader_id': video_data.get('show', {}).get('slug'), - 'duration': int_or_none(video_data.get('duration')), - 'formats': formats, + '_type': 'url_transparent', + 'url': 'revision3:%s' % video_id, }) return info else: From 4a684895c0227bf18896eae36e693d7046aacaf4 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Wed, 1 Jun 2016 21:20:02 +0100 Subject: [PATCH 11/11] [seeker] Add new extractor(closes #9619) --- youtube_dl/extractor/extractors.py | 6 +++- youtube_dl/extractor/seeker.py | 57 ++++++++++++++++++++++++++++++ 2 files changed, 62 insertions(+), 1 deletion(-) create mode 100644 youtube_dl/extractor/seeker.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index dc21cfed9..9dd55bd70 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -639,7 +639,10 @@ from .regiotv import RegioTVIE from .restudy import RestudyIE from .reuters import ReutersIE from .reverbnation import ReverbNationIE -from .revision3 import Revision3IE +from .revision3 import ( + Revision3EmbedIE, + Revision3IE, +) from .rice import RICEIE from .ringtv import RingTVIE from .ro220 import Ro220IE @@ -678,6 +681,7 @@ from .screencast import ScreencastIE from .screencastomatic import ScreencastOMaticIE from .screenjunkies import ScreenJunkiesIE from .screenwavemedia import ScreenwaveMediaIE, TeamFourIE +from .seeker import SeekerIE from .senateisvp import SenateISVPIE from .sendtonews import SendtoNewsIE from .servingsys import ServingSysIE diff --git a/youtube_dl/extractor/seeker.py b/youtube_dl/extractor/seeker.py new file mode 100644 index 000000000..3b9c65e7e --- /dev/null +++ b/youtube_dl/extractor/seeker.py @@ -0,0 +1,57 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor + + +class SeekerIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?seeker\.com/(?P<display_id>.*)-(?P<article_id>\d+)\.html' + _TESTS = [{ + # player.loadRevision3Item + 'url': 'http://www.seeker.com/should-trump-be-required-to-release-his-tax-returns-1833805621.html', + 'md5': '30c1dc4030cc715cf05b423d0947ac18', + 'info_dict': { + 'id': '76243', + 'ext': 'webm', + 'title': 'Should Trump Be Required To Release His Tax Returns?', + 'description': 'Donald Trump has been secretive about his "big," "beautiful" tax returns. So what can we learn if he decides to release them?', + 'uploader': 'Seeker Daily', + 'uploader_id': 'seekerdaily', + } + }, { + 'url': 'http://www.seeker.com/changes-expected-at-zoos-following-recent-gorilla-lion-shootings-1834116536.html', + 'playlist': [ + { + 'md5': '83bcd157cab89ad7318dd7b8c9cf1306', + 'info_dict': { + 'id': '67558', + 'ext': 'mp4', + 'title': 'The Pros & Cons Of Zoos', + 'description': 'Zoos are often depicted as a terrible place for animals to live, but is there any truth to this?', + 'uploader': 'DNews', + 'uploader_id': 'dnews', + }, + } + ], + 'info_dict': { + 'id': '1834116536', + 'title': 'After Gorilla Killing, Changes Ahead for Zoos', + 'description': 'The largest association of zoos and others are hoping to learn from recent incidents that led to the shooting deaths of a gorilla and two lions.', + }, + }] + + def _real_extract(self, url): + display_id, article_id = re.match(self._VALID_URL, url).groups() + webpage = self._download_webpage(url, display_id) + mobj = re.search(r"player\.loadRevision3Item\('([^']+)'\s*,\s*(\d+)\);", webpage) + if mobj: + playlist_type, playlist_id = mobj.groups() + return self.url_result( + 'revision3:%s:%s' % (playlist_type, playlist_id), 'Revision3Embed', playlist_id) + else: + entries = [self.url_result('revision3:video_id:%s' % video_id, 'Revision3Embed', video_id) for video_id in re.findall( + r'<iframe[^>]+src=[\'"](?:https?:)?//api\.seekernetwork\.com/player/embed\?videoId=(\d+)', webpage)] + return self.playlist_result( + entries, article_id, self._og_search_title(webpage), self._og_search_description(webpage))