diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 5d4db54d5..a7d151af3 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -2480,6 +2480,85 @@ class InfoExtractor(object): m3u8_id='hls', fatal=False)) return formats + def _find_clappr_data(self, webpage, video_id = None, transform_source=js_to_json): + """ + Find Clappr.Player data + http://clappr.github.io/classes/Player.html#method_constructor + """ + mobj = re.search( + r'new Clappr.Player\((?P{.+?})\);', + webpage.replace("\n","").replace("\t","")) + if mobj: + try: + clappr_data = self._parse_json(mobj.group('json'), + video_id=video_id, + transform_source=transform_source) + except ExtractorError: + pass + else: + if isinstance(clappr_data, dict): + return clappr_data + + + def _parse_clappr_data(self, clappr_data, video_id=None, require_title=True, + m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None): + """ + Parse Clappr player data + http://clappr.github.io/classes/Player.html#method_constructor + """ + + info_dict = { + 'id': video_id, + 'subtitles':{}, + } + info_dict['formats'] = self._extract_url_list_formats( + clappr_data.get("sources", [clappr_data.get("source")]), + video_id=video_id,m3u8_id=m3u8_id, mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url) + + thumbnail = clappr_data.get("poster") + if thumbnail: + info_dict['thumbnail'] = thumbnail + + # Title from `chromecast` plugin https://github.com/deaathh/sdasdas + title = clappr_data.get('chromecast',{}).get('title') + if title: + info_dict['title'] = title + #Subtitles: + #https://github.com/clappr/clappr/blob/master/doc/BUILTIN_PLUGINS.md#playback-configuration + subtitles = clappr_data.get('externalTracks') or clappr_data.get('playback',{}).get('externalTracks') + if subtitles: + for sub in subtitles: + if sub.get('kind',"subtitles") != "subtitles": + continue + lang = sub.get('lang') or sub.get('language') or sub.get('label','undefined') + src = sub.get('src') + if not src: + continue + info_dict['subtitles'].setdefault(lang, []).append({ + 'url': compat_urlparse.urljoin(base_url,src), + 'ext': determine_ext(src), + }) + #https://github.com/JMVTechnology/Clappr-Subtitle + subtitle = clappr_data.get('subtitle') + if subtitle: + if isinstance(subtitle, dict): + src = subtitle.get("src") + lang = subtitle.get("lang") or subtitle.get('label') + else: + src = subtitle + if src: + src = compat_urlparse.urljoin(base_url,src) + ext = determine_ext(src) + if not lang: + lang = src.split('/')[-1] + if video_id in lang: + lang = lang.replace("%s_" % video_id,'').replace(video_id,'').replace(".%s" % ext, '') + info_dict['subtitles'].setdefault(lang, []).append({ + 'url': src, + 'ext': ext, + }) + return info_dict + def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]): query = compat_urlparse.urlparse(url).query url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url) @@ -2533,6 +2612,54 @@ class InfoExtractor(object): }) return formats + def _extract_url_list_formats(self, sources, video_id=None, + m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None): + """ + Transform ["url1", "url2", {source: <>, mimeType: <>}] to formats. + Knows + """ + formats = [] + format_id = -1 + for source in sources: + #The media source URL, or {source: <>, mimeType: <>} + if isinstance(source, dict): + source_url = source.get('source') + mime = source.get('mimeType') + else: + source_url = source + mime = None + + format_id = format_id + 1 + if base_url: + source_url = compat_urlparse.urljoin(base_url, source_url) + ext = mimetype2ext(mime) or determine_ext(source_url, 'mp4') + if ext == "m3u8": + formats.extend(self._extract_m3u8_formats( + source_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id=m3u8_id, fatal=False, preference=1)) + elif ext == 'mpd': + formats.extend(self._extract_mpd_formats( + source_url, video_id, mpd_id=mpd_id, fatal=False)) + elif ext == 'smil': + formats.extend(self._extract_smil_formats( + source_url, video_id, fatal=False)) + elif ext == "f4m": + formats.extend(self._extract_f4m_formats( + source_url, video_id, m3u8_id=m3u8_id, fatal=False)) + else: + urlh = self._request_webpage(source_url, video_id, note="Checking format %d information"%format_id, fatal=False) + size = int(urlh.headers.get('Content-Length')) + formats.append({ + 'url': source_url, + 'ext': ext, + 'format_id': "%d" % format_id, + 'filesize': size, + 'preference': int(size / 1024 / 1024 / 10 ), + }) + if len(formats) == 0: + raise ExtractorError('Source not found', expected=True, video_id=video_id) + return formats + def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json): mobj = re.search( r'(?s)jwplayer\((?P[\'"])[^\'" ]+(?P=quote)\)(?!).*?\.setup\s*\((?P[^)]+)\)', diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index c6f8a785a..f3a7a47e7 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1266,6 +1266,7 @@ from .viewlift import ( ViewLiftEmbedIE, ) from .viewster import ViewsterIE +from .vidlox import VidloxIE from .viidea import ViideaIE from .vimeo import ( VimeoIE, diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index aa04905ed..e462e2828 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -2060,6 +2060,16 @@ class GenericIE(InfoExtractor): 'skip': 'TODO: fix nested playlists processing in tests', }, # { + # # Clappr.Player({}) + # 'url': 'http://demo.teleosmedia.com/mosaic/', + # 'md5': "TODO", + # 'info_dict': { + # 'id': 'mosaic', + # 'title': 'video', + # 'ext': 'mp4' + # }, + # }, + # { # # TODO: find another test # # http://schema.org/VideoObject # 'url': 'https://flipagram.com/f/nyvTSJMKId', @@ -3118,6 +3128,13 @@ class GenericIE(InfoExtractor): jwplayer_data, video_id, require_title=False, base_url=url) return merge_dicts(info, info_dict) + # Clappr.player() + clappr_dict = self._find_clappr_data(webpage, video_id) + if clappr_dict: + info = self._parse_clappr_data(clappr_dict, + video_id=video_id, base_url=url) + return merge_dicts(info, info_dict) + # Video.js embed mobj = re.search( r'(?s)\bvideojs\s*\(.+?\.src\s*\(\s*((?:\[.+?\]|{.+?}))\s*\)\s*;', diff --git a/youtube_dl/extractor/vidlox.py b/youtube_dl/extractor/vidlox.py new file mode 100644 index 000000000..c2203a08c --- /dev/null +++ b/youtube_dl/extractor/vidlox.py @@ -0,0 +1,75 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from ..utils import ExtractorError +from .common import InfoExtractor +from .openload import PhantomJSwrapper + + +class VidloxIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?vidlox\.(?:me|tv)/(?:embed-)?(?P[0-9a-z]+)(?:\.html)?' + + _TESTS = [{ + 'url': 'https://vidlox.me/5tq733o3wj1d', + 'md5': 'f780592146ad0458679064de891f3e3f', + 'info_dict': { + 'id': '5tq733o3wj1d', + 'ext': 'mp4', + 'title': r're:big buck bunny 1080p surround', + 'thumbnail': r're:^https?://.*\.jpg$', + 'subtitles': { + 'Spanish': [{ + 'ext': 'srt', + }], + } + } + }, { + 'url': 'https://vidlox.me/embed-bs2nk6dgqio1.html', + 'only_matching': True, + }] + + + + def _real_extract(self, url): + + video_id = self._match_id(url) + page_url = "https://vidlox.me/%s" % video_id + phantom = PhantomJSwrapper(self, required_version='2.0') + + # download page for couple simple test + webpage = self._download_webpage(page_url, video_id).replace("\n","").replace("\t","") + if 'File not found' in webpage: + raise ExtractorError('File not found', expected=True, video_id=video_id) + + title = None + if 'This video can be watched as embed only.' in webpage: + # extract tilte and download embed + title = self._html_search_regex( + r']*?>(?P.+?)\s*', webpage, 'title').replace('Watch ','',1) + webpage = None + page_url = "https://vidlox.me/embed-%s.html" % video_id + + # execute JS + webpage, _ = phantom.get(page_url, webpage, video_id=video_id) + + + + # extract player data + clappr_dict = self._find_clappr_data(webpage, video_id) + if not clappr_dict: + raise ExtractorError('Player data not found', + expected=False, video_id=video_id) + + # and parse it + info_dict = self._parse_clappr_data(clappr_dict, + video_id=video_id, base_url=page_url) + + info_dict['title'] = title or self._html_search_regex( + r']*?>(?P.+?)\s*</h1>', webpage, 'title') + + + + + return info_dict