New extractor Vidlox
This commit is contained in:
bato3 2018-07-12 17:42:47 +02:00
parent 40a051fa9f
commit 9dc48d44b5
4 changed files with 220 additions and 0 deletions

View File

@ -2480,6 +2480,85 @@ class InfoExtractor(object):
m3u8_id='hls', fatal=False)) m3u8_id='hls', fatal=False))
return formats return formats
def _find_clappr_data(self, webpage, video_id = None, transform_source=js_to_json):
"""
Find Clappr.Player data
http://clappr.github.io/classes/Player.html#method_constructor
"""
mobj = re.search(
r'new Clappr.Player\((?P<json>{.+?})\);',
webpage.replace("\n","").replace("\t",""))
if mobj:
try:
clappr_data = self._parse_json(mobj.group('json'),
video_id=video_id,
transform_source=transform_source)
except ExtractorError:
pass
else:
if isinstance(clappr_data, dict):
return clappr_data
def _parse_clappr_data(self, clappr_data, video_id=None, require_title=True,
m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
"""
Parse Clappr player data
http://clappr.github.io/classes/Player.html#method_constructor
"""
info_dict = {
'id': video_id,
'subtitles':{},
}
info_dict['formats'] = self._extract_url_list_formats(
clappr_data.get("sources", [clappr_data.get("source")]),
video_id=video_id,m3u8_id=m3u8_id, mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
thumbnail = clappr_data.get("poster")
if thumbnail:
info_dict['thumbnail'] = thumbnail
# Title from `chromecast` plugin https://github.com/deaathh/sdasdas
title = clappr_data.get('chromecast',{}).get('title')
if title:
info_dict['title'] = title
#Subtitles:
#https://github.com/clappr/clappr/blob/master/doc/BUILTIN_PLUGINS.md#playback-configuration
subtitles = clappr_data.get('externalTracks') or clappr_data.get('playback',{}).get('externalTracks')
if subtitles:
for sub in subtitles:
if sub.get('kind',"subtitles") != "subtitles":
continue
lang = sub.get('lang') or sub.get('language') or sub.get('label','undefined')
src = sub.get('src')
if not src:
continue
info_dict['subtitles'].setdefault(lang, []).append({
'url': compat_urlparse.urljoin(base_url,src),
'ext': determine_ext(src),
})
#https://github.com/JMVTechnology/Clappr-Subtitle
subtitle = clappr_data.get('subtitle')
if subtitle:
if isinstance(subtitle, dict):
src = subtitle.get("src")
lang = subtitle.get("lang") or subtitle.get('label')
else:
src = subtitle
if src:
src = compat_urlparse.urljoin(base_url,src)
ext = determine_ext(src)
if not lang:
lang = src.split('/')[-1]
if video_id in lang:
lang = lang.replace("%s_" % video_id,'').replace(video_id,'').replace(".%s" % ext, '')
info_dict['subtitles'].setdefault(lang, []).append({
'url': src,
'ext': ext,
})
return info_dict
def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]): def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
query = compat_urlparse.urlparse(url).query query = compat_urlparse.urlparse(url).query
url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url) url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
@ -2533,6 +2612,54 @@ class InfoExtractor(object):
}) })
return formats return formats
def _extract_url_list_formats(self, sources, video_id=None,
m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
"""
Transform ["url1", "url2", {source: <>, mimeType: <>}] to formats.
Knows
"""
formats = []
format_id = -1
for source in sources:
#The media source URL, or {source: <>, mimeType: <>}
if isinstance(source, dict):
source_url = source.get('source')
mime = source.get('mimeType')
else:
source_url = source
mime = None
format_id = format_id + 1
if base_url:
source_url = compat_urlparse.urljoin(base_url, source_url)
ext = mimetype2ext(mime) or determine_ext(source_url, 'mp4')
if ext == "m3u8":
formats.extend(self._extract_m3u8_formats(
source_url, video_id, 'mp4', entry_protocol='m3u8_native',
m3u8_id=m3u8_id, fatal=False, preference=1))
elif ext == 'mpd':
formats.extend(self._extract_mpd_formats(
source_url, video_id, mpd_id=mpd_id, fatal=False))
elif ext == 'smil':
formats.extend(self._extract_smil_formats(
source_url, video_id, fatal=False))
elif ext == "f4m":
formats.extend(self._extract_f4m_formats(
source_url, video_id, m3u8_id=m3u8_id, fatal=False))
else:
urlh = self._request_webpage(source_url, video_id, note="Checking format %d information"%format_id, fatal=False)
size = int(urlh.headers.get('Content-Length'))
formats.append({
'url': source_url,
'ext': ext,
'format_id': "%d" % format_id,
'filesize': size,
'preference': int(size / 1024 / 1024 / 10 ),
})
if len(formats) == 0:
raise ExtractorError('Source not found', expected=True, video_id=video_id)
return formats
def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json): def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
mobj = re.search( mobj = re.search(
r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)', r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)',

View File

@ -1266,6 +1266,7 @@ from .viewlift import (
ViewLiftEmbedIE, ViewLiftEmbedIE,
) )
from .viewster import ViewsterIE from .viewster import ViewsterIE
from .vidlox import VidloxIE
from .viidea import ViideaIE from .viidea import ViideaIE
from .vimeo import ( from .vimeo import (
VimeoIE, VimeoIE,

View File

@ -2060,6 +2060,16 @@ class GenericIE(InfoExtractor):
'skip': 'TODO: fix nested playlists processing in tests', 'skip': 'TODO: fix nested playlists processing in tests',
}, },
# { # {
# # Clappr.Player({})
# 'url': 'http://demo.teleosmedia.com/mosaic/',
# 'md5': "TODO",
# 'info_dict': {
# 'id': 'mosaic',
# 'title': 'video',
# 'ext': 'mp4'
# },
# },
# {
# # TODO: find another test # # TODO: find another test
# # http://schema.org/VideoObject # # http://schema.org/VideoObject
# 'url': 'https://flipagram.com/f/nyvTSJMKId', # 'url': 'https://flipagram.com/f/nyvTSJMKId',
@ -3118,6 +3128,13 @@ class GenericIE(InfoExtractor):
jwplayer_data, video_id, require_title=False, base_url=url) jwplayer_data, video_id, require_title=False, base_url=url)
return merge_dicts(info, info_dict) return merge_dicts(info, info_dict)
# Clappr.player()
clappr_dict = self._find_clappr_data(webpage, video_id)
if clappr_dict:
info = self._parse_clappr_data(clappr_dict,
video_id=video_id, base_url=url)
return merge_dicts(info, info_dict)
# Video.js embed # Video.js embed
mobj = re.search( mobj = re.search(
r'(?s)\bvideojs\s*\(.+?\.src\s*\(\s*((?:\[.+?\]|{.+?}))\s*\)\s*;', r'(?s)\bvideojs\s*\(.+?\.src\s*\(\s*((?:\[.+?\]|{.+?}))\s*\)\s*;',

View File

@ -0,0 +1,75 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from ..utils import ExtractorError
from .common import InfoExtractor
from .openload import PhantomJSwrapper
class VidloxIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?vidlox\.(?:me|tv)/(?:embed-)?(?P<id>[0-9a-z]+)(?:\.html)?'
_TESTS = [{
'url': 'https://vidlox.me/5tq733o3wj1d',
'md5': 'f780592146ad0458679064de891f3e3f',
'info_dict': {
'id': '5tq733o3wj1d',
'ext': 'mp4',
'title': r're:big buck bunny 1080p surround',
'thumbnail': r're:^https?://.*\.jpg$',
'subtitles': {
'Spanish': [{
'ext': 'srt',
}],
}
}
}, {
'url': 'https://vidlox.me/embed-bs2nk6dgqio1.html',
'only_matching': True,
}]
def _real_extract(self, url):
video_id = self._match_id(url)
page_url = "https://vidlox.me/%s" % video_id
phantom = PhantomJSwrapper(self, required_version='2.0')
# download page for couple simple test
webpage = self._download_webpage(page_url, video_id).replace("\n","").replace("\t","")
if 'File not found' in webpage:
raise ExtractorError('File not found', expected=True, video_id=video_id)
title = None
if 'This video can be watched as embed only.' in webpage:
# extract tilte and download embed
title = self._html_search_regex(
r'<title[^>]*?>(?P<title>.+?)\s*</title>', webpage, 'title').replace('Watch ','',1)
webpage = None
page_url = "https://vidlox.me/embed-%s.html" % video_id
# execute JS
webpage, _ = phantom.get(page_url, webpage, video_id=video_id)
# extract player data
clappr_dict = self._find_clappr_data(webpage, video_id)
if not clappr_dict:
raise ExtractorError('Player data not found',
expected=False, video_id=video_id)
# and parse it
info_dict = self._parse_clappr_data(clappr_dict,
video_id=video_id, base_url=page_url)
info_dict['title'] = title or self._html_search_regex(
r'<h1[^>]*?>(?P<title>.+?)\s*</h1>', webpage, 'title')
return info_dict