[telebasel] [simplex] Add new information extractors

This commit is contained in:
Alex Seiler 2017-02-06 17:01:34 +01:00
parent d5d904ff7d
commit 91d21e0a84
3 changed files with 366 additions and 0 deletions

View File

@ -849,6 +849,10 @@ from .shared import (
VivoIE, VivoIE,
) )
from .showroomlive import ShowRoomLiveIE from .showroomlive import ShowRoomLiveIE
from .simplex import (
SimplexIE,
SimplexHostsIE,
)
from .sina import SinaIE from .sina import SinaIE
from .sixplay import SixPlayIE from .sixplay import SixPlayIE
from .skynewsarabia import ( from .skynewsarabia import (
@ -931,6 +935,10 @@ from .teamfourstar import TeamFourStarIE
from .techtalks import TechTalksIE from .techtalks import TechTalksIE
from .ted import TEDIE from .ted import TEDIE
from .tele13 import Tele13IE from .tele13 import Tele13IE
from .telebasel import (
TelebaselMediathekIE,
TelebaselArticleIE,
)
from .telebruxelles import TeleBruxellesIE from .telebruxelles import TeleBruxellesIE
from .telecinco import TelecincoIE from .telecinco import TelecincoIE
from .telegraaf import TelegraafIE from .telegraaf import TelegraafIE

View File

@ -0,0 +1,233 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import (
ExtractorError,
float_or_none,
int_or_none,
str_or_none,
try_get,
urljoin,
)
class SimplexIE(InfoExtractor):
IE_DESC = 'Simplex Player'
_VALID_URL = r'''(?x)
simplex:
(?P<server_url>https?://(?:www\.)?.+):
(?P<customer_id>\d+):
(?P<author_id>\d+):
(?P<project_id>\d+)
'''
_TEST = {
'url': 'simplex:http://video.telebasel.ch:4062:4063:62349',
'only_matching': True,
}
@staticmethod
def _extract_width_height(resolution):
try:
w, h = resolution.split('x')
w = int_or_none(w)
h = int_or_none(h)
return w, h
except (AttributeError, ValueError):
return None, None
def _known_simplex_format(self, simplex_formats, fid):
for sf in simplex_formats:
if type(sf['id']) == str and sf['id'] == fid:
return sf
elif type(sf['id']) == list and fid in sf['id']:
return sf
return None
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
server_url = mobj.group('server_url')
customer_id = mobj.group('customer_id')
author_id = mobj.group('author_id')
project_id = mobj.group('project_id')
video_id = '%s-%s-%s' % (customer_id, author_id, project_id)
content_url = urljoin(
server_url,
'content/%s/%s/%s/' % (customer_id, author_id, project_id))
player_data = self._download_json(
urljoin(content_url, 'data.sid'),
video_id,
note='Downloading player data JSON',
errnote='Unable to download player data JSON')
video_data = self._download_json(
urljoin(content_url, 'pl01.sid'),
video_id,
note='Downloading video data JSON',
errnote='Unable to download video data JSON',
transform_source=lambda s: s[s.index('{'):s.rindex('}') + 1])
title = str_or_none(player_data['title'])
description = str_or_none(player_data.get('description'))
timestamp = int_or_none(player_data.get('createDate'))
language = str_or_none(player_data.get('language'))
duration = float_or_none(player_data.get('duration'), scale=10)
file_information = try_get(video_data, lambda x: x['data'], dict)
if not file_information:
raise ExtractorError('Cannot extract file information data.')
filename = str_or_none(file_information.get('filename'))
thumbname = str_or_none(file_information.get('thumb'))
thumbnail = urljoin(content_url, thumbname + '.jpg') if thumbname else None
qualities = try_get(player_data, lambda x: x['qualities'], list)
if not qualities:
raise ExtractorError('Cannot find available formats.')
# simplex_formats is the list of known simplex player formats.
# There might be some more format ids, but we are not sure, what they do:
# id 400: It was indicated to be for Apple TV.
# id 500: No additional information found.
simplex_formats = [
{'id': '20', 'filename': filename + '.flv', 'method': 'url'},
{'id': '40', 'filename': filename + '_40.flv', 'method': 'url'},
{'id': '200', 'filename': filename + '.mp4', 'method': 'url'},
{'id': ['300', '350', '355', '360'], 'filename': 'index.m3u8', 'method': 'm3u8'},
]
formats = []
m3u8_done = False
format_infos = []
for quali in qualities:
fid = str_or_none(quali.get('id'))
vbr = int_or_none(quali.get('b'))
resolution = str_or_none(quali.get('s'))
width, height = SimplexIE._extract_width_height(resolution)
form_info = {
'resolution': resolution,
'width': width,
'height': height,
'vbr': vbr,
'abr': int_or_none(quali.get('ab')),
'asr': int_or_none(quali.get('ar')),
'fps': int_or_none(quali.get('r')),
'language': language,
'format_id': 'hls-%s' % str_or_none(vbr)
}
format_infos.append(form_info)
simplex_format = self._known_simplex_format(simplex_formats, fid)
if simplex_format:
format_url = urljoin(content_url, simplex_format['filename'])
if simplex_format['method'] == 'url':
form = {
'url': format_url
}
form.update(form_info)
formats.append(form)
elif simplex_format['method'] == 'm3u8' and not m3u8_done:
forms = self._extract_m3u8_formats(
format_url,
video_id,
ext='mp4',
entry_protocol='m3u8_native')
formats.extend(forms)
m3u8_done = True
# Try to add additional information to the formats exracted by _extract_m3u8_formats:
for form in formats:
if form['url'].endswith('.m3u8'):
vbr = int_or_none(
self._search_regex(r'(\d+)kb.m3u8', form['url'], 'm3u8 vbr', default=None))
if vbr:
try:
form_info = next(f for f in format_infos if f['vbr'] == vbr)
form.update(form_info)
except StopIteration:
pass
self._sort_formats(formats)
return {
'id': video_id,
'title': title,
'description': description,
'duration': duration,
'thumbnail': thumbnail,
'timestamp': timestamp,
'formats': formats,
}
class SimplexHostsIE(InfoExtractor):
_VALID_URL = r'''(?x)
(?P<server_url>https?://(?:www\.)?
(?:
video\.telebasel\.ch|
media10\.simplex\.tv
)
)
/content/
(?P<customer_id>\d+)/
(?P<author_id>\d+)/
(?P<project_id>\d+)
'''
_TESTS = [{
'url': 'http://media10.simplex.tv/content/906/907/76997/',
'md5': 'e6b8ebefac5aeae4a6790fec18382ca0',
'info_dict': {
'id': '906-907-76997',
'ext': 'flv',
'title': '03.02.17: Der Trailer zum Rückrunden-Start',
'description': None,
'duration': 44.0,
'timestamp': 1486135964,
'upload_date': '20170203',
'url': 'http://media10.simplex.tv/content/906/907/76997/simvid_1_40.flv',
'thumbnail': 'http://media10.simplex.tv/content/906/907/76997/simvid_1.jpg',
'language': 'de',
'width': 1280,
'height': 720,
'vbr': 2304,
'abr': 160,
'fps': 25,
'asr': 44100,
'resolution': '1280x720'
}
}, {
'url': 'https://video.telebasel.ch/content/4062/4063/77067',
'info_dict': {
'id': '4062-4063-77067',
'ext': 'flv',
'title': 'News vom 05.02.2017',
'description': 'md5:23fb960068621263d5d4418996387674',
'timestamp': 1486314961,
'upload_date': '20170205',
},
'params': {
'skip_download': True,
}
}]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
server_url = mobj.group('server_url')
customer_id = mobj.group('customer_id')
author_id = mobj.group('author_id')
project_id = mobj.group('project_id')
video_id = '%s-%s-%s' % (customer_id, author_id, project_id)
simplex_url = 'simplex:%s:%s:%s:%s' % (server_url, customer_id, author_id, project_id)
return self.url_result(
simplex_url,
ie=SimplexIE.ie_key(),
video_id=video_id)

View File

@ -0,0 +1,125 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from .simplex import SimplexIE
from ..utils import (
ExtractorError,
str_or_none,
strip_or_none,
remove_end,
try_get,
urljoin,
)
class TelebaselBaseIE(InfoExtractor):
_SERVER_URL = 'https://video.telebasel.ch/'
_CUSTOMER_ID = '4062'
_AUTHOR_ID = '4063'
class TelebaselMediathekIE(TelebaselBaseIE):
IE_DESC = 'telebasel.ch Mediathek'
_VALID_URL = r'''(?x)
https?://
(?:www\.)?
telebasel\.ch/
(?!telebasel-archiv)
(?!\d+)
(?P<show_name>[^/]+)
(?:
/.*pid=(?P<pid>\d+).*
)?
'''
_TESTS = [{
'url': 'https://telebasel.ch/telebasel-gastro-tipp/?aid=4063&pid=75290&channel=15881',
'only_matching': True,
}, {
'url': 'https://telebasel.ch/telebasel-reihe-8',
'only_matching': True,
}, {
'url': 'https://telebasel.ch/telebasel-talk/?channel=15881',
'only_matching': True,
}]
def _extract_video_id(self, url, show_name):
webpage = self._download_webpage(url, show_name)
channel_id = self._html_search_regex(
r'<div[^>]+class=["\']tb-mediathek-videos["\'][^>]+data-channels=["\'](\d+)["\']',
webpage, 'channel id')
episodes_url = urljoin(
self._SERVER_URL,
'multichannel/%s/%s/.ofdd/json' % (self._CUSTOMER_ID, channel_id))
episodes = self._download_json(
episodes_url,
channel_id,
note='Downloading episodes JSON',
errnote='Unable to download episodes JSON',
transform_source=lambda s: s[s.index('{'):s.rindex('}') + 1])
video_id = str_or_none(
try_get(episodes, lambda x: x['projects'][0]['projectId'], int))
if not video_id:
raise ExtractorError('Could not extract video id from the webpage.')
return video_id
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
show_name = mobj.group('show_name')
video_id = mobj.group('pid')
if not video_id:
video_id = self._extract_video_id(url, show_name)
return self.url_result(
'simplex:%s:%s:%s:%s' % (
self._SERVER_URL, self._CUSTOMER_ID,
self._AUTHOR_ID, video_id),
ie=SimplexIE.ie_key())
class TelebaselArticleIE(TelebaselBaseIE):
IE_DESC = 'telebasel.ch articles'
_VALID_URL = r'https?://(?:www\.)?telebasel\.ch/(?P<id>\d{4}/\d{2}/\d{2}/[^/]+)/?'
_TEST = {
'url': 'https://telebasel.ch/2017/02/01/report-usr-iii-einfach-erklaert/?channel=105100',
'info_dict': {
'id': '2017/02/01/report-usr-iii-einfach-erklaert',
'title': 'Report: USR III einfach erklärt',
'description': 'md5:2cb2b94ac023a6a9517cffc58d500c7e',
},
'playlist_count': 3,
}
def _real_extract(self, url):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
search_url = urljoin(
self._SERVER_URL,
r'content/%s/%s/(?P<pid>\d+)' % (self._CUSTOMER_ID, self._AUTHOR_ID))
embed_regex = r'<iframe[^>]+src=["\']%s.+["\']' % search_url
entries = [
self.url_result(
'simplex:%s:%s:%s:%s' % (
self._SERVER_URL, self._CUSTOMER_ID,
self._AUTHOR_ID, m.group('pid')),
ie=SimplexIE.ie_key())
for m in re.finditer(embed_regex, webpage)]
title = strip_or_none(
remove_end(self._og_search_title(webpage), '- Telebasel'))
description = self._og_search_description(webpage)
return self.playlist_result(
entries,
playlist_id=display_id,
playlist_title=title,
playlist_description=description)