123 lines
5.9 KiB
Python
Raw Normal View History

2018-01-18 13:29:18 +01:00
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
2019-10-02 21:18:57 +02:00
from ..utils import (
2019-10-09 21:39:40 +02:00
js_to_json, RegexNotFoundError, get_element_by_id, unified_strdate
2019-10-02 21:18:57 +02:00
)
2019-10-02 21:18:36 +02:00
import re
2018-01-18 13:29:18 +01:00
class HHUIE(InfoExtractor):
_VALID_URL = r'https://mediathek\.hhu\.de/watch/(?P<id>.+)'
_TEST = {
'url': 'https://mediathek.hhu.de/watch/2dd05982-ea45-4108-9620-0c36e6ed8df5',
'md5': 'b99ff77f2148b1e754555abdf53f0e51',
'info_dict': {
'id': '2dd05982-ea45-4108-9620-0c36e6ed8df5',
'ext': 'mp4',
'title': 'Das Multimediazentrum',
'description': '',
2019-10-02 21:18:57 +02:00
'categories': ['Imagefilme'],
'tags': [
'MMZ', 'Multimediazentrum', 'Heinrich-Heine-Universität',
'UKD', 'eLearning', 'Abstimmsysteme', 'Portale',
'Studierendenportal', 'Lehrfilme', 'Lehrfilm',
'Operationsfilme', 'Vorlesungsaufzeichnung', 'Multimedia',
'ZIM', 'HHU', 'Ute', 'Clames', ], # yes, that's incorrect
2019-10-02 21:18:57 +02:00
'uploader': 'clames',
2018-01-18 13:29:18 +01:00
'uploader_id': 'clames',
2019-10-02 21:18:57 +02:00
'license': 'CC BY 3.0 DE',
'upload_date': '20150126',
'thumbnail': 'https://mediathek.hhu.de/thumbs/2dd05982-ea45-4108-9620-0c36e6ed8df5/thumb_000.jpg', }}
2018-01-18 13:29:18 +01:00
def _real_extract(self, url):
video_id = self._match_id(url)
webpage, webpage_url = self._download_webpage_handle(url, video_id)
if webpage_url.geturl().startswith("https://sts."):
self.raise_login_required()
2019-10-02 21:18:36 +02:00
# Some videos need a login, maybe TODO.
2018-01-18 13:29:18 +01:00
try:
2019-10-02 21:18:36 +02:00
config_js = self._search_regex(
r'playerInstance\.setup\(([^;]+)\);', webpage, 'config_js')
2019-10-02 21:18:36 +02:00
# remove 'link: encodeURI("<our url>"),'
if 'link: encodeURI' in config_js:
encode_begin = config_js.find('link: encodeURI')
encode_end = config_js.find(')', encode_begin)
config_js = (
config_js[:encode_begin] + config_js[encode_end + 2:])
2019-10-02 21:18:36 +02:00
del encode_begin, encode_end
2019-10-09 21:39:40 +02:00
config = self._parse_json(
config_js, video_id, transform_source=js_to_json)
info = self._parse_jwplayer_data(
config, video_id, require_title=False,
base_url='https://mediathek.hhu.de/')
2019-10-02 21:18:36 +02:00
except (RegexNotFoundError, ValueError):
self.report_warning('failed to get player config, guessing formats')
# This will likely work but better warn.
file_id = self._html_search_regex(
r"{ file: '\/movies\/(.+?)\/v_100\.mp4', label: '",
webpage, 'file_id')
2019-10-09 21:39:40 +02:00
info = {
'video_id': video_id,
'formats': [
({'url': format_url.format(file_id)})
for format_url in (
'https://mediathek.hhu.de/movies/{}/v_10.webm',
'https://mediathek.hhu.de/movies/{}/v_10.mp4',
'https://mediathek.hhu.de/movies/{}/v_50.webm',
'https://mediathek.hhu.de/movies/{}/v_50.mp4',
'https://mediathek.hhu.de/movies/{}/v_100.webm',
'https://mediathek.hhu.de/movies/{}/v_100.mp4',)]}
if not info.get('title'):
info['title'] = self._html_search_regex(
2018-01-18 13:29:18 +01:00
r'<h1 id="mt_watch-headline-title">\s+(.+?)\s+<\/h1>',
webpage, 'title')
2019-10-09 21:39:40 +02:00
if not info.get('title'):
info['title'] = self._og_search_title(webpage, fatal=False)
info['description'] = self._html_search_regex(
2019-10-02 21:18:36 +02:00
r'<p id="mt_watch-description" class="watch-description">\s+(.+?)\s+<\/p>',
webpage, 'description', fatal=False)
2019-10-09 21:39:40 +02:00
if not info.get('description'):
info['description'] = self._og_search_description(webpage, default='')
if not info.get('thumbnail'):
info['thumbnail'] = self._og_search_property(
'image:secure_url', webpage, 'thumbnail', fatal=False)
2019-10-09 21:39:40 +02:00
info['uploader'] = self._html_search_regex(
2018-01-18 13:29:18 +01:00
r'<a id="mt_content_placeholder_videoinfo_createdby" class="author" href=".+">(.+?)<\/a>',
webpage, 'uploader', fatal=False)
2019-10-09 21:39:40 +02:00
info['uploader_id'] = self._html_search_regex(
2019-10-02 21:18:57 +02:00
r'<a id="mt_content_placeholder_videoinfo_createdby" class="author" href="/user/(.+)">.+?<\/a>',
webpage, 'uploader_id', fatal=False)
2019-10-02 21:18:57 +02:00
# CC licenses get a image with an appropriate alt text
license_img = get_element_by_id('mt_watch_license', webpage)
if license_img:
2019-10-09 21:39:40 +02:00
info['license'] = self._search_regex(
r'alt="(.+)"', license_img, 'license_img', fatal=False)
2019-10-09 21:39:40 +02:00
if not license_img or not info.get('license'):
2019-10-02 21:18:57 +02:00
# other licenses are just text
2019-10-09 21:39:40 +02:00
info['license'] = self._html_search_regex(
2019-10-02 21:18:57 +02:00
r'<div id="mt_content_placeholder_videotabs_mt_videotabs_formview_video_license" class="video-license">(.+)<\/div>',
webpage, 'license_text', fatal=False)
2019-10-09 21:39:40 +02:00
info['upload_date'] = _date(self._html_search_regex(
2019-10-02 21:18:57 +02:00
r'<span class="watch-information-date added">(.+?)<\/span>',
webpage, 'upload_date', fatal=False))
2019-10-02 21:18:57 +02:00
category = self._html_search_regex(
r'<a href="/category/.+">(.+)</a>', webpage, 'category', fatal=False)
2019-10-09 21:39:40 +02:00
info['categories'] = [category] # there's just one category per video
2019-10-02 21:18:57 +02:00
tags_html = get_element_by_id('mt_watch_info_tag_list', webpage)
2019-10-09 21:39:40 +02:00
info['tags'] = _tags(tags_html)
return info
2019-10-02 21:18:57 +02:00
def _date(str_containing_date):
"""Parse the string 'at (M)M/(D)D/YYYY' to YYYYMMDD."""
return unified_strdate(str_containing_date.split(' ')[1], day_first=False)
def _tags(tags_html):
"""Parse the HTML markup containing the tags."""
matches = re.findall(r'<a.+>(.+)<\/a>', tags_html)
return [match.rstrip(',') for match in matches]