[telebasel] [simplex] Add new information extractors

2017-02-06 17:01:34 +01:00 · 2017-02-06 17:01:34 +01:00 · 91d21e0a84
commit 91d21e0a84
parent d5d904ff7d
3 changed files with 366 additions and 0 deletions
--- a/youtube_dl/extractor/extractors.py
+++ b/youtube_dl/extractor/extractors.py
@ -849,6 +849,10 @@ from .shared import (
    VivoIE,
 )
 from .showroomlive import ShowRoomLiveIE
 from .simplex import (
    SimplexIE,
    SimplexHostsIE,
 )
 from .sina import SinaIE
 from .sixplay import SixPlayIE
 from .skynewsarabia import (
@ -931,6 +935,10 @@ from .teamfourstar import TeamFourStarIE
 from .techtalks import TechTalksIE
 from .ted import TEDIE
 from .tele13 import Tele13IE
 from .telebasel import (
    TelebaselMediathekIE,
    TelebaselArticleIE,
 )
 from .telebruxelles import TeleBruxellesIE
 from .telecinco import TelecincoIE
 from .telegraaf import TelegraafIE
--- a/youtube_dl/extractor/simplex.py
+++ b/youtube_dl/extractor/simplex.py
@ -0,0 +1,233 @@
 # coding: utf-8
 from __future__ import unicode_literals
 import re
 from .common import InfoExtractor
 from ..utils import (
    ExtractorError,
    float_or_none,
    int_or_none,
    str_or_none,
    try_get,
    urljoin,
 )
 class SimplexIE(InfoExtractor):
    IE_DESC = 'Simplex Player'
    _VALID_URL = r'''(?x)
                simplex:
                (?P<server_url>https?://(?:www\.)?.+):
                (?P<customer_id>\d+):
                (?P<author_id>\d+):
                (?P<project_id>\d+)
                '''
    _TEST = {
        'url': 'simplex:http://video.telebasel.ch:4062:4063:62349',
        'only_matching': True,
    }
    @staticmethod
    def _extract_width_height(resolution):
        try:
            w, h = resolution.split('x')
            w = int_or_none(w)
            h = int_or_none(h)
            return w, h
        except (AttributeError, ValueError):
            return None, None
    def _known_simplex_format(self, simplex_formats, fid):
        for sf in simplex_formats:
            if type(sf['id']) == str and sf['id'] == fid:
                return sf
            elif type(sf['id']) == list and fid in sf['id']:
                return sf
        return None
    def _real_extract(self, url):
        mobj = re.match(self._VALID_URL, url)
        server_url = mobj.group('server_url')
        customer_id = mobj.group('customer_id')
        author_id = mobj.group('author_id')
        project_id = mobj.group('project_id')
        video_id = '%s-%s-%s' % (customer_id, author_id, project_id)
        content_url = urljoin(
            server_url,
            'content/%s/%s/%s/' % (customer_id, author_id, project_id))
        player_data = self._download_json(
            urljoin(content_url, 'data.sid'),
            video_id,
            note='Downloading player data JSON',
            errnote='Unable to download player data JSON')
        video_data = self._download_json(
            urljoin(content_url, 'pl01.sid'),
            video_id,
            note='Downloading video data JSON',
            errnote='Unable to download video data JSON',
            transform_source=lambda s: s[s.index('{'):s.rindex('}') + 1])
        title = str_or_none(player_data['title'])
        description = str_or_none(player_data.get('description'))
        timestamp = int_or_none(player_data.get('createDate'))
        language = str_or_none(player_data.get('language'))
        duration = float_or_none(player_data.get('duration'), scale=10)
        file_information = try_get(video_data, lambda x: x['data'], dict)
        if not file_information:
            raise ExtractorError('Cannot extract file information data.')
        filename = str_or_none(file_information.get('filename'))
        thumbname = str_or_none(file_information.get('thumb'))
        thumbnail = urljoin(content_url, thumbname + '.jpg') if thumbname else None
        qualities = try_get(player_data, lambda x: x['qualities'], list)
        if not qualities:
            raise ExtractorError('Cannot find available formats.')
        # simplex_formats is the list of known simplex player formats.
        # There might be some more format ids, but we are not sure, what they do:
        # id 400: It was indicated to be for Apple TV.
        # id 500: No additional information found.
        simplex_formats = [
            {'id': '20', 'filename': filename + '.flv', 'method': 'url'},
            {'id': '40', 'filename': filename + '_40.flv', 'method': 'url'},
            {'id': '200', 'filename': filename + '.mp4', 'method': 'url'},
            {'id': ['300', '350', '355', '360'], 'filename': 'index.m3u8', 'method': 'm3u8'},
        ]
        formats = []
        m3u8_done = False
        format_infos = []
        for quali in qualities:
            fid = str_or_none(quali.get('id'))
            vbr = int_or_none(quali.get('b'))
            resolution = str_or_none(quali.get('s'))
            width, height = SimplexIE._extract_width_height(resolution)
            form_info = {
                'resolution': resolution,
                'width': width,
                'height': height,
                'vbr': vbr,
                'abr': int_or_none(quali.get('ab')),
                'asr': int_or_none(quali.get('ar')),
                'fps': int_or_none(quali.get('r')),
                'language': language,
                'format_id': 'hls-%s' % str_or_none(vbr)
            }
            format_infos.append(form_info)
            simplex_format = self._known_simplex_format(simplex_formats, fid)
            if simplex_format:
                format_url = urljoin(content_url, simplex_format['filename'])
                if simplex_format['method'] == 'url':
                    form = {
                        'url': format_url
                    }
                    form.update(form_info)
                    formats.append(form)
                elif simplex_format['method'] == 'm3u8' and not m3u8_done:
                    forms = self._extract_m3u8_formats(
                        format_url,
                        video_id,
                        ext='mp4',
                        entry_protocol='m3u8_native')
                    formats.extend(forms)
                    m3u8_done = True
        # Try to add additional information to the formats exracted by _extract_m3u8_formats:
        for form in formats:
            if form['url'].endswith('.m3u8'):
                vbr = int_or_none(
                    self._search_regex(r'(\d+)kb.m3u8', form['url'], 'm3u8 vbr', default=None))
                if vbr:
                    try:
                        form_info = next(f for f in format_infos if f['vbr'] == vbr)
                        form.update(form_info)
                    except StopIteration:
                        pass
        self._sort_formats(formats)
        return {
            'id': video_id,
            'title': title,
            'description': description,
            'duration': duration,
            'thumbnail': thumbnail,
            'timestamp': timestamp,
            'formats': formats,
        }
 class SimplexHostsIE(InfoExtractor):
    _VALID_URL = r'''(?x)
                (?P<server_url>https?://(?:www\.)?
                    (?:
                        video\.telebasel\.ch|
                        media10\.simplex\.tv
                    )
                )
                /content/
                (?P<customer_id>\d+)/
                (?P<author_id>\d+)/
                (?P<project_id>\d+)
                '''
    _TESTS = [{
        'url': 'http://media10.simplex.tv/content/906/907/76997/',
        'md5': 'e6b8ebefac5aeae4a6790fec18382ca0',
        'info_dict': {
            'id': '906-907-76997',
            'ext': 'flv',
            'title': '03.02.17: Der Trailer zum Rückrunden-Start',
            'description': None,
            'duration': 44.0,
            'timestamp': 1486135964,
            'upload_date': '20170203',
            'url': 'http://media10.simplex.tv/content/906/907/76997/simvid_1_40.flv',
            'thumbnail': 'http://media10.simplex.tv/content/906/907/76997/simvid_1.jpg',
            'language': 'de',
            'width': 1280,
            'height': 720,
            'vbr': 2304,
            'abr': 160,
            'fps': 25,
            'asr': 44100,
            'resolution': '1280x720'
        }
    }, {
        'url': 'https://video.telebasel.ch/content/4062/4063/77067',
        'info_dict': {
            'id': '4062-4063-77067',
            'ext': 'flv',
            'title': 'News vom 05.02.2017',
            'description': 'md5:23fb960068621263d5d4418996387674',
            'timestamp': 1486314961,
            'upload_date': '20170205',
        },
        'params': {
            'skip_download': True,
        }
    }]
    def _real_extract(self, url):
        mobj = re.match(self._VALID_URL, url)
        server_url = mobj.group('server_url')
        customer_id = mobj.group('customer_id')
        author_id = mobj.group('author_id')
        project_id = mobj.group('project_id')
        video_id = '%s-%s-%s' % (customer_id, author_id, project_id)
        simplex_url = 'simplex:%s:%s:%s:%s' % (server_url, customer_id, author_id, project_id)
        return self.url_result(
            simplex_url,
            ie=SimplexIE.ie_key(),
            video_id=video_id)
--- a/youtube_dl/extractor/telebasel.py
+++ b/youtube_dl/extractor/telebasel.py
@ -0,0 +1,125 @@
 # coding: utf-8
 from __future__ import unicode_literals
 import re
 from .common import InfoExtractor
 from .simplex import SimplexIE
 from ..utils import (
    ExtractorError,
    str_or_none,
    strip_or_none,
    remove_end,
    try_get,
    urljoin,
 )
 class TelebaselBaseIE(InfoExtractor):
    _SERVER_URL = 'https://video.telebasel.ch/'
    _CUSTOMER_ID = '4062'
    _AUTHOR_ID = '4063'
 class TelebaselMediathekIE(TelebaselBaseIE):
    IE_DESC = 'telebasel.ch Mediathek'
    _VALID_URL = r'''(?x)
                https?://
                    (?:www\.)?
                    telebasel\.ch/
                    (?!telebasel-archiv)
                    (?!\d+)
                    (?P<show_name>[^/]+)
                    (?:
                        /.*pid=(?P<pid>\d+).*
                    )?
                '''
    _TESTS = [{
        'url': 'https://telebasel.ch/telebasel-gastro-tipp/?aid=4063&pid=75290&channel=15881',
        'only_matching': True,
    }, {
        'url': 'https://telebasel.ch/telebasel-reihe-8',
        'only_matching': True,
    }, {
        'url': 'https://telebasel.ch/telebasel-talk/?channel=15881',
        'only_matching': True,
    }]
    def _extract_video_id(self, url, show_name):
        webpage = self._download_webpage(url, show_name)
        channel_id = self._html_search_regex(
            r'<div[^>]+class=["\']tb-mediathek-videos["\'][^>]+data-channels=["\'](\d+)["\']',
            webpage, 'channel id')
        episodes_url = urljoin(
            self._SERVER_URL,
            'multichannel/%s/%s/.ofdd/json' % (self._CUSTOMER_ID, channel_id))
        episodes = self._download_json(
            episodes_url,
            channel_id,
            note='Downloading episodes JSON',
            errnote='Unable to download episodes JSON',
            transform_source=lambda s: s[s.index('{'):s.rindex('}') + 1])
        video_id = str_or_none(
            try_get(episodes, lambda x: x['projects'][0]['projectId'], int))
        if not video_id:
            raise ExtractorError('Could not extract video id from the webpage.')
        return video_id
    def _real_extract(self, url):
        mobj = re.match(self._VALID_URL, url)
        show_name = mobj.group('show_name')
        video_id = mobj.group('pid')
        if not video_id:
            video_id = self._extract_video_id(url, show_name)
        return self.url_result(
            'simplex:%s:%s:%s:%s' % (
                self._SERVER_URL, self._CUSTOMER_ID,
                self._AUTHOR_ID, video_id),
            ie=SimplexIE.ie_key())
 class TelebaselArticleIE(TelebaselBaseIE):
    IE_DESC = 'telebasel.ch articles'
    _VALID_URL = r'https?://(?:www\.)?telebasel\.ch/(?P<id>\d{4}/\d{2}/\d{2}/[^/]+)/?'
    _TEST = {
        'url': 'https://telebasel.ch/2017/02/01/report-usr-iii-einfach-erklaert/?channel=105100',
        'info_dict': {
            'id': '2017/02/01/report-usr-iii-einfach-erklaert',
            'title': 'Report: USR III einfach erklärt',
            'description': 'md5:2cb2b94ac023a6a9517cffc58d500c7e',
        },
        'playlist_count': 3,
    }
    def _real_extract(self, url):
        display_id = self._match_id(url)
        webpage = self._download_webpage(url, display_id)
        search_url = urljoin(
            self._SERVER_URL,
            r'content/%s/%s/(?P<pid>\d+)' % (self._CUSTOMER_ID, self._AUTHOR_ID))
        embed_regex = r'<iframe[^>]+src=["\']%s.+["\']' % search_url
        entries = [
            self.url_result(
                'simplex:%s:%s:%s:%s' % (
                    self._SERVER_URL, self._CUSTOMER_ID,
                    self._AUTHOR_ID, m.group('pid')),
                ie=SimplexIE.ie_key())
            for m in re.finditer(embed_regex, webpage)]
        title = strip_or_none(
            remove_end(self._og_search_title(webpage), '- Telebasel'))
        description = self._og_search_description(webpage)
        return self.playlist_result(
            entries,
            playlist_id=display_id,
            playlist_title=title,
            playlist_description=description)