l1ving_youtube-dl/youtube_dl/extractor/nobelprize.py

from __future__ import unicode_literals

from .common import InfoExtractor

from ..utils import (
    clean_html,
    get_element_by_class,
)


class NobelprizeIE(InfoExtractor):
    _VALID_URL = r'https?://(?:www\.)?nobelprize\.org/mediaplayer/.+?id=(?P<id>[0-9]{4})'
    IE_DESC = 'Nobelprize'

    _TEST = {
        'url': 'https://www.nobelprize.org/mediaplayer/index.php?id=2028',
        'md5': '19bb7134879a6e8f0731235f3c076321',
        'info_dict': {
            'id': '2028',
            'ext': 'mp4',
            'title': 'Acceptance Speech by Elie Wiesel (18 minutes)'
        }
    }

    def _real_extract(self, url):
        video_id = self._match_id(url)

        webpage = self._download_webpage(url, id)

        # we now do a regex search for a JS variable in our webpage
        # which will deliver us a m3u8 file with all streams available

        m3u8_playlist = self._search_regex(
            r"(http://nobelvod-vh.akamaihd.net/i/flashcontent/.+master\.m3u8)",
            webpage,
            'm3u8 url',
        )

        formats = self._extract_m3u8_formats(m3u8_playlist, video_id, 'mp4')

        return {
            'id': video_id,
            'title': clean_html(get_element_by_class('video-headline', webpage)),
            'formats': formats,
        }