4oD extractor, with a testcase

mostly unusable, since FlashAccess DRM is not implemented, here or in ffmpeg. youtube-dl can at least download subtitles and metadata, and make the stream URL available through -j for external decryption.
2015-03-20 22:37:02 +01:00 · 2015-03-20 22:37:02 +01:00 · 4866273f50
commit 4866273f50
parent 531980d89c
2 changed files with 79 additions and 0 deletions
--- a/youtube_dl/extractor/init.py
+++ b/youtube_dl/extractor/init.py
@ -63,6 +63,7 @@ from .cbsnews import CBSNewsIE
 from .cbssports import CBSSportsIE
 from .ccc import CCCIE
 from .ceskatelevize import CeskaTelevizeIE
 from .channel4 import Channel4IE
 from .channel9 import Channel9IE
 from .chilloutzone import ChilloutzoneIE
 from .chirbit import (
--- a/youtube_dl/extractor/channel4.py
+++ b/youtube_dl/extractor/channel4.py
@ -0,0 +1,78 @@
 from __future__ import unicode_literals
 import json
 from .common import InfoExtractor
 from ..compat import (
    compat_urllib_request,
    compat_urlparse,
 )
 from ..utils import (
    unified_strdate,
    ExtractorError
 )
 class Channel4IE(InfoExtractor):
    _VALID_URL = r'http://(?:www\.)?channel4\.com/programmes/(?P<pid>.*?)/on-demand/(?P<id>.*)'
    _TESTS = [{
        'url': 'http://www.channel4.com/programmes/black-mirror/on-demand/49114-002',
        'info_dict': {
            'id': '49114-002',
            '_programme_title': "Black Mirror",
            'title': "15 Million Merits",
            'description': "In the near future, everyone is confined to a life of strange physical drudgery. The only way to escape is to enter the 'Hot Shot' talent show and pray you can impress the judges.",
            'duration': 222780,
        },
        'params': {
            # unimplemented DRM
            'skip_download': True,
        }
    }]
    def _real_extract(self, url):
        video_id = self._match_id(url)
        webpage = self._download_webpage(url, video_id)
        video_data = json.loads(self._search_regex(
            r'onDemand.selectedEpisode = (?P<json>{.+?});\n',
            webpage, 'video data json'))
        thumbnails = [{
            'url': video_data["pictureComponent"]["url"]
        }]
        request_id = video_data["requestId"]
        # XXX: the Flash player also puts the Unix timestamp in the query string. the download works without it just fine, though
        asset_url = 'http://ais.channel4.com/asset/%s' % (request_id)
        stream_info = self._download_xml(asset_url, video_id)
        service_report = stream_info.find('./serviceReport')
        if service_report.attrib.get('returnCode') != "200":
            raise ExtractorError(service_report.findtext('./description'), expected=True)
        subtitles = stream_info.findtext('./assetInfo/subtitlesFileUri')
        if subtitles:
            subtitles = {
                'en': [{
                    'ext': 'sami',
                    'url': compat_urlparse.urljoin(asset_url, subtitles),
                }]
            }
        formats = self._extract_f4m_formats(stream_info.findtext('./assetInfo/uriData/streamUri'), video_id)
        return {
            'id': video_id,
            '_programme_title': stream_info.findtext('./assetInfo/brandTitle'),
            'title': stream_info.findtext('./assetInfo/episodeTitle'),
            'upload_date': unified_strdate(video_data['txDate'] + ' ' + video_data.get('txTime', '')),
            'description': video_data.get('synopsis'),
            'thumbnails': thumbnails,
            'formats': formats,
            'subtitles': subtitles,
            'duration': video_data["assetDuration"] * 60,
            '_drm_token': stream_info.findtext('./assetInfo/uriData/token'),
            '_programme_series': video_data.get("seriesNumber"),
            '_programme_episode': video_data.get("episodeNumber"),
        }