From 7c6bc92ae0befba8121627fb6aac7c7fb45641de Mon Sep 17 00:00:00 2001 From: James Oakley Date: Wed, 11 Nov 2015 20:16:53 -0800 Subject: [PATCH] [cbc] Add extractor --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/cbc.py | 97 ++++++++++++++++++++++++++++++++ 2 files changed, 98 insertions(+) create mode 100644 youtube_dl/extractor/cbc.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 06d25ef40..f36750be5 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -71,6 +71,7 @@ from .camdemy import ( from .canal13cl import Canal13clIE from .canalplus import CanalplusIE from .canalc2 import Canalc2IE +from .cbc import CBCIE from .cbs import CBSIE from .cbsnews import CBSNewsIE from .cbssports import CBSSportsIE diff --git a/youtube_dl/extractor/cbc.py b/youtube_dl/extractor/cbc.py new file mode 100644 index 000000000..6ef01d066 --- /dev/null +++ b/youtube_dl/extractor/cbc.py @@ -0,0 +1,97 @@ +# -*- coding: utf-8 -*- +from __future__ import unicode_literals + +from .common import InfoExtractor + + +CBC_CAFFEINE_MODULES_URL = 'http://www.cbc.ca/i/caffeine/js/Caffeine.modules.js' + + +class CBCIE(InfoExtractor): + IE_DESC = 'cbc.ca' + _VALID_URL = r'https?://(?:www\.)?cbc\.ca/.*/episodes/(?Pseason-\d+/.+)' + + _TESTS = [{ + 'url': 'http://www.cbc.ca/22minutes/episodes/season-23/episode-197', + 'md5': '9108d19314a116778932b874caf9bc91', + 'info_dict': { + 'id': 'season-23/episode-197', + 'ext': 'mp4', + 'title': '22 Minutes - S23E01 - Episode 1', + 'description': 'md5:03e943f67d535a48522b5bb4ba7cf812', + 'thumbnail': 're:http://.*\.jpg', + 'duration': 1315, + 'timestamp': 1444177800, + 'upload_date': '20151007', + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + episode_page = self._download_webpage(url, video_id) + + clip_id = self._search_regex( + r"CBC.APP.Caffeine.initInstance\({'clipId':\s*'(\d+)'", + episode_page, + 'Clip ID' + ) + + caffeine_js = self._download_webpage(CBC_CAFFEINE_MODULES_URL, video_id) + + caffeine_content_url = self._search_regex( + r'(http://tpfeed.cbc.ca/[a-zA-Z0-9/]+\?byContent=byReleases%3DbyId%253D)', + caffeine_js, + 'Caffeine content URL' + ) + + caffeine_media_url = self._search_regex( + r'(http://tpfeed.cbc.ca/[a-zA-Z0-9/_]+\?)"', + caffeine_js, + 'Caffeine media URL' + ) + + mpx_account_id = self._search_regex( + r'MPX_ACCOUNT_PID:"([a-zA-Z0-9/]+)"', + caffeine_js, + 'MPX account ID' + ) + + caffeine_content = self._download_json( + caffeine_content_url + clip_id + '&fields=content', + video_id + ) + media_id = caffeine_content['entries'][0]['content'][0]['releases'][0]['mediaId'].split('/')[-1] + + caffeine_media = self._download_json( + caffeine_media_url + 'q=*&byGuid=' + media_id, + video_id + ) + + episode_info = caffeine_media['entries'][0] + from pprint import pprint + pprint(episode_info) + + thumbnails = [] + for thumbnail in episode_info['thumbnails']: + thumbnails.append( + { + 'url': thumbnail['url'], + 'width': thumbnail['width'], + 'height': thumbnail['height'], + } + ) + + theplatform_url = 'http://player.theplatform.com/p/{mpx_account_id}/default_prod_vms/embed/select/media/{pid}'.format( + mpx_account_id=mpx_account_id, + pid=episode_info['pid'] + ) + + return { + '_type': 'url_transparent', + 'id': video_id, + 'title': '{cbc$show} - S{cbc$seasonNumber}E{cbc$episodeNumber} - {title}'.format(**episode_info), + 'description': episode_info['description'], + 'timestamp': episode_info['pubDate']/1000, + 'url': theplatform_url, + 'thumbnails': thumbnails, + }