From d4f0908217d51124c1b9cfb3819348bf76d0f5f3 Mon Sep 17 00:00:00 2001 From: fnord Date: Thu, 23 Jul 2015 01:36:59 -0500 Subject: [PATCH 1/2] Add support for cbc.ca --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/cbc.py | 97 ++++++++++++++++++++++++++++++++ 2 files changed, 98 insertions(+) create mode 100644 youtube_dl/extractor/cbc.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 3cfa804ec..e11ce60f0 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -60,6 +60,7 @@ from .brightcove import BrightcoveIE from .buzzfeed import BuzzFeedIE from .byutv import BYUtvIE from .c56 import C56IE +from .cbc import CBCIE from .camdemy import ( CamdemyIE, CamdemyFolderIE diff --git a/youtube_dl/extractor/cbc.py b/youtube_dl/extractor/cbc.py new file mode 100644 index 000000000..73c0bbc7c --- /dev/null +++ b/youtube_dl/extractor/cbc.py @@ -0,0 +1,97 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + url_basename, + unescapeHTML, + js_to_json, + ExtractorError, +) +import re + + +class CBCIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?cbc.ca/[^/]+/' + + _TESTS = [ + { + 'url': 'http://www.cbc.ca/news/thenational/the-real-cost-of-the-world-s-most-expensive-drug-1.3126338', + 'info_dict': { + 'id': 'if3k_n58u3hDrVX9dOXSTbtHBnSZGQpe', + 'ext': 'flv', + 'title': 'The real cost of the world\'s most expensive drug', + 'description': 'md5:407fb27bb8b10c2e1447bbad0c27e551', + }, + 'add_ie': ['ThePlatform'], + }, + { + 'url': 'http://www.cbc.ca/player/News/ID/2672225049/', + 'info_dict': { + 'id': 'VfTVl5c2pr40a9jxAMWGIRZO8Mz4ubPZ', + 'ext': 'flv', + 'title': 'WATCH: New Earth from space image released by NASA', + 'description': 'md5:3ddd36b5d1066a067a0b0c8891a72506', + }, + 'add_ie': ['ThePlatform'], + }, + { + 'url': 'http://www.cbc.ca/natureofthings/episodes/stonehenge-uncovered', + 'info_dict': { + 'id': 'QPnDq_piKkN5x0dH7SQF85cyJb_KOsG0', + 'ext': 'flv', + 'title': 'Stonehenge Uncovered', + }, + 'add_ie': ['ThePlatform'], + 'skip': 'Canada only', + } + ] + + def _real_extract(self, url): + # from http://www.cbc.ca/i/caffeine/js/Caffeine.js + # TP_FEED_DOMAIN:"http://tpfeed.cbc.ca/f/h9dtGB/5akSXx4Ng_Zn?" + # MPX_ACCOUNT_PID:"h9dtGB" + tp_feed_domain = "http://tpfeed.cbc.ca/f/h9dtGB/5akSXx4Ng_Zn?" + mpx_account_id = "h9dtGB" + + name = url_basename(url) + + webpage = self._download_webpage(url, name) + title = unescapeHTML( + self._search_regex('\s*(.+?)\s*', webpage, 'title')) + + cbcapp = re.findall( + r'CBC.APP.Caffeine.initInstance\((.+?)\);', webpage, re.DOTALL) + + clipids = [] + for jstr in cbcapp: + vdata = self._parse_json( + jstr, 'javascript chunk', transform_source=js_to_json) + if 'clipId' in vdata: + if vdata['clipId'] not in clipids: + clipids.append(vdata['clipId']) + + vids = [] + for cid in clipids: + feedurl = tp_feed_domain + \ + 'range=1-1&byContent=byReleases%3DbyId%253D' + cid + feedpage = self._download_webpage(feedurl, 'feed for clip ' + cid) + cjson = self._parse_json( + feedpage, 'clip feed json', transform_source=js_to_json) + for ent in cjson.get('entries', []): + for content in ent.get('content', []): + # assuming multi-content is playlist or multi-part video + vid = {} + for release in content.get('releases', []): + if 'url' in vid: + self.report_warning( + cid + ': multi-release video? Skipping, if content is missing please file a bug report') + continue + vid['url'] = 'http://link.theplatform.com/s/' + \ + mpx_account_id + '/' + release['pid'] + if 'url' in vid: + vids.append(self.url_result(vid['url'])) + if not vids: + raise ExtractorError('No video found', expected=True) + if len(vids) > 1: + return self.playlist_result(vids, name, title) + return vids[0] From ab84ca19f2e5180f467de71ea189eae5e0a9cc7b Mon Sep 17 00:00:00 2001 From: fnord Date: Sat, 25 Jul 2015 20:02:59 -0500 Subject: [PATCH 2/2] Title extract adjustment --- youtube_dl/extractor/cbc.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/youtube_dl/extractor/cbc.py b/youtube_dl/extractor/cbc.py index 73c0bbc7c..5420d445f 100644 --- a/youtube_dl/extractor/cbc.py +++ b/youtube_dl/extractor/cbc.py @@ -56,8 +56,7 @@ class CBCIE(InfoExtractor): name = url_basename(url) webpage = self._download_webpage(url, name) - title = unescapeHTML( - self._search_regex('\s*(.+?)\s*', webpage, 'title')) + title = self._html_search_regex('\s*(.+?)\s*', webpage, 'title') cbcapp = re.findall( r'CBC.APP.Caffeine.initInstance\((.+?)\);', webpage, re.DOTALL)