diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 3cfa804ec..e11ce60f0 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -60,6 +60,7 @@ from .brightcove import BrightcoveIE from .buzzfeed import BuzzFeedIE from .byutv import BYUtvIE from .c56 import C56IE +from .cbc import CBCIE from .camdemy import ( CamdemyIE, CamdemyFolderIE diff --git a/youtube_dl/extractor/cbc.py b/youtube_dl/extractor/cbc.py new file mode 100644 index 000000000..73c0bbc7c --- /dev/null +++ b/youtube_dl/extractor/cbc.py @@ -0,0 +1,97 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + url_basename, + unescapeHTML, + js_to_json, + ExtractorError, +) +import re + + +class CBCIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?cbc.ca/[^/]+/' + + _TESTS = [ + { + 'url': 'http://www.cbc.ca/news/thenational/the-real-cost-of-the-world-s-most-expensive-drug-1.3126338', + 'info_dict': { + 'id': 'if3k_n58u3hDrVX9dOXSTbtHBnSZGQpe', + 'ext': 'flv', + 'title': 'The real cost of the world\'s most expensive drug', + 'description': 'md5:407fb27bb8b10c2e1447bbad0c27e551', + }, + 'add_ie': ['ThePlatform'], + }, + { + 'url': 'http://www.cbc.ca/player/News/ID/2672225049/', + 'info_dict': { + 'id': 'VfTVl5c2pr40a9jxAMWGIRZO8Mz4ubPZ', + 'ext': 'flv', + 'title': 'WATCH: New Earth from space image released by NASA', + 'description': 'md5:3ddd36b5d1066a067a0b0c8891a72506', + }, + 'add_ie': ['ThePlatform'], + }, + { + 'url': 'http://www.cbc.ca/natureofthings/episodes/stonehenge-uncovered', + 'info_dict': { + 'id': 'QPnDq_piKkN5x0dH7SQF85cyJb_KOsG0', + 'ext': 'flv', + 'title': 'Stonehenge Uncovered', + }, + 'add_ie': ['ThePlatform'], + 'skip': 'Canada only', + } + ] + + def _real_extract(self, url): + # from http://www.cbc.ca/i/caffeine/js/Caffeine.js + # TP_FEED_DOMAIN:"http://tpfeed.cbc.ca/f/h9dtGB/5akSXx4Ng_Zn?" + # MPX_ACCOUNT_PID:"h9dtGB" + tp_feed_domain = "http://tpfeed.cbc.ca/f/h9dtGB/5akSXx4Ng_Zn?" + mpx_account_id = "h9dtGB" + + name = url_basename(url) + + webpage = self._download_webpage(url, name) + title = unescapeHTML( + self._search_regex('