Add support for cbc.ca

This commit is contained in:
fnord 2015-07-23 01:36:59 -05:00
parent b37317d8b0
commit d4f0908217
2 changed files with 98 additions and 0 deletions

View File

@ -60,6 +60,7 @@ from .brightcove import BrightcoveIE
from .buzzfeed import BuzzFeedIE from .buzzfeed import BuzzFeedIE
from .byutv import BYUtvIE from .byutv import BYUtvIE
from .c56 import C56IE from .c56 import C56IE
from .cbc import CBCIE
from .camdemy import ( from .camdemy import (
CamdemyIE, CamdemyIE,
CamdemyFolderIE CamdemyFolderIE

View File

@ -0,0 +1,97 @@
from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import (
url_basename,
unescapeHTML,
js_to_json,
ExtractorError,
)
import re
class CBCIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?cbc.ca/[^/]+/'
_TESTS = [
{
'url': 'http://www.cbc.ca/news/thenational/the-real-cost-of-the-world-s-most-expensive-drug-1.3126338',
'info_dict': {
'id': 'if3k_n58u3hDrVX9dOXSTbtHBnSZGQpe',
'ext': 'flv',
'title': 'The real cost of the world\'s most expensive drug',
'description': 'md5:407fb27bb8b10c2e1447bbad0c27e551',
},
'add_ie': ['ThePlatform'],
},
{
'url': 'http://www.cbc.ca/player/News/ID/2672225049/',
'info_dict': {
'id': 'VfTVl5c2pr40a9jxAMWGIRZO8Mz4ubPZ',
'ext': 'flv',
'title': 'WATCH: New Earth from space image released by NASA',
'description': 'md5:3ddd36b5d1066a067a0b0c8891a72506',
},
'add_ie': ['ThePlatform'],
},
{
'url': 'http://www.cbc.ca/natureofthings/episodes/stonehenge-uncovered',
'info_dict': {
'id': 'QPnDq_piKkN5x0dH7SQF85cyJb_KOsG0',
'ext': 'flv',
'title': 'Stonehenge Uncovered',
},
'add_ie': ['ThePlatform'],
'skip': 'Canada only',
}
]
def _real_extract(self, url):
# from http://www.cbc.ca/i/caffeine/js/Caffeine.js
# TP_FEED_DOMAIN:"http://tpfeed.cbc.ca/f/h9dtGB/5akSXx4Ng_Zn?"
# MPX_ACCOUNT_PID:"h9dtGB"
tp_feed_domain = "http://tpfeed.cbc.ca/f/h9dtGB/5akSXx4Ng_Zn?"
mpx_account_id = "h9dtGB"
name = url_basename(url)
webpage = self._download_webpage(url, name)
title = unescapeHTML(
self._search_regex('<title>\s*(.+?)\s*</title>', webpage, 'title'))
cbcapp = re.findall(
r'CBC.APP.Caffeine.initInstance\((.+?)\);', webpage, re.DOTALL)
clipids = []
for jstr in cbcapp:
vdata = self._parse_json(
jstr, 'javascript chunk', transform_source=js_to_json)
if 'clipId' in vdata:
if vdata['clipId'] not in clipids:
clipids.append(vdata['clipId'])
vids = []
for cid in clipids:
feedurl = tp_feed_domain + \
'range=1-1&byContent=byReleases%3DbyId%253D' + cid
feedpage = self._download_webpage(feedurl, 'feed for clip ' + cid)
cjson = self._parse_json(
feedpage, 'clip feed json', transform_source=js_to_json)
for ent in cjson.get('entries', []):
for content in ent.get('content', []):
# assuming multi-content is playlist or multi-part video
vid = {}
for release in content.get('releases', []):
if 'url' in vid:
self.report_warning(
cid + ': multi-release video? Skipping, if content is missing please file a bug report')
continue
vid['url'] = 'http://link.theplatform.com/s/' + \
mpx_account_id + '/' + release['pid']
if 'url' in vid:
vids.append(self.url_result(vid['url']))
if not vids:
raise ExtractorError('No video found', expected=True)
if len(vids) > 1:
return self.playlist_result(vids, name, title)
return vids[0]