[harvarddce] Add new extractor

Allows downloading some of the videos from [Harvard Open Learning Initiative][1]. These seem to be licensed under Creative Commons.

[1]: https://www.extension.harvard.edu/open-learning-initiative
This commit is contained in:
gheoan 2019-05-18 00:51:39 +03:00
parent 82e91d20a0
commit ecfc271554
2 changed files with 60 additions and 0 deletions

View File

@ -442,6 +442,7 @@ from .goshgay import GoshgayIE
from .gputechconf import GPUTechConfIE from .gputechconf import GPUTechConfIE
from .groupon import GrouponIE from .groupon import GrouponIE
from .hark import HarkIE from .hark import HarkIE
from .harvarddce import HarvardDceIE
from .hbo import HBOIE from .hbo import HBOIE
from .hearthisat import HearThisAtIE from .hearthisat import HearThisAtIE
from .heise import HeiseIE from .heise import HeiseIE

View File

@ -0,0 +1,59 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import (int_or_none, try_get)
class HarvardDceIE(InfoExtractor):
_VALID_URL = r'https?://matterhorn\.dce\.harvard\.edu/engage/player/watch.html\?id=(?P<id>[0-9a-z-]+)'
_TEST = {
'url': 'https://matterhorn.dce.harvard.edu/engage/player/watch.html?id=1a5e78df-fbb5-4c97-be82-860fb69b4379',
'info_dict': {
'id': '1a5e78df-fbb5-4c97-be82-860fb69b4379',
'title': 'Lecture 4',
'ext': 'mp4',
'description': 'Review, Kernels, Normality',
'duration': float(3187),
}
}
def _real_extract(self, url):
vid = self._match_id(url)
json_url = 'https://matterhorn.dce.harvard.edu/search/episode.json'
response = self._download_json(json_url, vid, query={'id': vid})
result = response['search-results']['result']
formats = result['mediapackage']['media']['track']
def sort_format(track):
return try_get(track, lambda x: x['video']['bitrate']) or 0
formats.sort(key=sort_format)
def map_format(track):
audio = track.get('audio') or {}
video = track.get('video') or {}
return {
'url': track['url'],
'acodec': try_get(audio, lambda x: x['encoder']['type']),
'abr': audio.get('bitrate'),
'vcodec': try_get(video, lambda x: x['encoder']['type']),
'vbr': video.get('bitrate'),
'fps': video.get('framerate'),
'resolution': video.get('resolution'),
}
formats = map(map_format, formats)
duration = result['mediapackage'].get('duration')
duration = int_or_none(duration, scale=1000)
return {
'id': vid,
'title': result.get('dcTitle'),
'formats': formats,
'description': result.get('dcDescription'),
'duration': duration,
'license': result.get('dcLicense'),
}