From ecfc2715541b8536d8c5305be5e9ade657db3561 Mon Sep 17 00:00:00 2001 From: gheoan Date: Sat, 18 May 2019 00:51:39 +0300 Subject: [PATCH] [harvarddce] Add new extractor Allows downloading some of the videos from [Harvard Open Learning Initiative][1]. These seem to be licensed under Creative Commons. [1]: https://www.extension.harvard.edu/open-learning-initiative --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/harvarddce.py | 59 ++++++++++++++++++++++++++++++ 2 files changed, 60 insertions(+) create mode 100644 youtube_dl/extractor/harvarddce.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 3037b5a45..d754f8386 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -442,6 +442,7 @@ from .goshgay import GoshgayIE from .gputechconf import GPUTechConfIE from .groupon import GrouponIE from .hark import HarkIE +from .harvarddce import HarvardDceIE from .hbo import HBOIE from .hearthisat import HearThisAtIE from .heise import HeiseIE diff --git a/youtube_dl/extractor/harvarddce.py b/youtube_dl/extractor/harvarddce.py new file mode 100644 index 000000000..39b47d666 --- /dev/null +++ b/youtube_dl/extractor/harvarddce.py @@ -0,0 +1,59 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import (int_or_none, try_get) + + +class HarvardDceIE(InfoExtractor): + _VALID_URL = r'https?://matterhorn\.dce\.harvard\.edu/engage/player/watch.html\?id=(?P[0-9a-z-]+)' + _TEST = { + 'url': 'https://matterhorn.dce.harvard.edu/engage/player/watch.html?id=1a5e78df-fbb5-4c97-be82-860fb69b4379', + 'info_dict': { + 'id': '1a5e78df-fbb5-4c97-be82-860fb69b4379', + 'title': 'Lecture 4', + 'ext': 'mp4', + 'description': 'Review, Kernels, Normality', + 'duration': float(3187), + } + } + + def _real_extract(self, url): + vid = self._match_id(url) + json_url = 'https://matterhorn.dce.harvard.edu/search/episode.json' + response = self._download_json(json_url, vid, query={'id': vid}) + result = response['search-results']['result'] + + formats = result['mediapackage']['media']['track'] + + def sort_format(track): + return try_get(track, lambda x: x['video']['bitrate']) or 0 + formats.sort(key=sort_format) + + def map_format(track): + audio = track.get('audio') or {} + video = track.get('video') or {} + return { + 'url': track['url'], + + 'acodec': try_get(audio, lambda x: x['encoder']['type']), + 'abr': audio.get('bitrate'), + + 'vcodec': try_get(video, lambda x: x['encoder']['type']), + 'vbr': video.get('bitrate'), + 'fps': video.get('framerate'), + 'resolution': video.get('resolution'), + } + formats = map(map_format, formats) + + duration = result['mediapackage'].get('duration') + duration = int_or_none(duration, scale=1000) + + return { + 'id': vid, + 'title': result.get('dcTitle'), + 'formats': formats, + 'description': result.get('dcDescription'), + 'duration': duration, + 'license': result.get('dcLicense'), + }