From ecfc2715541b8536d8c5305be5e9ade657db3561 Mon Sep 17 00:00:00 2001
From: gheoan <gheoan@outlook.com>
Date: Sat, 18 May 2019 00:51:39 +0300
Subject: [PATCH] [harvarddce] Add new extractor

Allows downloading some of the videos from [Harvard Open Learning Initiative][1]. These seem to be licensed under Creative Commons.

[1]: https://www.extension.harvard.edu/open-learning-initiative
---
 youtube_dl/extractor/extractors.py |  1 +
 youtube_dl/extractor/harvarddce.py | 59 ++++++++++++++++++++++++++++++
 2 files changed, 60 insertions(+)
 create mode 100644 youtube_dl/extractor/harvarddce.py
diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py
index 3037b5a45..d754f8386 100644
--- a/youtube_dl/extractor/extractors.py
+++ b/youtube_dl/extractor/extractors.py
@@ -442,6 +442,7 @@ from .goshgay import GoshgayIE
 from .gputechconf import GPUTechConfIE
 from .groupon import GrouponIE
 from .hark import HarkIE
+from .harvarddce import HarvardDceIE
 from .hbo import HBOIE
 from .hearthisat import HearThisAtIE
 from .heise import HeiseIE
diff --git a/youtube_dl/extractor/harvarddce.py b/youtube_dl/extractor/harvarddce.py
new file mode 100644
index 000000000..39b47d666
--- /dev/null
+++ b/youtube_dl/extractor/harvarddce.py
@@ -0,0 +1,59 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (int_or_none, try_get)
+
+
+class HarvardDceIE(InfoExtractor):
+    _VALID_URL = r'https?://matterhorn\.dce\.harvard\.edu/engage/player/watch.html\?id=(?P<id>[0-9a-z-]+)'
+    _TEST = {
+        'url': 'https://matterhorn.dce.harvard.edu/engage/player/watch.html?id=1a5e78df-fbb5-4c97-be82-860fb69b4379',
+        'info_dict': {
+            'id': '1a5e78df-fbb5-4c97-be82-860fb69b4379',
+            'title': 'Lecture 4',
+            'ext': 'mp4',
+            'description': 'Review, Kernels, Normality',
+            'duration': float(3187),
+        }
+    }
+
+    def _real_extract(self, url):
+        vid = self._match_id(url)
+        json_url = 'https://matterhorn.dce.harvard.edu/search/episode.json'
+        response = self._download_json(json_url, vid, query={'id': vid})
+        result = response['search-results']['result']
+
+        formats = result['mediapackage']['media']['track']
+
+        def sort_format(track):
+            return try_get(track, lambda x: x['video']['bitrate']) or 0
+        formats.sort(key=sort_format)
+
+        def map_format(track):
+            audio = track.get('audio') or {}
+            video = track.get('video') or {}
+            return {
+                'url': track['url'],
+
+                'acodec': try_get(audio, lambda x: x['encoder']['type']),
+                'abr': audio.get('bitrate'),
+
+                'vcodec': try_get(video, lambda x: x['encoder']['type']),
+                'vbr': video.get('bitrate'),
+                'fps': video.get('framerate'),
+                'resolution': video.get('resolution'),
+            }
+        formats = map(map_format, formats)
+
+        duration = result['mediapackage'].get('duration')
+        duration = int_or_none(duration, scale=1000)
+
+        return {
+            'id': vid,
+            'title': result.get('dcTitle'),
+            'formats': formats,
+            'description': result.get('dcDescription'),
+            'duration': duration,
+            'license': result.get('dcLicense'),
+        }