[wdr]: Add extractor for "Sendung mit dem Elefanten"

The homepage of "Sendung mit dem Elefanten" (a children's show) at http://www.wdrmaus.de/elefantenseite/ offers various videos. All videos use the same URL, but with different fragments, such as http://www.wdrmaus.de/elefantenseite/#folge_ostern_2015 . The new extractor WDRElefantIE supports these URLs; it downloads the site's internal table of contents (a JSON document) to look up the video.
2017-10-25 15:00:31 +02:00 · 2017-10-25 15:00:31 +02:00 · 6bbc101aed
commit 6bbc101aed
parent 86f237e649
2 changed files with 54 additions and 0 deletions
--- a/youtube_dl/extractor/extractors.py
+++ b/youtube_dl/extractor/extractors.py
@ -1275,6 +1275,7 @@ from .watchbox import WatchBoxIE
 from .watchindianporn import WatchIndianPornIE
 from .wdr import (
    WDRIE,
+    WDRElefantIE,
    WDRMobileIE,
 )
 from .webcaster import (
--- a/youtube_dl/extractor/wdr.py
+++ b/youtube_dl/extractor/wdr.py
@ -241,6 +241,59 @@ class WDRIE(WDRBaseIE):
        return info_dict


+class WDRElefantIE(WDRBaseIE):
+    _VALID_URL = r'https?://(?:www\.)wdrmaus.de/elefantenseite/#(?P<display_id>.+)'
+    IE_NAME = 'wdr:elefant'
+
+    _TESTS = [
+        {
+            'url': 'http://www.wdrmaus.de/elefantenseite/#lieder_geburtstagslied',
+            'info_dict': {
+                'title': 'Ich bin schon 1-2-3',
+                'id': 'mdb-1008774',
+                'ext': 'mp4',
+                'age_limit': None,
+                'upload_date': '20091119'
+            },
+        },
+        {
+            'url': 'http://www.wdrmaus.de/elefantenseite/#folge_ostern_2015',
+            'info_dict': {
+                'title': 'Folge Oster-Spezial 2015',
+                'id': 'mdb-1088195',
+                'ext': 'mp4',
+                'age_limit': None,
+                'upload_date': '20150406'
+            },
+        },
+    ]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        display_id = mobj.group('display_id')
+
+        # Table of Contents seems to always be at this address, so fetch it directly.
+        # The website fetches configurationJS.php5, which links to tableOfContentsJS.php5.
+        table_of_contents = self._download_json(
+            'https://www.wdrmaus.de/elefantenseite/data/tableOfContentsJS.php5', display_id)
+        if display_id not in table_of_contents:
+            raise ExtractorError(
+                'No entry in site\'s table of contents for this URL. '
+                'Is the fragment part of the URL (after the #) correct?',
+                expected=True)
+        xml_metadata_path = table_of_contents[display_id]['xmlPath']
+        xml_metadata = self._download_xml(
+            'https://www.wdrmaus.de/elefantenseite/' + xml_metadata_path, display_id)
+        zmdb_url_element = xml_metadata.find('./movie/zmdb_url')
+        if zmdb_url_element is None:
+            raise ExtractorError(
+                'The URL looks valid, but no video was found. Note that download only works '
+                'on pages showing a single video, not on video selection pages.',
+                expected=True)
+        info_dict = self._extract_wdr_video(zmdb_url_element.text, display_id)
+        return info_dict
+
+
 class WDRMobileIE(InfoExtractor):
    _VALID_URL = r'''(?x)
        https?://mobile-ondemand\.wdr\.de/