[MallTv] Add new extractor

2018-10-07 21:33:26 +02:00 · 2018-10-07 21:33:26 +02:00 · 95b8a52327
commit 95b8a52327
parent 5d90a8a5f3
2 changed files with 76 additions and 0 deletions
--- a/youtube_dl/extractor/extractors.py
+++ b/youtube_dl/extractor/extractors.py
@ -595,6 +595,7 @@ from .mailru import (
    MailRuMusicSearchIE,
 )
 from .makertv import MakerTVIE
+from .malltv import MallTvIE
 from .mangomolo import (
    MangomoloVideoIE,
    MangomoloLiveIE,
--- a/youtube_dl/extractor/malltv.py
+++ b/youtube_dl/extractor/malltv.py
@ -0,0 +1,75 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    NO_DEFAULT,
+)
+
+
+class MallTvIE(InfoExtractor):
+    _VALID_URL = r'https://mall.tv/(?P<id>[^/#?]+)'
+    _TEST = {
+        'url': 'https://mall.tv/tajemstvi-nejkrupavejsich-kurecich-kridylek',
+        'info_dict': {
+            'id': 'tajemstvi-nejkrupavejsich-kurecich-kridylek',
+            'ext': 'mp4',
+            'title': 'Tajemství nejkřupavějších kuřecích křidýlek',
+            'description': 'md5:f77cbb85d08745bfc85a2768fa34b57d',
+            'thumbnail': r're:^https?://.*\.jpg',
+            'duration': 58.0,
+            'upload_date': '20180912',
+            'timestamp': 1536781320,
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+    }
+
+    # MAll.tv has malformed type atribute (i.e. missing quotes)
+    #
+    JSON_LD_RE_MALLTV_MALFORMED = r'(?is)<script[^>]+type=application/ld\+json[^>]*>(?P<json_ld>.+?)</script>'
+
+    def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
+        json_ld = self._search_regex(
+            self.JSON_LD_RE_MALLTV_MALFORMED, html, 'JSON-LD', group='json_ld', **kwargs)
+        default = kwargs.get('default', NO_DEFAULT)
+        if not json_ld:
+            return default if default is not NO_DEFAULT else {}
+        # JSON-LD may be malformed and thus `fatal` should be respected.
+        # At the same time `default` may be passed that assumes `fatal=False`
+        # for _search_regex. Let's simulate the same behavior here as well.
+        fatal = kwargs.get('fatal', True) if default == NO_DEFAULT else False
+        return self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type)
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+
+        title = self._og_search_title(webpage, default=None)
+        description = self._og_search_description(webpage, default=None)
+
+        ldjson = self._search_json_ld(webpage, video_id, default=None)
+
+        # Again, the malform attribute
+        #
+        source = self._search_regex(re.compile(r'<source\s+src=([^ \t]+)'), webpage, None, default=None)
+
+        format_url = source + '.m3u8'
+        formats = self._extract_m3u8_formats(format_url, video_id)
+        for format in formats:
+            format['ext'] = 'mp4'
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': description,
+            'duration': ldjson['duration'],
+            'timestamp': ldjson['timestamp'],
+            'thumbnail': ldjson['thumbnail'],
+            'formats': formats
+        }