Added rtl.lu extractor

2015-01-20 11:02:01 +01:00 · 2015-01-20 11:02:01 +01:00 · 62962240b1
commit 62962240b1
parent 910c552052
2 changed files with 109 additions and 0 deletions
--- a/youtube_dl/extractor/init.py
+++ b/youtube_dl/extractor/init.py
@ -346,6 +346,7 @@ from .rottentomatoes import RottenTomatoesIE
 from .roxwel import RoxwelIE
 from .rtbf import RTBFIE
 from .rte import RteIE
+from .rtllu import RtlluIE
 from .rtlnl import RtlXlIE
 from .rtlnow import RTLnowIE
 from .rtp import RTPIE
--- a/youtube_dl/extractor/rtllu.py
+++ b/youtube_dl/extractor/rtllu.py
@ -0,0 +1,108 @@
+from __future__ import unicode_literals
+
+import re
+import json
+
+from .common import InfoExtractor
+
+
+class RtlluIE(InfoExtractor):
+    IE_NAME = 'rtl.lu'
+
+    _VALID_URL = r'https?://(www|tele|radio|5minutes)\.rtl\.lu\/.*?\/(?P<id>[0-9]+)'
+
+    _TEST = {
+        'url': 'http://radio.rtl.lu/emissiounen/background/599319.html',
+        'md5': 'TODO:',
+        'info_dict': {
+            'id': '599319',
+            'ext': 'mp4',
+        },
+    }
+
+    def _real_extract(self, url):
+        match = self._VALID_URL_RE.match(url)
+        id = match.group('id')
+
+        webpage = self._download_webpage(url, id)
+
+        javascript_regex = r'<script language="Javascript">((\n*?.*?)*?)</script>'
+        javascript = self._html_search_regex(javascript_regex, webpage, 'javascript')
+
+        try:
+            javascript_sources_regex = r'object.*\.sources = \'(?P<value>.*?)\';'
+            sources = json.loads(re.search(javascript_sources_regex, javascript).group('value'))
+
+            javascript_thumbnail_regex = r'object.*\.title = \'(?P<value>.*?)\';'
+            javascript_thumbnail = re.search(javascript_thumbnail_regex, javascript).group('value')
+
+            javascript_videoid_regex = r'object.*\.videoid = \'(?P<value>.*?)\';'
+            javascript_videoid = re.search(javascript_videoid_regex, javascript).group('value')
+
+            javascript_publicdate_regex = r'object.*\.publicdate = \'(?P<value>.*?)\';'
+            javascript_publicdate = re.search(javascript_publicdate_regex, javascript).group('value')
+
+            formats = [
+                {
+                    'url': sources['httplq']['src'],
+                    'format': 'Low Quality',
+                    'format_id': 'lq',
+                    'protocol': 'http',
+                },
+                {
+                    'url': sources['http']['src'],
+                    'format': 'Standard Quality',
+                    'format_id': 'sd',
+                    'protocol': 'http',
+                },
+                {
+                    'url': sources['httphq']['src'],
+                    'format': 'High Quality',
+                    'format_id': 'hq',
+                    'protocol': 'http',
+                },
+            ]
+
+            return {
+                'id': javascript_videoid or id,
+                'title': self.get_video_title(webpage, javascript),
+                'formats': formats,
+                'thumbnail': javascript_thumbnail,
+                'upload_date': javascript_publicdate,
+            }
+        except AttributeError:
+            javascript_mp3_regex = r'play_mp3\("object[0-9]*", "(?P<value>.*?)",'
+            javascript_mp3 = re.search(javascript_mp3_regex, javascript).group('value')
+            print(javascript_mp3)
+
+            return {
+                'id': id,
+                'title': self.get_audio_title(webpage),
+                'url': javascript_mp3,
+            }
+
+    def get_video_title(self, webpage, javascript):
+
+        title_regex = r'</div>.*<h1>(?P<title>.*?)</h1>.*?<p class="sub">'
+        title = re.findall(title_regex, webpage, flags=re.S)
+
+        if title:
+            title = title[-1]
+
+        javascript_title_regex = r'object.*\.title = \'(?P<value>.*?)\';'
+        javascript_title = re.search(javascript_title_regex, javascript).group('value')
+        return javascript_title or title or self._og_search_title(webpage)
+
+    def get_audio_title(self, webpage):
+
+        title_regex = r'<header><h1><span>(?P<span>.*?)</span>(?P<title>.*?)</h1>'
+        title = self._html_search_regex(title_regex, webpage, 'title', group='title', fatal=False)
+        span = self._html_search_regex(title_regex, webpage, 'span', group='span', fatal=False)
+
+        if title or span:
+            title = ' - '.join([span, title])
+
+        h5_title_regex = r'<h5>(?P<title>.*?)</h5>'
+        h5_title = self._html_search_regex(h5_title_regex, webpage, 'title', group='title', fatal=False)
+
+        return title or h5_title or self._og_search_title(webpage)