Added rtl.lu extractor
This commit is contained in:
parent
910c552052
commit
62962240b1
@ -346,6 +346,7 @@ from .rottentomatoes import RottenTomatoesIE
|
|||||||
from .roxwel import RoxwelIE
|
from .roxwel import RoxwelIE
|
||||||
from .rtbf import RTBFIE
|
from .rtbf import RTBFIE
|
||||||
from .rte import RteIE
|
from .rte import RteIE
|
||||||
|
from .rtllu import RtlluIE
|
||||||
from .rtlnl import RtlXlIE
|
from .rtlnl import RtlXlIE
|
||||||
from .rtlnow import RTLnowIE
|
from .rtlnow import RTLnowIE
|
||||||
from .rtp import RTPIE
|
from .rtp import RTPIE
|
||||||
|
108
youtube_dl/extractor/rtllu.py
Normal file
108
youtube_dl/extractor/rtllu.py
Normal file
@ -0,0 +1,108 @@
|
|||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
import re
|
||||||
|
import json
|
||||||
|
|
||||||
|
from .common import InfoExtractor
|
||||||
|
|
||||||
|
|
||||||
|
class RtlluIE(InfoExtractor):
|
||||||
|
IE_NAME = 'rtl.lu'
|
||||||
|
|
||||||
|
_VALID_URL = r'https?://(www|tele|radio|5minutes)\.rtl\.lu\/.*?\/(?P<id>[0-9]+)'
|
||||||
|
|
||||||
|
_TEST = {
|
||||||
|
'url': 'http://radio.rtl.lu/emissiounen/background/599319.html',
|
||||||
|
'md5': 'TODO:',
|
||||||
|
'info_dict': {
|
||||||
|
'id': '599319',
|
||||||
|
'ext': 'mp4',
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
def _real_extract(self, url):
|
||||||
|
match = self._VALID_URL_RE.match(url)
|
||||||
|
id = match.group('id')
|
||||||
|
|
||||||
|
webpage = self._download_webpage(url, id)
|
||||||
|
|
||||||
|
javascript_regex = r'<script language="Javascript">((\n*?.*?)*?)</script>'
|
||||||
|
javascript = self._html_search_regex(javascript_regex, webpage, 'javascript')
|
||||||
|
|
||||||
|
try:
|
||||||
|
javascript_sources_regex = r'object.*\.sources = \'(?P<value>.*?)\';'
|
||||||
|
sources = json.loads(re.search(javascript_sources_regex, javascript).group('value'))
|
||||||
|
|
||||||
|
javascript_thumbnail_regex = r'object.*\.title = \'(?P<value>.*?)\';'
|
||||||
|
javascript_thumbnail = re.search(javascript_thumbnail_regex, javascript).group('value')
|
||||||
|
|
||||||
|
javascript_videoid_regex = r'object.*\.videoid = \'(?P<value>.*?)\';'
|
||||||
|
javascript_videoid = re.search(javascript_videoid_regex, javascript).group('value')
|
||||||
|
|
||||||
|
javascript_publicdate_regex = r'object.*\.publicdate = \'(?P<value>.*?)\';'
|
||||||
|
javascript_publicdate = re.search(javascript_publicdate_regex, javascript).group('value')
|
||||||
|
|
||||||
|
formats = [
|
||||||
|
{
|
||||||
|
'url': sources['httplq']['src'],
|
||||||
|
'format': 'Low Quality',
|
||||||
|
'format_id': 'lq',
|
||||||
|
'protocol': 'http',
|
||||||
|
},
|
||||||
|
{
|
||||||
|
'url': sources['http']['src'],
|
||||||
|
'format': 'Standard Quality',
|
||||||
|
'format_id': 'sd',
|
||||||
|
'protocol': 'http',
|
||||||
|
},
|
||||||
|
{
|
||||||
|
'url': sources['httphq']['src'],
|
||||||
|
'format': 'High Quality',
|
||||||
|
'format_id': 'hq',
|
||||||
|
'protocol': 'http',
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
||||||
|
return {
|
||||||
|
'id': javascript_videoid or id,
|
||||||
|
'title': self.get_video_title(webpage, javascript),
|
||||||
|
'formats': formats,
|
||||||
|
'thumbnail': javascript_thumbnail,
|
||||||
|
'upload_date': javascript_publicdate,
|
||||||
|
}
|
||||||
|
except AttributeError:
|
||||||
|
javascript_mp3_regex = r'play_mp3\("object[0-9]*", "(?P<value>.*?)",'
|
||||||
|
javascript_mp3 = re.search(javascript_mp3_regex, javascript).group('value')
|
||||||
|
print(javascript_mp3)
|
||||||
|
|
||||||
|
return {
|
||||||
|
'id': id,
|
||||||
|
'title': self.get_audio_title(webpage),
|
||||||
|
'url': javascript_mp3,
|
||||||
|
}
|
||||||
|
|
||||||
|
def get_video_title(self, webpage, javascript):
|
||||||
|
|
||||||
|
title_regex = r'</div>.*<h1>(?P<title>.*?)</h1>.*?<p class="sub">'
|
||||||
|
title = re.findall(title_regex, webpage, flags=re.S)
|
||||||
|
|
||||||
|
if title:
|
||||||
|
title = title[-1]
|
||||||
|
|
||||||
|
javascript_title_regex = r'object.*\.title = \'(?P<value>.*?)\';'
|
||||||
|
javascript_title = re.search(javascript_title_regex, javascript).group('value')
|
||||||
|
return javascript_title or title or self._og_search_title(webpage)
|
||||||
|
|
||||||
|
def get_audio_title(self, webpage):
|
||||||
|
|
||||||
|
title_regex = r'<header><h1><span>(?P<span>.*?)</span>(?P<title>.*?)</h1>'
|
||||||
|
title = self._html_search_regex(title_regex, webpage, 'title', group='title', fatal=False)
|
||||||
|
span = self._html_search_regex(title_regex, webpage, 'span', group='span', fatal=False)
|
||||||
|
|
||||||
|
if title or span:
|
||||||
|
title = ' - '.join([span, title])
|
||||||
|
|
||||||
|
h5_title_regex = r'<h5>(?P<title>.*?)</h5>'
|
||||||
|
h5_title = self._html_search_regex(h5_title_regex, webpage, 'title', group='title', fatal=False)
|
||||||
|
|
||||||
|
return title or h5_title or self._og_search_title(webpage)
|
Loading…
x
Reference in New Issue
Block a user