From 0817510dfba8a7de1b8e46f7755994510f82366e Mon Sep 17 00:00:00 2001 From: zmobbie Date: Fri, 12 Aug 2016 00:55:34 +0300 Subject: [PATCH] Kanal2 Add new extractor --- youtube_dl/extractor/kanal2.py | 97 +++++++++++++++------------------- 1 file changed, 44 insertions(+), 53 deletions(-) diff --git a/youtube_dl/extractor/kanal2.py b/youtube_dl/extractor/kanal2.py index 97ce8b5ea..b42b3f7a2 100644 --- a/youtube_dl/extractor/kanal2.py +++ b/youtube_dl/extractor/kanal2.py @@ -11,13 +11,14 @@ from ..utils import ( str_to_int, int_or_none, HEADRequest, + unescapeHTML, ) import re class Kanal2IE(InfoExtractor): - _VALID_URL = r'(?P.+\.postimees\.ee)[a-zA-Z0-9\/._-]+\?[a-zA-Z0-9=&._-]*id=(?P[0-9]+)[^ ]*' + _VALID_URL = r'(?Phttps?:\/\/.+\.postimees\.ee)[a-zA-Z0-9\/._-]+\?[a-zA-Z0-9=&._-]*id=(?P[a-zA-Z0-9_-]+)[^ ]*' _TESTS = [{ # The most ordinary case 'url': 'http://kanal2.postimees.ee/pluss/video/?id=40792', @@ -68,12 +69,8 @@ class Kanal2IE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - # base url, e.g. kanal2.postimees.ee (in chrome, the black part of the address) base = re.compile(self._VALID_URL).match(url).group('base') - - # Acquire the video's address, where we can search for website data(needed in case of embed player) if "pluss" not in url and "kanal2" in base: - # Generic url for all the kanal2 videos, may redirect url = base + '/pluss/video/?id=' + video_id # This part copied from generic.py, bypasses redirects head_response = self._request_webpage(HEADRequest(url), video_id) @@ -82,79 +79,73 @@ class Kanal2IE(InfoExtractor): if url != new_url: self._downloader.to_screen('[redirect] Following redirect to %s' % new_url) return self.url_result(new_url) - # copied until here xmlfile = self._download_xml(update_url_query(base + '/video/playerPlaylistApi', {'id': video_id}), video_id) + host = xmlfile.find('./playlist/video/streamItems').get('host') - # Remove stacked urls(e.g. http://test.comhttp://test2.com, removes everything before second http) + formats = [{ + 'protocol': re.compile('(?P.+):\/\/[^\0]*').match(host).group('protocol') or 'rtmp', + 'app': re.compile(((re.compile('(?P.+):\/\/[^\0]*').match(host).group('protocol') or 'rtmp') + ':\/\/[^\0]*\/(?P.+\/)')).match(host).group('app') or 'kanal2vod', + 'url': host + stream.get('streamName'), + 'play_path': 'mp4:' + stream.get('streamName'), + 'ext': 'flv', + 'height': str_to_int(stream.get('height')), + 'width': str_to_int(stream.get('width')), + 'rtmp_real_time': True, + } for stream in xmlfile.findall('./playlist/video/streamItems/streamItem')] + self._sort_formats(formats) + + # Remove stacked urls(e.g. http://test.comhttp://test2.com, removes everything before second http(kanal12 fix)) thumbnail = re.compile('[^\0]*(?Phttps?:\/\/[^"]+)[^\0]*').match(base + xpath_text(xmlfile, './playlist/video/thumbUrl')).group('realurl') average_rating = int_or_none(xpath_text(xmlfile, './playlist/video/rating/value')) - # Determine, whether the stream is high or low quality and act accordingly - for stream in xmlfile.findall('./playlist/video/streamItems/streamItem'): - # Found low quality stream, but keep iterating streamItems in hope of finding hq stream - if "k2lq" in stream.get('streamName'): - streamname = stream.get('streamName') - width = str_to_int(stream.get('width')) - height = str_to_int(stream.get('height')) - continue - # Found high quality stream, looping no longer necessary - if "k2hq" in stream.get('streamName'): - streamname = stream.get('streamName') - width = str_to_int(stream.get('width')) - height = str_to_int(stream.get('height')) - break - webpage = self._download_webpage(url, video_id) - # Is the following info on website? if div player-container is present, info also is if 'player-container' in webpage: - # Find description description = self._search_regex(r'[^\0]*

]*>([^<]*)<\/p>[^\0]*', webpage, 'description', default=None) if description is not None: - # Remove a lot of trailing spaces, that were added to get the text to be in the right place on webpage description = description.strip() - # Episode and season - epandseason = self._search_regex('[^\0]*(Osa *[0-9]+ *Hooaeg *[0-9]+)[^\0]*', webpage, 'epandseason', default=None) - if epandseason is not None: - episode = int_or_none(re.compile('Osa *(?P[0-9]+) *Hooaeg *[0-9]+').match(epandseason).group('episode')) - season = int_or_none(re.compile('Osa *[0-9]+ *Hooaeg *(?P[0-9]+)').match(epandseason).group('season')) - # Timestamp generation - dateandtime = self._search_regex('[^\0]*(eetris[^\0]*<\/span>[^\0]*[0-9]{1,2}.[0-9]{1,2}.[0-9]{4,}[^0-9]*[0-9]{1,2}:[0-9]{1,2})[^\0]*', webpage, 'dateandtime', default=None) - if dateandtime is not None: - date = re.compile('[^\0]*eetris[^\0]*<\/span>[^\0]*(?P[0-9]{1,2}.[0-9]{1,2}.[0-9]{4,})[^0-9]*(?P