Kanal2 Add new extractor
This commit is contained in:
parent
ea80d6ff9b
commit
0817510dfb
@ -11,13 +11,14 @@ from ..utils import (
|
|||||||
str_to_int,
|
str_to_int,
|
||||||
int_or_none,
|
int_or_none,
|
||||||
HEADRequest,
|
HEADRequest,
|
||||||
|
unescapeHTML,
|
||||||
)
|
)
|
||||||
|
|
||||||
import re
|
import re
|
||||||
|
|
||||||
|
|
||||||
class Kanal2IE(InfoExtractor):
|
class Kanal2IE(InfoExtractor):
|
||||||
_VALID_URL = r'(?P<base>.+\.postimees\.ee)[a-zA-Z0-9\/._-]+\?[a-zA-Z0-9=&._-]*id=(?P<id>[0-9]+)[^ ]*'
|
_VALID_URL = r'(?P<base>https?:\/\/.+\.postimees\.ee)[a-zA-Z0-9\/._-]+\?[a-zA-Z0-9=&._-]*id=(?P<id>[a-zA-Z0-9_-]+)[^ ]*'
|
||||||
_TESTS = [{
|
_TESTS = [{
|
||||||
# The most ordinary case
|
# The most ordinary case
|
||||||
'url': 'http://kanal2.postimees.ee/pluss/video/?id=40792',
|
'url': 'http://kanal2.postimees.ee/pluss/video/?id=40792',
|
||||||
@ -68,12 +69,8 @@ class Kanal2IE(InfoExtractor):
|
|||||||
|
|
||||||
def _real_extract(self, url):
|
def _real_extract(self, url):
|
||||||
video_id = self._match_id(url)
|
video_id = self._match_id(url)
|
||||||
# base url, e.g. kanal2.postimees.ee (in chrome, the black part of the address)
|
|
||||||
base = re.compile(self._VALID_URL).match(url).group('base')
|
base = re.compile(self._VALID_URL).match(url).group('base')
|
||||||
|
|
||||||
# Acquire the video's address, where we can search for website data(needed in case of embed player)
|
|
||||||
if "pluss" not in url and "kanal2" in base:
|
if "pluss" not in url and "kanal2" in base:
|
||||||
# Generic url for all the kanal2 videos, may redirect
|
|
||||||
url = base + '/pluss/video/?id=' + video_id
|
url = base + '/pluss/video/?id=' + video_id
|
||||||
# This part copied from generic.py, bypasses redirects
|
# This part copied from generic.py, bypasses redirects
|
||||||
head_response = self._request_webpage(HEADRequest(url), video_id)
|
head_response = self._request_webpage(HEADRequest(url), video_id)
|
||||||
@ -82,79 +79,73 @@ class Kanal2IE(InfoExtractor):
|
|||||||
if url != new_url:
|
if url != new_url:
|
||||||
self._downloader.to_screen('[redirect] Following redirect to %s' % new_url)
|
self._downloader.to_screen('[redirect] Following redirect to %s' % new_url)
|
||||||
return self.url_result(new_url)
|
return self.url_result(new_url)
|
||||||
# copied until here
|
|
||||||
|
|
||||||
xmlfile = self._download_xml(update_url_query(base + '/video/playerPlaylistApi', {'id': video_id}), video_id)
|
xmlfile = self._download_xml(update_url_query(base + '/video/playerPlaylistApi', {'id': video_id}), video_id)
|
||||||
|
host = xmlfile.find('./playlist/video/streamItems').get('host')
|
||||||
|
|
||||||
# Remove stacked urls(e.g. http://test.comhttp://test2.com, removes everything before second http)
|
formats = [{
|
||||||
|
'protocol': re.compile('(?P<protocol>.+):\/\/[^\0]*').match(host).group('protocol') or 'rtmp',
|
||||||
|
'app': re.compile(((re.compile('(?P<protocol>.+):\/\/[^\0]*').match(host).group('protocol') or 'rtmp') + ':\/\/[^\0]*\/(?P<app>.+\/)')).match(host).group('app') or 'kanal2vod',
|
||||||
|
'url': host + stream.get('streamName'),
|
||||||
|
'play_path': 'mp4:' + stream.get('streamName'),
|
||||||
|
'ext': 'flv',
|
||||||
|
'height': str_to_int(stream.get('height')),
|
||||||
|
'width': str_to_int(stream.get('width')),
|
||||||
|
'rtmp_real_time': True,
|
||||||
|
} for stream in xmlfile.findall('./playlist/video/streamItems/streamItem')]
|
||||||
|
self._sort_formats(formats)
|
||||||
|
|
||||||
|
# Remove stacked urls(e.g. http://test.comhttp://test2.com, removes everything before second http(kanal12 fix))
|
||||||
thumbnail = re.compile('[^\0]*(?P<realurl>https?:\/\/[^"]+)[^\0]*').match(base + xpath_text(xmlfile, './playlist/video/thumbUrl')).group('realurl')
|
thumbnail = re.compile('[^\0]*(?P<realurl>https?:\/\/[^"]+)[^\0]*').match(base + xpath_text(xmlfile, './playlist/video/thumbUrl')).group('realurl')
|
||||||
average_rating = int_or_none(xpath_text(xmlfile, './playlist/video/rating/value'))
|
average_rating = int_or_none(xpath_text(xmlfile, './playlist/video/rating/value'))
|
||||||
|
|
||||||
# Determine, whether the stream is high or low quality and act accordingly
|
|
||||||
for stream in xmlfile.findall('./playlist/video/streamItems/streamItem'):
|
|
||||||
# Found low quality stream, but keep iterating streamItems in hope of finding hq stream
|
|
||||||
if "k2lq" in stream.get('streamName'):
|
|
||||||
streamname = stream.get('streamName')
|
|
||||||
width = str_to_int(stream.get('width'))
|
|
||||||
height = str_to_int(stream.get('height'))
|
|
||||||
continue
|
|
||||||
# Found high quality stream, looping no longer necessary
|
|
||||||
if "k2hq" in stream.get('streamName'):
|
|
||||||
streamname = stream.get('streamName')
|
|
||||||
width = str_to_int(stream.get('width'))
|
|
||||||
height = str_to_int(stream.get('height'))
|
|
||||||
break
|
|
||||||
|
|
||||||
webpage = self._download_webpage(url, video_id)
|
webpage = self._download_webpage(url, video_id)
|
||||||
# Is the following info on website? if div player-container is present, info also is
|
|
||||||
if 'player-container' in webpage:
|
if 'player-container' in webpage:
|
||||||
# Find description
|
|
||||||
description = self._search_regex(r'[^\0]*<p class="full"[^>]*>([^<]*)<\/p>[^\0]*', webpage, 'description', default=None)
|
description = self._search_regex(r'[^\0]*<p class="full"[^>]*>([^<]*)<\/p>[^\0]*', webpage, 'description', default=None)
|
||||||
if description is not None:
|
if description is not None:
|
||||||
# Remove a lot of trailing spaces, that were added to get the text to be in the right place on webpage
|
|
||||||
description = description.strip()
|
description = description.strip()
|
||||||
# Episode and season
|
|
||||||
epandseason = self._search_regex('[^\0]*(Osa *[0-9]+ *Hooaeg *[0-9]+)[^\0]*', webpage, 'epandseason', default=None)
|
epandseasonregex = re.compile('Osa *(?P<episode>[0-9]+) *Hooaeg *(?P<season>[0-9]+)').match(self._search_regex('[^\0]*(Osa *[0-9]+ *Hooaeg *[0-9]+)[^\0]*', webpage, 'epandseason', default=None))
|
||||||
if epandseason is not None:
|
if epandseasonregex is not None:
|
||||||
episode = int_or_none(re.compile('Osa *(?P<episode>[0-9]+) *Hooaeg *[0-9]+').match(epandseason).group('episode'))
|
episode = int_or_none(epandseasonregex.group('episode'))
|
||||||
season = int_or_none(re.compile('Osa *[0-9]+ *Hooaeg *(?P<season>[0-9]+)').match(epandseason).group('season'))
|
season = int_or_none(epandseasonregex.group('season'))
|
||||||
# Timestamp generation
|
|
||||||
dateandtime = self._search_regex('[^\0]*(eetris[^\0]*<\/span>[^\0]*[0-9]{1,2}.[0-9]{1,2}.[0-9]{4,}[^0-9]*[0-9]{1,2}:[0-9]{1,2})[^\0]*', webpage, 'dateandtime', default=None)
|
dateandtimeregex = re.compile('[^\0]*eetris[^\0]*<\/span>[^\0]*(?P<date>[0-9]{1,2}.[0-9]{1,2}.[0-9]{4,})[^0-9]*(?P<time>[0-9]{1,2}:[0-9]{1,2})[^\0]*').match(self._search_regex('[^\0]*(eetris[^\0]*<\/span>[^\0]*[0-9]{1,2}.[0-9]{1,2}.[0-9]{4,}[^0-9]*[0-9]{1,2}:[0-9]{1,2})[^\0]*', webpage, 'dateandtime', default=None))
|
||||||
if dateandtime is not None:
|
if dateandtimeregex is not None:
|
||||||
date = re.compile('[^\0]*eetris[^\0]*<\/span>[^\0]*(?P<date>[0-9]{1,2}.[0-9]{1,2}.[0-9]{4,})[^0-9]*(?P<time>[0-9]{1,2}:[0-9]{1,2})[^\0]*').match(dateandtime).group('date')
|
date = dateandtimeregex.group('date')
|
||||||
time = re.compile('[^\0]*eetris[^\0]*<\/span>[^\0]*(?P<date>[0-9]{1,2}.[0-9]{1,2}.[0-9]{4,})[^0-9]*(?P<time>[0-9]{1,2}:[0-9]{1,2})[^\0]*').match(dateandtime).group('time')
|
time = dateandtimeregex.group('time')
|
||||||
timestamp = int_or_none((datetime.strptime(date + " " + time, '%d.%m.%Y %H:%M') - datetime(1970, 1, 1) + timedelta(seconds=60 * 60 * 2)).total_seconds()) # No dst support, but added the 2 default hours of estonia
|
timestamp = int_or_none((datetime.strptime(date + " " + time, '%d.%m.%Y %H:%M') - datetime(1970, 1, 1) + timedelta(seconds=60 * 60 * 2)).total_seconds()) # No dst support, but added the 2 default hours of estonia
|
||||||
player_url = self._search_regex('[^\0]embedSWF\("([^"]+)[^\0]', webpage, 'player_url', default=None)
|
player_url = self._search_regex('[^\0]embedSWF\("([^"]+)[^\0]', webpage, 'player_url', default=None)
|
||||||
|
|
||||||
# There are videos that can only be seen when logged in, so some data can't be accessed(but we can still download the video)
|
|
||||||
else:
|
else:
|
||||||
# Try to get description from api(which is mostly empty result) or in other case from og meta tag.
|
description = None
|
||||||
description = xpath_text(xmlfile, './playlist/video/description') or self._search_regex('[^\0]og:description"[^\0]*content="(.*)\" \/>', webpage, 'description', default=None)
|
|
||||||
# Basic character parsing to turn character references into real characters. also remove excessive whitespace
|
|
||||||
if description is not None:
|
|
||||||
description = description.strip().replace("õ", "õ").replace("Õ", "Õ").replace("ä", "ä").replace("Ä", "Ä").replace("ö", "ö").replace("Ö", "Ö").replace("ü", "ü").replace("Ü", "Ü").replace("&", "&")
|
|
||||||
|
|
||||||
player_url = None
|
player_url = None
|
||||||
episode = int_or_none(xpath_text(xmlfile, './playlist/video/episode')) or None
|
season = None
|
||||||
season = None # Episode is mostly empty in the xml but season does not even appear there
|
episode = None
|
||||||
timestamp = None
|
timestamp = None
|
||||||
|
|
||||||
|
if description is None:
|
||||||
|
description = xpath_text(xmlfile, './playlist/video/description') or self._search_regex('[^\0]og:description" *content="(.*)\" *\/>', webpage, 'description', default=None)
|
||||||
|
if description is not None:
|
||||||
|
description = unescapeHTML(description).strip()
|
||||||
|
|
||||||
|
if episode is None:
|
||||||
|
episode = int_or_none(xpath_text(xmlfile, './playlist/video/episode'))
|
||||||
|
|
||||||
|
title = xpath_text(xmlfile, './playlist/video/name')
|
||||||
|
if title is None:
|
||||||
|
title = self._search_regex('[^\0]og:title" *content="(.*)\" *\/>', webpage, 'title', default=None) or self._search_regex('[^\0]<title>(.*)<\/title>[^\0]', webpage, 'description', default=None)
|
||||||
|
|
||||||
return {
|
return {
|
||||||
'app': "kanal2vod",
|
|
||||||
'average_rating': average_rating,
|
'average_rating': average_rating,
|
||||||
'description': description,
|
'description': description,
|
||||||
'episode_number': episode,
|
'episode_number': episode,
|
||||||
'ext': "flv",
|
'formats': formats,
|
||||||
'height': height,
|
|
||||||
'id': video_id,
|
'id': video_id,
|
||||||
'page_url': url,
|
'page_url': url,
|
||||||
'player_url': player_url,
|
'player_url': player_url,
|
||||||
'play_path': "mp4:" + streamname,
|
|
||||||
'protocol': "rtmp",
|
|
||||||
'rtmp_real_time': True,
|
|
||||||
'season_number': season,
|
'season_number': season,
|
||||||
'timestamp': timestamp,
|
'timestamp': timestamp,
|
||||||
'title': xpath_text(xmlfile, './playlist/video/name'),
|
'title': title,
|
||||||
'thumbnail': thumbnail,
|
'thumbnail': thumbnail,
|
||||||
'url': xmlfile.find('./playlist/video/streamItems').get('host') + streamname,
|
|
||||||
'width': width,
|
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user