diff --git a/youtube_dl/extractor/kanal2.py b/youtube_dl/extractor/kanal2.py
index 97ce8b5ea..b42b3f7a2 100644
--- a/youtube_dl/extractor/kanal2.py
+++ b/youtube_dl/extractor/kanal2.py
@@ -11,13 +11,14 @@ from ..utils import (
str_to_int,
int_or_none,
HEADRequest,
+ unescapeHTML,
)
import re
class Kanal2IE(InfoExtractor):
- _VALID_URL = r'(?P.+\.postimees\.ee)[a-zA-Z0-9\/._-]+\?[a-zA-Z0-9=&._-]*id=(?P[0-9]+)[^ ]*'
+ _VALID_URL = r'(?Phttps?:\/\/.+\.postimees\.ee)[a-zA-Z0-9\/._-]+\?[a-zA-Z0-9=&._-]*id=(?P[a-zA-Z0-9_-]+)[^ ]*'
_TESTS = [{
# The most ordinary case
'url': 'http://kanal2.postimees.ee/pluss/video/?id=40792',
@@ -68,12 +69,8 @@ class Kanal2IE(InfoExtractor):
def _real_extract(self, url):
video_id = self._match_id(url)
- # base url, e.g. kanal2.postimees.ee (in chrome, the black part of the address)
base = re.compile(self._VALID_URL).match(url).group('base')
-
- # Acquire the video's address, where we can search for website data(needed in case of embed player)
if "pluss" not in url and "kanal2" in base:
- # Generic url for all the kanal2 videos, may redirect
url = base + '/pluss/video/?id=' + video_id
# This part copied from generic.py, bypasses redirects
head_response = self._request_webpage(HEADRequest(url), video_id)
@@ -82,79 +79,73 @@ class Kanal2IE(InfoExtractor):
if url != new_url:
self._downloader.to_screen('[redirect] Following redirect to %s' % new_url)
return self.url_result(new_url)
- # copied until here
xmlfile = self._download_xml(update_url_query(base + '/video/playerPlaylistApi', {'id': video_id}), video_id)
+ host = xmlfile.find('./playlist/video/streamItems').get('host')
- # Remove stacked urls(e.g. http://test.comhttp://test2.com, removes everything before second http)
+ formats = [{
+ 'protocol': re.compile('(?P.+):\/\/[^\0]*').match(host).group('protocol') or 'rtmp',
+ 'app': re.compile(((re.compile('(?P.+):\/\/[^\0]*').match(host).group('protocol') or 'rtmp') + ':\/\/[^\0]*\/(?P.+\/)')).match(host).group('app') or 'kanal2vod',
+ 'url': host + stream.get('streamName'),
+ 'play_path': 'mp4:' + stream.get('streamName'),
+ 'ext': 'flv',
+ 'height': str_to_int(stream.get('height')),
+ 'width': str_to_int(stream.get('width')),
+ 'rtmp_real_time': True,
+ } for stream in xmlfile.findall('./playlist/video/streamItems/streamItem')]
+ self._sort_formats(formats)
+
+ # Remove stacked urls(e.g. http://test.comhttp://test2.com, removes everything before second http(kanal12 fix))
thumbnail = re.compile('[^\0]*(?Phttps?:\/\/[^"]+)[^\0]*').match(base + xpath_text(xmlfile, './playlist/video/thumbUrl')).group('realurl')
average_rating = int_or_none(xpath_text(xmlfile, './playlist/video/rating/value'))
- # Determine, whether the stream is high or low quality and act accordingly
- for stream in xmlfile.findall('./playlist/video/streamItems/streamItem'):
- # Found low quality stream, but keep iterating streamItems in hope of finding hq stream
- if "k2lq" in stream.get('streamName'):
- streamname = stream.get('streamName')
- width = str_to_int(stream.get('width'))
- height = str_to_int(stream.get('height'))
- continue
- # Found high quality stream, looping no longer necessary
- if "k2hq" in stream.get('streamName'):
- streamname = stream.get('streamName')
- width = str_to_int(stream.get('width'))
- height = str_to_int(stream.get('height'))
- break
-
webpage = self._download_webpage(url, video_id)
- # Is the following info on website? if div player-container is present, info also is
if 'player-container' in webpage:
- # Find description
description = self._search_regex(r'[^\0]*
]*>([^<]*)<\/p>[^\0]*', webpage, 'description', default=None)
if description is not None:
- # Remove a lot of trailing spaces, that were added to get the text to be in the right place on webpage
description = description.strip()
- # Episode and season
- epandseason = self._search_regex('[^\0]*(Osa *[0-9]+ *Hooaeg *[0-9]+)[^\0]*', webpage, 'epandseason', default=None)
- if epandseason is not None:
- episode = int_or_none(re.compile('Osa *(?P[0-9]+) *Hooaeg *[0-9]+').match(epandseason).group('episode'))
- season = int_or_none(re.compile('Osa *[0-9]+ *Hooaeg *(?P[0-9]+)').match(epandseason).group('season'))
- # Timestamp generation
- dateandtime = self._search_regex('[^\0]*(eetris[^\0]*<\/span>[^\0]*[0-9]{1,2}.[0-9]{1,2}.[0-9]{4,}[^0-9]*[0-9]{1,2}:[0-9]{1,2})[^\0]*', webpage, 'dateandtime', default=None)
- if dateandtime is not None:
- date = re.compile('[^\0]*eetris[^\0]*<\/span>[^\0]*(?P[0-9]{1,2}.[0-9]{1,2}.[0-9]{4,})[^0-9]*(?P