From 0817510dfba8a7de1b8e46f7755994510f82366e Mon Sep 17 00:00:00 2001
From: zmobbie <ottoxas@hotmail.com>
Date: Fri, 12 Aug 2016 00:55:34 +0300
Subject: [PATCH] Kanal2 Add new extractor

---
 youtube_dl/extractor/kanal2.py | 97 +++++++++++++++-------------------
 1 file changed, 44 insertions(+), 53 deletions(-)
diff --git a/youtube_dl/extractor/kanal2.py b/youtube_dl/extractor/kanal2.py
index 97ce8b5ea..b42b3f7a2 100644
--- a/youtube_dl/extractor/kanal2.py
+++ b/youtube_dl/extractor/kanal2.py
@@ -11,13 +11,14 @@ from ..utils import (
     str_to_int,
     int_or_none,
     HEADRequest,
+    unescapeHTML,
 )
 
 import re
 
 
 class Kanal2IE(InfoExtractor):
-    _VALID_URL = r'(?P<base>.+\.postimees\.ee)[a-zA-Z0-9\/._-]+\?[a-zA-Z0-9=&._-]*id=(?P<id>[0-9]+)[^ ]*'
+    _VALID_URL = r'(?P<base>https?:\/\/.+\.postimees\.ee)[a-zA-Z0-9\/._-]+\?[a-zA-Z0-9=&._-]*id=(?P<id>[a-zA-Z0-9_-]+)[^ ]*'
     _TESTS = [{
         # The most ordinary case
         'url': 'http://kanal2.postimees.ee/pluss/video/?id=40792',
@@ -68,12 +69,8 @@ class Kanal2IE(InfoExtractor):
 
     def _real_extract(self, url):
         video_id = self._match_id(url)
-        # base url, e.g. kanal2.postimees.ee (in chrome, the black part of the address)
         base = re.compile(self._VALID_URL).match(url).group('base')
-
-        # Acquire the video's address, where we can search for website data(needed in case of embed player)
         if "pluss" not in url and "kanal2" in base:
-            # Generic url for all the kanal2 videos, may redirect
             url = base + '/pluss/video/?id=' + video_id
             # This part copied from generic.py, bypasses redirects
             head_response = self._request_webpage(HEADRequest(url), video_id)
@@ -82,79 +79,73 @@ class Kanal2IE(InfoExtractor):
                 if url != new_url:
                     self._downloader.to_screen('[redirect] Following redirect to %s' % new_url)
                 return self.url_result(new_url)
-                # copied until here
 
         xmlfile = self._download_xml(update_url_query(base + '/video/playerPlaylistApi', {'id': video_id}), video_id)
+        host = xmlfile.find('./playlist/video/streamItems').get('host')
 
-        # Remove stacked urls(e.g. http://test.comhttp://test2.com, removes everything before second http)
+        formats = [{
+            'protocol': re.compile('(?P<protocol>.+):\/\/[^\0]*').match(host).group('protocol') or 'rtmp',
+            'app': re.compile(((re.compile('(?P<protocol>.+):\/\/[^\0]*').match(host).group('protocol') or 'rtmp') + ':\/\/[^\0]*\/(?P<app>.+\/)')).match(host).group('app') or 'kanal2vod',
+            'url': host + stream.get('streamName'),
+            'play_path': 'mp4:' + stream.get('streamName'),
+            'ext': 'flv',
+            'height': str_to_int(stream.get('height')),
+            'width': str_to_int(stream.get('width')),
+            'rtmp_real_time': True,
+        } for stream in xmlfile.findall('./playlist/video/streamItems/streamItem')]
+        self._sort_formats(formats)
+
+        # Remove stacked urls(e.g. http://test.comhttp://test2.com, removes everything before second http(kanal12 fix))
         thumbnail = re.compile('[^\0]*(?P<realurl>https?:\/\/[^"]+)[^\0]*').match(base + xpath_text(xmlfile, './playlist/video/thumbUrl')).group('realurl')
         average_rating = int_or_none(xpath_text(xmlfile, './playlist/video/rating/value'))
 
-        # Determine, whether the stream is high or low quality and act accordingly
-        for stream in xmlfile.findall('./playlist/video/streamItems/streamItem'):
-            # Found low quality stream, but keep iterating streamItems in hope of finding hq stream
-            if "k2lq" in stream.get('streamName'):
-                streamname = stream.get('streamName')
-                width = str_to_int(stream.get('width'))
-                height = str_to_int(stream.get('height'))
-                continue
-            # Found high quality stream, looping no longer necessary
-            if "k2hq" in stream.get('streamName'):
-                streamname = stream.get('streamName')
-                width = str_to_int(stream.get('width'))
-                height = str_to_int(stream.get('height'))
-                break
-
         webpage = self._download_webpage(url, video_id)
-        # Is the following info on website? if div player-container is present, info also is
         if 'player-container' in webpage:
-            # Find description
             description = self._search_regex(r'[^\0]*<p class="full"[^>]*>([^<]*)<\/p>[^\0]*', webpage, 'description', default=None)
             if description is not None:
-                # Remove a lot of trailing spaces, that were added to get the text to be in the right place on webpage
                 description = description.strip()
-            # Episode and season
-            epandseason = self._search_regex('[^\0]*(Osa *[0-9]+ *Hooaeg *[0-9]+)[^\0]*', webpage, 'epandseason', default=None)
-            if epandseason is not None:
-                episode = int_or_none(re.compile('Osa *(?P<episode>[0-9]+) *Hooaeg *[0-9]+').match(epandseason).group('episode'))
-                season = int_or_none(re.compile('Osa *[0-9]+ *Hooaeg *(?P<season>[0-9]+)').match(epandseason).group('season'))
-            # Timestamp generation
-            dateandtime = self._search_regex('[^\0]*(eetris[^\0]*<\/span>[^\0]*[0-9]{1,2}.[0-9]{1,2}.[0-9]{4,}[^0-9]*[0-9]{1,2}:[0-9]{1,2})[^\0]*', webpage, 'dateandtime', default=None)
-            if dateandtime is not None:
-                date = re.compile('[^\0]*eetris[^\0]*<\/span>[^\0]*(?P<date>[0-9]{1,2}.[0-9]{1,2}.[0-9]{4,})[^0-9]*(?P<time>[0-9]{1,2}:[0-9]{1,2})[^\0]*').match(dateandtime).group('date')
-                time = re.compile('[^\0]*eetris[^\0]*<\/span>[^\0]*(?P<date>[0-9]{1,2}.[0-9]{1,2}.[0-9]{4,})[^0-9]*(?P<time>[0-9]{1,2}:[0-9]{1,2})[^\0]*').match(dateandtime).group('time')
+
+            epandseasonregex = re.compile('Osa *(?P<episode>[0-9]+) *Hooaeg *(?P<season>[0-9]+)').match(self._search_regex('[^\0]*(Osa *[0-9]+ *Hooaeg *[0-9]+)[^\0]*', webpage, 'epandseason', default=None))
+            if epandseasonregex is not None:
+                episode = int_or_none(epandseasonregex.group('episode'))
+                season = int_or_none(epandseasonregex.group('season'))
+
+            dateandtimeregex = re.compile('[^\0]*eetris[^\0]*<\/span>[^\0]*(?P<date>[0-9]{1,2}.[0-9]{1,2}.[0-9]{4,})[^0-9]*(?P<time>[0-9]{1,2}:[0-9]{1,2})[^\0]*').match(self._search_regex('[^\0]*(eetris[^\0]*<\/span>[^\0]*[0-9]{1,2}.[0-9]{1,2}.[0-9]{4,}[^0-9]*[0-9]{1,2}:[0-9]{1,2})[^\0]*', webpage, 'dateandtime', default=None))
+            if dateandtimeregex is not None:
+                date = dateandtimeregex.group('date')
+                time = dateandtimeregex.group('time')
                 timestamp = int_or_none((datetime.strptime(date + " " + time, '%d.%m.%Y %H:%M') - datetime(1970, 1, 1) + timedelta(seconds=60 * 60 * 2)).total_seconds())  # No dst support, but added the 2 default hours of estonia
             player_url = self._search_regex('[^\0]embedSWF\("([^"]+)[^\0]', webpage, 'player_url', default=None)
 
-        # There are videos that can only be seen when logged in, so some data can't be accessed(but we can still download the video)
         else:
-            # Try to get description from api(which is mostly empty result) or in other case from og meta tag.
-            description = xpath_text(xmlfile, './playlist/video/description') or self._search_regex('[^\0]og:description"[^\0]*content="(.*)\" \/>', webpage, 'description', default=None)
-            # Basic character parsing to turn character references into real characters. also remove excessive whitespace
-            if description is not None:
-                description = description.strip().replace("&otilde;", "õ").replace("&Otilde;", "Õ").replace("&auml;", "ä").replace("&Auml;", "Ä").replace("&ouml;", "ö").replace("&Ouml;", "Ö").replace("&uuml;", "ü").replace("&Uuml;", "Ü").replace("&amp;", "&")
-
+            description = None
             player_url = None
-            episode = int_or_none(xpath_text(xmlfile, './playlist/video/episode')) or None
-            season = None  # Episode is mostly empty in the xml but season does not even appear there
+            season = None
+            episode = None
             timestamp = None
+
+        if description is None:
+            description = xpath_text(xmlfile, './playlist/video/description') or self._search_regex('[^\0]og:description" *content="(.*)\" *\/>', webpage, 'description', default=None)
+            if description is not None:
+                description = unescapeHTML(description).strip()
+
+        if episode is None:
+            episode = int_or_none(xpath_text(xmlfile, './playlist/video/episode'))
+
+        title = xpath_text(xmlfile, './playlist/video/name')
+        if title is None:
+            title = self._search_regex('[^\0]og:title" *content="(.*)\" *\/>', webpage, 'title', default=None) or self._search_regex('[^\0]<title>(.*)<\/title>[^\0]', webpage, 'description', default=None)
+
         return {
-            'app': "kanal2vod",
             'average_rating': average_rating,
             'description': description,
             'episode_number': episode,
-            'ext': "flv",
-            'height': height,
+            'formats': formats,
             'id': video_id,
             'page_url': url,
             'player_url': player_url,
-            'play_path': "mp4:" + streamname,
-            'protocol': "rtmp",
-            'rtmp_real_time': True,
             'season_number': season,
             'timestamp': timestamp,
-            'title': xpath_text(xmlfile, './playlist/video/name'),
+            'title': title,
             'thumbnail': thumbnail,
-            'url': xmlfile.find('./playlist/video/streamItems').get('host') + streamname,
-            'width': width,
         }