[fc-zenit] New extractor

2015-10-12 01:25:57 -04:00 · 2015-10-12 01:25:57 -04:00 · 634d079a98
commit 634d079a98
parent cd7364a89c
2 changed files with 53 additions and 1 deletions
--- a/youtube_dl/extractor/init.py
+++ b/youtube_dl/extractor/init.py
@ -76,7 +76,6 @@ from .cbssports import CBSSportsIE
 from .ccc import CCCIE
 from .ceskatelevize import CeskaTelevizeIE
 from .channel9 import Channel9IE
-from .chaturbate import ChaturbateIE
 from .chilloutzone import ChilloutzoneIE
 from .chirbit import (
    ChirbitIE,
@ -167,6 +166,7 @@ from .extremetube import ExtremeTubeIE
 from .facebook import FacebookIE
 from .faz import FazIE
 from .fc2 import FC2IE
+from .fczenit import fczenitIE
 from .firstpost import FirstpostIE
 from .firsttv import FirstTVIE
 from .fivemin import FiveMinIE
--- a/youtube_dl/extractor/fczenit.py
+++ b/youtube_dl/extractor/fczenit.py
@ -0,0 +1,52 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import os.path
+import re
+import json
+
+from ..compat import compat_urllib_parse_unquote
+from ..utils import url_basename
+from .common import InfoExtractor
+
+class fczenitIE(InfoExtractor):
+    _VALID_URL = r'(?:https?://(?:www\.)?fc-zenit\.ru/video/gl(?P<id>[0-9]+))'
+    _TEST = {
+    u'url': u'http://fc-zenit.ru/video/gl6785/',
+    u'md5' : '458bacc24549173fe5a5aa29174a5606',
+    u'info_dict': {
+        u"id": u"6785",
+        u"ext": u"mp4",
+        u"title": u"«Зенит-ТВ»: как Олег Шатов играл против «Урала»"
+    }
+}
+
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+        webpage_url = 'http://fc-zenit.ru/video/gl' + video_id
+        webpage = self._download_webpage(webpage_url, video_id)
+
+        video_title = self._html_search_regex(r'<div class=\"photoalbum__title\">([^<]+)', webpage, u"title")
+
+        # Log that we are starting to parse the page
+        self.report_extraction(video_id)
+
+        bitrates_raw = self._html_search_regex(r'bitrates:.*\n(.*)\]', webpage, u'video URL')
+        bitrates = re.findall(r'url:.?\'(.+?)\'.*?bitrate:.?([0-9]{3}?)', bitrates_raw)
+
+        formats = [{
+                "url" : sources[0],
+                "tbr": sources[1]
+        } for sources in bitrates]
+
+        self._sort_formats(formats)
+
+        return {
+            'id' : video_id,
+            'title' : video_title,
+            'url' : webpage_url,
+            'ext' : u'mp4',
+            'formats' : formats
+        }