[YouWatch] Add new extractor (for testing purpose)

This commit is contained in:
sulyi 2016-11-20 21:09:55 +01:00
parent 58355a3bf1
commit d18c635985
3 changed files with 180 additions and 0 deletions

View File

@ -937,6 +937,7 @@
- **youtube:subscriptions**: YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication) - **youtube:subscriptions**: YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)
- **youtube:user**: YouTube.com user videos (URL or "ytuser" keyword) - **youtube:user**: YouTube.com user videos (URL or "ytuser" keyword)
- **youtube:watchlater**: Youtube watch later list, ":ytwatchlater" for short (requires authentication) - **youtube:watchlater**: Youtube watch later list, ":ytwatchlater" for short (requires authentication)
- **YouWatch**
- **Zapiks** - **Zapiks**
- **ZDF** - **ZDF**
- **ZDFChannel** - **ZDFChannel**

View File

@ -1190,6 +1190,7 @@ from .youtube import (
YoutubeUserIE, YoutubeUserIE,
YoutubeWatchLaterIE, YoutubeWatchLaterIE,
) )
from .youwatch import YouWatchIE
from .zapiks import ZapiksIE from .zapiks import ZapiksIE
from .zdf import ZDFIE, ZDFChannelIE from .zdf import ZDFIE, ZDFChannelIE
from .zingmp3 import ZingMp3IE from .zingmp3 import ZingMp3IE

View File

@ -0,0 +1,178 @@
# coding: utf-8
from __future__ import unicode_literals
import re
import sys
from .common import InfoExtractor
from ..utils import (
ExtractorError,
determine_ext, int_or_none,
unified_strdate,
js_to_json)
class YouWatchIE(InfoExtractor):
_VALID_URL = r'^(https?://(?:www\.)?youwatch\.org/)(?P<embed>embed-)?(?P<id>[a-z0-9]+)(\.html)?$'
_TESTS = [
{
'url': 'http://youwatch.org/ncvag7qib06a',
'md5': 'fcc3d1b77d41921ab408ddfbc51604b1',
'info_dict': {
'id': 'ncvag7qib06a',
'ext': 'mp4',
'title': 'A 4 mesterl v sz',
'thumbnail': 'http://212.7.205.2/i/03/00000/ncvag7qib06a.jpg',
'upload_date': '20140710',
'view_count': int
}
}, {
'url': 'http://youwatch.org/r4gmiun6qx57',
'md5': '71a151e542ae86a023d8cc57e243a917',
'info_dict': {
'id': 'r4gmiun6qx57',
'ext': 'mp4',
'title': 'A h<>z 2 A m<>sodik t<>rt<72>net',
'thumbnail': 'http://212.7.211.67/i/05/00000/r4gmiun6qx57.jpg',
'upload_date': '20150509',
'view_count': int
}
}, {
'url': 'http://youwatch.org/t179wlmqbndd',
'md5': '1984ebed1fb5fee33aa7076a73faffa3',
'info_dict': {
'id': 't179wlmqbndd',
'ext': 'mp4',
'title': 'bd-ijf',
'thumbnail': 'http://212.7.211.67/i/04/00000/t179wlmqbndd.jpg',
'upload_date': '20150523',
'view_count': int
}
}, {
'url': 'http://youwatch.org/embed-2a3y6svmsofw.html',
'md5': 'ded91fbca8c913d493263ae26e5e18d1',
'info_dict': {
'id': '2a3y6svmsofw',
'ext': 'mp4',
'title': 'A Vadnyugat v<>gnapjai (2008)',
'thumbnail': 'http://212.7.211.68/i/05/00362/0iy24azid2xj.jpg',
'upload_date': '20140223',
'view_count': int
}
}
]
def extract_arguments(self, call, code):
if not call.endswith(')'):
pattern = r'%s\s*\(' % re.escape(call)
else:
pattern = re.escape(call)
mobj = re.search(pattern, code)
if mobj:
# XXX: context-free!
close_pos = open_pos = mobj.end()
counter = 1
while counter > 0:
if close_pos > len(code):
break
c = code[close_pos]
close_pos += 1
if c == '(':
counter += 1
elif c == ')':
counter -= 1
else:
return code[open_pos:close_pos - 1]
@staticmethod
def __v_rot(text):
if text is None:
return None
rotated = ''
for letter in text:
l_code = ord(letter)
if 64 < l_code < 91: # ord('A'): 65 ord('Z'): 90
rotated += chr((l_code - 65 - 13) % 26 + 65)
elif 96 < l_code < 123: # ord('a'): 97 ord('z'): 122
rotated += chr((l_code - 97 - 13) % 26 + 97)
else:
rotated += letter
return rotated
def _real_extract(self, url):
video_id = self._match_id(url)
embed_url = None
if self._VALID_URL_RE.match(url).group('embed') is not None:
embed_url = url
url = self._VALID_URL_RE.sub(r'\1\g<id>', url)
webpage = self._download_webpage(url, video_id)
if 'This server is in maintenance mode. Refresh this page in some minutes.' in webpage:
raise ExtractorError('Video is temporally unavailable. ',
sys.exc_info()[2], True, video_id=video_id)
if 'The file you were looking for could not be found, sorry for any inconvenience.' in webpage:
raise ExtractorError('Video is gone. ',
sys.exc_info()[2], True, video_id=video_id)
title = self._html_search_regex(r'''<h3\s+.*id=("|')title\1.*>\s*(?P<title>.+?)\s*<\s*/''',
webpage, 'title', fatal=False,
flags=re.IGNORECASE, group='title')
title = self.__v_rot(title)
ul_date = self._html_search_regex(r'''<i\s.*class=("|')fa fa-calendar\1.*>\s*</i>\s*on\s+(?P<date>.*)\s*<''',
webpage, 'upload date', fatal=False,
flags=re.MULTILINE | re.IGNORECASE, group='date')
ul_date = unified_strdate(ul_date, day_first=False)
views = self._html_search_regex(r'''<\s*([^\s]+).*title=("|')views\2.*>\s*(?P<count>.*)\s*<\s*\/\s*\1\s*>''',
webpage, 'views', fatal=False,
flags=re.IGNORECASE, group='count')
views = int_or_none(views)
if embed_url is None:
embed_url = self._html_search_regex(
r'''<iframe[^>]+class=("|')embed-responsive-item\1[^>]+src=("|')(?P<url>((?!\2).)+)\2''',
webpage, 'embed url', flags=re.IGNORECASE, group='url')
embed_html = self._download_webpage(embed_url, video_id)
ref_url = self._html_search_regex(
r'''<iframe[^>]+src=("|')(?P<url>((?!\1).)+)\1''',
embed_html, 'referer', flags=re.IGNORECASE, group='url')
embed_html = self._download_webpage(ref_url, video_id)
jwplayer_setup_script = self._html_search_regex(
r'''<span\b.*id=("|')vplayer\1[^>]*>\s*<img\b[^>]*>\s*</span>\s*''' +
r'''<script\b[^>]*type=("|')((?!\2).)*\2[^>]*>(?P<script>((?!</script>).|\s)*)''',
embed_html, 'jwplayer setup script', flags=re.IGNORECASE, group='script'
)
jwplayer_setup = self._html_search_regex(r'(jwplayer\s*\((.|\n)*\)\.setup)',
jwplayer_setup_script, 'jwplayer')
jwplayer_json = self.extract_arguments(jwplayer_setup, jwplayer_setup_script)
jwplayer_json = js_to_json(jwplayer_json)
# fix unbalanced commas
jwplayer_json = re.sub(r',\s*([}\]])', '\g<1>', jwplayer_json)
jwplayer_json = self._parse_json(jwplayer_json, video_id)
video_urls = [
{
'url': source['file'],
'format_id': source['label'],
'ext': determine_ext(source['file'])
} for source in jwplayer_json['sources']
]
info_dict = {'id': video_id, 'formats': video_urls, 'http_headers': {'Referer': ref_url}}
if jwplayer_json['image'] is not None:
info_dict['thumbnail'] = jwplayer_json['image']
if title is not None:
info_dict['title'] = title
if ul_date is not None:
info_dict['upload_date'] = ul_date
if views is not None:
info_dict['view_count'] = views
return info_dict