Merge remote-tracking branch 'upstream/master'

This commit is contained in:
Marco Ferragina 2015-10-16 09:43:08 +02:00
commit 5ba6537e03
11 changed files with 129 additions and 102 deletions

View File

@ -35,10 +35,14 @@ class TestInfoExtractor(unittest.TestCase):
<meta name="og:title" content='Foo'/> <meta name="og:title" content='Foo'/>
<meta content="Some video's description " name="og:description"/> <meta content="Some video's description " name="og:description"/>
<meta property='og:image' content='http://domain.com/pic.jpg?key1=val1&amp;key2=val2'/> <meta property='og:image' content='http://domain.com/pic.jpg?key1=val1&amp;key2=val2'/>
<meta content='application/x-shockwave-flash' property='og:video:type'>
<meta content='Foo' property=og:foobar>
''' '''
self.assertEqual(ie._og_search_title(html), 'Foo') self.assertEqual(ie._og_search_title(html), 'Foo')
self.assertEqual(ie._og_search_description(html), 'Some video\'s description ') self.assertEqual(ie._og_search_description(html), 'Some video\'s description ')
self.assertEqual(ie._og_search_thumbnail(html), 'http://domain.com/pic.jpg?key1=val1&key2=val2') self.assertEqual(ie._og_search_thumbnail(html), 'http://domain.com/pic.jpg?key1=val1&key2=val2')
self.assertEqual(ie._og_search_video_url(html, default=None), None)
self.assertEqual(ie._og_search_property('foobar', html), 'Foo')
def test_html_search_meta(self): def test_html_search_meta(self):
ie = self.ie ie = self.ie

View File

@ -319,7 +319,6 @@ from .macgamestore import MacGameStoreIE
from .mailru import MailRuIE from .mailru import MailRuIE
from .malemotion import MalemotionIE from .malemotion import MalemotionIE
from .mdr import MDRIE from .mdr import MDRIE
from .megavideoz import MegaVideozIE
from .metacafe import MetacafeIE from .metacafe import MetacafeIE
from .metacritic import MetacriticIE from .metacritic import MetacriticIE
from .mgoon import MgoonIE from .mgoon import MgoonIE

View File

@ -3,7 +3,11 @@ from __future__ import unicode_literals
import re import re
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import ExtractorError from ..utils import (
ExtractorError,
parse_filesize,
qualities,
)
class Channel9IE(InfoExtractor): class Channel9IE(InfoExtractor):
@ -28,7 +32,7 @@ class Channel9IE(InfoExtractor):
'title': 'Developer Kick-Off Session: Stuff We Love', 'title': 'Developer Kick-Off Session: Stuff We Love',
'description': 'md5:c08d72240b7c87fcecafe2692f80e35f', 'description': 'md5:c08d72240b7c87fcecafe2692f80e35f',
'duration': 4576, 'duration': 4576,
'thumbnail': 'http://video.ch9.ms/ch9/9d51/03902f2d-fc97-4d3c-b195-0bfe15a19d51/KOS002_220.jpg', 'thumbnail': 're:http://.*\.jpg',
'session_code': 'KOS002', 'session_code': 'KOS002',
'session_day': 'Day 1', 'session_day': 'Day 1',
'session_room': 'Arena 1A', 'session_room': 'Arena 1A',
@ -44,31 +48,29 @@ class Channel9IE(InfoExtractor):
'title': 'Self-service BI with Power BI - nuclear testing', 'title': 'Self-service BI with Power BI - nuclear testing',
'description': 'md5:d1e6ecaafa7fb52a2cacdf9599829f5b', 'description': 'md5:d1e6ecaafa7fb52a2cacdf9599829f5b',
'duration': 1540, 'duration': 1540,
'thumbnail': 'http://video.ch9.ms/ch9/87e1/0300391f-a455-4c72-bec3-4422f19287e1/selfservicenuk_512.jpg', 'thumbnail': 're:http://.*\.jpg',
'authors': ['Mike Wilmot'], 'authors': ['Mike Wilmot'],
}, },
},
{
# low quality mp4 is best
'url': 'https://channel9.msdn.com/Events/CPP/CppCon-2015/Ranges-for-the-Standard-Library',
'info_dict': {
'id': 'Events/CPP/CppCon-2015/Ranges-for-the-Standard-Library',
'ext': 'mp4',
'title': 'Ranges for the Standard Library',
'description': 'md5:2e6b4917677af3728c5f6d63784c4c5d',
'duration': 5646,
'thumbnail': 're:http://.*\.jpg',
},
'params': {
'skip_download': True,
},
} }
] ]
_RSS_URL = 'http://channel9.msdn.com/%s/RSS' _RSS_URL = 'http://channel9.msdn.com/%s/RSS'
# Sorted by quality
_known_formats = ['MP3', 'MP4', 'Mid Quality WMV', 'Mid Quality MP4', 'High Quality WMV', 'High Quality MP4']
def _restore_bytes(self, formatted_size):
if not formatted_size:
return 0
m = re.match(r'^(?P<size>\d+(?:\.\d+)?)\s+(?P<units>[a-zA-Z]+)', formatted_size)
if not m:
return 0
units = m.group('units')
try:
exponent = ['B', 'KB', 'MB', 'GB', 'TB', 'PB', 'EB', 'ZB', 'YB'].index(units.upper())
except ValueError:
return 0
size = float(m.group('size'))
return int(size * (1024 ** exponent))
def _formats_from_html(self, html): def _formats_from_html(self, html):
FORMAT_REGEX = r''' FORMAT_REGEX = r'''
(?x) (?x)
@ -78,16 +80,20 @@ class Channel9IE(InfoExtractor):
<h3>File\s+size</h3>\s*(?P<filesize>.*?)\s* <h3>File\s+size</h3>\s*(?P<filesize>.*?)\s*
</div>)? # File size part may be missing </div>)? # File size part may be missing
''' '''
# Extract known formats quality = qualities((
'MP3', 'MP4',
'Low Quality WMV', 'Low Quality MP4',
'Mid Quality WMV', 'Mid Quality MP4',
'High Quality WMV', 'High Quality MP4'))
formats = [{ formats = [{
'url': x.group('url'), 'url': x.group('url'),
'format_id': x.group('quality'), 'format_id': x.group('quality'),
'format_note': x.group('note'), 'format_note': x.group('note'),
'format': '%s (%s)' % (x.group('quality'), x.group('note')), 'format': '%s (%s)' % (x.group('quality'), x.group('note')),
'filesize': self._restore_bytes(x.group('filesize')), # File size is approximate 'filesize_approx': parse_filesize(x.group('filesize')),
'preference': self._known_formats.index(x.group('quality')), 'quality': quality(x.group('quality')),
'vcodec': 'none' if x.group('note') == 'Audio only' else None, 'vcodec': 'none' if x.group('note') == 'Audio only' else None,
} for x in list(re.finditer(FORMAT_REGEX, html)) if x.group('quality') in self._known_formats] } for x in list(re.finditer(FORMAT_REGEX, html))]
self._sort_formats(formats) self._sort_formats(formats)

View File

@ -646,7 +646,8 @@ class InfoExtractor(object):
@staticmethod @staticmethod
def _og_regexes(prop): def _og_regexes(prop):
content_re = r'content=(?:"([^>]+?)"|\'([^>]+?)\'|\s*([^\s"\'=<>`]+?))' content_re = r'content=(?:"([^>]+?)"|\'([^>]+?)\'|\s*([^\s"\'=<>`]+?))'
property_re = r'(?:name|property)=[\'"]?og:%s[\'"]?' % re.escape(prop) property_re = (r'(?:name|property)=(?:\'og:%(prop)s\'|"og:%(prop)s"|\s*og:%(prop)s\b)'
% {'prop': re.escape(prop)})
template = r'<meta[^>]+?%s[^>]+?%s' template = r'<meta[^>]+?%s[^>]+?%s'
return [ return [
template % (property_re, content_re), template % (property_re, content_re),

View File

@ -27,9 +27,7 @@ class CriterionIE(InfoExtractor):
final_url = self._search_regex( final_url = self._search_regex(
r'so.addVariable\("videoURL", "(.+?)"\)\;', webpage, 'video url') r'so.addVariable\("videoURL", "(.+?)"\)\;', webpage, 'video url')
title = self._og_search_title(webpage) title = self._og_search_title(webpage)
description = self._html_search_regex( description = self._html_search_meta('description', webpage)
r'<meta name="description" content="(.+?)" />',
webpage, 'video description')
thumbnail = self._search_regex( thumbnail = self._search_regex(
r'so.addVariable\("thumbnailURL", "(.+?)"\)\;', r'so.addVariable\("thumbnailURL", "(.+?)"\)\;',
webpage, 'thumbnail url') webpage, 'thumbnail url')

View File

@ -2,11 +2,15 @@ from __future__ import unicode_literals
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import ( from ..compat import (
compat_str,
compat_urllib_parse, compat_urllib_parse,
compat_parse_qs,
compat_urllib_parse_urlparse,
compat_urlparse,
) )
from ..utils import ( from ..utils import (
ExtractorError, ExtractorError,
parse_duration,
replace_extension,
) )
@ -28,6 +32,7 @@ class FiveMinIE(InfoExtractor):
'id': '518013791', 'id': '518013791',
'ext': 'mp4', 'ext': 'mp4',
'title': 'iPad Mini with Retina Display Review', 'title': 'iPad Mini with Retina Display Review',
'duration': 177,
}, },
}, },
{ {
@ -38,9 +43,52 @@ class FiveMinIE(InfoExtractor):
'id': '518086247', 'id': '518086247',
'ext': 'mp4', 'ext': 'mp4',
'title': 'How to Make a Next-Level Fruit Salad', 'title': 'How to Make a Next-Level Fruit Salad',
'duration': 184,
}, },
}, },
] ]
_ERRORS = {
'ErrorVideoNotExist': 'We\'re sorry, but the video you are trying to watch does not exist.',
'ErrorVideoNoLongerAvailable': 'We\'re sorry, but the video you are trying to watch is no longer available.',
'ErrorVideoRejected': 'We\'re sorry, but the video you are trying to watch has been removed.',
'ErrorVideoUserNotGeo': 'We\'re sorry, but the video you are trying to watch cannot be viewed from your current location.',
'ErrorVideoLibraryRestriction': 'We\'re sorry, but the video you are trying to watch is currently unavailable for viewing at this domain.',
'ErrorExposurePermission': 'We\'re sorry, but the video you are trying to watch is currently unavailable for viewing at this domain.',
}
_QUALITIES = {
1: {
'width': 640,
'height': 360,
},
2: {
'width': 854,
'height': 480,
},
4: {
'width': 1280,
'height': 720,
},
8: {
'width': 1920,
'height': 1080,
},
16: {
'width': 640,
'height': 360,
},
32: {
'width': 854,
'height': 480,
},
64: {
'width': 1280,
'height': 720,
},
128: {
'width': 640,
'height': 360,
},
}
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) video_id = self._match_id(url)
@ -59,26 +107,36 @@ class FiveMinIE(InfoExtractor):
'https://syn.5min.com/handlers/SenseHandler.ashx?' + query, 'https://syn.5min.com/handlers/SenseHandler.ashx?' + query,
video_id) video_id)
if not response['success']: if not response['success']:
err_msg = response['errorMessage'] raise ExtractorError(
if err_msg == 'ErrorVideoUserNotGeo': '%s said: %s' % (
msg = 'Video not available from your location' self.IE_NAME,
else: self._ERRORS.get(response['errorMessage'], response['errorMessage'])),
msg = 'Aol said: %s' % err_msg expected=True)
raise ExtractorError(msg, expected=True, video_id=video_id)
info = response['binding'][0] info = response['binding'][0]
second_id = compat_str(int(video_id[:-2]) + 1)
formats = [] formats = []
for quality, height in [(1, 320), (2, 480), (4, 720), (8, 1080)]: parsed_video_url = compat_urllib_parse_urlparse(compat_parse_qs(
if any(r['ID'] == quality for r in info['Renditions']): compat_urllib_parse_urlparse(info['EmbededURL']).query)['videoUrl'][0])
for rendition in info['Renditions']:
if rendition['RenditionType'] == 'm3u8':
formats.extend(self._extract_m3u8_formats(rendition['Url'], video_id, m3u8_id='hls'))
elif rendition['RenditionType'] == 'aac':
continue
else:
rendition_url = compat_urlparse.urlunparse(parsed_video_url._replace(path=replace_extension(parsed_video_url.path.replace('//', '/%s/' % rendition['ID']), rendition['RenditionType'])))
quality = self._QUALITIES.get(rendition['ID'], {})
formats.append({ formats.append({
'format_id': compat_str(quality), 'format_id': '%s-%d' % (rendition['RenditionType'], rendition['ID']),
'url': 'http://avideos.5min.com/%s/%s/%s_%s.mp4' % (second_id[-3:], second_id, video_id, quality), 'url': rendition_url,
'height': height, 'width': quality.get('width'),
'height': quality.get('height'),
}) })
self._sort_formats(formats)
return { return {
'id': video_id, 'id': video_id,
'title': info['Title'], 'title': info['Title'],
'thumbnail': info.get('ThumbURL'),
'duration': parse_duration(info.get('Duration')),
'formats': formats, 'formats': formats,
} }

View File

@ -28,7 +28,7 @@ class JeuxVideoIE(InfoExtractor):
mobj = re.match(self._VALID_URL, url) mobj = re.match(self._VALID_URL, url)
title = mobj.group(1) title = mobj.group(1)
webpage = self._download_webpage(url, title) webpage = self._download_webpage(url, title)
title = self._html_search_meta('name', webpage) title = self._html_search_meta('name', webpage) or self._og_search_title(webpage)
config_url = self._html_search_regex( config_url = self._html_search_regex(
r'data-src="(/contenu/medias/video.php.*?)"', r'data-src="(/contenu/medias/video.php.*?)"',
webpage, 'config URL') webpage, 'config URL')

View File

@ -1,56 +0,0 @@
# encoding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import (
ExtractorError,
float_or_none,
xpath_text,
)
class MegaVideozIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?megavideoz\.eu/video/(?P<id>[^/]+)(?:/(?P<display_id>[^/]+))?'
_TEST = {
'url': 'http://megavideoz.eu/video/WM6UB919XMXH/SMPTE-Universal-Film-Leader',
'info_dict': {
'id': '48723',
'display_id': 'SMPTE-Universal-Film-Leader',
'ext': 'mp4',
'title': 'SMPTE Universal Film Leader',
'thumbnail': 're:https?://.*?\.jpg',
'duration': 10.93,
}
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
display_id = mobj.group('display_id') or video_id
webpage = self._download_webpage(url, display_id)
if any(p in webpage for p in ('>Video Not Found<', '>404 Error<')):
raise ExtractorError('Video %s does not exist' % video_id, expected=True)
config = self._download_xml(
self._search_regex(
r"var\s+cnf\s*=\s*'([^']+)'", webpage, 'cnf url'),
display_id)
video_url = xpath_text(config, './file', 'video url', fatal=True)
title = xpath_text(config, './title', 'title', fatal=True)
thumbnail = xpath_text(config, './image', 'thumbnail')
duration = float_or_none(xpath_text(config, './duration', 'duration'))
video_id = xpath_text(config, './mediaid', 'video id') or video_id
return {
'id': video_id,
'display_id': display_id,
'url': video_url,
'title': title,
'thumbnail': thumbnail,
'duration': duration
}

View File

@ -212,7 +212,7 @@ class VimeoIE(VimeoBaseInfoExtractor):
url = url.replace('http://', 'https://') url = url.replace('http://', 'https://')
password_request = compat_urllib_request.Request(url + '/password', data) password_request = compat_urllib_request.Request(url + '/password', data)
password_request.add_header('Content-Type', 'application/x-www-form-urlencoded') password_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
password_request.add_header('Cookie', 'clip_v=1; vuid=%s' % vuid) password_request.add_header('Cookie', 'clip_test2=1; vuid=%s' % vuid)
password_request.add_header('Referer', url) password_request.add_header('Referer', url)
return self._download_webpage( return self._download_webpage(
password_request, video_id, password_request, video_id,

View File

@ -46,6 +46,12 @@ class YandexMusicTrackIE(InfoExtractor):
% (data['host'], key, data['ts'] + data['path'], storage[1])) % (data['host'], key, data['ts'] + data['path'], storage[1]))
def _get_track_info(self, track): def _get_track_info(self, track):
thumbnail = None
cover_uri = track.get('albums', [{}])[0].get('coverUri')
if cover_uri:
thumbnail = cover_uri.replace('%%', 'orig')
if not thumbnail.startswith('http'):
thumbnail = 'http://' + thumbnail
return { return {
'id': track['id'], 'id': track['id'],
'ext': 'mp3', 'ext': 'mp3',
@ -53,6 +59,7 @@ class YandexMusicTrackIE(InfoExtractor):
'title': '%s - %s' % (track['artists'][0]['name'], track['title']), 'title': '%s - %s' % (track['artists'][0]['name'], track['title']),
'filesize': int_or_none(track.get('fileSize')), 'filesize': int_or_none(track.get('fileSize')),
'duration': float_or_none(track.get('durationMs'), 1000), 'duration': float_or_none(track.get('durationMs'), 1000),
'thumbnail': thumbnail,
} }
def _real_extract(self, url): def _real_extract(self, url):

View File

@ -1371,7 +1371,12 @@ def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
v = getattr(v, get_attr, None) v = getattr(v, get_attr, None)
if v == '': if v == '':
v = None v = None
return default if v is None else (int(v) * invscale // scale) if v is None:
return default
try:
return int(v) * invscale // scale
except ValueError:
return default
def str_or_none(v, default=None): def str_or_none(v, default=None):
@ -1387,7 +1392,12 @@ def str_to_int(int_str):
def float_or_none(v, scale=1, invscale=1, default=None): def float_or_none(v, scale=1, invscale=1, default=None):
return default if v is None else (float(v) * invscale / scale) if v is None:
return default
try:
return float(v) * invscale / scale
except ValueError:
return default
def parse_duration(s): def parse_duration(s):