fixed dailymotion view_count extraction and added support for playerv5 embed pages

This commit is contained in:
Asadullah Ahmad 2015-07-17 01:00:58 +05:00
parent e901e6fa81
commit 26b26d207d
2 changed files with 41 additions and 29 deletions

Binary file not shown.

View File

@ -19,6 +19,7 @@ from ..utils import (
unescapeHTML, unescapeHTML,
) )
class DailymotionBaseInfoExtractor(InfoExtractor): class DailymotionBaseInfoExtractor(InfoExtractor):
@staticmethod @staticmethod
def _build_request(url): def _build_request(url):
@ -33,6 +34,7 @@ class DailymotionIE(DailymotionBaseInfoExtractor):
_VALID_URL = r'(?i)(?:https?://)?(?:(www|touch)\.)?dailymotion\.[a-z]{2,3}/(?:(embed|#)/)?video/(?P<id>[^/?_]+)' _VALID_URL = r'(?i)(?:https?://)?(?:(www|touch)\.)?dailymotion\.[a-z]{2,3}/(?:(embed|#)/)?video/(?P<id>[^/?_]+)'
IE_NAME = 'dailymotion' IE_NAME = 'dailymotion'
_FORMATS = [ _FORMATS = [
('stream_h264_ld_url', 'ld'), ('stream_h264_ld_url', 'ld'),
('stream_h264_url', 'standard'), ('stream_h264_url', 'standard'),
@ -121,10 +123,12 @@ class DailymotionIE(DailymotionBaseInfoExtractor):
embed_request = self._build_request(embed_url) embed_request = self._build_request(embed_url)
embed_page = self._download_webpage( embed_page = self._download_webpage(
embed_request, video_id, 'Downloading embed page') embed_request, video_id, 'Downloading embed page')
info = self._search_regex(r'var info = ({.*?}),$', embed_page, checkv5 = self._search_regex(r'playerV5(.)', embed_page,
'video info', flags=re.MULTILINE, fatal=False) 'checkv5', default=None, fatal=False)
"""For normal embed pages with info JSON""" """For normal embed pages with info variable"""
if info is not None: if checkv5 is None:
info = self._search_regex(r'var info = ({.*?}),$', embed_page,
'video info', flags=re.MULTILINE)
info = json.loads(info) info = json.loads(info)
if info.get('error') is not None: if info.get('error') is not None:
msg = 'Couldn\'t get video, Dailymotion says: %s' % info['error']['title'] msg = 'Couldn\'t get video, Dailymotion says: %s' % info['error']['title']
@ -148,9 +152,11 @@ class DailymotionIE(DailymotionBaseInfoExtractor):
if not formats: if not formats:
raise ExtractorError('Unable to extract video URL') raise ExtractorError('Unable to extract video URL')
video_subtitles = self.extract_subtitles(video_id, webpage) video_subtitles = self.extract_subtitles(video_id, webpage)
view_count = str_to_int(self._search_regex( view_count = self._search_regex(
r'video_views_count[^>]+>\s+([\d\.,]+)', r'video_views_count[^>]+>\s+([\d\. ]+)\s+views',
webpage, 'view count', fatal=False)) webpage, 'view count', fatal=False)
view_count = view_count.replace(" ", "")
view_count = str_to_int(view_count)
title = self._og_search_title(webpage, default=None) title = self._og_search_title(webpage, default=None)
if title is None: if title is None:
title = self._html_search_regex( title = self._html_search_regex(
@ -171,38 +177,44 @@ class DailymotionIE(DailymotionBaseInfoExtractor):
else: else:
formats = [] formats = []
for (key, format_id) in self._FORMATSv5: for (key, format_id) in self._FORMATSv5:
video_url = self._search_regex(r'%s+".{30}(.*?)"' % key, embed_page, """Verify format is available"""
'video info', flags=re.MULTILINE, fatal=False) checkformat = self._search_regex(r'%s+":(.)' % key, embed_page,
if video_url: 'checkformat', default=None)
if checkformat is not None:
video_url = self._search_regex(r'%s+".{30}(.*?)"' % key, embed_page,
'video info', flags=re.MULTILINE)
video_url = video_url.replace("\\", "") video_url = video_url.replace("\\", "")
if video_url is not None: if video_url is not None:
m_size = re.search(r'H264-(\d+)x(\d+)', video_url) m_size = re.search(r'H264-(\d+)x(\d+)', video_url)
if m_size is not None: if m_size is not None:
width, height = map(int_or_none, (m_size.group(1), m_size.group(2))) width, height = map(int_or_none, (m_size.group(1), m_size.group(2)))
else: else:
width, height = None, None width, height = None, None
formats.append({ formats.append({
'url': video_url, 'url': video_url,
'ext': 'mp4', 'ext': 'mp4',
'format_id': format_id, 'format_id': format_id,
'width': width, 'width': width,
'height': height, 'height': height,
}) })
if not formats: if not formats:
raise ExtractorError('Unable to extract video URL from playerv5 page') raise ExtractorError('Unable to extract video URL from playerv5 page')
v5screenname = self._search_regex(r'screenname":"(.*?)"', embed_page, v5screenname = self._search_regex(r'screenname":"(.*?)"', embed_page,
'video info', flags=re.MULTILINE) 'video info-v5screenname', flags=re.MULTILINE, fatal=False)
v5thumbnailurl = self._search_regex(r'poster_url":"(.*?)"', embed_page, v5thumbnailurl = self._search_regex(r'poster_url":"(.*?)"', embed_page,
'video info', flags=re.MULTILINE) 'video info-v5thumbnailurl', flags=re.MULTILINE, fatal=False)
if v5thumbnailurl is not None:
v5thumbnailurl = v5thumbnailurl.replace("\\", "")
video_subtitles = self.extract_subtitles(video_id, webpage) video_subtitles = self.extract_subtitles(video_id, webpage)
view_count = str_to_int(self._search_regex( view_count = self._search_regex(r'video_views_count[^>]+>\s+([\d\. ]+)\s+views',
r'video_views_count[^>]+>\s+([\d\.,]+)', webpage, 'view count', fatal=False)
webpage, 'view count', fatal=False)) view_count = view_count.replace(" ", "")
view_count = str_to_int(view_count)
title = self._og_search_title(webpage, default=None) title = self._og_search_title(webpage, default=None)
if title is None: if title is None:
title = self._html_search_regex( title = self._html_search_regex(
r'(?s)<span\s+id="video_title"[^>]*>(.*?)</span>', webpage, r'(?s)<span\s+id="video_title"[^>]*>(.*?)</span>', webpage,
'title') 'title')
return { return {
'id': video_id, 'id': video_id,
'formats': formats, 'formats': formats,