diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index 789dd79d5..c4555142e 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -57,7 +57,7 @@ class FacebookIE(InfoExtractor): _CHROME_USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.97 Safari/537.36' _VIDEO_PAGE_TEMPLATE = 'https://www.facebook.com/video/video.php?v=%s' - _VIDEO_PAGE_TAHOE_TEMPLATE = 'https://www.facebook.com/video/tahoe/async/%s/?chain=true&isvideo=true&payloadtype=primary' + _VIDEO_PAGE_TAHOE_TEMPLATE = 'https://www.facebook.com/video/tahoe/async/%s/?chain=true&isvideo=true&payloadtype=%s' _TESTS = [{ 'url': 'https://www.facebook.com/video.php?v=637842556329505&fref=nf', @@ -218,6 +218,25 @@ class FacebookIE(InfoExtractor): 'ext': 'mp4', 'title': '#ESLOne VoD - Birmingham Finals Day#1 Fnatic vs. @Evil Geniuses', 'uploader': 'ESL One Dota 2', + 'timestamp': 1527084179, + 'upload_date': '20180523', + 'uploader_id': '234218833769558', + 'is_live': False + }, + 'params': { + 'skip_download': True, + }, + }, { + # no timestamp + 'url': 'https://www.facebook.com/SuperNewsGames/videos/642255722780473/', + 'info_dict': { + 'timestamp': 1521221400, + 'uploader': 'Super News Games', + 'uploader_id': '229550157384367', + 'id': '642255722780473', + 'ext': 'mp4', + 'upload_date': '20180316', + 'title': 'The Voice of Nick is trying Fortnite after 100 hours of PLAYERUNKNOWN\'S BATTL...', }, 'params': { 'skip_download': True, @@ -339,6 +358,7 @@ class FacebookIE(InfoExtractor): video_id, transform_source=js_to_json, fatal=False) video_data = extract_from_jsmods_instances(server_js_data) + tahoe_data = FacebookTahoeData(self, webpage, video_id) if not video_data: if not fatal_if_no_video: return webpage, False @@ -349,36 +369,33 @@ class FacebookIE(InfoExtractor): expected=True) elif '>You must log in to continue' in webpage: self.raise_login_required() - # Video info not in first request, do a secondary request using # tahoe player specific URL - tahoe_data = self._download_webpage( - self._VIDEO_PAGE_TAHOE_TEMPLATE % video_id, video_id, - data=urlencode_postdata({ - '__a': 1, - '__pc': self._search_regex( - r'pkg_cohort["\']\s*:\s*["\'](.+?)["\']', webpage, - 'pkg cohort', default='PHASED:DEFAULT'), - '__rev': self._search_regex( - r'client_revision["\']\s*:\s*(\d+),', webpage, - 'client revision', default='3944515'), - 'fb_dtsg': self._search_regex( - r'"DTSGInitialData"\s*,\s*\[\]\s*,\s*{\s*"token"\s*:\s*"([^"]+)"', - webpage, 'dtsg token', default=''), - }), - headers={ - 'Content-Type': 'application/x-www-form-urlencoded', - }) tahoe_js_data = self._parse_json( self._search_regex( - r'for\s+\(\s*;\s*;\s*\)\s*;(.+)', tahoe_data, + r'for\s+\(\s*;\s*;\s*\)\s*;(.+)', tahoe_data.primary, 'tahoe js data', default='{}'), video_id, fatal=False) + video_data = extract_from_jsmods_instances(tahoe_js_data) if not video_data: raise ExtractorError('Cannot parse data') + is_scheduled = '"isScheduledLive":true' in tahoe_data.secondary + is_live_stream = video_data[0].get('is_live_stream', False) + is_broadcast = video_data[0].get('is_broadcast', False) + + live_status = 'not_live' + if is_broadcast: + live_status = 'completed' + if is_live_stream: + live_status = 'live' + if is_scheduled: + live_status = 'upcoming' + + is_live = live_status == 'live' + formats = [] for f in video_data: format_id = f['stream_type'] @@ -423,16 +440,35 @@ class FacebookIE(InfoExtractor): video_title = 'Facebook video #%s' % video_id uploader = clean_html(get_element_by_id( 'fbPhotoPageAuthorName', webpage)) or self._search_regex( - r'ownerName\s*:\s*"([^"]+)"', webpage, 'uploader', - default=None) or self._og_search_title(webpage, fatal=False) + r'ownerName\s*:\s*"([^"]+)"', webpage, 'uploader',default=None) or \ + self._og_search_title(webpage, default=None) or self._search_regex( + r'\"ownerName\":"(.+?)"', tahoe_data.secondary, + 'uploader_id', fatal=False) + + timestamp = int_or_none(self._search_regex( r']+data-utime=["\'](\d+)', webpage, + 'timestamp', default=None) or self._search_regex( + r'data-utime=\\\"(\d+)\\\"', tahoe_data.secondary, 'timestamp', default=None)) + + uploader_id = self._search_regex( + r'ownerid:"([\d]+)', webpage, + 'uploader_id', default=None) or self._search_regex( + r'[\'\"]ownerid[\'\"]\s*:\s*[\'\"](\d+)[\'\"]', tahoe_data.secondary, + 'uploader_id', fatal=False) thumbnail = self._og_search_thumbnail(webpage) view_count = parse_count(self._search_regex( + r'\bpostViewCount\s*:\s*["\']([\d,.]+)', webpage, 'view count', + default=None) or self._search_regex( + r'[\'\"]postViewCount[\'\"]\s*:\s*(\d+)', tahoe_data.secondary, 'view count', + default=None) or self._search_regex( r'\bviewCount\s*:\s*["\']([\d,.]+)', webpage, 'view count', - default=None)) + default=None) or self._search_regex( + r'[\'\"]viewCount[\'\"]\s*:\s*(\d+)', tahoe_data.secondary, 'view count', + default=None) + ) info_dict = { 'id': video_id, @@ -442,6 +478,9 @@ class FacebookIE(InfoExtractor): 'timestamp': timestamp, 'thumbnail': thumbnail, 'view_count': view_count, + 'uploader_id': uploader_id, + 'is_live': is_live, + 'live_status': live_status } return webpage, info_dict @@ -472,6 +511,54 @@ class FacebookIE(InfoExtractor): return info_dict +class FacebookTahoeData: + def __init__(self, extractor, page, video_id): + self._page = page + self._video_id = video_id + self._extractor = extractor + self._data = {} + + def _get_data(self, data_type): + if data_type in self._data: + data = self._data[data_type] + else: + req_data, headers = self._get_request_data_and_headers() + data = self._extractor._download_webpage( + self._extractor._VIDEO_PAGE_TAHOE_TEMPLATE % (self._video_id, data_type), self._video_id, + data=req_data, + headers=headers + ) + return '' if not data else data + + @property + def primary(self): + return self._get_data('primary') + + @property + def secondary(self): + return self._get_data('secondary') + + def _get_request_data_and_headers(self): + tahoe_request_data = urlencode_postdata( + { + '__a': 1, + '__pc': self._extractor._search_regex( + r'pkg_cohort["\']\s*:\s*["\'](.+?)["\']', self._page, + 'pkg cohort', default='PHASED:DEFAULT'), + '__rev': self._extractor._search_regex( + r'client_revision["\']\s*:\s*(\d+),', self._page, + 'client revision', default='3944515'), + 'fb_dtsg': self._extractor._search_regex( + r'"DTSGInitialData"\s*,\s*\[\]\s*,\s*{\s*"token"\s*:\s*"([^"]+)"', + self._page, 'dtsg token', default=''), + }) + tahoe_request_headers = { + 'Content-Type': 'application/x-www-form-urlencoded', + } + + return tahoe_request_data, tahoe_request_headers + + class FacebookPluginsVideoIE(InfoExtractor): _VALID_URL = r'https?://(?:[\w-]+\.)?facebook\.com/plugins/video\.php\?.*?\bhref=(?Phttps.+)' diff --git a/youtube_dl/extractor/odnoklassniki.py b/youtube_dl/extractor/odnoklassniki.py index 114b93c07..1eb3bdd2f 100644 --- a/youtube_dl/extractor/odnoklassniki.py +++ b/youtube_dl/extractor/odnoklassniki.py @@ -94,6 +94,21 @@ class OdnoklassnikiIE(InfoExtractor): 'skip_download': True, }, 'skip': 'Video has not been found', + }, { + # live video + 'url': 'https://www.ok.ru/video/1050794925929', + 'info_dict': { + 'id': '1050794925929', + 'title': 're:^Поиск репертуара [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'ext': 'mp4', + 'upload_date': u'20190428', + 'uploader': u'(((((КнЯзЬ ))))', + 'uploader_id': u'557343776873', + 'is_live': True + }, + 'params': { + 'skip_download': True, + } }, { 'url': 'http://ok.ru/web-api/video/moviePlayer/20079905452', 'only_matching': True, @@ -131,8 +146,8 @@ class OdnoklassnikiIE(InfoExtractor): 'http://ok.ru/video/%s' % video_id, video_id) error = self._search_regex( - r'[^>]+class="vp_video_stub_txt"[^>]*>([^<]+)<', - webpage, 'error', default=None) + r'
(?P.*?)<\/div>', + webpage, name='error', group='error', default=None) if error: raise ExtractorError(error, expected=True) @@ -176,6 +191,45 @@ class OdnoklassnikiIE(InfoExtractor): upload_date = unified_strdate(self._html_search_meta( 'ya:ovs:upload_date', webpage, 'upload date', default=None)) + if upload_date is None: + upload_date_str = self._search_regex( + r'vp-layer-info_date">(?P.*?)<\/span>', + webpage, 'upload date', group='date') + if upload_date_str: + upload_date_str = upload_date_str.replace('Sept', 'Sep') + from datetime import datetime, timedelta + upload_date_time = None + try: + upload_date_time = datetime.strptime(upload_date_str, '%d %b %Y') + except: + pass + try: + upload_date_time = datetime.strptime(upload_date_str, '%d %b') + upload_date_time = upload_date_time.replace(year=datetime.utcnow().year) + except: + pass + try: + upload_date_time = datetime.strptime(upload_date_str, '%d %B') + upload_date_time = upload_date_time.replace(year=datetime.utcnow().year) + except: + pass + + try: + if upload_date_str.find(':') >=0: + hour_and_minutes = upload_date_str.split(' ')[-1] + else: + hour_and_minutes = upload_date_str + upload_date_time = datetime.strptime(hour_and_minutes, '%H:%M') + upload_date_time = upload_date_time.replace(year=datetime.utcnow().year) + upload_date_time = upload_date_time.replace(day=datetime.utcnow().day) + if upload_date_str.find('yesterday') ==0: + upload_date_time = upload_date_time - timedelta(days=1) + except: + pass + + if upload_date_time: + upload_date = upload_date_time.strftime('%Y%m%d') + age_limit = None adult = self._html_search_meta( 'ya:ovs:adult', webpage, 'age limit', default=None) @@ -207,6 +261,7 @@ class OdnoklassnikiIE(InfoExtractor): assert title if provider == 'LIVE_TV_APP': info['title'] = self._live_title(title) + info['is_live'] = True quality = qualities(('4', '0', '1', '2', '3', '5')) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index a41178bab..1a571eb7e 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -397,6 +397,7 @@ class VimeoIE(VimeoBaseInfoExtractor): 'timestamp': 1324343742, 'upload_date': '20111220', 'description': 'md5:ae23671e82d05415868f7ad1aec21147', + 'view_count': int, }, }, { @@ -693,12 +694,17 @@ class VimeoIE(VimeoBaseInfoExtractor): 'timestamp', default=None) try: - view_count = int(self._search_regex(r'UserPlays:(\d+)', webpage, 'view count')) + # When userInteractionCount does not exist views is 0 + view_count = int_or_none( + self._search_regex( + r'"interactionType":"http:\/\/schema\.org\/WatchAction","userInteractionCount":(.+?)}', + webpage, 'view count', default=0 + ) + ) like_count = int(self._search_regex(r'UserLikes:(\d+)', webpage, 'like count')) comment_count = int(self._search_regex(r'UserComments:(\d+)', webpage, 'comment count')) except RegexNotFoundError: # This info is only available in vimeo.com/{id} urls - view_count = None like_count = None comment_count = None diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py index f57ed2288..aa241824a 100644 --- a/youtube_dl/extractor/vk.py +++ b/youtube_dl/extractor/vk.py @@ -282,7 +282,13 @@ class VKIE(VKBaseIE): # The video is not available in your region. 'url': 'https://vk.com/video-51812607_171445436', 'only_matching': True, - }] + }, + { + # Video %s is not available. + 'url': 'https://vk.com/video-173478245_456239188', + 'only_matching': True, + }, + ] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) @@ -345,6 +351,9 @@ class VKIE(VKBaseIE): r'The video .+? is not available in your region.': 'Video %s is not available in your region.', + + r'The video .+? is unavailable': + 'Video %s is not available.', } for error_re, error_msg in ERRORS.items(): diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 6aa666bc9..c0c82859e 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2019.06.08' +__version__ = 'vc.2019.06.08'