From a18d8b0a25b5a292fd874574b6e56c970fd889a2 Mon Sep 17 00:00:00 2001 From: hodayabu <44240078+hodayabu@users.noreply.github.com> Date: Sun, 31 May 2020 11:42:38 +0300 Subject: [PATCH] Facebook new ui (#323) * fix_view_count_facebook * facebook_new_ui_metadata * facebook_new_ui_live_info_fix Co-authored-by: bhodaya --- youtube_dl/extractor/facebook.py | 131 ++++++++++++++++++++++++++----- 1 file changed, 113 insertions(+), 18 deletions(-) diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index b11162a07..8f7a2b5c4 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -1,7 +1,5 @@ # coding: utf-8 from __future__ import unicode_literals - -import datetime import re import socket @@ -11,7 +9,7 @@ from ..compat import ( compat_http_client, compat_urllib_error, compat_urllib_parse_unquote, - compat_urllib_parse_unquote_plus, + compat_urllib_parse_unquote_plus ) from ..utils import ( clean_html, @@ -27,11 +25,10 @@ from ..utils import ( urlencode_postdata, update_url_query, lowercase_escape, - parse_iso8601 + parse_iso8601, + unescapeHTML, ) - - class FacebookIE(InfoExtractor): _VALID_URL = r'''(?x) (?: @@ -385,7 +382,12 @@ class FacebookIE(InfoExtractor): elif '>You must log in to continue' in webpage: self.raise_login_required() - if not video_data : + if not video_data: + info_dict = self.get_from_new_ui(webpage, tahoe_data, url) + if info_dict: + return webpage, info_dict + + if not video_data: if self._search_regex(r'newsFeedStream.*?

(.*?)<\/span><\/h1>', webpage, "video_title") is not None: self.raise_login_required() raise ExtractorError('Cannot parse data') @@ -394,15 +396,8 @@ class FacebookIE(InfoExtractor): is_live_stream = video_data[0].get('is_live_stream', False) is_broadcast = video_data[0].get('is_broadcast', False) - live_status = 'not_live' - if is_broadcast: - live_status = 'completed' - if is_live_stream: - live_status = 'live' - if is_scheduled: - live_status = 'upcoming' + is_live, live_status = self.extract_live_info(is_scheduled, is_live_stream, is_broadcast) - is_live = live_status == 'live' subtitles = {} formats = [] @@ -478,8 +473,6 @@ class FacebookIE(InfoExtractor): timestamp = int_or_none(regex_search_result_date_time) or int_or_none(regex_search_result_publish_time) - - uploader_id = self._search_regex( r'ownerid:"([\d]+)', webpage, 'uploader_id', default=None) or self._search_regex( @@ -503,6 +496,50 @@ class FacebookIE(InfoExtractor): comment_count = parse_count(self._extract_comments_count(webpage, tahoe_data)) uploader_handle = self._resolve_uploader_handle(tahoe_data, uploader_id) + + info_dict = self.build_info_dict(webpage, tahoe_data, video_id, video_title, formats, uploader, timestamp, + thumbnail, view_count, uploader_id, is_live, live_status, likes_count, + shares_count, subtitles, comment_count, other_posts_view_count, uploader_handle) + + return webpage, info_dict + + + + def get_from_new_ui(self, webpage, tahoe_data, url): + + video_title = self._search_regex(r'"headline":"(.+?")', webpage, 'title') + comments_count = parse_count(self._search_regex(r'"commentCount":(.+?,)', webpage, 'comments_count')) + subtitles = self._search_regex(r'"about":"(.+?")', webpage, 'subtitles') + likes = parse_count(self._extract_likes(webpage, tahoe_data)) + + timestamp = self._search_regex(r'"datePublished":"(.+?)"', webpage, 'timestamp') + timestamp = parse_iso8601(timestamp) + + uploader_json = self._search_regex(r'"author":{(.+?)}', webpage, 'uploader') + uploader_handle, uploader = self._extract_uploader_info_new_ui(uploader_json) + + ids_json = self._search_regex(r'data-video-channel-id="(.+?)"', webpage, 'ids') + channel_id, video_id = self._extract_ids_info_new_ui(ids_json) + + post_view_counts = parse_count(self._search_regex(r'"postViewCount":(.+?),', tahoe_data.secondary, 'views')) + other_post_view_counts = parse_count(self._search_regex(r'"otherPostsViewCount":(.+?),', tahoe_data.secondary, 'other_views')) + + share_counts = parse_count(self._search_regex(r'"sharecount":(.+?),', tahoe_data.secondary, 'other_views')) + thumbnail = self._search_regex(r'"thumbnailUrl":"(.+?)"', webpage, 'thumbnail') + is_live, live_status = self.resolve_new_ui_live_info(webpage, tahoe_data) + + formats = self.resolve_new_ui_format(webpage) + info_dict = self.build_info_dict(webpage, tahoe_data, video_id, video_title, formats, uploader, timestamp, + thumbnail, post_view_counts, channel_id, is_live, live_status, likes, + share_counts, {}, comments_count, other_post_view_counts, + uploader_handle) + + return info_dict + + def build_info_dict(self,webpage, tahoe_data, video_id, video_title=None, formats=None, uploader=None, + timestamp=None, thumbnail=None, view_count=None, uploader_id=None, is_live=None, live_status=None, + likes_count=None, shares_count=None, subtitles=None, comment_count=None, other_posts_view_count=None, + uploader_handle=None): info_dict = { 'id': video_id, 'title': video_title, @@ -528,7 +565,7 @@ class FacebookIE(InfoExtractor): if uploader_id: info_dict['uploader_like_count'] = FacebookAjax(self, webpage, uploader_id).page_likes - return webpage, info_dict + return info_dict def _resolve_uploader_handle(self, tahoe_data, uploader_id): uploader_handle = self._search_regex(r'"video_path":"\\\/([^\/]+)\\\/', tahoe_data.primary, 'uploader_handle', @@ -655,6 +692,64 @@ class FacebookIE(InfoExtractor): video_title = 'Facebook video #%s' % video_id return video_title + def _extract_uploader_info_new_ui(self, uploader_json): + uploader_handle = self._search_regex(r'"name":"(.+?")', uploader_json, 'uploader') + uploader_url = self._search_regex(r'"url":"(.+?")', uploader_json, 'uploader_url') + uploader_url_str = uploader_url.decode("utf-8") + uploader = uploader_url_str.split('\\/')[-2] + return uploader_handle, uploader + + def _extract_ids_info_new_ui(self, ids_json): + ids_json_str = ids_json.decode("utf-8") + ids = ids_json_str.split(':') + channel_id = ids[0] + video_id = ids[1] + return channel_id, video_id + + def resolve_new_ui_live_info(self, webpage, tahoe_data): + + is_scheduled = '"isScheduledLive":true' in tahoe_data.secondary + is_live_stream = self._search_regex(r'"isLiveVOD":(.+?),', tahoe_data.secondary, "vod_live") + is_broadcast = '"isLiveBroadcast":true' in webpage + + return self.extract_live_info(is_scheduled, is_live_stream, is_broadcast) + + + def extract_live_info(self, is_scheduled, is_live_stream, is_broadcast): + live_status = 'not_live' + if is_broadcast: + live_status = 'completed' + if is_live_stream: + live_status = 'live' + if is_scheduled: + live_status = 'upcoming' + + is_live = live_status == 'live' + + return is_live, live_status + + + def resolve_new_ui_format(self, webpage): + format_url = self.build_format_url(webpage) + width = parse_count(self._search_regex(r'