Merge pull request #339 from aviperes/facebook_missing_reactions_and_comment_count

facebook separate reactions from likes
This commit is contained in:
aviperetz34 2020-06-21 10:51:20 +03:00 committed by GitHub
commit 52c3b5898e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 55 additions and 17 deletions

31
test/ci/test_facebook.py Normal file
View File

@ -0,0 +1,31 @@
import unittest
import youtube_dl
class facebookMetaData(unittest.TestCase):
def test_metadata_fetch(self):
params = {}
url = "https://www.facebook.com/iihfhockey/videos/2742345396033296/"
ydl = youtube_dl.YoutubeDL(params)
info = ydl.extract_info(url, download=False)
self.assertTrue(info.has_key('like_count'))
self.assertTrue(info.has_key('reactions_count'))
def _test_metadata_fetch_with_log_in(self):
url = "https://www.facebook.com/iihfhockey/videos/2742345396033296/"
params = {}
with open("cookie_file") as file:
proxy = "ec2-35-175-164-238.compute-1.amazonaws.com:3128"
params['cookiefile'] = file.name
params['proxy'] = proxy
ydl = youtube_dl.YoutubeDL(params)
info = ydl.extract_info(url, download=False)
self.assertTrue(info.get('timestamp'))
self.assertTrue(info.get('view_count'))
self.assertTrue(info.get('width'))
self.assertTrue(info.get('uploader_id'))
self.assertTrue(info.get('thumbnail'))
if __name__ == '__main__':
unittest.main()

View File

@ -398,7 +398,6 @@ class FacebookIE(InfoExtractor):
is_live, live_status = self.extract_live_info(is_scheduled, is_live_stream, is_broadcast) is_live, live_status = self.extract_live_info(is_scheduled, is_live_stream, is_broadcast)
subtitles = {} subtitles = {}
formats = [] formats = []
for f in video_data: for f in video_data:
@ -443,8 +442,9 @@ class FacebookIE(InfoExtractor):
return lowercase_escape(s) return lowercase_escape(s)
uploader = clean_html(get_element_by_id('fbPhotoPageAuthorName', webpage)) or \ uploader = clean_html(get_element_by_id('fbPhotoPageAuthorName', webpage)) or \
self._search_regex(r'ownerName\s*:\s*"([^"]+)"', webpage, 'uploader',default=None) or \ self._search_regex(r'ownerName\s*:\s*"([^"]+)"', webpage, 'uploader', default=None) or \
_lowercase_escape(self._search_regex(r'\"ownerName\":"(.+?)"', tahoe_data.secondary, 'uploader_id', fatal=False)) or \ _lowercase_escape(self._search_regex(r'\"ownerName\":"(.+?)"', tahoe_data.secondary, 'uploader_id',
fatal=False)) or \
self._search_regex(r'ownerName"\s*:\s*"([^"]+)"', webpage, 'uploader', default=None) or \ self._search_regex(r'ownerName"\s*:\s*"([^"]+)"', webpage, 'uploader', default=None) or \
self._og_search_title(webpage, default=None) self._og_search_title(webpage, default=None)
@ -476,6 +476,7 @@ class FacebookIE(InfoExtractor):
view_count = parse_count(self._extract_views(webpage, tahoe_data)) view_count = parse_count(self._extract_views(webpage, tahoe_data))
other_posts_view_count = parse_count(self._extract_meta_count(['otherPostsViewCount'], webpage, tahoe_data, 'other_post_views')) other_posts_view_count = parse_count(self._extract_meta_count(['otherPostsViewCount'], webpage, tahoe_data, 'other_post_views'))
reactions_count = parse_count(self._extract_reactions(webpage, tahoe_data))
likes_count = parse_count(self._extract_likes(webpage, tahoe_data)) likes_count = parse_count(self._extract_likes(webpage, tahoe_data))
shares_count = parse_count(self._extract_shares(webpage, tahoe_data)) shares_count = parse_count(self._extract_shares(webpage, tahoe_data))
comment_count = parse_count(self._extract_comments_count(webpage, tahoe_data)) comment_count = parse_count(self._extract_comments_count(webpage, tahoe_data))
@ -484,12 +485,11 @@ class FacebookIE(InfoExtractor):
info_dict = self.build_info_dict(webpage, tahoe_data, video_id, video_title, formats, uploader, timestamp, info_dict = self.build_info_dict(webpage, tahoe_data, video_id, video_title, formats, uploader, timestamp,
thumbnail, view_count, uploader_id, is_live, live_status, likes_count, thumbnail, view_count, uploader_id, is_live, live_status, likes_count,
shares_count, subtitles, comment_count, other_posts_view_count, uploader_handle) reactions_count, shares_count, subtitles, comment_count, other_posts_view_count,
uploader_handle)
return webpage, info_dict return webpage, info_dict
def get_from_new_ui(self, webpage, tahoe_data, video_id): def get_from_new_ui(self, webpage, tahoe_data, video_id):
video_title = self._resolve_new_ui_title(webpage, tahoe_data, video_id) video_title = self._resolve_new_ui_title(webpage, tahoe_data, video_id)
@ -498,6 +498,8 @@ class FacebookIE(InfoExtractor):
likes = parse_count(self._extract_likes(webpage, tahoe_data)) likes = parse_count(self._extract_likes(webpage, tahoe_data))
reactions = parse_count(self._extract_reactions(webpage, tahoe_data))
timestamp = self._resolve_new_ui_timestamp(webpage, tahoe_data) timestamp = self._resolve_new_ui_timestamp(webpage, tahoe_data)
uploader_json = self._search_regex(r'"author":{(.+?)}', webpage, 'uploader') uploader_json = self._search_regex(r'"author":{(.+?)}', webpage, 'uploader')
@ -517,15 +519,17 @@ class FacebookIE(InfoExtractor):
formats = self.resolve_new_ui_format(webpage) formats = self.resolve_new_ui_format(webpage)
info_dict = self.build_info_dict(webpage, tahoe_data, video_id, video_title, formats, uploader, timestamp, info_dict = self.build_info_dict(webpage, tahoe_data, video_id, video_title, formats, uploader, timestamp,
thumbnail, post_view_counts, uploader_id, is_live, live_status, likes, thumbnail, post_view_counts, uploader_id, is_live, live_status, likes, reactions,
share_counts, {}, comments_count, other_post_view_counts, share_counts, {}, comments_count, other_post_view_counts,
uploader_handle) uploader_handle)
return info_dict return info_dict
def build_info_dict(self,webpage, tahoe_data, video_id, video_title=None, formats=None, uploader=None, def build_info_dict(self, webpage, tahoe_data, video_id, video_title=None, formats=None, uploader=None,
timestamp=None, thumbnail=None, view_count=None, uploader_id=None, is_live=None, live_status=None, timestamp=None, thumbnail=None, view_count=None, uploader_id=None, is_live=None,
likes_count=None, shares_count=None, subtitles=None, comment_count=None, other_posts_view_count=None, live_status=None,
likes_count=None, reactions_count=None, shares_count=None, subtitles=None, comment_count=None,
other_posts_view_count=None,
uploader_handle=None): uploader_handle=None):
info_dict = { info_dict = {
'id': video_id, 'id': video_id,
@ -539,6 +543,7 @@ class FacebookIE(InfoExtractor):
'is_live': is_live, 'is_live': is_live,
'live_status': live_status, 'live_status': live_status,
'like_count': likes_count, 'like_count': likes_count,
'reactions_count': reactions_count,
'share_count': shares_count, 'share_count': shares_count,
'subtitles': subtitles, 'subtitles': subtitles,
'comment_count': comment_count, 'comment_count': comment_count,
@ -571,8 +576,8 @@ class FacebookIE(InfoExtractor):
if value: if value:
break break
value = self._search_regex( value = self._search_regex(
r'\b%s\s*:\s*["\']([\d,.]+)' % f, webpage, name, r'\b%s\s*:\s*["\']([\d,.]+)' % f, webpage, name,
default=None default=None
) )
if value: if value:
break break
@ -593,10 +598,15 @@ class FacebookIE(InfoExtractor):
if values: if values:
return values[-1] return values[-1]
def _extract_likes(self, webpage, tahoe_data): def _extract_reactions(self, webpage, tahoe_data):
pairs = ( pairs = (
(r'"reaction_count"\s*:\s*{\s*"count"\s*:\s*(\d+)', [tahoe_data.secondary, webpage]), (r'"reaction_count"\s*:\s*{\s*"count"\s*:\s*(\d+)', [tahoe_data.secondary, webpage]),
(r'reaction_count:{count:([\d]+)}', webpage), (r'reaction_count:{count:([\d]+)}', webpage)
)
return self._extract_first_pattern(pairs)
def _extract_likes(self, webpage, tahoe_data):
pairs = (
(r'\blikecount\s*:\s*["\']([\d,.]+)', webpage), (r'\blikecount\s*:\s*["\']([\d,.]+)', webpage),
(r'[\'\"]\blikecount[\'\"]\s*:\s*(\d+)', tahoe_data.secondary) (r'[\'\"]\blikecount[\'\"]\s*:\s*(\d+)', tahoe_data.secondary)
) )
@ -709,7 +719,6 @@ class FacebookIE(InfoExtractor):
return self.extract_live_info(is_scheduled, is_live_stream, is_broadcast) return self.extract_live_info(is_scheduled, is_live_stream, is_broadcast)
def extract_live_info(self, is_scheduled, is_live_stream, is_broadcast): def extract_live_info(self, is_scheduled, is_live_stream, is_broadcast):
live_status = 'not_live' live_status = 'not_live'
if is_broadcast: if is_broadcast:
@ -723,7 +732,6 @@ class FacebookIE(InfoExtractor):
return is_live, live_status return is_live, live_status
def resolve_new_ui_format(self, webpage): def resolve_new_ui_format(self, webpage):
format_url = self.build_format_url(webpage) format_url = self.build_format_url(webpage)
width = parse_count(self._search_regex(r'<meta property="og:video:width" content="(.+?)"', webpage, 'width')) width = parse_count(self._search_regex(r'<meta property="og:video:width" content="(.+?)"', webpage, 'width'))
@ -799,7 +807,6 @@ class FacebookIE(InfoExtractor):
return video_title and not u'Log In or Sign Up to View' in video_title return video_title and not u'Log In or Sign Up to View' in video_title
class FacebookTahoeData: class FacebookTahoeData:
def __init__(self, extractor, page, video_id): def __init__(self, extractor, page, video_id):
self._page = page self._page = page