From a1986b5bf25a0c17bee78897b6945bcd4f983077 Mon Sep 17 00:00:00 2001 From: bhodaya Date: Sun, 28 Jun 2020 15:59:47 +0300 Subject: [PATCH 1/2] fix facebook thumbnail --- test/ci/test_facebook.py | 25 ++++++++++++++++++- youtube_dl/extractor/facebook.py | 41 +++++++++++++++++++++++++------- 2 files changed, 56 insertions(+), 10 deletions(-) diff --git a/test/ci/test_facebook.py b/test/ci/test_facebook.py index 8533a81f5..383017c09 100644 --- a/test/ci/test_facebook.py +++ b/test/ci/test_facebook.py @@ -1,5 +1,6 @@ import unittest import youtube_dl +from youtube_dl.utils import DownloadError class facebookMetaData(unittest.TestCase): @@ -25,8 +26,15 @@ class facebookMetaData(unittest.TestCase): info = ydl.extract_info(url, download=False) self.assertGreater(info.get('comment_count'), 0) + def test_meta_data(self): + params = {} + url = "https://www.facebook.com/watch?v=177407933624543/" + ydl = youtube_dl.YoutubeDL(params) + info = ydl.extract_info(url, download=False) + self.assertGreater(info.get('comment_count'), 0) + def test_metadata_fetch_with_log_in(self): - url = "https://www.facebook.com/SerieA/videos/282581803097269" + url = "https://www.facebook.com/oristandup/videos/675360549895283" params = {} with open("cookie_file") as file: proxy = "ec2-3-221-82-67.compute-1.amazonaws.com:3128" @@ -34,6 +42,8 @@ class facebookMetaData(unittest.TestCase): params['proxy'] = proxy ydl = youtube_dl.YoutubeDL(params) info = ydl.extract_info(url, download=False) + print (info.get('title')) + print (info.get('timestamp')) self.assertTrue(info.get('timestamp')) self.assertTrue(info.get('view_count')) self.assertTrue(info.get('comment_count')) @@ -41,6 +51,19 @@ class facebookMetaData(unittest.TestCase): self.assertTrue(info.get('uploader_id')) self.assertTrue(info.get('thumbnail')) + def test_unavailable_video(self): + url = "https://www.facebook.com/101457238278830/videos/287839102599521/" + params = {} + with open("cookie_file") as file: + try: + proxy = "ec2-3-221-82-67.compute-1.amazonaws.com:3128" + params['cookiefile'] = file.name + params['proxy'] = proxy + ydl = youtube_dl.YoutubeDL(params) + info = ydl.extract_info(url, download=False) + except DownloadError: + self.assertRaises(DownloadError) + if __name__ == '__main__': unittest.main() diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index cf60a2c21..f18eb872b 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -374,13 +374,7 @@ class FacebookIE(InfoExtractor): if not video_data: if not fatal_if_no_video: return webpage, False - m_msg = re.search(r'class="[^"]*uiInterstitialContent[^"]*">
(.*?)
', webpage) - if m_msg is not None: - raise ExtractorError( - 'The video is not available, Facebook said: "%s"' % m_msg.group(1), - expected=True) - elif '>You must log in to continue' in webpage: - self.raise_login_required() + self.validate_webpage(webpage) if not video_data: info_dict = self.get_from_new_ui(webpage, tahoe_data, video_id) @@ -806,13 +800,42 @@ class FacebookIE(InfoExtractor): def _resolve_thumbnail(self, webpage, tahoe_data): thumbnail = self._html_search_meta(['og:image', 'twitter:image'], webpage) + if not thumbnail: - thumbnail = self._search_regex(r'"thumbSrc":"(.+?)"', tahoe_data.secondary, 'thumbnail', fatal=False) + page = self.resolve_full_webpage(tahoe_data) + thumbnail = self._search_regex(r'"thumbnailUrl":"(.+?)"', page, 'thumbnail', fatal=False) thumbnail = str(thumbnail).replace('\\', "") return thumbnail def _valid_video_title(self, video_title): - return video_title and not u'Log In or Sign Up to View' in video_title + if video_title: + video_title = video_title.lower() + return video_title and not u'log in or sign up to view' in video_title + + def validate_webpage(self, webpage): + m_msg = re.search(r'class="[^"]*uiInterstitialContent[^"]*">
(.*?)
', webpage) + if m_msg is not None: + raise ExtractorError( + 'The video is not available, Facebook said: "%s"' % m_msg.group(1), + expected=True) + if 'Your Request Couldn\'t be Processed' in webpage: + raise ExtractorError( + 'The video is not available, Facebook said: this content is not available', + expected=True) + elif '>You must log in to continue' in webpage: + self.raise_login_required() + + def resolve_full_webpage(self, tahoe_data): + import urllib2 + user_agent = 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_4; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.472.63 Safari/534.3' + headers = {'User-Agent': user_agent} + full_url = self._search_regex(r'"permalinkURL":"(.+?)"', tahoe_data.primary, 'video_url', fatal=False) + full_url = str(full_url).replace('\\', "") + req = urllib2.Request(full_url, None, headers) + response = urllib2.urlopen(req) + page = response.read() + response.close() + return page class FacebookTahoeData: From 0f45a3da23690f9138f7b2c3083b72951c40e392 Mon Sep 17 00:00:00 2001 From: bhodaya Date: Mon, 29 Jun 2020 09:28:58 +0300 Subject: [PATCH 2/2] fix facebook thumbnail --- test/ci/test_facebook.py | 2 +- youtube_dl/extractor/facebook.py | 15 +-------------- 2 files changed, 2 insertions(+), 15 deletions(-) diff --git a/test/ci/test_facebook.py b/test/ci/test_facebook.py index 383017c09..8910b3261 100644 --- a/test/ci/test_facebook.py +++ b/test/ci/test_facebook.py @@ -28,7 +28,7 @@ class facebookMetaData(unittest.TestCase): def test_meta_data(self): params = {} - url = "https://www.facebook.com/watch?v=177407933624543/" + url = "https://www.facebook.com/parapsychological.centr/videos/177407933624543/" ydl = youtube_dl.YoutubeDL(params) info = ydl.extract_info(url, download=False) self.assertGreater(info.get('comment_count'), 0) diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index f18eb872b..8db50e89f 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -802,8 +802,7 @@ class FacebookIE(InfoExtractor): thumbnail = self._html_search_meta(['og:image', 'twitter:image'], webpage) if not thumbnail: - page = self.resolve_full_webpage(tahoe_data) - thumbnail = self._search_regex(r'"thumbnailUrl":"(.+?)"', page, 'thumbnail', fatal=False) + thumbnail = self._search_regex(r'"thumbnailUrl":"(.+?)"', webpage, 'thumbnail', fatal=False) thumbnail = str(thumbnail).replace('\\', "") return thumbnail @@ -825,18 +824,6 @@ class FacebookIE(InfoExtractor): elif '>You must log in to continue' in webpage: self.raise_login_required() - def resolve_full_webpage(self, tahoe_data): - import urllib2 - user_agent = 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_4; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.472.63 Safari/534.3' - headers = {'User-Agent': user_agent} - full_url = self._search_regex(r'"permalinkURL":"(.+?)"', tahoe_data.primary, 'video_url', fatal=False) - full_url = str(full_url).replace('\\', "") - req = urllib2.Request(full_url, None, headers) - response = urllib2.urlopen(req) - page = response.read() - response.close() - return page - class FacebookTahoeData: def __init__(self, extractor, page, video_id):