fix facebook thumbnail

2020-06-28 15:59:47 +03:00 · 2020-06-28 15:59:47 +03:00 · a1986b5bf2
commit a1986b5bf2
parent b84d66e626
2 changed files with 56 additions and 10 deletions
--- a/test/ci/test_facebook.py
+++ b/test/ci/test_facebook.py
@ -1,5 +1,6 @@
 import unittest
 import youtube_dl
 from youtube_dl.utils import DownloadError
 class facebookMetaData(unittest.TestCase):
@ -25,8 +26,15 @@ class facebookMetaData(unittest.TestCase):
        info = ydl.extract_info(url, download=False)
        self.assertGreater(info.get('comment_count'), 0)
    def test_meta_data(self):
        params = {}
        url = "https://www.facebook.com/watch?v=177407933624543/"
        ydl = youtube_dl.YoutubeDL(params)
        info = ydl.extract_info(url, download=False)
        self.assertGreater(info.get('comment_count'), 0)
    def test_metadata_fetch_with_log_in(self):
-        url = "https://www.facebook.com/SerieA/videos/282581803097269"
+        url = "https://www.facebook.com/oristandup/videos/675360549895283"
        params = {}
        with open("cookie_file") as file:
            proxy = "ec2-3-221-82-67.compute-1.amazonaws.com:3128"
@ -34,6 +42,8 @@ class facebookMetaData(unittest.TestCase):
            params['proxy'] = proxy
            ydl = youtube_dl.YoutubeDL(params)
            info = ydl.extract_info(url, download=False)
            print (info.get('title'))
            print (info.get('timestamp'))
            self.assertTrue(info.get('timestamp'))
            self.assertTrue(info.get('view_count'))
            self.assertTrue(info.get('comment_count'))
@ -41,6 +51,19 @@ class facebookMetaData(unittest.TestCase):
            self.assertTrue(info.get('uploader_id'))
            self.assertTrue(info.get('thumbnail'))
    def test_unavailable_video(self):
        url = "https://www.facebook.com/101457238278830/videos/287839102599521/"
        params = {}
        with open("cookie_file") as file:
            try:
                proxy = "ec2-3-221-82-67.compute-1.amazonaws.com:3128"
                params['cookiefile'] = file.name
                params['proxy'] = proxy
                ydl = youtube_dl.YoutubeDL(params)
                info = ydl.extract_info(url, download=False)
            except DownloadError:
                self.assertRaises(DownloadError)
 if __name__ == '__main__':
    unittest.main()
--- a/youtube_dl/extractor/facebook.py
+++ b/youtube_dl/extractor/facebook.py
@ -374,13 +374,7 @@ class FacebookIE(InfoExtractor):
        if not video_data:
            if not fatal_if_no_video:
                return webpage, False
-            m_msg = re.search(r'class="[^"]*uiInterstitialContent[^"]*"><div>(.*?)</div>', webpage)
+            self.validate_webpage(webpage)
            if m_msg is not None:
                raise ExtractorError(
                    'The video is not available, Facebook said: "%s"' % m_msg.group(1),
                    expected=True)
            elif '>You must log in to continue' in webpage:
                self.raise_login_required()
        if not video_data:
            info_dict = self.get_from_new_ui(webpage, tahoe_data, video_id)
@ -806,13 +800,42 @@ class FacebookIE(InfoExtractor):
    def _resolve_thumbnail(self, webpage, tahoe_data):
        thumbnail = self._html_search_meta(['og:image', 'twitter:image'], webpage)
        if not thumbnail:
-            thumbnail = self._search_regex(r'"thumbSrc":"(.+?)"', tahoe_data.secondary, 'thumbnail', fatal=False)
+            page = self.resolve_full_webpage(tahoe_data)
            thumbnail = self._search_regex(r'"thumbnailUrl":"(.+?)"', page, 'thumbnail', fatal=False)
            thumbnail = str(thumbnail).replace('\\', "")
        return thumbnail
    def _valid_video_title(self, video_title):
-        return video_title and not u'Log In or Sign Up to View' in video_title
+        if video_title:
            video_title = video_title.lower()
        return video_title and not u'log in or sign up to view' in video_title
    def validate_webpage(self, webpage):
        m_msg = re.search(r'class="[^"]*uiInterstitialContent[^"]*"><div>(.*?)</div>', webpage)
        if m_msg is not None:
            raise ExtractorError(
                'The video is not available, Facebook said: "%s"' % m_msg.group(1),
                expected=True)
        if 'Your Request Couldn\'t be Processed' in webpage:
            raise ExtractorError(
                'The video is not available, Facebook said: this content is not available',
                expected=True)
        elif '>You must log in to continue' in webpage:
            self.raise_login_required()
    def resolve_full_webpage(self, tahoe_data):
        import urllib2
        user_agent = 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_4; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.472.63 Safari/534.3'
        headers = {'User-Agent': user_agent}
        full_url = self._search_regex(r'"permalinkURL":"(.+?)"', tahoe_data.primary, 'video_url', fatal=False)
        full_url = str(full_url).replace('\\', "")
        req = urllib2.Request(full_url, None, headers)
        response = urllib2.urlopen(req)
        page = response.read()
        response.close()
        return page
 class FacebookTahoeData: