Merge branch 'fix.25.12.2018'

2019-06-13 11:43:30 +03:00 · 2019-06-13 11:43:30 +03:00 · 0317d16c78
commit 0317d16c78
parent f21e20734e d6ad71cb7a
5 changed files with 186 additions and 29 deletions
--- a/youtube_dl/extractor/facebook.py
+++ b/youtube_dl/extractor/facebook.py
@ -57,7 +57,7 @@ class FacebookIE(InfoExtractor):
    _CHROME_USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.97 Safari/537.36'

    _VIDEO_PAGE_TEMPLATE = 'https://www.facebook.com/video/video.php?v=%s'
-    _VIDEO_PAGE_TAHOE_TEMPLATE = 'https://www.facebook.com/video/tahoe/async/%s/?chain=true&isvideo=true&payloadtype=primary'
+    _VIDEO_PAGE_TAHOE_TEMPLATE = 'https://www.facebook.com/video/tahoe/async/%s/?chain=true&isvideo=true&payloadtype=%s'

    _TESTS = [{
        'url': 'https://www.facebook.com/video.php?v=637842556329505&fref=nf',
@ -218,6 +218,25 @@ class FacebookIE(InfoExtractor):
            'ext': 'mp4',
            'title': '#ESLOne VoD - Birmingham Finals Day#1 Fnatic vs. @Evil Geniuses',
            'uploader': 'ESL One Dota 2',
+            'timestamp': 1527084179,
+            'upload_date': '20180523',
+            'uploader_id': '234218833769558',
+            'is_live': False
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        # no timestamp
+        'url': 'https://www.facebook.com/SuperNewsGames/videos/642255722780473/',
+        'info_dict': {
+            'timestamp': 1521221400,
+            'uploader': 'Super News Games',
+            'uploader_id': '229550157384367',
+            'id': '642255722780473',
+            'ext': 'mp4',
+            'upload_date': '20180316',
+            'title': 'The Voice of Nick is trying Fortnite after 100 hours of PLAYERUNKNOWN\'S BATTL...',
        },
        'params': {
            'skip_download': True,
@ -339,6 +358,7 @@ class FacebookIE(InfoExtractor):
                video_id, transform_source=js_to_json, fatal=False)
            video_data = extract_from_jsmods_instances(server_js_data)

+        tahoe_data = FacebookTahoeData(self, webpage, video_id)
        if not video_data:
            if not fatal_if_no_video:
                return webpage, False
@ -349,36 +369,33 @@ class FacebookIE(InfoExtractor):
                    expected=True)
            elif '>You must log in to continue' in webpage:
                self.raise_login_required()
-
            # Video info not in first request, do a secondary request using
            # tahoe player specific URL
-            tahoe_data = self._download_webpage(
-                self._VIDEO_PAGE_TAHOE_TEMPLATE % video_id, video_id,
-                data=urlencode_postdata({
-                    '__a': 1,
-                    '__pc': self._search_regex(
-                        r'pkg_cohort["\']\s*:\s*["\'](.+?)["\']', webpage,
-                        'pkg cohort', default='PHASED:DEFAULT'),
-                    '__rev': self._search_regex(
-                        r'client_revision["\']\s*:\s*(\d+),', webpage,
-                        'client revision', default='3944515'),
-                    'fb_dtsg': self._search_regex(
-                        r'"DTSGInitialData"\s*,\s*\[\]\s*,\s*{\s*"token"\s*:\s*"([^"]+)"',
-                        webpage, 'dtsg token', default=''),
-                }),
-                headers={
-                    'Content-Type': 'application/x-www-form-urlencoded',
-                })
            tahoe_js_data = self._parse_json(
                self._search_regex(
-                    r'for\s+\(\s*;\s*;\s*\)\s*;(.+)', tahoe_data,
+                    r'for\s+\(\s*;\s*;\s*\)\s*;(.+)', tahoe_data.primary,
                    'tahoe js data', default='{}'),
                video_id, fatal=False)
+
            video_data = extract_from_jsmods_instances(tahoe_js_data)

        if not video_data:
            raise ExtractorError('Cannot parse data')

+        is_scheduled = '"isScheduledLive":true' in tahoe_data.secondary
+        is_live_stream = video_data[0].get('is_live_stream', False)
+        is_broadcast = video_data[0].get('is_broadcast', False)
+
+        live_status = 'not_live'
+        if is_broadcast:
+            live_status = 'completed'
+            if is_live_stream:
+                live_status = 'live'
+                if is_scheduled:
+                    live_status = 'upcoming'
+
+        is_live = live_status == 'live'
+
        formats = []
        for f in video_data:
            format_id = f['stream_type']
@ -423,16 +440,35 @@ class FacebookIE(InfoExtractor):
            video_title = 'Facebook video #%s' % video_id
        uploader = clean_html(get_element_by_id(
            'fbPhotoPageAuthorName', webpage)) or self._search_regex(
-            r'ownerName\s*:\s*"([^"]+)"', webpage, 'uploader',
-            default=None) or self._og_search_title(webpage, fatal=False)
+            r'ownerName\s*:\s*"([^"]+)"', webpage, 'uploader',default=None) or \
+                   self._og_search_title(webpage, default=None) or self._search_regex(
+                        r'\"ownerName\":"(.+?)"', tahoe_data.secondary,
+                        'uploader_id', fatal=False)
+
+
        timestamp = int_or_none(self._search_regex(
            r'<abbr[^>]+data-utime=["\'](\d+)', webpage,
+            'timestamp', default=None) or self._search_regex(
+            r'data-utime=\\\"(\d+)\\\"', tahoe_data.secondary,
            'timestamp', default=None))
+
+        uploader_id = self._search_regex(
+            r'ownerid:"([\d]+)', webpage,
+            'uploader_id', default=None) or self._search_regex(
+            r'[\'\"]ownerid[\'\"]\s*:\s*[\'\"](\d+)[\'\"]', tahoe_data.secondary,
+            'uploader_id', fatal=False)
        thumbnail = self._og_search_thumbnail(webpage)

        view_count = parse_count(self._search_regex(
+            r'\bpostViewCount\s*:\s*["\']([\d,.]+)', webpage, 'view count',
+            default=None) or self._search_regex(
+            r'[\'\"]postViewCount[\'\"]\s*:\s*(\d+)', tahoe_data.secondary, 'view count',
+            default=None) or self._search_regex(
            r'\bviewCount\s*:\s*["\']([\d,.]+)', webpage, 'view count',
-            default=None))
+            default=None) or self._search_regex(
+            r'[\'\"]viewCount[\'\"]\s*:\s*(\d+)', tahoe_data.secondary, 'view count',
+            default=None)
+        )

        info_dict = {
            'id': video_id,
@ -442,6 +478,9 @@ class FacebookIE(InfoExtractor):
            'timestamp': timestamp,
            'thumbnail': thumbnail,
            'view_count': view_count,
+            'uploader_id': uploader_id,
+            'is_live': is_live,
+            'live_status': live_status
        }

        return webpage, info_dict
@ -472,6 +511,54 @@ class FacebookIE(InfoExtractor):
            return info_dict


+class FacebookTahoeData:
+    def __init__(self, extractor, page, video_id):
+        self._page = page
+        self._video_id = video_id
+        self._extractor = extractor
+        self._data = {}
+
+    def _get_data(self, data_type):
+        if data_type in self._data:
+            data = self._data[data_type]
+        else:
+            req_data, headers = self._get_request_data_and_headers()
+            data = self._extractor._download_webpage(
+                self._extractor._VIDEO_PAGE_TAHOE_TEMPLATE % (self._video_id, data_type), self._video_id,
+                data=req_data,
+                headers=headers
+            )
+        return '' if not data else data
+
+    @property
+    def primary(self):
+        return self._get_data('primary')
+
+    @property
+    def secondary(self):
+        return self._get_data('secondary')
+
+    def _get_request_data_and_headers(self):
+        tahoe_request_data = urlencode_postdata(
+            {
+                '__a': 1,
+                '__pc': self._extractor._search_regex(
+                    r'pkg_cohort["\']\s*:\s*["\'](.+?)["\']', self._page,
+                    'pkg cohort', default='PHASED:DEFAULT'),
+                '__rev': self._extractor._search_regex(
+                    r'client_revision["\']\s*:\s*(\d+),', self._page,
+                    'client revision', default='3944515'),
+                'fb_dtsg': self._extractor._search_regex(
+                    r'"DTSGInitialData"\s*,\s*\[\]\s*,\s*{\s*"token"\s*:\s*"([^"]+)"',
+                    self._page, 'dtsg token', default=''),
+            })
+        tahoe_request_headers = {
+            'Content-Type': 'application/x-www-form-urlencoded',
+        }
+
+        return tahoe_request_data, tahoe_request_headers
+
+
 class FacebookPluginsVideoIE(InfoExtractor):
    _VALID_URL = r'https?://(?:[\w-]+\.)?facebook\.com/plugins/video\.php\?.*?\bhref=(?P<id>https.+)'

--- a/youtube_dl/extractor/odnoklassniki.py
+++ b/youtube_dl/extractor/odnoklassniki.py
@ -94,6 +94,21 @@ class OdnoklassnikiIE(InfoExtractor):
            'skip_download': True,
        },
        'skip': 'Video has not been found',
+    }, {
+        # live video
+        'url': 'https://www.ok.ru/video/1050794925929',
+        'info_dict': {
+            'id': '1050794925929',
+            'title': 're:^Поиск репертуара [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
+            'ext': 'mp4',
+            'upload_date': u'20190428',
+            'uploader': u'(((((КнЯзЬ ))))',
+            'uploader_id': u'557343776873',
+            'is_live': True
+        },
+        'params': {
+            'skip_download': True,
+        }
    }, {
        'url': 'http://ok.ru/web-api/video/moviePlayer/20079905452',
        'only_matching': True,
@ -131,8 +146,8 @@ class OdnoklassnikiIE(InfoExtractor):
            'http://ok.ru/video/%s' % video_id, video_id)

        error = self._search_regex(
-            r'[^>]+class="vp_video_stub_txt"[^>]*>([^<]+)<',
-            webpage, 'error', default=None)
+            r'<div class="vp_video_stub_txt">(?P<error>.*?)<\/div>',
+            webpage, name='error', group='error', default=None)
        if error:
            raise ExtractorError(error, expected=True)

@ -176,6 +191,45 @@ class OdnoklassnikiIE(InfoExtractor):
        upload_date = unified_strdate(self._html_search_meta(
            'ya:ovs:upload_date', webpage, 'upload date', default=None))

+        if upload_date is None:
+            upload_date_str = self._search_regex(
+                r'vp-layer-info_date">(?P<date>.*?)<\/span>',
+                webpage, 'upload date', group='date')
+            if upload_date_str:
+                upload_date_str = upload_date_str.replace('Sept', 'Sep')
+                from datetime import datetime, timedelta
+                upload_date_time = None
+                try:
+                    upload_date_time = datetime.strptime(upload_date_str, '%d %b %Y')
+                except:
+                    pass
+                try:
+                    upload_date_time = datetime.strptime(upload_date_str, '%d %b')
+                    upload_date_time = upload_date_time.replace(year=datetime.utcnow().year)
+                except:
+                    pass
+                try:
+                    upload_date_time = datetime.strptime(upload_date_str, '%d %B')
+                    upload_date_time = upload_date_time.replace(year=datetime.utcnow().year)
+                except:
+                    pass
+
+                try:
+                    if upload_date_str.find(':') >=0:
+                        hour_and_minutes = upload_date_str.split(' ')[-1]
+                    else:
+                        hour_and_minutes = upload_date_str
+                    upload_date_time = datetime.strptime(hour_and_minutes, '%H:%M')
+                    upload_date_time = upload_date_time.replace(year=datetime.utcnow().year)
+                    upload_date_time = upload_date_time.replace(day=datetime.utcnow().day)
+                    if upload_date_str.find('yesterday') ==0:
+                        upload_date_time = upload_date_time - timedelta(days=1)
+                except:
+                    pass
+
+                if upload_date_time:
+                    upload_date = upload_date_time.strftime('%Y%m%d')
+
        age_limit = None
        adult = self._html_search_meta(
            'ya:ovs:adult', webpage, 'age limit', default=None)
@ -207,6 +261,7 @@ class OdnoklassnikiIE(InfoExtractor):
        assert title
        if provider == 'LIVE_TV_APP':
            info['title'] = self._live_title(title)
+            info['is_live'] = True

        quality = qualities(('4', '0', '1', '2', '3', '5'))

--- a/youtube_dl/extractor/vimeo.py
+++ b/youtube_dl/extractor/vimeo.py
@ -397,6 +397,7 @@ class VimeoIE(VimeoBaseInfoExtractor):
                'timestamp': 1324343742,
                'upload_date': '20111220',
                'description': 'md5:ae23671e82d05415868f7ad1aec21147',
+                'view_count': int,
            },
        },
        {
@ -693,12 +694,17 @@ class VimeoIE(VimeoBaseInfoExtractor):
                'timestamp', default=None)

        try:
-            view_count = int(self._search_regex(r'UserPlays:(\d+)', webpage, 'view count'))
+            # When userInteractionCount does not exist views is 0
+            view_count = int_or_none(
+                self._search_regex(
+                    r'"interactionType":"http:\/\/schema\.org\/WatchAction","userInteractionCount":(.+?)}',
+                    webpage, 'view count', default=0
+                )
+            )
            like_count = int(self._search_regex(r'UserLikes:(\d+)', webpage, 'like count'))
            comment_count = int(self._search_regex(r'UserComments:(\d+)', webpage, 'comment count'))
        except RegexNotFoundError:
            # This info is only available in vimeo.com/{id} urls
-            view_count = None
            like_count = None
            comment_count = None

--- a/youtube_dl/extractor/vk.py
+++ b/youtube_dl/extractor/vk.py
@ -282,7 +282,13 @@ class VKIE(VKBaseIE):
            # The video is not available in your region.
            'url': 'https://vk.com/video-51812607_171445436',
            'only_matching': True,
-        }]
+        },
+        {
+            # Video %s is not available.
+            'url': 'https://vk.com/video-173478245_456239188',
+            'only_matching': True,
+        },
+    ]

    def _real_extract(self, url):
        mobj = re.match(self._VALID_URL, url)
@ -345,6 +351,9 @@ class VKIE(VKBaseIE):

            r'<!>The video .+? is not available in your region.':
            'Video %s is not available in your region.',
+
+            r'<!>The video .+? is unavailable':
+                'Video %s is not available.',
        }

        for error_re, error_msg in ERRORS.items():
--- a/youtube_dl/version.py
+++ b/youtube_dl/version.py
@ -1,3 +1,3 @@
 from __future__ import unicode_literals

-__version__ = '2019.06.08'
+__version__ = 'vc.2019.06.08'