fixed dailymotion view_count extraction and added support for playerv5 embed pages

2015-07-17 01:00:58 +05:00 · 2015-07-17 01:00:58 +05:00 · 26b26d207d
commit 26b26d207d
parent e901e6fa81
2 changed files with 41 additions and 29 deletions
--- a/BIN
+++ b/BIN
--- a/youtube_dl/extractor/dailymotion.py
+++ b/youtube_dl/extractor/dailymotion.py
@ -19,6 +19,7 @@ from ..utils import (
    unescapeHTML,
 )
 class DailymotionBaseInfoExtractor(InfoExtractor):
    @staticmethod
    def _build_request(url):
@ -33,6 +34,7 @@ class DailymotionIE(DailymotionBaseInfoExtractor):
    _VALID_URL = r'(?i)(?:https?://)?(?:(www|touch)\.)?dailymotion\.[a-z]{2,3}/(?:(embed|#)/)?video/(?P<id>[^/?_]+)'
    IE_NAME = 'dailymotion'
    _FORMATS = [
        ('stream_h264_ld_url', 'ld'),
        ('stream_h264_url', 'standard'),
@ -121,10 +123,12 @@ class DailymotionIE(DailymotionBaseInfoExtractor):
        embed_request = self._build_request(embed_url)
        embed_page = self._download_webpage(
            embed_request, video_id, 'Downloading embed page')
-        info = self._search_regex(r'var info = ({.*?}),$', embed_page,
+        checkv5 = self._search_regex(r'playerV5(.)', embed_page,
-                                  'video info', flags=re.MULTILINE, fatal=False)
+                                'checkv5', default=None, fatal=False)
-        """For normal embed pages with info JSON"""
+        """For normal embed pages with info variable"""
-        if info is not None: 
+        if checkv5 is None:
            info = self._search_regex(r'var info = ({.*?}),$', embed_page,
                                  'video info', flags=re.MULTILINE)
            info = json.loads(info)
            if info.get('error') is not None:
                msg = 'Couldn\'t get video, Dailymotion says: %s' % info['error']['title']
@ -148,9 +152,11 @@ class DailymotionIE(DailymotionBaseInfoExtractor):
            if not formats:
                raise ExtractorError('Unable to extract video URL')
            video_subtitles = self.extract_subtitles(video_id, webpage)
-            view_count = str_to_int(self._search_regex(
+            view_count = self._search_regex(
-                r'video_views_count[^>]+>\s+([\d\.,]+)',
+                r'video_views_count[^>]+>\s+([\d\. ]+)\s+views',
-                webpage, 'view count', fatal=False))
+                webpage, 'view count', fatal=False)
            view_count = view_count.replace(" ", "")
            view_count = str_to_int(view_count)
            title = self._og_search_title(webpage, default=None)
            if title is None:
                title = self._html_search_regex(
@ -171,38 +177,44 @@ class DailymotionIE(DailymotionBaseInfoExtractor):
        else:
            formats = []
            for (key, format_id) in self._FORMATSv5:
-                video_url = self._search_regex(r'%s+".{30}(.*?)"' % key, embed_page,
+                """Verify format is available"""
-                                               'video info', flags=re.MULTILINE, fatal=False)
+                checkformat = self._search_regex(r'%s+":(.)' % key, embed_page,
-                if video_url:
+                                            'checkformat', default=None)
                if checkformat is not None:
                    video_url = self._search_regex(r'%s+".{30}(.*?)"' % key, embed_page,
                                                   'video info', flags=re.MULTILINE)
                    video_url = video_url.replace("\\", "")
-                if video_url is not None:
+                    if video_url is not None:
-                    m_size = re.search(r'H264-(\d+)x(\d+)', video_url)
+                        m_size = re.search(r'H264-(\d+)x(\d+)', video_url)
-                    if m_size is not None:
+                        if m_size is not None:
-                        width, height = map(int_or_none, (m_size.group(1), m_size.group(2)))
+                            width, height = map(int_or_none, (m_size.group(1), m_size.group(2)))
-                    else:
+                        else:
-                        width, height = None, None
+                            width, height = None, None
-                    formats.append({
+                        formats.append({
-                                    'url': video_url,
+                                        'url': video_url,
-                                    'ext': 'mp4',
+                                        'ext': 'mp4',
-                                    'format_id': format_id,
+                                        'format_id': format_id,
-                                    'width': width,
+                                        'width': width,
-                                    'height': height,
+                                        'height': height,
-                                    })
+                                        })
            if not formats:
                raise ExtractorError('Unable to extract video URL from playerv5 page')
            v5screenname = self._search_regex(r'screenname":"(.*?)"', embed_page,
-            'video info', flags=re.MULTILINE)
+            'video info-v5screenname', flags=re.MULTILINE, fatal=False)
            v5thumbnailurl = self._search_regex(r'poster_url":"(.*?)"', embed_page,
-            'video info', flags=re.MULTILINE) 
+            'video info-v5thumbnailurl', flags=re.MULTILINE, fatal=False) 
            if v5thumbnailurl is not None:
                v5thumbnailurl = v5thumbnailurl.replace("\\", "")
            video_subtitles = self.extract_subtitles(video_id, webpage)
-            view_count = str_to_int(self._search_regex(
+            view_count = self._search_regex(r'video_views_count[^>]+>\s+([\d\. ]+)\s+views',
-                r'video_views_count[^>]+>\s+([\d\.,]+)',
+                webpage, 'view count', fatal=False)
-                webpage, 'view count', fatal=False))
+            view_count = view_count.replace(" ", "")
            view_count = str_to_int(view_count)
            title = self._og_search_title(webpage, default=None)
            if title is None:
                title = self._html_search_regex(
                    r'(?s)<span\s+id="video_title"[^>]*>(.*?)</span>', webpage,
-                    'title')       
+                    'title')      
            return  {
                'id':       video_id,
                'formats': formats,