Removed redundant comments, needless escaping of slashes in regular expressions, using scala argument of int_or_none and direct use of the generic extractor for already supported embeds

2016-02-27 19:54:38 +01:00 · 2016-02-27 19:54:38 +01:00 · ccdbe1e8fa
commit ccdbe1e8fa
parent fa67589134
1 changed files with 8 additions and 23 deletions
--- a/youtube_dl/extractor/lcp.py
+++ b/youtube_dl/extractor/lcp.py
@ -8,7 +8,7 @@ from ..utils import (

 class LcpIE(InfoExtractor):
    IE_NAME = 'LCP'
-    _VALID_URL = r'https?:\/\/(?:www\.)?lcp\.fr\/(?:[^\/]+/)*(?P<id>[^/]+)'
+    _VALID_URL = r'https?://(?:www\.)?lcp\.fr/(?:[^\/]+/)*(?P<id>[^/]+)'

    _TESTS = [{
        'url': 'http://www.lcp.fr/la-politique-en-video/schwartzenberg-prg-preconise-francois-hollande-de-participer-une-primaire',
@ -31,24 +31,18 @@ class LcpIE(InfoExtractor):
    }]

    def _real_extract(self, url):
-        """Extracts the information for a given url and returns it in a dictionary"""
        display_id = self._match_id(url)
-
-        # Extract the web page
        webpage = self._download_webpage(url, display_id)

-        # Extract the required info of the media files
+        # Extract the required info of the media files gathered in a dictionary
        media_files_info = self.__extract_from_webpage(display_id, webpage)
-        # Some web pages embed videos from other platforms like dailymotion, therefore we pass on these URL
+        # Some web pages embed videos from other platforms like dailymotion, therefore we pass on these URLs
        if media_files_info is None:
-            return self.url_result(self.__extract_embed_url(webpage))
+            return self.url_result(url, 'Generic')

-        # Extract the video formats from the media info
        video_formats = self.__get_video_formats(media_files_info)
-        # Extract the thumbnails from the media info
        video_thumbnails = self.__get_thumbnails(media_files_info)

-        # Return the dictionary with the information about the video to download
        return {
            'id': media_files_info['EntryName'],
            'title': self._og_search_title(webpage),
@ -59,22 +53,19 @@ class LcpIE(InfoExtractor):
    def __extract_from_webpage(self, display_id, webpage):
        """Extracts the media info JSON object for the video for the provided web page."""
        embed_url = self.__extract_embed_url(webpage)
-        embed_regex = r'(?:[a-zA-Z0-9]+\.)?lcp\.fr\/embed\/(?P<clip_id>[A-za-z0-9]+)\/(?P<player_id>[A-za-z0-9]+)\/(?P<skin_name>[^\/]+)'
+        embed_regex = r'(?:[a-zA-Z0-9]+\.)?lcp\.fr/embed/(?P<clip_id>[A-za-z0-9]+)/(?P<player_id>[A-za-z0-9]+)/(?P<skin_name>[^\/]+)'

-        # Extract the identifying attributes from the embed url of the web page
        clip_id = self._search_regex(embed_regex, embed_url, 'clip id', group='clip_id', fatal=False)
        player_id = self._search_regex(embed_regex, embed_url, 'player id', group='player_id', fatal=False)
        skin_name = self._search_regex(embed_regex, embed_url, 'skin name', group='skin_name', fatal=False)

-        # Check whether the extraction of the clip id, player id or skin name
+        # Check whether the matches failed, which might be when dealing with other players (e.g., dailymotion stream)
        if (clip_id is None) or (player_id is None) or (skin_name is None):
            return None

-        # Extract the video url from the embedded player
        return self.__extract_from_player(display_id, clip_id, player_id, skin_name)

    def __extract_embed_url(self, webpage):
-        """Extracts the embedded player url for the video."""
        return self._search_regex(
            r'<iframe[^>]+src=(["\'])(?P<url>.+?)\1',
            webpage, 'embed url', group='url')
@ -86,19 +77,17 @@ class LcpIE(InfoExtractor):
                                                                                                              player_id)
        arkena_info = self._download_webpage(arkena_url, 'clip_info_' + clip_id)

-        # Extract the json containing information about the video files
        arkena_info_regex = r'\?\((?P<json>.*)\);'
        info_json = self._parse_json(self._search_regex(arkena_info_regex, arkena_info, 'json', group='json'),
                                     display_id)

-        # All videos are part of a playlist, a single video is in a playlist of size 1
+        # All videos are part of a playlist, a single video is also put in a playlist
        media_files_info = info_json.get('Playlist')
        if media_files_info is not None:
            media_files_info = media_files_info[0]
        return media_files_info

    def __get_thumbnails(self, media_files_info):
-        """Retrieves the thumbnails contained in the media info"""
        thumbnails = []
        media_thumbnail_info = media_files_info.get('MediaInfo', {}).get('Poster')
        if media_thumbnail_info is not None:
@ -110,7 +99,6 @@ class LcpIE(InfoExtractor):
        return thumbnails

    def __get_video_formats(self, media_files_info):
-        """Retrieves the video formats contained in the media file info"""
        formats = []
        media_files = media_files_info.get('MediaFiles')

@ -121,14 +109,11 @@ class LcpIE(InfoExtractor):
        return formats

    def __get_mp4_video_formats(self, media_files_json):
-        """Retrieves all mp4 video formats contained in the media file info"""
        formats = []
        mp4_files_json = media_files_json.get('Mp4')
        if mp4_files_json is not None:
            for video_info in mp4_files_json:
-                bitrate = int_or_none(video_info.get('Bitrate'))
-                if bitrate is not None:
-                    bitrate /= 1000  # Set bitrate to KBit/s
+                bitrate = int_or_none(video_info.get('Bitrate'), scale=0.001) # Scale bitrate to KBit/s
                formats.append({
                    'url': video_info.get('Url'),
                    'ext': 'mp4',