diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 1ae606f1e..8bb2631a7 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -349,6 +349,7 @@ from .kuwo import ( ) from .la7 import LA7IE from .laola1tv import Laola1TvIE +from .lcp import LcpIE from .lecture2go import Lecture2GoIE from .lemonde import LemondeIE from .letv import ( diff --git a/youtube_dl/extractor/lcp.py b/youtube_dl/extractor/lcp.py new file mode 100644 index 000000000..be586e1f1 --- /dev/null +++ b/youtube_dl/extractor/lcp.py @@ -0,0 +1,137 @@ +# coding: utf-8 +from __future__ import unicode_literals +from .common import InfoExtractor +from ..utils import ( + int_or_none +) + + +class LcpIE(InfoExtractor): + IE_NAME = 'LCP' + _VALID_URL = r'https?:\/\/(?:www\.)?lcp\.fr\/(?:[^\/]+/)*(?P[^/]+)' + + _TESTS = [{ + 'url': 'http://www.lcp.fr/la-politique-en-video/schwartzenberg-prg-preconise-francois-hollande-de-participer-une-primaire', + 'md5': 'aecf5a330cfc1061445a9af5b2df392d', + 'info_dict': { + 'id': 'd56d03e9', + 'url': 're:http://httpod.scdn.arkena.com/11970/d56d03e9_[0-9]+.mp4', + 'ext': 'mp4', + 'title': 'Schwartzenberg (PRG) préconise à François Hollande de participer à une primaire à gauche' + } + }, { + 'url': 'http://www.lcp.fr/emissions/politique-matin/271085-politique-matin', + 'md5': '6cea4f7d13810464ef8485a924fc3333', + 'info_dict': { + 'id': '327336', + 'url': 're:http://httpod.scdn.arkena.com/11970/327336_[0-9]+.mp4', + 'ext': 'mp4', + 'title': 'Politique Matin - Politique matin' + } + }] + + def _real_extract(self, url): + """Extracts the information for a given url and returns it in a dictionary""" + display_id = self._match_id(url) + + # Extract the web page + webpage = self._download_webpage(url, display_id) + + # Extract the required info of the media files + media_files_info = self.__extract_from_webpage(display_id, webpage) + # Some web pages embed videos from other platforms like dailymotion, therefore we pass on these URL + if media_files_info is None: + return self.url_result(self.__extract_embed_url(webpage)) + + # Extract the video formats from the media info + video_formats = self.__get_video_formats(media_files_info) + # Extract the thumbnails from the media info + video_thumbnails = self.__get_thumbnails(media_files_info) + + # Return the dictionary with the information about the video to download + return { + 'id': media_files_info['EntryName'], + 'title': self._og_search_title(webpage), + 'formats': video_formats, + 'thumbnails': video_thumbnails + } + + def __extract_from_webpage(self, display_id, webpage): + """Extracts the media info JSON object for the video for the provided web page.""" + embed_url = self.__extract_embed_url(webpage) + embed_regex = r'(?:[a-zA-Z0-9]+\.)?lcp\.fr\/embed\/(?P[A-za-z0-9]+)\/(?P[A-za-z0-9]+)\/(?P[^\/]+)' + + # Extract the identifying attributes from the embed url of the web page + clip_id = self._search_regex(embed_regex, embed_url, 'clip id', group='clip_id', fatal=False) + player_id = self._search_regex(embed_regex, embed_url, 'player id', group='player_id', fatal=False) + skin_name = self._search_regex(embed_regex, embed_url, 'skin name', group='skin_name', fatal=False) + + # Check whether the extraction of the clip id, player id or skin name + if (clip_id is None) or (player_id is None) or (skin_name is None): + return None + + # Extract the video url from the embedded player + return self.__extract_from_player(display_id, clip_id, player_id, skin_name) + + def __extract_embed_url(self, webpage): + """Extracts the embedded player url for the video.""" + return self._search_regex( + r']+src=(["\'])(?P.+?)\1', + webpage, 'embed url', group='url') + + def __extract_from_player(self, display_id, clip_id, player_id, skin_name): + """Extracts the JSON object containing the required media info from the embedded arkena player""" + arkena_url = 'http://play.arkena.com/config/avp/v1/player/media/{0}/{1}/{2}/?callbackMethod=?'.format(clip_id, + skin_name, + player_id) + arkena_info = self._download_webpage(arkena_url, 'clip_info_' + clip_id) + + # Extract the json containing information about the video files + arkena_info_regex = r'\?\((?P.*)\);' + info_json = self._parse_json(self._search_regex(arkena_info_regex, arkena_info, 'json', group='json'), + display_id) + + # All videos are part of a playlist, a single video is in a playlist of size 1 + media_files_info = info_json.get('Playlist') + if media_files_info is not None: + media_files_info = media_files_info[0] + return media_files_info + + def __get_thumbnails(self, media_files_info): + """Retrieves the thumbnails contained in the media info""" + thumbnails = [] + media_thumbnail_info = media_files_info.get('MediaInfo', {}).get('Poster') + if media_thumbnail_info is not None: + for thumbnail in media_thumbnail_info: + thumbnails.append({ + 'url': thumbnail.get('Url'), + 'width': int_or_none(thumbnail.get('Size')) + }) + return thumbnails + + def __get_video_formats(self, media_files_info): + """Retrieves the video formats contained in the media file info""" + formats = [] + media_files = media_files_info.get('MediaFiles') + + if media_files is not None: + formats.extend(self.__get_mp4_video_formats(media_files)) + self._sort_formats(formats) + + return formats + + def __get_mp4_video_formats(self, media_files_json): + """Retrieves all mp4 video formats contained in the media file info""" + formats = [] + mp4_files_json = media_files_json.get('Mp4') + if mp4_files_json is not None: + for video_info in mp4_files_json: + bitrate = int_or_none(video_info.get('Bitrate')) + if bitrate is not None: + bitrate /= 1000 # Set bitrate to KBit/s + formats.append({ + 'url': video_info.get('Url'), + 'ext': 'mp4', + 'tbr': bitrate + }) + return formats