[LCP] Add new extractor

This commit is contained in:
Rob van Bekkum 2016-02-27 18:21:42 +01:00
parent bf4b3b6bd9
commit fa67589134
2 changed files with 138 additions and 0 deletions

View File

@ -349,6 +349,7 @@ from .kuwo import (
) )
from .la7 import LA7IE from .la7 import LA7IE
from .laola1tv import Laola1TvIE from .laola1tv import Laola1TvIE
from .lcp import LcpIE
from .lecture2go import Lecture2GoIE from .lecture2go import Lecture2GoIE
from .lemonde import LemondeIE from .lemonde import LemondeIE
from .letv import ( from .letv import (

137
youtube_dl/extractor/lcp.py Normal file
View File

@ -0,0 +1,137 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import (
int_or_none
)
class LcpIE(InfoExtractor):
IE_NAME = 'LCP'
_VALID_URL = r'https?:\/\/(?:www\.)?lcp\.fr\/(?:[^\/]+/)*(?P<id>[^/]+)'
_TESTS = [{
'url': 'http://www.lcp.fr/la-politique-en-video/schwartzenberg-prg-preconise-francois-hollande-de-participer-une-primaire',
'md5': 'aecf5a330cfc1061445a9af5b2df392d',
'info_dict': {
'id': 'd56d03e9',
'url': 're:http://httpod.scdn.arkena.com/11970/d56d03e9_[0-9]+.mp4',
'ext': 'mp4',
'title': 'Schwartzenberg (PRG) préconise à François Hollande de participer à une primaire à gauche'
}
}, {
'url': 'http://www.lcp.fr/emissions/politique-matin/271085-politique-matin',
'md5': '6cea4f7d13810464ef8485a924fc3333',
'info_dict': {
'id': '327336',
'url': 're:http://httpod.scdn.arkena.com/11970/327336_[0-9]+.mp4',
'ext': 'mp4',
'title': 'Politique Matin - Politique matin'
}
}]
def _real_extract(self, url):
"""Extracts the information for a given url and returns it in a dictionary"""
display_id = self._match_id(url)
# Extract the web page
webpage = self._download_webpage(url, display_id)
# Extract the required info of the media files
media_files_info = self.__extract_from_webpage(display_id, webpage)
# Some web pages embed videos from other platforms like dailymotion, therefore we pass on these URL
if media_files_info is None:
return self.url_result(self.__extract_embed_url(webpage))
# Extract the video formats from the media info
video_formats = self.__get_video_formats(media_files_info)
# Extract the thumbnails from the media info
video_thumbnails = self.__get_thumbnails(media_files_info)
# Return the dictionary with the information about the video to download
return {
'id': media_files_info['EntryName'],
'title': self._og_search_title(webpage),
'formats': video_formats,
'thumbnails': video_thumbnails
}
def __extract_from_webpage(self, display_id, webpage):
"""Extracts the media info JSON object for the video for the provided web page."""
embed_url = self.__extract_embed_url(webpage)
embed_regex = r'(?:[a-zA-Z0-9]+\.)?lcp\.fr\/embed\/(?P<clip_id>[A-za-z0-9]+)\/(?P<player_id>[A-za-z0-9]+)\/(?P<skin_name>[^\/]+)'
# Extract the identifying attributes from the embed url of the web page
clip_id = self._search_regex(embed_regex, embed_url, 'clip id', group='clip_id', fatal=False)
player_id = self._search_regex(embed_regex, embed_url, 'player id', group='player_id', fatal=False)
skin_name = self._search_regex(embed_regex, embed_url, 'skin name', group='skin_name', fatal=False)
# Check whether the extraction of the clip id, player id or skin name
if (clip_id is None) or (player_id is None) or (skin_name is None):
return None
# Extract the video url from the embedded player
return self.__extract_from_player(display_id, clip_id, player_id, skin_name)
def __extract_embed_url(self, webpage):
"""Extracts the embedded player url for the video."""
return self._search_regex(
r'<iframe[^>]+src=(["\'])(?P<url>.+?)\1',
webpage, 'embed url', group='url')
def __extract_from_player(self, display_id, clip_id, player_id, skin_name):
"""Extracts the JSON object containing the required media info from the embedded arkena player"""
arkena_url = 'http://play.arkena.com/config/avp/v1/player/media/{0}/{1}/{2}/?callbackMethod=?'.format(clip_id,
skin_name,
player_id)
arkena_info = self._download_webpage(arkena_url, 'clip_info_' + clip_id)
# Extract the json containing information about the video files
arkena_info_regex = r'\?\((?P<json>.*)\);'
info_json = self._parse_json(self._search_regex(arkena_info_regex, arkena_info, 'json', group='json'),
display_id)
# All videos are part of a playlist, a single video is in a playlist of size 1
media_files_info = info_json.get('Playlist')
if media_files_info is not None:
media_files_info = media_files_info[0]
return media_files_info
def __get_thumbnails(self, media_files_info):
"""Retrieves the thumbnails contained in the media info"""
thumbnails = []
media_thumbnail_info = media_files_info.get('MediaInfo', {}).get('Poster')
if media_thumbnail_info is not None:
for thumbnail in media_thumbnail_info:
thumbnails.append({
'url': thumbnail.get('Url'),
'width': int_or_none(thumbnail.get('Size'))
})
return thumbnails
def __get_video_formats(self, media_files_info):
"""Retrieves the video formats contained in the media file info"""
formats = []
media_files = media_files_info.get('MediaFiles')
if media_files is not None:
formats.extend(self.__get_mp4_video_formats(media_files))
self._sort_formats(formats)
return formats
def __get_mp4_video_formats(self, media_files_json):
"""Retrieves all mp4 video formats contained in the media file info"""
formats = []
mp4_files_json = media_files_json.get('Mp4')
if mp4_files_json is not None:
for video_info in mp4_files_json:
bitrate = int_or_none(video_info.get('Bitrate'))
if bitrate is not None:
bitrate /= 1000 # Set bitrate to KBit/s
formats.append({
'url': video_info.get('Url'),
'ext': 'mp4',
'tbr': bitrate
})
return formats