2016-02-27 18:21:42 +01:00
# coding: utf-8
from __future__ import unicode_literals
from . common import InfoExtractor
from . . utils import (
2016-02-27 23:20:45 +01:00
int_or_none ,
parse_iso8601
2016-02-27 18:21:42 +01:00
)
class LcpIE ( InfoExtractor ) :
IE_NAME = ' LCP '
2016-02-27 19:54:38 +01:00
_VALID_URL = r ' https?://(?:www \ .)?lcp \ .fr/(?:[^ \ /]+/)*(?P<id>[^/]+) '
2016-02-27 18:21:42 +01:00
_TESTS = [ {
' url ' : ' http://www.lcp.fr/la-politique-en-video/schwartzenberg-prg-preconise-francois-hollande-de-participer-une-primaire ' ,
' md5 ' : ' aecf5a330cfc1061445a9af5b2df392d ' ,
' info_dict ' : {
' id ' : ' d56d03e9 ' ,
' url ' : ' re:http://httpod.scdn.arkena.com/11970/d56d03e9_[0-9]+.mp4 ' ,
' ext ' : ' mp4 ' ,
2016-02-27 23:20:45 +01:00
' title ' : ' Schwartzenberg (PRG) préconise à François Hollande de participer à une primaire à gauche ' ,
' upload_date ' : ' 20160226 ' ,
' description ' : ' Le président du groupe parlementaire radical, républicain, démocrate et progressiste (RRDP) y voit une bonne occasion pour le président de la République de se " relégitimer " . ' ,
' timestamp ' : 1456488895
2016-02-27 18:21:42 +01:00
}
} , {
' url ' : ' http://www.lcp.fr/emissions/politique-matin/271085-politique-matin ' ,
' md5 ' : ' 6cea4f7d13810464ef8485a924fc3333 ' ,
' info_dict ' : {
' id ' : ' 327336 ' ,
' url ' : ' re:http://httpod.scdn.arkena.com/11970/327336_[0-9]+.mp4 ' ,
' ext ' : ' mp4 ' ,
2016-02-27 23:20:45 +01:00
' title ' : ' Politique Matin - Politique matin ' ,
' upload_date ' : ' 20160225 ' ,
' timestamp ' : 1456391602
2016-02-27 18:21:42 +01:00
}
2016-02-27 21:40:05 +01:00
} , {
' url ' : ' http://www.lcp.fr/le-direct ' ,
' info_dict ' : {
' title ' : ' Le direct | LCP Assembl \xe9 e nationale ' ,
' id ' : ' le-direct ' ,
} ,
' playlist_mincount ' : 1
2016-02-27 18:21:42 +01:00
} ]
def _real_extract ( self , url ) :
display_id = self . _match_id ( url )
webpage = self . _download_webpage ( url , display_id )
2016-02-27 19:54:38 +01:00
# Extract the required info of the media files gathered in a dictionary
2016-02-27 23:20:45 +01:00
media_dict = self . __extract_from_webpage ( display_id , webpage )
2016-02-27 19:54:38 +01:00
# Some web pages embed videos from other platforms like dailymotion, therefore we pass on these URLs
2016-02-27 23:20:45 +01:00
if not media_dict :
2016-02-27 19:54:38 +01:00
return self . url_result ( url , ' Generic ' )
2016-02-27 18:21:42 +01:00
2016-02-27 23:20:45 +01:00
# All videos are part of a playlist, a single video is also put in a playlist
playlist_files_info = media_dict . get ( ' Playlist ' )
if not playlist_files_info :
return self . url_result ( url , ' Generic ' )
media_files_info = playlist_files_info [ 0 ]
2016-02-27 18:21:42 +01:00
video_formats = self . __get_video_formats ( media_files_info )
video_thumbnails = self . __get_thumbnails ( media_files_info )
2016-02-27 23:20:45 +01:00
video_timestamp = parse_iso8601 ( media_files_info . get ( ' MediaInfo ' , { } ) . get ( ' PublishDate ' ) )
title = self . _og_search_title ( webpage )
2016-02-28 00:29:09 +01:00
description = self . _html_search_meta ( ' description ' , webpage , default = None )
2016-02-27 18:21:42 +01:00
return {
2016-02-27 23:20:45 +01:00
' id ' : media_files_info . get ( ' EntryName ' ) ,
' title ' : title ,
2016-02-27 18:21:42 +01:00
' formats ' : video_formats ,
2016-02-27 23:20:45 +01:00
' thumbnails ' : video_thumbnails ,
' description ' : description ,
' timestamp ' : video_timestamp
2016-02-27 18:21:42 +01:00
}
def __extract_from_webpage ( self , display_id , webpage ) :
""" Extracts the media info JSON object for the video for the provided web page. """
embed_url = self . __extract_embed_url ( webpage )
2016-02-27 19:54:38 +01:00
embed_regex = r ' (?:[a-zA-Z0-9]+ \ .)?lcp \ .fr/embed/(?P<clip_id>[A-za-z0-9]+)/(?P<player_id>[A-za-z0-9]+)/(?P<skin_name>[^ \ /]+) '
2016-02-27 18:21:42 +01:00
2016-02-27 21:40:05 +01:00
clip_id = self . _search_regex ( embed_regex , embed_url , ' clip id ' , group = ' clip_id ' , default = None )
player_id = self . _search_regex ( embed_regex , embed_url , ' player id ' , group = ' player_id ' , default = None )
skin_name = self . _search_regex ( embed_regex , embed_url , ' skin name ' , group = ' skin_name ' , default = None )
2016-02-27 18:21:42 +01:00
2016-02-27 19:54:38 +01:00
# Check whether the matches failed, which might be when dealing with other players (e.g., dailymotion stream)
2016-02-27 21:40:05 +01:00
if not clip_id or not player_id or not skin_name :
2016-02-27 18:21:42 +01:00
return None
return self . __extract_from_player ( display_id , clip_id , player_id , skin_name )
def __extract_embed_url ( self , webpage ) :
return self . _search_regex (
r ' <iframe[^>]+src=([ " \' ])(?P<url>.+?) \ 1 ' ,
webpage , ' embed url ' , group = ' url ' )
def __extract_from_player ( self , display_id , clip_id , player_id , skin_name ) :
""" Extracts the JSON object containing the required media info from the embedded arkena player """
arkena_url = ' http://play.arkena.com/config/avp/v1/player/media/ {0} / {1} / {2} /?callbackMethod=? ' . format ( clip_id ,
skin_name ,
player_id )
arkena_info = self . _download_webpage ( arkena_url , ' clip_info_ ' + clip_id )
arkena_info_regex = r ' \ ? \ ((?P<json>.*) \ ); '
2016-02-27 23:20:45 +01:00
return self . _parse_json ( self . _search_regex ( arkena_info_regex , arkena_info , ' json ' , group = ' json ' ) ,
display_id )
2016-02-27 18:21:42 +01:00
def __get_thumbnails ( self , media_files_info ) :
thumbnails = [ ]
media_thumbnail_info = media_files_info . get ( ' MediaInfo ' , { } ) . get ( ' Poster ' )
2016-02-27 21:40:05 +01:00
if not media_thumbnail_info :
return None
for thumbnail in media_thumbnail_info :
2016-02-27 22:07:38 +01:00
thumbnail_url = thumbnail . get ( ' Url ' )
if not thumbnail_url :
continue
2016-02-27 21:40:05 +01:00
thumbnails . append ( {
2016-02-27 22:07:38 +01:00
' url ' : thumbnail_url ,
2016-02-27 21:40:05 +01:00
' width ' : int_or_none ( thumbnail . get ( ' Size ' ) )
} )
2016-02-27 18:21:42 +01:00
return thumbnails
def __get_video_formats ( self , media_files_info ) :
formats = [ ]
media_files = media_files_info . get ( ' MediaFiles ' )
2016-02-27 21:40:05 +01:00
if not media_files :
return None
2016-02-27 18:21:42 +01:00
2016-02-27 21:40:05 +01:00
formats . extend ( self . __get_mp4_video_formats ( media_files ) )
self . _sort_formats ( formats )
2016-02-27 18:21:42 +01:00
return formats
def __get_mp4_video_formats ( self , media_files_json ) :
formats = [ ]
mp4_files_json = media_files_json . get ( ' Mp4 ' )
2016-02-27 21:40:05 +01:00
if not mp4_files_json :
return None
for video_info in mp4_files_json :
2016-02-27 23:20:45 +01:00
bitrate = int_or_none ( video_info . get ( ' Bitrate ' ) , scale = 1000 ) # Scale bitrate to KBit/s
2016-02-27 22:07:38 +01:00
video_url = video_info . get ( ' Url ' )
if not video_url :
continue
2016-02-27 21:40:05 +01:00
formats . append ( {
2016-02-27 22:07:38 +01:00
' url ' : video_url ,
2016-02-27 21:40:05 +01:00
' ext ' : ' mp4 ' ,
' tbr ' : bitrate
} )
2016-02-27 18:21:42 +01:00
return formats