From 26bab54ef5239dcfa3f2ff180e472f828225c08a Mon Sep 17 00:00:00 2001 From: Rob van Bekkum Date: Mon, 7 Mar 2016 01:14:18 +0100 Subject: [PATCH] Added ArkenaPlay extractor [Extraction for different formats needs to be implemented] --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/arkenaplay.py | 129 +++++++++++++++++++++++++++++ youtube_dl/extractor/lcp.py | 124 +++------------------------ 3 files changed, 143 insertions(+), 111 deletions(-) create mode 100644 youtube_dl/extractor/arkenaplay.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 8bb2631a7..a15572d7e 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -37,6 +37,7 @@ from .ard import ( ARDMediathekIE, SportschauIE, ) +from .arkenaplay import ArkenaPlayIE from .arte import ( ArteTvIE, ArteTVPlus7IE, diff --git a/youtube_dl/extractor/arkenaplay.py b/youtube_dl/extractor/arkenaplay.py new file mode 100644 index 000000000..ea6faac31 --- /dev/null +++ b/youtube_dl/extractor/arkenaplay.py @@ -0,0 +1,129 @@ +# coding: utf-8 +from __future__ import unicode_literals +from .common import InfoExtractor +from ..utils import ( + int_or_none, + parse_iso8601 +) + + +class ArkenaPlayIE(InfoExtractor): + IE_NAME = 'ArkenaPlay' + _VALID_URL = r'(?Phttps?://(?:www\.)?play\..*\..*)/embed/.*(?P\d+)?/.*' + + _TESTS = [{ + 'url': 'http://play.lcp.fr/embed/327336/131064/darkmatter/0', + 'md5': '6cea4f7d13810464ef8485a924fc3333', + 'info_dict': { + 'id': '327336', + 'url': 're:http://httpod.scdn.arkena.com/11970/327336_[0-9]+.mp4', + 'ext': 'mp4', + 'title': '327336', + 'upload_date': '20160225', + 'timestamp': 1456391602 + } + }, { + 'url': 'https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411', + 'md5': 'b96f2f71b359a8ecd05ce4e1daa72365', + 'info_dict': { + 'id': 'b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe', + 'url': 'http://88e04ec095b07cd1aa3ea588be47e870.httpcache0.90034-httpcache0.dna.qbrick.com/90034-httpcache0/4bf759a1-00090034/bbb_sunflower_2160p_60fps_normal_720p.mp4', + 'ext': 'mp4', + 'title': 'Big Buck Bunny', + 'description': 'Royalty free test video', + 'upload_date': '20150528', + 'timestamp': 1432816365 + } + }] + + def _real_extract(self, url): + display_id = self._search_regex(self._VALID_URL, url, 'host_name', group='id') + webpage = self._download_webpage(url, display_id) + + media_url_regex = '"(?P(?P.*)/config/avp/.*/\?callbackMethod=\?)"' + media_url = self._html_search_regex(media_url_regex, webpage, 'arkena_media_info_url') + hostname = self._html_search_regex(media_url_regex, webpage, 'arkena_media_host', group='host') + if not hostname: + hostname = self._search_regex(self._VALID_URL, url, 'host_name', group='host') + media_url = hostname + media_url + + # Extract the required info of the media files gathered in a dictionary + arkena_info = self._download_webpage(media_url, 'arkena_info_') + arkena_info_regex = r'\?\((?P.*)\);' + media_dict = self._parse_json(self._search_regex(arkena_info_regex, arkena_info, 'json', group='json'), + display_id) + + # All videos are part of a playlist, a single video is also put in a playlist + playlist_items = media_dict.get('Playlist', []) + if len(playlist_items) == 0: + return self.url_result(url, 'Generic') + elif len(playlist_items) == 1: + arkena_media_info = playlist_items[0] + return self.__extract_from_playlistentry(arkena_media_info) + else: + entries_info = [] + for arkena_playlist_item in playlist_items: + entries_info.append(self.__extract_from_playlistentry(arkena_playlist_item)) + return { + 'id': display_id, + 'entries': entries_info + } + + def __extract_from_playlistentry(self, arkena_playlistentry_info): + formats = self.__get_video_formats(arkena_playlistentry_info) + media_info = arkena_playlistentry_info.get('MediaInfo', {}) + thumbnails = self.__get_thumbnails(media_info) + title = media_info.get('Title') + description = media_info.get('Description') + timestamp = parse_iso8601(media_info.get('PublishDate')) + return { + 'id': arkena_playlistentry_info.get('EntryName'), + 'title': title, + 'formats': formats, + 'thumbnails': thumbnails, + 'description': description, + 'timestamp': timestamp + } + + def __get_thumbnails(self, arkena_mediainfo): + thumbnails = [] + thumbnails_info = arkena_mediainfo.get('Poster') + if not thumbnails_info: + return None + for thumbnail in thumbnails_info: + thumbnail_url = thumbnail.get('Url') + if not thumbnail_url: + continue + thumbnails.append({ + 'url': thumbnail_url, + 'width': int_or_none(thumbnail.get('Size')) + }) + return thumbnails + + def __get_video_formats(self, media_files_info): + formats = [] + media_files = media_files_info.get('MediaFiles') + if not media_files: + return None + + formats.extend(self.__get_mp4_video_formats(media_files)) + # TODO + self._sort_formats(formats) + return formats + + def __get_mp4_video_formats(self, media_files_json): + formats = [] + mp4_files_json = media_files_json.get('Mp4') + if not mp4_files_json: + return None + for video_info in mp4_files_json: + bitrate = int_or_none(video_info.get('Bitrate'), scale=1000) # Scale bitrate to KBit/s + video_url = video_info.get('Url') + if not video_url: + continue + formats.append({ + 'url': video_url, + 'ext': 'mp4', + 'tbr': bitrate + }) + return formats diff --git a/youtube_dl/extractor/lcp.py b/youtube_dl/extractor/lcp.py index e49744e91..22c9642a6 100644 --- a/youtube_dl/extractor/lcp.py +++ b/youtube_dl/extractor/lcp.py @@ -1,11 +1,6 @@ # coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import ( - int_or_none, - parse_iso8601 -) - class LcpIE(InfoExtractor): IE_NAME = 'LCP' @@ -18,21 +13,20 @@ class LcpIE(InfoExtractor): 'id': 'd56d03e9', 'url': 're:http://httpod.scdn.arkena.com/11970/d56d03e9_[0-9]+.mp4', 'ext': 'mp4', - 'title': 'Schwartzenberg (PRG) préconise à François Hollande de participer à une primaire à gauche', + 'title': 'd56d03e9', 'upload_date': '20160226', - 'description': 'Le président du groupe parlementaire radical, républicain, démocrate et progressiste (RRDP) y voit une bonne occasion pour le président de la République de se "relégitimer".', 'timestamp': 1456488895 } }, { - 'url': 'http://www.lcp.fr/emissions/politique-matin/271085-politique-matin', - 'md5': '6cea4f7d13810464ef8485a924fc3333', + 'url': 'http://www.lcp.fr/emissions/parlementair', + 'md5': '9b63769445cbe5f26952bef71f281e8c', 'info_dict': { - 'id': '327336', - 'url': 're:http://httpod.scdn.arkena.com/11970/327336_[0-9]+.mp4', + 'id': '327499', + 'url': 're:http://httpod.scdn.arkena.com/11970/327499_[0-9]+.mp4', 'ext': 'mp4', - 'title': 'Politique Matin - Politique matin', - 'upload_date': '20160225', - 'timestamp': 1456391602 + 'title': '327499', + 'upload_date': '20160304', + 'timestamp': 1457098658 } }, { 'url': 'http://www.lcp.fr/le-direct', @@ -47,102 +41,10 @@ class LcpIE(InfoExtractor): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) - # Extract the required info of the media files gathered in a dictionary - media_dict = self.__extract_from_webpage(display_id, webpage) - # Some web pages embed videos from other platforms like dailymotion, therefore we pass on these URLs - if not media_dict: + embed_url_regex = r'"(?P(?:https?://(?:www\.)?)?play\.lcp\.fr/embed/[A-za-z0-9]+/[A-za-z0-9]+/[A-za-z0-9]+/[A-za-z0-9]+)"' + embed_url = self._html_search_regex(embed_url_regex, webpage, 'player_url', default=None, fatal=False) + if not embed_url: return self.url_result(url, 'Generic') - # All videos are part of a playlist, a single video is also put in a playlist - playlist_files_info = media_dict.get('Playlist') - if not playlist_files_info: - return self.url_result(url, 'Generic') - - media_files_info = playlist_files_info[0] - video_formats = self.__get_video_formats(media_files_info) - video_thumbnails = self.__get_thumbnails(media_files_info) - video_timestamp = parse_iso8601(media_files_info.get('MediaInfo', {}).get('PublishDate')) - - title = self._og_search_title(webpage) - description = self._html_search_meta('description', webpage, default=None) - - return { - 'id': media_files_info.get('EntryName'), - 'title': title, - 'formats': video_formats, - 'thumbnails': video_thumbnails, - 'description': description, - 'timestamp': video_timestamp - } - - def __extract_from_webpage(self, display_id, webpage): - """Extracts the media info JSON object for the video for the provided web page.""" - embed_url = self.__extract_embed_url(webpage) - embed_regex = r'(?:[a-zA-Z0-9]+\.)?lcp\.fr/embed/(?P[A-za-z0-9]+)/(?P[A-za-z0-9]+)/(?P[^\/]+)' - - clip_id = self._search_regex(embed_regex, embed_url, 'clip id', group='clip_id', default=None) - player_id = self._search_regex(embed_regex, embed_url, 'player id', group='player_id', default=None) - skin_name = self._search_regex(embed_regex, embed_url, 'skin name', group='skin_name', default=None) - - # Check whether the matches failed, which might be when dealing with other players (e.g., dailymotion stream) - if not clip_id or not player_id or not skin_name: - return None - - return self.__extract_from_player(display_id, clip_id, player_id, skin_name) - - def __extract_embed_url(self, webpage): - return self._search_regex( - r']+src=(["\'])(?P.+?)\1', - webpage, 'embed url', group='url') - - def __extract_from_player(self, display_id, clip_id, player_id, skin_name): - """Extracts the JSON object containing the required media info from the embedded arkena player""" - arkena_url = 'http://play.arkena.com/config/avp/v1/player/media/{0}/{1}/{2}/?callbackMethod=?'.format(clip_id, - skin_name, - player_id) - arkena_info = self._download_webpage(arkena_url, 'clip_info_' + clip_id) - arkena_info_regex = r'\?\((?P.*)\);' - return self._parse_json(self._search_regex(arkena_info_regex, arkena_info, 'json', group='json'), - display_id) - - def __get_thumbnails(self, media_files_info): - thumbnails = [] - media_thumbnail_info = media_files_info.get('MediaInfo', {}).get('Poster') - if not media_thumbnail_info: - return None - for thumbnail in media_thumbnail_info: - thumbnail_url = thumbnail.get('Url') - if not thumbnail_url: - continue - thumbnails.append({ - 'url': thumbnail_url, - 'width': int_or_none(thumbnail.get('Size')) - }) - return thumbnails - - def __get_video_formats(self, media_files_info): - formats = [] - media_files = media_files_info.get('MediaFiles') - if not media_files: - return None - - formats.extend(self.__get_mp4_video_formats(media_files)) - self._sort_formats(formats) - return formats - - def __get_mp4_video_formats(self, media_files_json): - formats = [] - mp4_files_json = media_files_json.get('Mp4') - if not mp4_files_json: - return None - for video_info in mp4_files_json: - bitrate = int_or_none(video_info.get('Bitrate'), scale=1000) # Scale bitrate to KBit/s - video_url = video_info.get('Url') - if not video_url: - continue - formats.append({ - 'url': video_url, - 'ext': 'mp4', - 'tbr': bitrate - }) - return formats + title = self._og_search_title(webpage, default=None) + return self.url_result(embed_url, 'ArkenaPlay', video_id=display_id, video_title=title)