From e2e08a82d0653f3763a66bc56a0869eb0ba650fa Mon Sep 17 00:00:00 2001 From: flatgreen Date: Sun, 21 Aug 2016 23:00:52 +0200 Subject: [PATCH 1/6] [universcience] Add new extractor --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/universcience.py | 97 +++++++++++++++++++++++++++ 2 files changed, 98 insertions(+) create mode 100644 youtube_dl/extractor/universcience.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 8e405ad72..9fb7ff044 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -938,6 +938,7 @@ from .udemy import ( from .udn import UDNEmbedIE from .digiteka import DigitekaIE from .unistra import UnistraIE +from .universcience import UniverscienceIE from .uol import UOLIE from .uplynk import ( UplynkIE, diff --git a/youtube_dl/extractor/universcience.py b/youtube_dl/extractor/universcience.py new file mode 100644 index 000000000..70f97220e --- /dev/null +++ b/youtube_dl/extractor/universcience.py @@ -0,0 +1,97 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + xpath_text, + xpath_element, + xpath_attr, + clean_html, +) + + +class UniverscienceIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?universcience\.tv/video-(.*)-(?P[0-9]+)\.html' + _TEST = { + 'url': 'http://www.universcience.tv/video-haro-sur-les-loups-o-5466.html', + 'info_dict': { + 'id': '5466', + 'duration': 1990, + 'ext': 'mp4', + 'title': 'Haro sur les loups ?', + 'thumbnail': 're:^https?://.*\.jpg$', + 'description': 'startswith:Face au retour', + 'creator': 'Sylvie Allonneau', + 'subtitles': { + 'fr': [{ + 'url': 'http://universcience-webtv2-videos-pad.brainsonic.com/1/20121217-100607/attachedFiles/subtitles/2c4a2240149dbb984edc8afefea23a7c.srt', + }], + }, + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + APIKey = self._html_search_regex(r'["\']APIKey["\'](.*)["\'](?P.*)["\']', webpage, 'APIKey', group='APIKey') + url_get_media = 'http://universcience-webtv2-services-pad.brainsonic.com/rest/getMedia?APIKey=' + APIKey + '&byMediaId=' + video_id + xml = self._download_xml(url_get_media, video_id) + path_media = xpath_element(xml, './medias/media', fatal=True) + + title = xpath_text(path_media, './title') + creator = xpath_text(path_media, './author') + duration = int_or_none(xpath_text(path_media, './length')) + description = clean_html(xpath_text(path_media, './description')) + thumbnail = xpath_text(path_media, './thumbnail_url') + + subtitles = {} + subtitle_urls = path_media.findall('./subtitles/subtitle') + for subtitle in subtitle_urls: + lang = subtitle.get('lang') + subtitles[lang] = [{ + 'url': subtitle.text, + }] + + formats = [] + path_media_source = './medias/media/media_sources/media_source' + for media_source in xml.findall(path_media_source): + format_url = xpath_text(media_source, 'source', fatal=True) + media_label = xpath_attr(media_source, './streaming_type', 'label') + media_width = self._search_regex( + r'.* (\d*) x \d*', media_label, 'width', default='None', fatal=False) + media_height = self._search_regex( + r'.* \d* x (\d*)', media_label, 'height', default='None', fatal=False) + media_label = self._search_regex( + r'(.*) (\d* x \d*)', media_label, 'media_label', default=media_label, fatal=False) + + if (media_label == 'HLS') or (media_label == 'm3u8'): + formats.extend(self._extract_m3u8_formats( + format_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hds', fatal=False)) + else: + format_info = { + 'width': int_or_none(media_width), + 'height': int_or_none(media_height), + 'tbr': int_or_none(xpath_attr(media_source, './streaming_type', 'bitrate')), + # 'vcodec': xpath_attr -> bug sur regexp? + 'vcodec': media_source.find('streaming_type').get('html5_codec'), + 'url': format_url, + 'format_id': 'http-%s' % media_label, + } + formats.append(format_info) + + podcast_url = xpath_text(path_media, './podcast_url') + formats.append({'format_id': 'podcast', 'vcodec': 'none', 'url': podcast_url}) + + self._sort_formats(formats) + + return { + 'id': video_id, + 'duration': duration, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'formats': formats, + 'creator': creator, + 'subtitles': subtitles, + } From ae33167571b26c36fb96b4ae6eb7c087b4a51f28 Mon Sep 17 00:00:00 2001 From: flatgreen Date: Thu, 25 Aug 2016 16:56:36 +0200 Subject: [PATCH 2/6] universcience 2 --- youtube_dl/extractor/canalu.py | 38 +++++++++++++++++++++++++++ youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/universcience.py | 24 +++++++++-------- 3 files changed, 52 insertions(+), 11 deletions(-) create mode 100644 youtube_dl/extractor/canalu.py diff --git a/youtube_dl/extractor/canalu.py b/youtube_dl/extractor/canalu.py new file mode 100644 index 000000000..632ee1aa5 --- /dev/null +++ b/youtube_dl/extractor/canalu.py @@ -0,0 +1,38 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class CanalUIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?yourextractor\.com/watch/(?P[0-9]+)' + _TEST = { + 'url': 'http://yourextractor.com/watch/42', + 'md5': 'TODO: md5 sum of the first 10241 bytes of the video file (use --test)', + 'info_dict': { + 'id': '42', + 'ext': 'mp4', + 'title': 'Video title goes here', + 'thumbnail': 're:^https?://.*\.jpg$', + # TODO more properties, either as: + # * A value + # * MD5 checksum; start the string with md5: + # * A regular expression; start the string with re: + # * Any Python type (for example int or float) + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + # TODO more code goes here, for example ... + title = self._html_search_regex(r'

(.+?)

', webpage, 'title') + + return { + 'id': video_id, + 'title': title, + 'description': self._og_search_description(webpage), + 'uploader': self._search_regex(r']+id="uploader"[^>]*>([^<]+)<', webpage, 'uploader', fatal=False), + # TODO more properties (see youtube_dl/extractor/common.py) + } \ No newline at end of file diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 9fb7ff044..052b386b5 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -116,6 +116,7 @@ from .camdemy import ( from .camwithher import CamWithHerIE from .canalplus import CanalplusIE from .canalc2 import Canalc2IE +from .canalu import CanalUIE from .canvas import CanvasIE from .carambatv import ( CarambaTVIE, diff --git a/youtube_dl/extractor/universcience.py b/youtube_dl/extractor/universcience.py index 70f97220e..caadd172a 100644 --- a/youtube_dl/extractor/universcience.py +++ b/youtube_dl/extractor/universcience.py @@ -8,11 +8,12 @@ from ..utils import ( xpath_element, xpath_attr, clean_html, + update_url_query, ) class UniverscienceIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?universcience\.tv/video-(.*)-(?P[0-9]+)\.html' + _VALID_URL = r'https?://(?:www\.)?universcience\.tv/video-.*-(?P[0-9]+)\.html' _TEST = { 'url': 'http://www.universcience.tv/video-haro-sur-les-loups-o-5466.html', 'info_dict': { @@ -34,8 +35,10 @@ class UniverscienceIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - APIKey = self._html_search_regex(r'["\']APIKey["\'](.*)["\'](?P.*)["\']', webpage, 'APIKey', group='APIKey') - url_get_media = 'http://universcience-webtv2-services-pad.brainsonic.com/rest/getMedia?APIKey=' + APIKey + '&byMediaId=' + video_id + APIKey = self._html_search_regex(r'["\']APIKey["\'].*["\'](?P.*)["\']', webpage, 'APIKey', group='APIKey') + url_get_media = update_url_query( + 'http://universcience-webtv2-services-pad.brainsonic.com/rest/getMedia', + {'APIKey': APIKey, 'byMediaId': video_id}) xml = self._download_xml(url_get_media, video_id) path_media = xpath_element(xml, './medias/media', fatal=True) @@ -58,20 +61,18 @@ class UniverscienceIE(InfoExtractor): for media_source in xml.findall(path_media_source): format_url = xpath_text(media_source, 'source', fatal=True) media_label = xpath_attr(media_source, './streaming_type', 'label') - media_width = self._search_regex( - r'.* (\d*) x \d*', media_label, 'width', default='None', fatal=False) - media_height = self._search_regex( - r'.* \d* x (\d*)', media_label, 'height', default='None', fatal=False) + media_width = int_or_none(self._search_regex(r'(\d*) x \d*', media_label, 'width', default=None)) + media_height = int_or_none(self._search_regex(r'\d* x (\d*)', media_label, 'height', default=None)) media_label = self._search_regex( r'(.*) (\d* x \d*)', media_label, 'media_label', default=media_label, fatal=False) - if (media_label == 'HLS') or (media_label == 'm3u8'): + if media_label in ('HLS', 'm3u8'): formats.extend(self._extract_m3u8_formats( format_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hds', fatal=False)) else: format_info = { - 'width': int_or_none(media_width), - 'height': int_or_none(media_height), + 'width': media_width, + 'height': media_height, 'tbr': int_or_none(xpath_attr(media_source, './streaming_type', 'bitrate')), # 'vcodec': xpath_attr -> bug sur regexp? 'vcodec': media_source.find('streaming_type').get('html5_codec'), @@ -81,7 +82,8 @@ class UniverscienceIE(InfoExtractor): formats.append(format_info) podcast_url = xpath_text(path_media, './podcast_url') - formats.append({'format_id': 'podcast', 'vcodec': 'none', 'url': podcast_url}) + if podcast_url is not None: + formats.append({'format_id': 'podcast', 'vcodec': 'none', 'url': podcast_url}) self._sort_formats(formats) From 85ab99d1f2957e4b587ae79eab5dd95077e23182 Mon Sep 17 00:00:00 2001 From: flatgreen Date: Thu, 25 Aug 2016 16:57:17 +0200 Subject: [PATCH 3/6] universcience 2 --- youtube_dl/extractor/canalu.py | 38 ---------------------------------- 1 file changed, 38 deletions(-) delete mode 100644 youtube_dl/extractor/canalu.py diff --git a/youtube_dl/extractor/canalu.py b/youtube_dl/extractor/canalu.py deleted file mode 100644 index 632ee1aa5..000000000 --- a/youtube_dl/extractor/canalu.py +++ /dev/null @@ -1,38 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor - - -class CanalUIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?yourextractor\.com/watch/(?P[0-9]+)' - _TEST = { - 'url': 'http://yourextractor.com/watch/42', - 'md5': 'TODO: md5 sum of the first 10241 bytes of the video file (use --test)', - 'info_dict': { - 'id': '42', - 'ext': 'mp4', - 'title': 'Video title goes here', - 'thumbnail': 're:^https?://.*\.jpg$', - # TODO more properties, either as: - # * A value - # * MD5 checksum; start the string with md5: - # * A regular expression; start the string with re: - # * Any Python type (for example int or float) - } - } - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - # TODO more code goes here, for example ... - title = self._html_search_regex(r'

(.+?)

', webpage, 'title') - - return { - 'id': video_id, - 'title': title, - 'description': self._og_search_description(webpage), - 'uploader': self._search_regex(r']+id="uploader"[^>]*>([^<]+)<', webpage, 'uploader', fatal=False), - # TODO more properties (see youtube_dl/extractor/common.py) - } \ No newline at end of file From 0a0c6d27ba5456b82e304befa8825b010e14fb5a Mon Sep 17 00:00:00 2001 From: flatgreen Date: Fri, 26 Aug 2016 08:31:27 +0200 Subject: [PATCH 4/6] universcience 3 --- youtube_dl/extractor/extractors.py | 1 - youtube_dl/extractor/universcience.py | 4 +++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 052b386b5..9fb7ff044 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -116,7 +116,6 @@ from .camdemy import ( from .camwithher import CamWithHerIE from .canalplus import CanalplusIE from .canalc2 import Canalc2IE -from .canalu import CanalUIE from .canvas import CanvasIE from .carambatv import ( CarambaTVIE, diff --git a/youtube_dl/extractor/universcience.py b/youtube_dl/extractor/universcience.py index caadd172a..a0a3f5406 100644 --- a/youtube_dl/extractor/universcience.py +++ b/youtube_dl/extractor/universcience.py @@ -11,6 +11,8 @@ from ..utils import ( update_url_query, ) +import re + class UniverscienceIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?universcience\.tv/video-.*-(?P[0-9]+)\.html' @@ -64,7 +66,7 @@ class UniverscienceIE(InfoExtractor): media_width = int_or_none(self._search_regex(r'(\d*) x \d*', media_label, 'width', default=None)) media_height = int_or_none(self._search_regex(r'\d* x (\d*)', media_label, 'height', default=None)) media_label = self._search_regex( - r'(.*) (\d* x \d*)', media_label, 'media_label', default=media_label, fatal=False) + r'(.*) \d* x \d*', media_label, 'media_label', default=media_label, fatal=False) if media_label in ('HLS', 'm3u8'): formats.extend(self._extract_m3u8_formats( From cc59aba2b380b07f6872283466d917cf776f4a1f Mon Sep 17 00:00:00 2001 From: flatgreen Date: Fri, 26 Aug 2016 08:32:24 +0200 Subject: [PATCH 5/6] universcience 4 --- youtube_dl/extractor/universcience.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/youtube_dl/extractor/universcience.py b/youtube_dl/extractor/universcience.py index a0a3f5406..f21326598 100644 --- a/youtube_dl/extractor/universcience.py +++ b/youtube_dl/extractor/universcience.py @@ -11,8 +11,6 @@ from ..utils import ( update_url_query, ) -import re - class UniverscienceIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?universcience\.tv/video-.*-(?P[0-9]+)\.html' From 7b188bb67fbbe3f463518290d1dc565ed277bbb8 Mon Sep 17 00:00:00 2001 From: flatgreen Date: Mon, 29 Aug 2016 10:58:52 +0200 Subject: [PATCH 6/6] universcience 5 --- youtube_dl/extractor/universcience.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/universcience.py b/youtube_dl/extractor/universcience.py index f21326598..beb258d79 100644 --- a/youtube_dl/extractor/universcience.py +++ b/youtube_dl/extractor/universcience.py @@ -11,6 +11,8 @@ from ..utils import ( update_url_query, ) +import re + class UniverscienceIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?universcience\.tv/video-.*-(?P[0-9]+)\.html' @@ -63,8 +65,7 @@ class UniverscienceIE(InfoExtractor): media_label = xpath_attr(media_source, './streaming_type', 'label') media_width = int_or_none(self._search_regex(r'(\d*) x \d*', media_label, 'width', default=None)) media_height = int_or_none(self._search_regex(r'\d* x (\d*)', media_label, 'height', default=None)) - media_label = self._search_regex( - r'(.*) \d* x \d*', media_label, 'media_label', default=media_label, fatal=False) + media_label = re.sub(' \d* x \d*', '', media_label) if media_label in ('HLS', 'm3u8'): formats.extend(self._extract_m3u8_formats(