From a1eb5017b09490a6541916f4231588aecac258d5 Mon Sep 17 00:00:00 2001 From: Unknown Date: Fri, 8 Mar 2019 07:37:22 +0200 Subject: [PATCH 1/3] [YleAreena] Add new extractor --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/yleareena.py | 85 ++++++++++++++++++++++++++++++ 2 files changed, 86 insertions(+) create mode 100644 youtube_dl/extractor/yleareena.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 923dfe7f4..2ffcf34eb 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1458,6 +1458,7 @@ from .yandexdisk import YandexDiskIE from .yapfiles import YapFilesIE from .yesjapan import YesJapanIE from .yinyuetai import YinYueTaiIE +from .yleareena import YleAreenaIE from .ynet import YnetIE from .youjizz import YouJizzIE from .youku import ( diff --git a/youtube_dl/extractor/yleareena.py b/youtube_dl/extractor/yleareena.py new file mode 100644 index 000000000..fd4b003ff --- /dev/null +++ b/youtube_dl/extractor/yleareena.py @@ -0,0 +1,85 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + RegexNotFoundError, + url_or_none +) + + +class YleAreenaIE(InfoExtractor): + _VALID_URL = r'https?://(?:areena|arenan).yle.fi/(?P[0-9]-[0-9]+)' + _GEO_COUNTRIES = ['FI'] + + _TEST = { + 'url': 'https://areena.yle.fi/1-4256816', + 'md5': 'b9658c5960a8c2ca4ba8f1b0ff079df2', + 'info_dict': { + 'id': '1_iq074q8b', + 'ext': 'mxf', + 'title': 'Luottomies | Luottomies jouluspeciaali', + 'description': u'Tommia harmittaa kun sukulaiset ovat tulossa pilaamaan mukavan perhejoulun. Muuttuuko mieli isosta yllätyksestä? Joulun erikoisjakson on ohjannut Jalmari Helander.', + 'upload_date': '20171207', + 'height': 1080, + 'width': 1920, + 'fps': 25, + 'duration': 1302, + 'timestamp': 1512633989, + 'extractor': 'Kaltura', + 'uploader_id': 'ovp@yle.fi', + 'webpage_url_basename': '1-4256816', + 'webpage_url': 'https://areena.yle.fi/1-4256816' + } + } + + def _real_extract(self, url): + # Get essential data + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + # Extract essential metadata from Areena webpage + title = self._og_search_title(webpage) + description = self._og_search_description(webpage) + + # player_url is not used for the actual extraction, + # just for getting partner_id and entry_id for Kaltura extractor + # (though it is still required or else the extraction will fail) + try: + player_url = url_or_none( + self._og_search_property('video:secure_url', webpage) + ) + except RegexNotFoundError: + player_url = None + + # If this backup fails extractor will error out + player_url = url_or_none( + self._og_search_property('video:url', webpage) + ) + + if player_url is None: + raise RegexNotFoundError('Cannot find player url') + + # Get Kaltura identifiers from player_url + partner_id = self._search_regex( + r'/p/([0-9]+)', + player_url, + 'Kaltura partner id' + ) + + entry_id = self._search_regex( + r'/entry_id/([0-9]_[0-9a-z]+)', + player_url, + 'Kaltura entry id' + ) + + kaltura_url = 'kaltura:%s:%s' % (partner_id, entry_id) + + return { + '_type': 'url_transparent', + 'id': video_id, + 'url': kaltura_url, + 'ie_key': 'Kaltura', + 'title': title, + 'description': description + } From 49ca4a787555567716c775e06bf2d5d7480cc025 Mon Sep 17 00:00:00 2001 From: Unknown Date: Fri, 8 Mar 2019 17:12:11 +0200 Subject: [PATCH 2/3] Style changes and more fallbacks Following the style guide a bit better --- youtube_dl/extractor/yleareena.py | 59 +++++++++++++++++++++---------- 1 file changed, 41 insertions(+), 18 deletions(-) diff --git a/youtube_dl/extractor/yleareena.py b/youtube_dl/extractor/yleareena.py index fd4b003ff..8685dd8da 100644 --- a/youtube_dl/extractor/yleareena.py +++ b/youtube_dl/extractor/yleareena.py @@ -19,7 +19,10 @@ class YleAreenaIE(InfoExtractor): 'id': '1_iq074q8b', 'ext': 'mxf', 'title': 'Luottomies | Luottomies jouluspeciaali', - 'description': u'Tommia harmittaa kun sukulaiset ovat tulossa pilaamaan mukavan perhejoulun. Muuttuuko mieli isosta yllätyksestä? Joulun erikoisjakson on ohjannut Jalmari Helander.', + 'description': + u'Tommia harmittaa kun sukulaiset ovat tulossa pilaamaan ' + 'mukavan perhejoulun. Muuttuuko mieli isosta yllätyksestä? ' + 'Joulun erikoisjakson on ohjannut Jalmari Helander.', 'upload_date': '20171207', 'height': 1080, 'width': 1920, @@ -34,17 +37,44 @@ class YleAreenaIE(InfoExtractor): } def _real_extract(self, url): + # This extractor will fetch some basic info and then lead to Kaltura + # extractor. + props = { + '_type': 'url_transparent', + 'ie_key': 'Kaltura' + } + # Get essential data - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + props['id'] = self._match_id(url) + webpage = self._download_webpage(url, props['id']) - # Extract essential metadata from Areena webpage - title = self._og_search_title(webpage) - description = self._og_search_description(webpage) + # Try to extract title from OpenGraph metadata + _title = self._og_search_title(webpage, fatal=False) - # player_url is not used for the actual extraction, - # just for getting partner_id and entry_id for Kaltura extractor - # (though it is still required or else the extraction will fail) + # Fallback #1: try to extract title from page body + if _title is None: + _title = self._html_search_regex( + r'

([^<]+)', + webpage, + 'title', + fatal=False + ) + + # Fallback #2: let Kaltura extractor give the title (it should have it) + # If title is found from Areena page, use it + if _title is not None: + props['title'] = _title + + # Same thing for description + _description = self._og_search_description(webpage) + + # No Areena fallback here, the page layout is so ambiguous we cannot + # guarantee that the right description would match in series pages + if _description is not None: + props['description'] = _description + + # player_url is used for getting partner_id and entry_id for Kaltura + # extractor try: player_url = url_or_none( self._og_search_property('video:secure_url', webpage) @@ -73,13 +103,6 @@ class YleAreenaIE(InfoExtractor): 'Kaltura entry id' ) - kaltura_url = 'kaltura:%s:%s' % (partner_id, entry_id) + props['url'] = 'kaltura:%s:%s' % (partner_id, entry_id) - return { - '_type': 'url_transparent', - 'id': video_id, - 'url': kaltura_url, - 'ie_key': 'Kaltura', - 'title': title, - 'description': description - } + return props From 3dd4cb7425b4bcac0b71355fc7000663157b2b51 Mon Sep 17 00:00:00 2001 From: Unknown Date: Fri, 8 Mar 2019 19:44:34 +0200 Subject: [PATCH 3/3] Fix unicode literals --- youtube_dl/extractor/yleareena.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/yleareena.py b/youtube_dl/extractor/yleareena.py index 8685dd8da..119697b92 100644 --- a/youtube_dl/extractor/yleareena.py +++ b/youtube_dl/extractor/yleareena.py @@ -20,7 +20,7 @@ class YleAreenaIE(InfoExtractor): 'ext': 'mxf', 'title': 'Luottomies | Luottomies jouluspeciaali', 'description': - u'Tommia harmittaa kun sukulaiset ovat tulossa pilaamaan ' + 'Tommia harmittaa kun sukulaiset ovat tulossa pilaamaan ' 'mukavan perhejoulun. Muuttuuko mieli isosta yllätyksestä? ' 'Joulun erikoisjakson on ohjannut Jalmari Helander.', 'upload_date': '20171207',