From 31fa1e92b663eb67d7f814e659b3ab3d2d7b6abd Mon Sep 17 00:00:00 2001 From: cwd24 Date: Wed, 11 Apr 2018 01:28:02 +0200 Subject: [PATCH 1/5] [senedd] add extractor for senedd.tv --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/senedd.py | 63 ++++++++++++++++++++++++++++++ 2 files changed, 64 insertions(+) create mode 100644 youtube_dl/extractor/senedd.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index c9f60114d..691bc3ac6 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -947,6 +947,7 @@ from .scrippsnetworks import ScrippsNetworksWatchIE from .seeker import SeekerIE from .senateisvp import SenateISVPIE from .sendtonews import SendtoNewsIE +from .senedd import SeneddIE from .servingsys import ServingSysIE from .servus import ServusIE from .sevenplus import SevenPlusIE diff --git a/youtube_dl/extractor/senedd.py b/youtube_dl/extractor/senedd.py new file mode 100644 index 000000000..637dd45fb --- /dev/null +++ b/youtube_dl/extractor/senedd.py @@ -0,0 +1,63 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import parse_duration +from ..compat import ( + compat_parse_qs, + compat_urllib_parse_urlparse, +) + + +class SeneddIE(InfoExtractor): + _VALID_URL = r'http://senedd\.tv/Meeting/(?:Archive|Clip)/(?P[0-9a-f\-]+)' + # TODO: some old links which redirect: http://www.senedd.tv/cy/4251?startPos=6&l=cy + _TEST = { + 'url': 'http://senedd.tv/Meeting/Clip/f2a274d3-a15a-4dec-b92b-be233eed9601?inPoint=00:50:35&outPoint=02:39:16', + # http://senedd.tv/Meeting/Archive/f2a274d3-a15a-4dec-b92b-be233eed9601?autostart=True + 'md5': '673307fe76d3c885bf02d8b146f10a2f', + 'info_dict': { + 'id': 'f2a274d3-a15a-4dec-b92b-be233eed9601', + 'ext': 'mp4', + 'title': 'Plenary', + 'thumbnail': r're:^http://.*\.jpg$', + 'language': 'en', + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + # http://player.nafw.cloud.vualto.com/Player/Index/f2a274d3-a15a-4dec-b92b-be233eed9601?autostart=True&captionsOn=False + webpage = self._download_webpage(url, video_id) + inverted_language = self._html_search_regex(r' Date: Wed, 11 Apr 2018 01:37:18 +0200 Subject: [PATCH 2/5] [senedd] remove some comments (removing tabs in the process) --- youtube_dl/extractor/senedd.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/youtube_dl/extractor/senedd.py b/youtube_dl/extractor/senedd.py index 637dd45fb..a48059496 100644 --- a/youtube_dl/extractor/senedd.py +++ b/youtube_dl/extractor/senedd.py @@ -14,7 +14,6 @@ class SeneddIE(InfoExtractor): # TODO: some old links which redirect: http://www.senedd.tv/cy/4251?startPos=6&l=cy _TEST = { 'url': 'http://senedd.tv/Meeting/Clip/f2a274d3-a15a-4dec-b92b-be233eed9601?inPoint=00:50:35&outPoint=02:39:16', - # http://senedd.tv/Meeting/Archive/f2a274d3-a15a-4dec-b92b-be233eed9601?autostart=True 'md5': '673307fe76d3c885bf02d8b146f10a2f', 'info_dict': { 'id': 'f2a274d3-a15a-4dec-b92b-be233eed9601', @@ -27,7 +26,6 @@ class SeneddIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - # http://player.nafw.cloud.vualto.com/Player/Index/f2a274d3-a15a-4dec-b92b-be233eed9601?autostart=True&captionsOn=False webpage = self._download_webpage(url, video_id) inverted_language = self._html_search_regex(r' Date: Fri, 13 Apr 2018 20:24:42 +0100 Subject: [PATCH 3/5] [senedd] Remove unicode and add additional http request to ensure correct m3u8 url --- youtube_dl/extractor/senedd.py | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/youtube_dl/extractor/senedd.py b/youtube_dl/extractor/senedd.py index a48059496..96a7162fa 100644 --- a/youtube_dl/extractor/senedd.py +++ b/youtube_dl/extractor/senedd.py @@ -7,6 +7,7 @@ from ..compat import ( compat_parse_qs, compat_urllib_parse_urlparse, ) +import re class SeneddIE(InfoExtractor): @@ -14,7 +15,7 @@ class SeneddIE(InfoExtractor): # TODO: some old links which redirect: http://www.senedd.tv/cy/4251?startPos=6&l=cy _TEST = { 'url': 'http://senedd.tv/Meeting/Clip/f2a274d3-a15a-4dec-b92b-be233eed9601?inPoint=00:50:35&outPoint=02:39:16', - 'md5': '673307fe76d3c885bf02d8b146f10a2f', + 'md5': 'b4c66ce851d67dcccc2a2deb2871707c', 'info_dict': { 'id': 'f2a274d3-a15a-4dec-b92b-be233eed9601', 'ext': 'mp4', @@ -27,16 +28,12 @@ class SeneddIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - inverted_language = self._html_search_regex(r' Date: Fri, 13 Apr 2018 21:45:21 +0200 Subject: [PATCH 4/5] Expand URL coverage, and correct md5 --- youtube_dl/extractor/senedd.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/senedd.py b/youtube_dl/extractor/senedd.py index 96a7162fa..013897e75 100644 --- a/youtube_dl/extractor/senedd.py +++ b/youtube_dl/extractor/senedd.py @@ -11,11 +11,11 @@ import re class SeneddIE(InfoExtractor): - _VALID_URL = r'http://senedd\.tv/Meeting/(?:Archive|Clip)/(?P[0-9a-f\-]+)' + _VALID_URL = r'http://(?:www\.)?senedd\.tv/Meeting/(?:Archive|Clip)/(?P[0-9a-f\-]+)' # TODO: some old links which redirect: http://www.senedd.tv/cy/4251?startPos=6&l=cy _TEST = { 'url': 'http://senedd.tv/Meeting/Clip/f2a274d3-a15a-4dec-b92b-be233eed9601?inPoint=00:50:35&outPoint=02:39:16', - 'md5': 'b4c66ce851d67dcccc2a2deb2871707c', + 'md5': '57e83ed0b3816d6661f0b51e74818765', 'info_dict': { 'id': 'f2a274d3-a15a-4dec-b92b-be233eed9601', 'ext': 'mp4', From 69af9079951d1164043784a186323366c99f50a7 Mon Sep 17 00:00:00 2001 From: Curon Date: Sat, 14 Apr 2018 20:52:04 +0100 Subject: [PATCH 5/5] Relax regex, prevent empty string match, and move title extraction before return. --- youtube_dl/extractor/senedd.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/senedd.py b/youtube_dl/extractor/senedd.py index 013897e75..60c077c0c 100644 --- a/youtube_dl/extractor/senedd.py +++ b/youtube_dl/extractor/senedd.py @@ -29,10 +29,12 @@ class SeneddIE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - iframe_src = self._html_search_regex(r'(?: