From 285eefc81b76eb0743d9d86bc1081105fc64d86b Mon Sep 17 00:00:00 2001 From: Parth Verma Date: Sun, 29 Apr 2018 22:32:46 +0530 Subject: [PATCH 1/9] [ibm think videos] Added new extractor --- youtube_dl/extractor/extractors.py | 4 +++ youtube_dl/extractor/ibm_think.py | 55 ++++++++++++++++++++++++++++++ 2 files changed, 59 insertions(+) create mode 100644 youtube_dl/extractor/ibm_think.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 6fb65e4fe..7c8e42ed4 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -450,6 +450,10 @@ from .hrti import ( from .huajiao import HuajiaoIE from .huffpost import HuffPostIE from .hypem import HypemIE +from .ibm_think import ( + IbmThinkIE, + IbmThinkPlaylistIE, +) from .iconosquare import IconosquareIE from .ign import ( IGNIE, diff --git a/youtube_dl/extractor/ibm_think.py b/youtube_dl/extractor/ibm_think.py new file mode 100644 index 000000000..815f29d73 --- /dev/null +++ b/youtube_dl/extractor/ibm_think.py @@ -0,0 +1,55 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from .ustream import UstreamIE + + +class IbmThinkIE(InfoExtractor): + IE_DESC = 'IBM Think Videos' + IE_NAME = 'IBMThink' + _VALID_URL = r'https?://(?:www\.)?ibm\.com/events/think/watch/replay/(?P[0-9]+)/?' + _TESTS = [{ + 'url': 'https://www.ibm.com/events/think/watch/replay/113734399/', + 'md5': '0a3f1c81c58aacbbb36e292a1c1f9690', + 'info_dict': { + 'id': '113734399', + 'ext': 'mp4', + 'title': 'Think 2018 Chairman\'s Address: Putting Smart to Work', + 'timestamp': 1521575552, + 'upload_date': '20180320', + 'uploader': 'f8k4md3yana', + 'uploader_id': '43178333', + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + ustream_url = self._html_search_regex(r'[0-9]+)/?' + _TESTS = [{ + 'url': 'https://www.ibm.com/events/think/watch/playlist/241295/', + 'info_dict': { + 'id': '241295', + 'title': 'Five innovations that will help change our lives within five years', + 'description': 'Discover what the world is thinking at Think 2018, IBM\'s first business event to go beyond IT conference, exploring cloud technology, data analytics & security.' + }, + 'playlist_mincount': 6 + }] + + def _real_extract(self, url): + playlist_id = self._match_id(url) + webpage = self._download_webpage(url, playlist_id) + entries = [self.url_result(m) for m in re.findall(r'', webpage)] + title = self._html_search_regex(r'(?:.+?)\s\|\s(?:.+?)\s\|\s(.+?)', webpage, 'title') + description = self._og_search_description(webpage) + return self.playlist_result(entries, playlist_id, title, description) From 93041d4ceca35fe661d4d4608f5ec18d3a12cefc Mon Sep 17 00:00:00 2001 From: Parth Verma Date: Sun, 29 Apr 2018 23:42:10 +0530 Subject: [PATCH 2/9] Fixed ibm think playlist downloader regex and handled ustream via generic parser --- youtube_dl/extractor/ibm_think.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/ibm_think.py b/youtube_dl/extractor/ibm_think.py index 815f29d73..b6a45403f 100644 --- a/youtube_dl/extractor/ibm_think.py +++ b/youtube_dl/extractor/ibm_think.py @@ -4,13 +4,13 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from .ustream import UstreamIE +from .generic import GenericIE class IbmThinkIE(InfoExtractor): IE_DESC = 'IBM Think Videos' IE_NAME = 'IBMThink' - _VALID_URL = r'https?://(?:www\.)?ibm\.com/events/think/watch/replay/(?P[0-9]+)/?' + _VALID_URL = r'https?://(?:www\.)?ibm\.com/events/think/watch/(playlist/)?(\d*/)?replay/(?P[0-9]+)/?' _TESTS = [{ 'url': 'https://www.ibm.com/events/think/watch/replay/113734399/', 'md5': '0a3f1c81c58aacbbb36e292a1c1f9690', @@ -29,7 +29,7 @@ class IbmThinkIE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) ustream_url = self._html_search_regex(r'', webpage)] + entries = [self.url_result(m, GenericIE.ie_key()) for m in re.findall(r'', webpage)] title = self._html_search_regex(r'(?:.+?)\s\|\s(?:.+?)\s\|\s(.+?)', webpage, 'title') description = self._og_search_description(webpage) return self.playlist_result(entries, playlist_id, title, description) From 17bfdc7404d9760ccbbb9b3636b3e5fce9d7adec Mon Sep 17 00:00:00 2001 From: Parth Verma Date: Mon, 30 Apr 2018 19:54:45 +0530 Subject: [PATCH 3/9] Removed superflous non capturing groups --- youtube_dl/extractor/ibm_think.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/ibm_think.py b/youtube_dl/extractor/ibm_think.py index b6a45403f..2a63e0f38 100644 --- a/youtube_dl/extractor/ibm_think.py +++ b/youtube_dl/extractor/ibm_think.py @@ -50,6 +50,6 @@ class IbmThinkPlaylistIE(InfoExtractor): playlist_id = self._match_id(url) webpage = self._download_webpage(url, playlist_id) entries = [self.url_result(m, GenericIE.ie_key()) for m in re.findall(r'', webpage)] - title = self._html_search_regex(r'(?:.+?)\s\|\s(?:.+?)\s\|\s(.+?)', webpage, 'title') + title = self._html_search_regex(r'.+?\s\|\s.+?\s\|\s(.+?)', webpage, 'title') description = self._og_search_description(webpage) return self.playlist_result(entries, playlist_id, title, description) From 201f149cf7e4398976fe1a04c50e31fb2594e509 Mon Sep 17 00:00:00 2001 From: Parth Verma Date: Mon, 30 Apr 2018 21:21:58 +0530 Subject: [PATCH 4/9] Replaced entry match regex to match links with specified classes only --- youtube_dl/extractor/ibm_think.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/ibm_think.py b/youtube_dl/extractor/ibm_think.py index 2a63e0f38..fd825599c 100644 --- a/youtube_dl/extractor/ibm_think.py +++ b/youtube_dl/extractor/ibm_think.py @@ -49,7 +49,7 @@ class IbmThinkPlaylistIE(InfoExtractor): def _real_extract(self, url): playlist_id = self._match_id(url) webpage = self._download_webpage(url, playlist_id) - entries = [self.url_result(m, GenericIE.ie_key()) for m in re.findall(r'', webpage)] + entries = [self.url_result(m, GenericIE.ie_key()) for m in re.findall(r'', webpage)] title = self._html_search_regex(r'.+?\s\|\s.+?\s\|\s(.+?)', webpage, 'title') description = self._og_search_description(webpage) return self.playlist_result(entries, playlist_id, title, description) From 2605e97bda37927fd9ac676489ab3077fa42d566 Mon Sep 17 00:00:00 2001 From: Parth Verma Date: Mon, 30 Apr 2018 21:22:27 +0530 Subject: [PATCH 5/9] Made playlist title non fatal --- youtube_dl/extractor/ibm_think.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/ibm_think.py b/youtube_dl/extractor/ibm_think.py index fd825599c..7589cb856 100644 --- a/youtube_dl/extractor/ibm_think.py +++ b/youtube_dl/extractor/ibm_think.py @@ -50,6 +50,6 @@ class IbmThinkPlaylistIE(InfoExtractor): playlist_id = self._match_id(url) webpage = self._download_webpage(url, playlist_id) entries = [self.url_result(m, GenericIE.ie_key()) for m in re.findall(r'', webpage)] - title = self._html_search_regex(r'.+?\s\|\s.+?\s\|\s(.+?)', webpage, 'title') + title = self._html_search_regex(r'.+?\s\|\s.+?\s\|\s(.+?)', webpage, 'title', fatal=False) description = self._og_search_description(webpage) return self.playlist_result(entries, playlist_id, title, description) From 15b71e4a61d2b3aab0fc72b78f355300d6d31ba1 Mon Sep 17 00:00:00 2001 From: Parth Verma Date: Mon, 30 Apr 2018 21:59:08 +0530 Subject: [PATCH 6/9] Added https for ustream iframe sources. --- youtube_dl/extractor/ustream.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/ustream.py b/youtube_dl/extractor/ustream.py index 5737d4d16..caf390f07 100644 --- a/youtube_dl/extractor/ustream.py +++ b/youtube_dl/extractor/ustream.py @@ -72,7 +72,7 @@ class UstreamIE(InfoExtractor): @staticmethod def _extract_url(webpage): mobj = re.search( - r']+?src=(["\'])(?Phttp://www\.ustream\.tv/embed/.+?)\1', webpage) + r']+?src=(["\'])(?Phttps?://www\.ustream\.tv/embed/.+?)\1', webpage) if mobj is not None: return mobj.group('url') From 5305ecf7f9a8013fcb2a382908d3e343b7e2ff26 Mon Sep 17 00:00:00 2001 From: Parth Verma Date: Mon, 30 Apr 2018 21:59:45 +0530 Subject: [PATCH 7/9] Moved ibm think videos GenericIE --- youtube_dl/extractor/extractors.py | 7 ++----- youtube_dl/extractor/ibm_think.py | 25 ------------------------- 2 files changed, 2 insertions(+), 30 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 7c8e42ed4..e3bd07155 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -450,10 +450,7 @@ from .hrti import ( from .huajiao import HuajiaoIE from .huffpost import HuffPostIE from .hypem import HypemIE -from .ibm_think import ( - IbmThinkIE, - IbmThinkPlaylistIE, -) +from .ibm_think import IbmThinkPlaylistIE from .iconosquare import IconosquareIE from .ign import ( IGNIE, @@ -1336,7 +1333,7 @@ from .webofstories import ( WebOfStoriesPlaylistIE, ) from .weibo import ( - WeiboIE, + WeiboIE, WeiboMobileIE ) from .weiqitv import WeiqiTVIE diff --git a/youtube_dl/extractor/ibm_think.py b/youtube_dl/extractor/ibm_think.py index 7589cb856..4541c78ae 100644 --- a/youtube_dl/extractor/ibm_think.py +++ b/youtube_dl/extractor/ibm_think.py @@ -7,31 +7,6 @@ from .common import InfoExtractor from .generic import GenericIE -class IbmThinkIE(InfoExtractor): - IE_DESC = 'IBM Think Videos' - IE_NAME = 'IBMThink' - _VALID_URL = r'https?://(?:www\.)?ibm\.com/events/think/watch/(playlist/)?(\d*/)?replay/(?P[0-9]+)/?' - _TESTS = [{ - 'url': 'https://www.ibm.com/events/think/watch/replay/113734399/', - 'md5': '0a3f1c81c58aacbbb36e292a1c1f9690', - 'info_dict': { - 'id': '113734399', - 'ext': 'mp4', - 'title': 'Think 2018 Chairman\'s Address: Putting Smart to Work', - 'timestamp': 1521575552, - 'upload_date': '20180320', - 'uploader': 'f8k4md3yana', - 'uploader_id': '43178333', - } - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - ustream_url = self._html_search_regex(r' Date: Tue, 1 May 2018 01:32:19 +0530 Subject: [PATCH 8/9] Fixed regex --- youtube_dl/extractor/ibm_think.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/ibm_think.py b/youtube_dl/extractor/ibm_think.py index 4541c78ae..5a0ee10bf 100644 --- a/youtube_dl/extractor/ibm_think.py +++ b/youtube_dl/extractor/ibm_think.py @@ -24,7 +24,7 @@ class IbmThinkPlaylistIE(InfoExtractor): def _real_extract(self, url): playlist_id = self._match_id(url) webpage = self._download_webpage(url, playlist_id) - entries = [self.url_result(m, GenericIE.ie_key()) for m in re.findall(r'', webpage)] + entries = [self.url_result(m, GenericIE.ie_key()) for m in re.findall(r'', webpage)] title = self._html_search_regex(r'.+?\s\|\s.+?\s\|\s(.+?)', webpage, 'title', fatal=False) description = self._og_search_description(webpage) return self.playlist_result(entries, playlist_id, title, description) From 56d61298efa4a55b163cd5583e2cb6de2304ce86 Mon Sep 17 00:00:00 2001 From: Parth Verma Date: Tue, 1 May 2018 15:39:20 +0530 Subject: [PATCH 9/9] Made https?: optional in extract urls for ustream --- youtube_dl/extractor/ustream.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/ustream.py b/youtube_dl/extractor/ustream.py index caf390f07..beac22264 100644 --- a/youtube_dl/extractor/ustream.py +++ b/youtube_dl/extractor/ustream.py @@ -72,7 +72,7 @@ class UstreamIE(InfoExtractor): @staticmethod def _extract_url(webpage): mobj = re.search( - r']+?src=(["\'])(?Phttps?://www\.ustream\.tv/embed/.+?)\1', webpage) + r']+?src=(["\'])(?P(?:https?:)?//www\.ustream\.tv/embed/.+?)\1', webpage) if mobj is not None: return mobj.group('url')