From cc73d5ad15aed96f6462b8079ccb6716c2ef9f85 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 1 Sep 2019 01:18:25 +0700 Subject: [PATCH 1/3] [openload] Fix domains regex --- youtube_dl/extractor/openload.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/openload.py b/youtube_dl/extractor/openload.py index b638450af..679eaf6c3 100644 --- a/youtube_dl/extractor/openload.py +++ b/youtube_dl/extractor/openload.py @@ -243,12 +243,13 @@ class PhantomJSwrapper(object): class OpenloadIE(InfoExtractor): - _DOMAINS = r'''(?x) + _DOMAINS = r''' (?: openload\.(?:co|io|link|pw)| oload\.(?:tv|best|biz|stream|site|xyz|win|download|cloud|cc|icu|fun|club|info|press|pw|life|live|space|services|website|vip)| - oladblock\.(?:services|xyz|me)|openloed\.co) - ''' + oladblock\.(?:services|xyz|me)|openloed\.co + ) + ''' _VALID_URL = r'''(?x) https?:// (?P @@ -396,7 +397,7 @@ class OpenloadIE(InfoExtractor): @classmethod def _extract_urls(cls, webpage): return re.findall( - r']+src=["\']((?:https?://)?%s/%s/[a-zA-Z0-9-_]+)' + r'(?x)]+src=["\']((?:https?://)?%s/%s/[a-zA-Z0-9-_]+)' % (cls._DOMAINS, cls._EMBED_WORD), webpage) def _extract_decrypted_page(self, page_url, webpage, video_id): From d78657fd18ae6413239137298eee4c54f3efee32 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Mon, 22 Jul 2019 14:09:21 -0700 Subject: [PATCH 2/3] [extractor/generic] Add support for squarespace embeds (closes #21294) --- youtube_dl/extractor/generic.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index d34fc4b15..7dd2e2d5f 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -2075,6 +2075,17 @@ class GenericIE(InfoExtractor): }, 'playlist_count': 6, }, + { + # Squarespace video embed, 2019-08-28 + 'url': 'http://ootboxford.com', + 'info_dict': { + 'id': 'Tc7b_JGdZfw', + 'title': 'Out of the Blue, at Childish Things 10', + }, + 'params': { + 'skip_download': True, + }, + }, { # Zype embed 'url': 'https://www.cookscountry.com/episode/554-smoky-barbecue-favorites', @@ -2395,6 +2406,13 @@ class GenericIE(InfoExtractor): # Unescaping the whole page allows to handle those cases in a generic way webpage = compat_urllib_parse_unquote(webpage) + # unescape re.sub replacement + def unescape_resub(m): + return unescapeHTML(m.group(0)) + + # unescape squarespace video embeds + webpage = re.sub(r']+class=[^>]*?sqs-video-wrapper[^>]*>', unescape_resub, webpage) + # it's tempting to parse this further, but you would # have to take into account all the variations like # Video Title - Site Name From 7cb51b5daf07a6a627a9084394636c570194cc4e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 1 Sep 2019 01:23:58 +0700 Subject: [PATCH 3/3] [extractor/generic] Improve squarespace detection and fix test (closes #21859, refs #21294, refs #21802) --- youtube_dl/extractor/generic.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 7dd2e2d5f..d1725d98b 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -2081,6 +2081,11 @@ class GenericIE(InfoExtractor): 'info_dict': { 'id': 'Tc7b_JGdZfw', 'title': 'Out of the Blue, at Childish Things 10', + 'ext': 'mp4', + 'description': 'md5:a83d0026666cf5ee970f8bd1cfd69c7f', + 'uploader_id': 'helendouglashouse', + 'uploader': 'Helen & Douglas House', + 'upload_date': '20140328', }, 'params': { 'skip_download': True, @@ -2406,12 +2411,11 @@ class GenericIE(InfoExtractor): # Unescaping the whole page allows to handle those cases in a generic way webpage = compat_urllib_parse_unquote(webpage) - # unescape re.sub replacement - def unescape_resub(m): - return unescapeHTML(m.group(0)) - - # unescape squarespace video embeds - webpage = re.sub(r']+class=[^>]*?sqs-video-wrapper[^>]*>', unescape_resub, webpage) + # Unescape squarespace embeds to be detected by generic extractor, + # see https://github.com/ytdl-org/youtube-dl/issues/21294 + webpage = re.sub( + r']+class=[^>]*?\bsqs-video-wrapper\b[^>]*>', + lambda x: unescapeHTML(x.group(0)), webpage) # it's tempting to parse this further, but you would # have to take into account all the variations like