diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index d34fc4b15..d1725d98b 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -2075,6 +2075,22 @@ class GenericIE(InfoExtractor): }, 'playlist_count': 6, }, + { + # Squarespace video embed, 2019-08-28 + 'url': 'http://ootboxford.com', + 'info_dict': { + 'id': 'Tc7b_JGdZfw', + 'title': 'Out of the Blue, at Childish Things 10', + 'ext': 'mp4', + 'description': 'md5:a83d0026666cf5ee970f8bd1cfd69c7f', + 'uploader_id': 'helendouglashouse', + 'uploader': 'Helen & Douglas House', + 'upload_date': '20140328', + }, + 'params': { + 'skip_download': True, + }, + }, { # Zype embed 'url': 'https://www.cookscountry.com/episode/554-smoky-barbecue-favorites', @@ -2395,6 +2411,12 @@ class GenericIE(InfoExtractor): # Unescaping the whole page allows to handle those cases in a generic way webpage = compat_urllib_parse_unquote(webpage) + # Unescape squarespace embeds to be detected by generic extractor, + # see https://github.com/ytdl-org/youtube-dl/issues/21294 + webpage = re.sub( + r']+class=[^>]*?\bsqs-video-wrapper\b[^>]*>', + lambda x: unescapeHTML(x.group(0)), webpage) + # it's tempting to parse this further, but you would # have to take into account all the variations like # Video Title - Site Name diff --git a/youtube_dl/extractor/openload.py b/youtube_dl/extractor/openload.py index b638450af..679eaf6c3 100644 --- a/youtube_dl/extractor/openload.py +++ b/youtube_dl/extractor/openload.py @@ -243,12 +243,13 @@ class PhantomJSwrapper(object): class OpenloadIE(InfoExtractor): - _DOMAINS = r'''(?x) + _DOMAINS = r''' (?: openload\.(?:co|io|link|pw)| oload\.(?:tv|best|biz|stream|site|xyz|win|download|cloud|cc|icu|fun|club|info|press|pw|life|live|space|services|website|vip)| - oladblock\.(?:services|xyz|me)|openloed\.co) - ''' + oladblock\.(?:services|xyz|me)|openloed\.co + ) + ''' _VALID_URL = r'''(?x) https?:// (?P @@ -396,7 +397,7 @@ class OpenloadIE(InfoExtractor): @classmethod def _extract_urls(cls, webpage): return re.findall( - r']+src=["\']((?:https?://)?%s/%s/[a-zA-Z0-9-_]+)' + r'(?x)]+src=["\']((?:https?://)?%s/%s/[a-zA-Z0-9-_]+)' % (cls._DOMAINS, cls._EMBED_WORD), webpage) def _extract_decrypted_page(self, page_url, webpage, video_id):