From 1ec0696196fc23a2484b4687d54e73e704036672 Mon Sep 17 00:00:00 2001 From: JChris246 Date: Tue, 5 Feb 2019 14:46:52 -0400 Subject: [PATCH 1/3] [SpankBangPlaylist] Add new extractor --- youtube_dl/extractor/extractors.py | 5 ++++- youtube_dl/extractor/spankbang.py | 33 +++++++++++++++++++++++++++++- 2 files changed, 36 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 693c16e49..d7685cd87 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1058,7 +1058,10 @@ from .southpark import ( SouthParkEsIE, SouthParkNlIE ) -from .spankbang import SpankBangIE +from .spankbang import ( + SpankBangIE, + SpankBangPlaylistIE, +) from .spankwire import SpankwireIE from .spiegel import SpiegelIE, SpiegelArticleIE from .spiegeltv import SpiegeltvIE diff --git a/youtube_dl/extractor/spankbang.py b/youtube_dl/extractor/spankbang.py index 67500b69c..067c702ef 100644 --- a/youtube_dl/extractor/spankbang.py +++ b/youtube_dl/extractor/spankbang.py @@ -12,7 +12,7 @@ from ..utils import ( class SpankBangIE(InfoExtractor): - _VALID_URL = r'https?://(?:(?:www|m|[a-z]{2})\.)?spankbang\.com/(?P[\da-z]+)/video' + _VALID_URL = r'https?://(?:(?:www|m|[a-z]{2})\.)?spankbang\.com/(?P[\da-z-]+)/(?:video|playlist)' _TESTS = [{ 'url': 'http://spankbang.com/3vvn/video/fantasy+solo', 'md5': '1cc433e1d6aa14bc376535b8679302f7', @@ -94,3 +94,34 @@ class SpankBangIE(InfoExtractor): 'formats': formats, 'age_limit': age_limit, } + + +class SpankBangPlaylistIE(InfoExtractor): + _VALID_URL = r'https?://(?:(?:www|m|[a-z]{2})\.)?spankbang\.com/(?P[\da-z]+)/playlist' + _TEST = { + 'url': 'https://spankbang.com/ug0k/playlist/big+ass+titties', + 'info_dict': { + 'id': 'ug0k', + 'title': 'Big Ass Titties playlist', + }, + 'playlist_mincount': 2, + } + + def _extract_entries(self, webpage): + return [ + self.url_result( + 'http://www.%s/%s' % ('spankbang.com', video_url), + SpankBangIE.ie_key()) + for video_url in re.findall( + r'href="/?([\da-z-]+/playlist/[^"]+)', webpage) + ] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + playlist_id = mobj.group('id') + webpage = self._download_webpage(url, playlist_id) + + entries = self._extract_entries(webpage) + title = self._search_regex(r'

(.+)

', webpage, 'playlist_title') + + return self.playlist_result(entries, playlist_id, title) From 1a7f6314b68c130a7001186c186548d054ee0280 Mon Sep 17 00:00:00 2001 From: JChris246 Date: Tue, 5 Feb 2019 19:25:02 -0400 Subject: [PATCH 2/3] [SpankBangPlaylistIE] change single video regex, used urljoin, eliminate duplicate video capture, playlist title is no longer fatal, used _match_id --- youtube_dl/extractor/spankbang.py | 33 +++++++++++++++++++------------ 1 file changed, 20 insertions(+), 13 deletions(-) diff --git a/youtube_dl/extractor/spankbang.py b/youtube_dl/extractor/spankbang.py index 067c702ef..17f823604 100644 --- a/youtube_dl/extractor/spankbang.py +++ b/youtube_dl/extractor/spankbang.py @@ -8,11 +8,12 @@ from ..utils import ( parse_duration, parse_resolution, str_to_int, + urljoin, ) class SpankBangIE(InfoExtractor): - _VALID_URL = r'https?://(?:(?:www|m|[a-z]{2})\.)?spankbang\.com/(?P[\da-z-]+)/(?:video|playlist)' + _VALID_URL = r'https?://(?:(?:www|m|[a-z]{2})\.)?spankbang\.com/(?P[\da-z]+)/video' _TESTS = [{ 'url': 'http://spankbang.com/3vvn/video/fantasy+solo', 'md5': '1cc433e1d6aa14bc376535b8679302f7', @@ -107,21 +108,27 @@ class SpankBangPlaylistIE(InfoExtractor): 'playlist_mincount': 2, } - def _extract_entries(self, webpage): - return [ - self.url_result( - 'http://www.%s/%s' % ('spankbang.com', video_url), - SpankBangIE.ie_key()) - for video_url in re.findall( - r'href="/?([\da-z-]+/playlist/[^"]+)', webpage) - ] + def _extract_entries(self, webpage, id): + video_items = re.findall(r']+class=[\'"].*?video-item[^>]*>\s*(.+?)>', webpage) + + entries = [] + if video_items: + for div in video_items: + page_url = self._search_regex( + r'href="/?(' + id + '-[\da-z]+/playlist/[^"]+)', div, 'page url', default=None) + + if page_url: + page = self._download_webpage(urljoin('http://spankbang.com', page_url), id) + canonical_url = self._search_regex( + r'link rel="canonical" href="(.+?)"', page, 'canonical_url') + entries.append(self.url_result(canonical_url, SpankBangIE.ie_key())) + return entries def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - playlist_id = mobj.group('id') + playlist_id = self._match_id(url) webpage = self._download_webpage(url, playlist_id) - entries = self._extract_entries(webpage) - title = self._search_regex(r'

(.+)

', webpage, 'playlist_title') + entries = self._extract_entries(webpage, playlist_id) + title = self._search_regex(r'

(.+)

', webpage, 'playlist_title', fatal=False) return self.playlist_result(entries, playlist_id, title) From 3eaeaaba4e15e56b2cf57071fcd444d868dac7e5 Mon Sep 17 00:00:00 2001 From: JChris246 Date: Wed, 6 Feb 2019 14:38:26 -0400 Subject: [PATCH 3/3] removed shadow, removed pointless check --- youtube_dl/extractor/spankbang.py | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/spankbang.py b/youtube_dl/extractor/spankbang.py index 17f823604..e1f0c60cd 100644 --- a/youtube_dl/extractor/spankbang.py +++ b/youtube_dl/extractor/spankbang.py @@ -108,21 +108,20 @@ class SpankBangPlaylistIE(InfoExtractor): 'playlist_mincount': 2, } - def _extract_entries(self, webpage, id): + def _extract_entries(self, webpage, p_id): video_items = re.findall(r']+class=[\'"].*?video-item[^>]*>\s*(.+?)>', webpage) entries = [] - if video_items: - for div in video_items: - page_url = self._search_regex( - r'href="/?(' + id + '-[\da-z]+/playlist/[^"]+)', div, 'page url', default=None) + for div in video_items: + page_url = self._search_regex( + r'href="/?(' + p_id + '-[\da-z]+/playlist/[^"]+)', div, 'page url', default=None) - if page_url: - page = self._download_webpage(urljoin('http://spankbang.com', page_url), id) - canonical_url = self._search_regex( - r'link rel="canonical" href="(.+?)"', page, 'canonical_url') - entries.append(self.url_result(canonical_url, SpankBangIE.ie_key())) - return entries + if page_url: + page = self._download_webpage(urljoin('http://spankbang.com', page_url), p_id) + canonical_url = self._search_regex( + r'link rel="canonical" href="(.+?)"', page, 'canonical url') + entries.append(self.url_result(canonical_url, SpankBangIE.ie_key())) + return entries def _real_extract(self, url): playlist_id = self._match_id(url)