From f6199a619d98fb6684655bab7c9a2da8b910f450 Mon Sep 17 00:00:00 2001 From: Simon Morgan Date: Fri, 7 Oct 2016 16:17:57 +0100 Subject: [PATCH 1/5] [yuvutu] Add new extractor --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/yuvutu.py | 46 ++++++++++++++++++++++++++++++ 2 files changed, 47 insertions(+) create mode 100644 youtube_dl/extractor/yuvutu.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index feee06004..6ae0f5a4c 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1162,6 +1162,7 @@ from .youtube import ( YoutubeUserIE, YoutubeWatchLaterIE, ) +from .yuvutu import YuvutuIE from .zapiks import ZapiksIE from .zdf import ZDFIE, ZDFChannelIE from .zingmp3 import ZingMp3IE diff --git a/youtube_dl/extractor/yuvutu.py b/youtube_dl/extractor/yuvutu.py new file mode 100644 index 000000000..ebf0846fb --- /dev/null +++ b/youtube_dl/extractor/yuvutu.py @@ -0,0 +1,46 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import determine_ext + + +class YuvutuIE(InfoExtractor): + _VALID_URL = r'http://(?:www\.)?yuvutu.com/video/(?P[0-9]+)(?:.*)' + _TEST = { + 'url': 'http://www.yuvutu.com/video/330/', + 'md5': 'af4a0d2eabec6b6bd43cd6b68543fa9c', + 'info_dict': { + 'id': '330', + 'title': 'carnal bliss', + 'ext': 'flv', + 'age_limit': 18, + } + } + + _title_regex = r"class=[\"']video-title-content[\"']>.+?>(.+?)<" + _thumbnail_regex = r"itemprop=[\"']thumbnailURL[\"']\s+content=[\"'](.+?)[\"']" + _embed_regex = r"[\"'](\/embed_video\.php.+?)[\"']" + _video_regex = r"file:\s*[\"']([^\s]+)[\"']" + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + title = self._html_search_regex(self._title_regex, webpage, 'title') + + embed_url = self._html_search_regex(self._embed_regex, webpage, + 'embed') + embed_webpage = self._download_webpage( + "http://www.yuvutu.com/" + embed_url, video_id) + video_url = self._html_search_regex(self._video_regex, embed_webpage, + 'video_url') + + return { + 'id': video_id, + 'url': video_url, + 'ext': determine_ext(video_url, 'mp4'), + 'title': title, + 'age_limit': 18, + } From 5d4d88151f75f90b61f82bac15c6bf633fdc50b0 Mon Sep 17 00:00:00 2001 From: Simon Morgan Date: Fri, 7 Oct 2016 16:23:16 +0100 Subject: [PATCH 2/5] [yuvutu] Export thumbnail URL --- youtube_dl/extractor/yuvutu.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/youtube_dl/extractor/yuvutu.py b/youtube_dl/extractor/yuvutu.py index ebf0846fb..eca020fda 100644 --- a/youtube_dl/extractor/yuvutu.py +++ b/youtube_dl/extractor/yuvutu.py @@ -29,6 +29,7 @@ class YuvutuIE(InfoExtractor): webpage = self._download_webpage(url, video_id) title = self._html_search_regex(self._title_regex, webpage, 'title') + thumbnail_url = self._html_search_regex(self._thumbnail_regex, webpage, 'thumbnail') embed_url = self._html_search_regex(self._embed_regex, webpage, 'embed') @@ -40,6 +41,7 @@ class YuvutuIE(InfoExtractor): return { 'id': video_id, 'url': video_url, + 'thumbnail': thumbnail_url, 'ext': determine_ext(video_url, 'mp4'), 'title': title, 'age_limit': 18, From 77da0152fe149e19d5da75e27fdfc43a1ddc13fd Mon Sep 17 00:00:00 2001 From: Simon Morgan Date: Fri, 7 Oct 2016 17:19:00 +0100 Subject: [PATCH 3/5] [yuvutu] Move regular expressions to place of usage. --- youtube_dl/extractor/yuvutu.py | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/yuvutu.py b/youtube_dl/extractor/yuvutu.py index eca020fda..7faa5b703 100644 --- a/youtube_dl/extractor/yuvutu.py +++ b/youtube_dl/extractor/yuvutu.py @@ -18,25 +18,22 @@ class YuvutuIE(InfoExtractor): } } - _title_regex = r"class=[\"']video-title-content[\"']>.+?>(.+?)<" - _thumbnail_regex = r"itemprop=[\"']thumbnailURL[\"']\s+content=[\"'](.+?)[\"']" - _embed_regex = r"[\"'](\/embed_video\.php.+?)[\"']" - _video_regex = r"file:\s*[\"']([^\s]+)[\"']" - def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - title = self._html_search_regex(self._title_regex, webpage, 'title') - thumbnail_url = self._html_search_regex(self._thumbnail_regex, webpage, 'thumbnail') + title = self._html_search_regex( + r"class=[\"']video-title-content[\"']>.+?>(.+?)<", webpage, 'title') + thumbnail_url = self._html_search_regex( + r"itemprop=[\"']thumbnailURL[\"']\s+content=[\"'](.+?)[\"']", webpage, 'thumbnail') - embed_url = self._html_search_regex(self._embed_regex, webpage, - 'embed') + embed_url = self._html_search_regex( + r"[\"'](\/embed_video\.php.+?)[\"']", webpage, 'embed') embed_webpage = self._download_webpage( "http://www.yuvutu.com/" + embed_url, video_id) - video_url = self._html_search_regex(self._video_regex, embed_webpage, - 'video_url') + video_url = self._html_search_regex( + r"file:\s*[\"']([^\s]+)[\"']", embed_webpage, 'video_url') return { 'id': video_id, From 1f3cfb8178b467c77cdf88e94f3f2e71403226be Mon Sep 17 00:00:00 2001 From: Simon Morgan Date: Sun, 9 Oct 2016 12:56:19 +0100 Subject: [PATCH 4/5] [yuvutu] Check for thumbnail in test --- youtube_dl/extractor/yuvutu.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/yuvutu.py b/youtube_dl/extractor/yuvutu.py index 7faa5b703..d54175e9e 100644 --- a/youtube_dl/extractor/yuvutu.py +++ b/youtube_dl/extractor/yuvutu.py @@ -15,6 +15,7 @@ class YuvutuIE(InfoExtractor): 'title': 'carnal bliss', 'ext': 'flv', 'age_limit': 18, + 'thumbnail': 're:https?://.*?\.jpg' } } From 90dfa03af52375dbbfdccbf1403c9b0aed6db496 Mon Sep 17 00:00:00 2001 From: Simon Morgan Date: Tue, 11 Oct 2016 10:44:22 +0100 Subject: [PATCH 5/5] [yuvutu] Support for user pages --- youtube_dl/extractor/extractors.py | 2 +- youtube_dl/extractor/yuvutu.py | 41 +++++++++++++++++++++++++++++- 2 files changed, 41 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index b3a5884b7..d7ddcf09c 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1166,7 +1166,7 @@ from .youtube import ( YoutubeUserIE, YoutubeWatchLaterIE, ) -from .yuvutu import YuvutuIE +from .yuvutu import YuvutuIE, YuvutuUserIE from .zapiks import ZapiksIE from .zdf import ZDFIE, ZDFChannelIE from .zingmp3 import ZingMp3IE diff --git a/youtube_dl/extractor/yuvutu.py b/youtube_dl/extractor/yuvutu.py index d54175e9e..71a505066 100644 --- a/youtube_dl/extractor/yuvutu.py +++ b/youtube_dl/extractor/yuvutu.py @@ -1,8 +1,14 @@ # coding: utf-8 from __future__ import unicode_literals +import itertools +import re + from .common import InfoExtractor -from ..utils import determine_ext +from ..utils import ( + determine_ext, + sanitized_Request, +) class YuvutuIE(InfoExtractor): @@ -44,3 +50,36 @@ class YuvutuIE(InfoExtractor): 'title': title, 'age_limit': 18, } + + +class YuvutuUserIE(InfoExtractor): + IE_DESC = 'Yuvutu user profile' + _VALID_URL = r'http://(?:www\.)?yuvutu\.com/modules\.php\?name=YuPeople&action=view_videos&user_id=(?P[0-9]+)' + _TEST = { + 'url': 'http://www.yuvutu.com/modules.php?name=YuPeople&action=view_videos&user_id=1072966', + 'info_dict': { + 'id': '1072966', + 'age_limit': 18, + }, + 'playlist_mincount': 90, + } + + def _real_extract(self, url): + user_id = self._match_id(url) + + entries = [] + for pagenum in itertools.count(1): + request = sanitized_Request( + 'http://www.yuvutu.com/modules.php?name=YuPeople&action=view_videos&user_id=%s&page=%d' % (user_id, pagenum)) + page = self._download_webpage(request, user_id, 'Downloading user page %d' % pagenum) + + video_ids = re.findall( + r'class=[\'"]thumb-image[\'"]>\s+