From 3810bcdd0aa7527be0b6a693c41de4ac928264d1 Mon Sep 17 00:00:00 2001 From: iamevn Date: Thu, 4 May 2017 22:41:01 -0400 Subject: [PATCH 1/5] inital attempt at adding full30 support --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/full30.py | 64 ++++++++++++++++++++++++++++++ 2 files changed, 65 insertions(+) create mode 100644 youtube_dl/extractor/full30.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index c0020dd7d..0c68e30c3 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -359,6 +359,7 @@ from .francetv import ( from .freesound import FreesoundIE from .freespeech import FreespeechIE from .freshlive import FreshLiveIE +from .full30 import Full30IE from .funimation import FunimationIE from .funnyordie import FunnyOrDieIE from .fusion import FusionIE diff --git a/youtube_dl/extractor/full30.py b/youtube_dl/extractor/full30.py new file mode 100644 index 000000000..c06ced2fb --- /dev/null +++ b/youtube_dl/extractor/full30.py @@ -0,0 +1,64 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class Full30IE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?full30\.com/video/(?P[a-f0-9]+)' + _TEST = { + 'url': 'http://www.full30.com/video/b2a28b99494164ddd55e91a6c4648cbc', + 'md5': '88f6812042afaf60f74dbcd84d4491c2', + 'info_dict': { + 'id': 'b2a28b99494164ddd55e91a6c4648cbc', + 'ext': 'webm', + 'title': 'Flamethrower Q&A with Charlie Hobson', + 'thumbnail': r're:^https?://.*52130\.jpg$', + 'uploader' : 'Forgotten Weapons', + # TODO more properties, either as: + # * A value + # * MD5 checksum; start the string with md5: + # * A regular expression; start the string with re: + # * Any Python type (for example int or float) + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + # TODO more code goes here, for example ... + title = self._html_search_regex(r'

]*class=.video-title[^>]*>([^<]+?)

', webpage, 'title') + uploader = self._html_search_regex(r'

]*>([^<]+)<', webpage, 'uploader', fatal=False) + description = self._og_search_description(webpage) + thumbnail = self._html_search_regex(r'<[^>]*property=.og:image. ?content="([^>]*thumbnails[^">]*)"\/>', webpage, 'thumbnail', fatal=False) or self._og_search_thumbnail(webpage) + + vidpath = self._html_search_regex(r']*value=["\']([^"\']*)["\'][^>]*>', webpage, 'video_path', fatal=False) + vidjson = self._download_webpage(vidpath, video_id) + # this is robust + vidjson = vidjson.rstrip() + vidjson = "[" + vidjson + "]" + vidjson = vidjson.replace("}", "},").replace(",]","]") + parsed = self._parse_json(vidjson, video_id) + + formats = [] + for d in parsed: + if d["type"] == "object": + formats.append({ + "url" : vidpath + d["name"], + "resolution" : d["name"][:d["name"].rfind(".")], + "filesize" : d["size"], + "protocol" : "https" + }) + + return { + 'id': video_id, + 'title': title, + # 'description': description, + 'uploader': uploader, + # 'url' : url, + 'formats' : formats, + # TODO more properties (see youtube_dl/extractor/common.py) + 'ext': 'mp4', + 'thumbnail' : thumbnail, + } From 957a6811487cb08106386e569c7078ae0a814f68 Mon Sep 17 00:00:00 2001 From: iamevn Date: Thu, 4 May 2017 22:59:11 -0400 Subject: [PATCH 2/5] formats sorted --- docs/supportedsites.md | 1 + youtube_dl/extractor/full30.py | 6 ++++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index e3c038c48..aa5124e9f 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -286,6 +286,7 @@ - **Freesound** - **freespeech.org** - **FreshLive** + - **Full30** - **Funimation** - **FunnyOrDie** - **Fusion** diff --git a/youtube_dl/extractor/full30.py b/youtube_dl/extractor/full30.py index c06ced2fb..047cb2de3 100644 --- a/youtube_dl/extractor/full30.py +++ b/youtube_dl/extractor/full30.py @@ -8,10 +8,10 @@ class Full30IE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?full30\.com/video/(?P[a-f0-9]+)' _TEST = { 'url': 'http://www.full30.com/video/b2a28b99494164ddd55e91a6c4648cbc', - 'md5': '88f6812042afaf60f74dbcd84d4491c2', + 'md5': 'f5aa3862cbe35c2083ce050ac1a5eb06', 'info_dict': { 'id': 'b2a28b99494164ddd55e91a6c4648cbc', - 'ext': 'webm', + 'ext': 'ogv', 'title': 'Flamethrower Q&A with Charlie Hobson', 'thumbnail': r're:^https?://.*52130\.jpg$', 'uploader' : 'Forgotten Weapons', @@ -51,6 +51,8 @@ class Full30IE(InfoExtractor): "protocol" : "https" }) + self._sort_formats(formats) + return { 'id': video_id, 'title': title, From be9e6386835de134042b3cc60b9229b6cd84bcad Mon Sep 17 00:00:00 2001 From: iamevn Date: Fri, 5 May 2017 00:42:22 -0400 Subject: [PATCH 3/5] cleaned up, added fallbacks --- youtube_dl/extractor/full30.py | 47 +++++++++++++++------------------- 1 file changed, 21 insertions(+), 26 deletions(-) diff --git a/youtube_dl/extractor/full30.py b/youtube_dl/extractor/full30.py index 047cb2de3..f519934ca 100644 --- a/youtube_dl/extractor/full30.py +++ b/youtube_dl/extractor/full30.py @@ -11,15 +11,10 @@ class Full30IE(InfoExtractor): 'md5': 'f5aa3862cbe35c2083ce050ac1a5eb06', 'info_dict': { 'id': 'b2a28b99494164ddd55e91a6c4648cbc', - 'ext': 'ogv', 'title': 'Flamethrower Q&A with Charlie Hobson', - 'thumbnail': r're:^https?://.*52130\.jpg$', 'uploader' : 'Forgotten Weapons', - # TODO more properties, either as: - # * A value - # * MD5 checksum; start the string with md5: - # * A regular expression; start the string with re: - # * Any Python type (for example int or float) + 'thumbnail': r're:^https?://.*52130\.jpg$', + 'ext': 'ogv', } } @@ -27,28 +22,32 @@ class Full30IE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - # TODO more code goes here, for example ... - title = self._html_search_regex(r'

]*class=.video-title[^>]*>([^<]+?)

', webpage, 'title') - uploader = self._html_search_regex(r'

]*>([^<]+)<', webpage, 'uploader', fatal=False) - description = self._og_search_description(webpage) - thumbnail = self._html_search_regex(r'<[^>]*property=.og:image. ?content="([^>]*thumbnails[^">]*)"\/>', webpage, 'thumbnail', fatal=False) or self._og_search_thumbnail(webpage) + title = self._html_search_regex(r'

]*class=.video-title[^>]*>([^<]+?)

', webpage, 'title', fatal=False, default=None) or self._og_search_title(webpage) + uploader = self._html_search_regex(r'

]*>([^<]+)<', webpage, 'uploader', fatal=False, default=None) or None + thumbnail = self._html_search_regex(r'<[^>]*property=.og:image. ?content="([^>]*thumbnails[^">]*)"\/>', webpage, 'thumbnail', fatal=False, default=None) or self._og_search_thumbnail(webpage) - vidpath = self._html_search_regex(r']*value=["\']([^"\']*)["\'][^>]*>', webpage, 'video_path', fatal=False) - vidjson = self._download_webpage(vidpath, video_id) - # this is robust - vidjson = vidjson.rstrip() - vidjson = "[" + vidjson + "]" - vidjson = vidjson.replace("}", "},").replace(",]","]") - parsed = self._parse_json(vidjson, video_id) + # looking for a line like the following + # + # there's also a full30.com/cdn which appears to have the same sort of structure. it's possible that either of these may go away so as a backup I'll build the cdn link out from channel slug + vid_path = self._html_search_regex(r']*value=["\']([^"\']*)["\'][^>]*>', webpage, 'video_path', fatal=False, default=None) + if not vid_path: + channel_slug = self._html_search_regex(r']*value=["\']([^"\']*)["\'][^>]*>', webpage, 'channel_slug', fatal=True) + vid_path = "https://www.full30.com/cdn/videos/" + channel_slug + "/" + video_id + "/" + + vid_json = self._download_webpage(vid_path, video_id) + # turn sequence of json entries into an actual list + vid_json = vid_json.rstrip() + vid_json = "[" + vid_json + "]" + vid_json = vid_json.replace("}", "},").replace(",]","]") + parsed = self._parse_json(vid_json, video_id) formats = [] for d in parsed: if d["type"] == "object": formats.append({ - "url" : vidpath + d["name"], + "url" : vid_path + d["name"], "resolution" : d["name"][:d["name"].rfind(".")], "filesize" : d["size"], - "protocol" : "https" }) self._sort_formats(formats) @@ -56,11 +55,7 @@ class Full30IE(InfoExtractor): return { 'id': video_id, 'title': title, - # 'description': description, 'uploader': uploader, - # 'url' : url, - 'formats' : formats, - # TODO more properties (see youtube_dl/extractor/common.py) - 'ext': 'mp4', 'thumbnail' : thumbnail, + 'formats' : formats, } From feff8387412994d28a33761f18b1e5e0a3f32a78 Mon Sep 17 00:00:00 2001 From: iamevn Date: Sat, 6 May 2017 02:17:17 -0400 Subject: [PATCH 4/5] flake8 compliant, allow non-essential uploader field to be None, fallback to video_id as title (current behavior with generic exractor), make filesize int_or_none --- youtube_dl/extractor/full30.py | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/youtube_dl/extractor/full30.py b/youtube_dl/extractor/full30.py index f519934ca..a434ba6d0 100644 --- a/youtube_dl/extractor/full30.py +++ b/youtube_dl/extractor/full30.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals from .common import InfoExtractor +from ..utils import int_or_none class Full30IE(InfoExtractor): @@ -12,7 +13,7 @@ class Full30IE(InfoExtractor): 'info_dict': { 'id': 'b2a28b99494164ddd55e91a6c4648cbc', 'title': 'Flamethrower Q&A with Charlie Hobson', - 'uploader' : 'Forgotten Weapons', + 'uploader': 'Forgotten Weapons', 'thumbnail': r're:^https?://.*52130\.jpg$', 'ext': 'ogv', } @@ -22,8 +23,8 @@ class Full30IE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - title = self._html_search_regex(r'

]*class=.video-title[^>]*>([^<]+?)

', webpage, 'title', fatal=False, default=None) or self._og_search_title(webpage) - uploader = self._html_search_regex(r'

]*>([^<]+)<', webpage, 'uploader', fatal=False, default=None) or None + title = self._html_search_regex(r'

]*class=.video-title[^>]*>([^<]+?)

', webpage, 'title', fatal=False, default=None) or self._og_search_title(webpage) or video_id + uploader = self._html_search_regex(r'

]*>([^<]+)<', webpage, 'uploader', fatal=False, default=None) or None thumbnail = self._html_search_regex(r'<[^>]*property=.og:image. ?content="([^>]*thumbnails[^">]*)"\/>', webpage, 'thumbnail', fatal=False, default=None) or self._og_search_thumbnail(webpage) # looking for a line like the following @@ -38,17 +39,14 @@ class Full30IE(InfoExtractor): # turn sequence of json entries into an actual list vid_json = vid_json.rstrip() vid_json = "[" + vid_json + "]" - vid_json = vid_json.replace("}", "},").replace(",]","]") + vid_json = vid_json.replace("}", "},").replace(",]", "]") parsed = self._parse_json(vid_json, video_id) - formats = [] - for d in parsed: - if d["type"] == "object": - formats.append({ - "url" : vid_path + d["name"], - "resolution" : d["name"][:d["name"].rfind(".")], - "filesize" : d["size"], - }) + formats = [{ + "url": vid_path + entry["name"], + "resolution": entry["name"][:entry["name"].rfind(".")], + "filesize": int_or_none(entry["size"]), + } for entry in parsed if entry.get("type") == "object"] self._sort_formats(formats) @@ -56,6 +54,6 @@ class Full30IE(InfoExtractor): 'id': video_id, 'title': title, 'uploader': uploader, - 'thumbnail' : thumbnail, - 'formats' : formats, + 'thumbnail': thumbnail, + 'formats': formats, } From 8f37417d83623268f98e782c589b0f49e7f1413a Mon Sep 17 00:00:00 2001 From: iamevn Date: Sun, 7 May 2017 19:10:29 -0400 Subject: [PATCH 5/5] be consistent with quotes --- youtube_dl/extractor/full30.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/full30.py b/youtube_dl/extractor/full30.py index a434ba6d0..4c78e4fb0 100644 --- a/youtube_dl/extractor/full30.py +++ b/youtube_dl/extractor/full30.py @@ -33,20 +33,20 @@ class Full30IE(InfoExtractor): vid_path = self._html_search_regex(r']*value=["\']([^"\']*)["\'][^>]*>', webpage, 'video_path', fatal=False, default=None) if not vid_path: channel_slug = self._html_search_regex(r']*value=["\']([^"\']*)["\'][^>]*>', webpage, 'channel_slug', fatal=True) - vid_path = "https://www.full30.com/cdn/videos/" + channel_slug + "/" + video_id + "/" + vid_path = 'https://www.full30.com/cdn/videos/' + channel_slug + '/' + video_id + '/' vid_json = self._download_webpage(vid_path, video_id) # turn sequence of json entries into an actual list vid_json = vid_json.rstrip() - vid_json = "[" + vid_json + "]" - vid_json = vid_json.replace("}", "},").replace(",]", "]") + vid_json = '[' + vid_json + ']' + vid_json = vid_json.replace('}', '},').replace(',]', ']') parsed = self._parse_json(vid_json, video_id) formats = [{ - "url": vid_path + entry["name"], - "resolution": entry["name"][:entry["name"].rfind(".")], - "filesize": int_or_none(entry["size"]), - } for entry in parsed if entry.get("type") == "object"] + 'url': vid_path + entry['name'], + 'resolution': entry['name'][:entry['name'].rfind('.')], + 'filesize': int_or_none(entry['size']), + } for entry in parsed if entry.get('type') == 'object'] self._sort_formats(formats)