From 18414edeb7ebb3c9261f300cbee734a13abdf7fa Mon Sep 17 00:00:00 2001 From: Alex Aplin Date: Sun, 30 Jul 2017 18:32:01 -0400 Subject: [PATCH 1/2] [Iwara] Improve metadata extraction --- youtube_dl/extractor/iwara.py | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/youtube_dl/extractor/iwara.py b/youtube_dl/extractor/iwara.py index a7514fc80..44cc2c67d 100644 --- a/youtube_dl/extractor/iwara.py +++ b/youtube_dl/extractor/iwara.py @@ -1,12 +1,18 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor from ..compat import compat_urllib_parse_urlparse from ..utils import ( int_or_none, mimetype2ext, remove_end, + clean_html, + get_element_by_class, + get_elements_by_class, + unified_strdate, ) @@ -20,6 +26,12 @@ class IwaraIE(InfoExtractor): 'ext': 'mp4', 'title': '【MMD R-18】ガールフレンド carry_me_off', 'age_limit': 18, + 'upload_date': '20150828', + 'uploader': 'Reimu丨Action', + 'description': '禁止转载\n\n=acfun=\n=bilibili=\n=youtube=\n\n=stage=\n=motion=\n=camera=\n=dress=', + 'comment_count': int, + 'like_count': int, + 'view_count': int, }, }, { 'url': 'http://ecchi.iwara.tv/videos/Vb4yf2yZspkzkBO', @@ -71,6 +83,19 @@ class IwaraIE(InfoExtractor): title = remove_end(self._html_search_regex( r'([^<]+)', webpage, 'title'), ' | Iwara') + upload_date = unified_strdate(self._html_search_regex( + r'(\d{4}-\d{2}-\d{2})', webpage, 'upload_date', fatal=False)) + + uploader = get_element_by_class('username', webpage) + + description = clean_html(get_element_by_class('field-type-text-with-summary', webpage).replace('

', '

')) + + comment_count = int_or_none(re.sub('\D', '', get_elements_by_class('title', webpage)[1])) + + node_views = clean_html(get_element_by_class('node-views', webpage)).split() + like_count = int_or_none(node_views[0].replace(',', '')) + view_count = int_or_none(node_views[1].replace(',', '')) + formats = [] for a_format in video_data: format_id = a_format.get('resolution') @@ -92,4 +117,10 @@ class IwaraIE(InfoExtractor): 'title': title, 'age_limit': age_limit, 'formats': formats, + 'upload_date': upload_date, + 'uploader': uploader, + 'description': description, + 'comment_count': comment_count, + 'like_count': like_count, + 'view_count': view_count, } From a954ccfd3ba80450093e537ba5f08b078d138961 Mon Sep 17 00:00:00 2001 From: Alex Aplin Date: Sat, 5 Aug 2017 22:48:27 -0400 Subject: [PATCH 2/2] Safer extraction --- youtube_dl/extractor/iwara.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/iwara.py b/youtube_dl/extractor/iwara.py index 44cc2c67d..119bb44f4 100644 --- a/youtube_dl/extractor/iwara.py +++ b/youtube_dl/extractor/iwara.py @@ -1,8 +1,6 @@ # coding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor from ..compat import compat_urllib_parse_urlparse from ..utils import ( @@ -88,13 +86,20 @@ class IwaraIE(InfoExtractor): uploader = get_element_by_class('username', webpage) - description = clean_html(get_element_by_class('field-type-text-with-summary', webpage).replace('

', '

')) + description_class = get_element_by_class('field-type-text-with-summary', webpage) + description = clean_html(description_class.replace('

', '

') if description_class else None) - comment_count = int_or_none(re.sub('\D', '', get_elements_by_class('title', webpage)[1])) + comment_count_classes = get_elements_by_class('title', webpage) + comment_count = None + if comment_count_classes and len(comment_count_classes) >= 2: + comment_count = int_or_none(''.join(digit for digit in comment_count_classes[1] if digit.isdigit())) - node_views = clean_html(get_element_by_class('node-views', webpage)).split() - like_count = int_or_none(node_views[0].replace(',', '')) - view_count = int_or_none(node_views[1].replace(',', '')) + node_views_class = clean_html(get_element_by_class('node-views', webpage)) + node_views = node_views_class.split() if node_views_class else None + like_count = view_count = None + if node_views and len(node_views) >= 2: + like_count = int_or_none(node_views[0].replace(',', '')) + view_count = int_or_none(node_views[1].replace(',', '')) formats = [] for a_format in video_data: