From 18414edeb7ebb3c9261f300cbee734a13abdf7fa Mon Sep 17 00:00:00 2001
From: Alex Aplin
Date: Sun, 30 Jul 2017 18:32:01 -0400
Subject: [PATCH 1/2] [Iwara] Improve metadata extraction
---
youtube_dl/extractor/iwara.py | 31 +++++++++++++++++++++++++++++++
1 file changed, 31 insertions(+)
diff --git a/youtube_dl/extractor/iwara.py b/youtube_dl/extractor/iwara.py
index a7514fc80..44cc2c67d 100644
--- a/youtube_dl/extractor/iwara.py
+++ b/youtube_dl/extractor/iwara.py
@@ -1,12 +1,18 @@
# coding: utf-8
from __future__ import unicode_literals
+import re
+
from .common import InfoExtractor
from ..compat import compat_urllib_parse_urlparse
from ..utils import (
int_or_none,
mimetype2ext,
remove_end,
+ clean_html,
+ get_element_by_class,
+ get_elements_by_class,
+ unified_strdate,
)
@@ -20,6 +26,12 @@ class IwaraIE(InfoExtractor):
'ext': 'mp4',
'title': '【MMD R-18】ガールフレンド carry_me_off',
'age_limit': 18,
+ 'upload_date': '20150828',
+ 'uploader': 'Reimu丨Action',
+ 'description': '禁止转载\n\n=acfun=\n=bilibili=\n=youtube=\n\n=stage=\n=motion=\n=camera=\n=dress=',
+ 'comment_count': int,
+ 'like_count': int,
+ 'view_count': int,
},
}, {
'url': 'http://ecchi.iwara.tv/videos/Vb4yf2yZspkzkBO',
@@ -71,6 +83,19 @@ class IwaraIE(InfoExtractor):
title = remove_end(self._html_search_regex(
r'([^<]+)', webpage, 'title'), ' | Iwara')
+ upload_date = unified_strdate(self._html_search_regex(
+ r'(\d{4}-\d{2}-\d{2})', webpage, 'upload_date', fatal=False))
+
+ uploader = get_element_by_class('username', webpage)
+
+ description = clean_html(get_element_by_class('field-type-text-with-summary', webpage).replace('
', '
'))
+
+ comment_count = int_or_none(re.sub('\D', '', get_elements_by_class('title', webpage)[1]))
+
+ node_views = clean_html(get_element_by_class('node-views', webpage)).split()
+ like_count = int_or_none(node_views[0].replace(',', ''))
+ view_count = int_or_none(node_views[1].replace(',', ''))
+
formats = []
for a_format in video_data:
format_id = a_format.get('resolution')
@@ -92,4 +117,10 @@ class IwaraIE(InfoExtractor):
'title': title,
'age_limit': age_limit,
'formats': formats,
+ 'upload_date': upload_date,
+ 'uploader': uploader,
+ 'description': description,
+ 'comment_count': comment_count,
+ 'like_count': like_count,
+ 'view_count': view_count,
}
From a954ccfd3ba80450093e537ba5f08b078d138961 Mon Sep 17 00:00:00 2001
From: Alex Aplin
Date: Sat, 5 Aug 2017 22:48:27 -0400
Subject: [PATCH 2/2] Safer extraction
---
youtube_dl/extractor/iwara.py | 19 ++++++++++++-------
1 file changed, 12 insertions(+), 7 deletions(-)
diff --git a/youtube_dl/extractor/iwara.py b/youtube_dl/extractor/iwara.py
index 44cc2c67d..119bb44f4 100644
--- a/youtube_dl/extractor/iwara.py
+++ b/youtube_dl/extractor/iwara.py
@@ -1,8 +1,6 @@
# coding: utf-8
from __future__ import unicode_literals
-import re
-
from .common import InfoExtractor
from ..compat import compat_urllib_parse_urlparse
from ..utils import (
@@ -88,13 +86,20 @@ class IwaraIE(InfoExtractor):
uploader = get_element_by_class('username', webpage)
- description = clean_html(get_element_by_class('field-type-text-with-summary', webpage).replace('', '
'))
+ description_class = get_element_by_class('field-type-text-with-summary', webpage)
+ description = clean_html(description_class.replace('', '
') if description_class else None)
- comment_count = int_or_none(re.sub('\D', '', get_elements_by_class('title', webpage)[1]))
+ comment_count_classes = get_elements_by_class('title', webpage)
+ comment_count = None
+ if comment_count_classes and len(comment_count_classes) >= 2:
+ comment_count = int_or_none(''.join(digit for digit in comment_count_classes[1] if digit.isdigit()))
- node_views = clean_html(get_element_by_class('node-views', webpage)).split()
- like_count = int_or_none(node_views[0].replace(',', ''))
- view_count = int_or_none(node_views[1].replace(',', ''))
+ node_views_class = clean_html(get_element_by_class('node-views', webpage))
+ node_views = node_views_class.split() if node_views_class else None
+ like_count = view_count = None
+ if node_views and len(node_views) >= 2:
+ like_count = int_or_none(node_views[0].replace(',', ''))
+ view_count = int_or_none(node_views[1].replace(',', ''))
formats = []
for a_format in video_data: