From 32f6c118cb16caac01905f4c1bee661d6b6f48f2 Mon Sep 17 00:00:00 2001 From: JChris246 <43832407+JChris246@users.noreply.github.com> Date: Sat, 2 Feb 2019 15:20:14 -0400 Subject: [PATCH] improved function to grab categories and tags --- youtube_dl/extractor/pornhub.py | 21 +++++++-------------- 1 file changed, 7 insertions(+), 14 deletions(-) diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index 8bf4d9f62..1a2f07345 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -159,10 +159,12 @@ class PornHubIE(PornHubBaseIE): return str_to_int(self._search_regex( pattern, webpage, '%s count' % name, fatal=False)) - def _get_text(self, str): + def _get_text(self, str, page): l = [] - for a in re.finditer(r']+>([^<]+)', str): - l.append(a.group(1)) + div = re.search(r'
\s+[^\n]+\s+([^\n]+)', page) + if div: + for a in re.finditer(r']+>([^<]+)', div.group(1)): + l.append(a.group(1)) return l def _real_extract(self, url): @@ -308,17 +310,8 @@ class PornHubIE(PornHubBaseIE): comment_count = self._extract_count( r'All Comments\s*\(([\d,.]+)\)', webpage, 'comment') - div = re.search(r'
\s+[^\n]+\s+([^\n]+)', webpage) - if div: - categories = self._get_text(div.group(1)) - else: - categories = None - - div = re.search(r'
\s+Tags: \s+([^\n]+)', webpage) - if div: - tags = self._get_text(div.group(1)) - else: - tags = None + categories = self._get_text("categoriesWrapper", webpage) + tags = self._get_text("tagsWrapper", webpage) return { 'id': video_id,