From 6870bf7efcc42509a68d7f9d323a3e7e27702154 Mon Sep 17 00:00:00 2001 From: SweepingMonk Date: Sun, 7 May 2017 16:43:25 +0800 Subject: [PATCH] [douyutv] add support for multiple room on single page --- youtube_dl/extractor/douyutv.py | 50 +++++++++++++++++++++++++++++++-- 1 file changed, 47 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/douyutv.py b/youtube_dl/extractor/douyutv.py index d22133d24..3fa596f10 100644 --- a/youtube_dl/extractor/douyutv.py +++ b/youtube_dl/extractor/douyutv.py @@ -3,17 +3,48 @@ from __future__ import unicode_literals import time import hashlib +import re from .common import InfoExtractor from ..utils import ( ExtractorError, unescapeHTML, + compat_HTMLParser ) +class RoomIDParser(compat_HTMLParser): + + def __init__(self, room_index=None): + compat_HTMLParser.__init__(self) + self._room_index = room_index + self._room_id = None + + def handle_starttag(self, tag, attrs): + if tag != 'div' and tag != 'li': + return + + attrs_dict = dict(attrs) + # process switch button situation firstly + if(tag == 'li' + and attrs_dict.get('class', '') == 'switchRoom-btn' + and attrs_dict.get('data-index', '-1') == self._room_index): + self._room_id = attrs_dict.get('data-onlineid') + + if self._room_id is not None: + return + + if tag == 'div' and attrs_dict.get('data-component-id', '') == 'room': + self._room_id = attrs_dict.get('data-onlineid') + + @property + def room_id(self): + return self._room_id + + class DouyuTVIE(InfoExtractor): IE_DESC = '斗鱼' - _VALID_URL = r'https?://(?:www\.)?douyu(?:tv)?\.com/(?:[^/]+/)*(?P[A-Za-z0-9]+)' + _VALID_URL = r'https?://(?:www\.)?douyu(?:tv)?\.com/(?P(?:[^/]+/)*[A-Za-z0-9]+(?:\?roomIndex=\d+)?)' _TESTS = [{ 'url': 'http://www.douyutv.com/iseven', 'info_dict': { @@ -67,6 +98,9 @@ class DouyuTVIE(InfoExtractor): # \"room_id\" 'url': 'http://www.douyu.com/t/lpl', 'only_matching': True, + }, { + 'url': 'http://www.douyu.com/t/douyukpl?roomIndex=1', + 'only_matching': True }] def _real_extract(self, url): @@ -76,8 +110,18 @@ class DouyuTVIE(InfoExtractor): room_id = video_id else: page = self._download_webpage(url, video_id) - room_id = self._html_search_regex( - r'"room_id\\?"\s*:\s*(\d+),', page, 'room id') + if not video_id.startswith('t/'): + room_id = self._html_search_regex( + r'"room_id\\?"\s*:\s*(\d+),', page, 'room id') + else: + match_obj = re.match(r'.+roomIndex=(\d+)', video_id) + # default room index is 0 + room_index = match_obj.group(1) if match_obj is not None else '0' + room_id_parser = RoomIDParser(room_index) + room_id_parser.feed(page) + room_id = room_id_parser.room_id + if room_id is None: + raise ExtractorError('Extracting room id failed.') # Grab metadata from mobile API room = self._download_json(