[techtalks] Use get_element_by_class and rely on _match_id

This commit is contained in:
Déstin Reed 2016-10-01 13:54:35 +02:00
parent 63e0fd5bcc
commit 03999993af

View File

@ -4,14 +4,13 @@ import re
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import ( from ..utils import (
get_element_by_attribute, get_element_by_class,
clean_html, clean_html,
) )
class TechTalksIE(InfoExtractor): class TechTalksIE(InfoExtractor):
_VALID_URL = r'https?://techtalks\.tv/talks/[^/]*/(?P<id>\d+)/' _VALID_URL = r'https?://techtalks\.tv/talks/[^/]*/(?P<id>\d+)/'
_TEST = { _TEST = {
'url': 'http://techtalks.tv/talks/learning-topic-models-going-beyond-svd/57758/', 'url': 'http://techtalks.tv/talks/learning-topic-models-going-beyond-svd/57758/',
'info_dict': { 'info_dict': {
@ -41,15 +40,14 @@ class TechTalksIE(InfoExtractor):
} }
def _real_extract(self, url): def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url) talk_id = self._match_id(url)
talk_id = mobj.group('id')
webpage = self._download_webpage(url, talk_id) webpage = self._download_webpage(url, talk_id)
rtmp_url = self._search_regex( rtmp_url = self._search_regex(
r'netConnectionUrl: \'(.*?)\'', webpage, 'rtmp url') r'netConnectionUrl: \'(.*?)\'', webpage, 'rtmp url')
play_path = self._search_regex( play_path = self._search_regex(
r'href=\'(.*?)\' [^>]*id="flowplayer_presenter"', r'href=\'(.*?)\' [^>]*id="flowplayer_presenter"',
webpage, 'presenter play path') webpage, 'presenter play path')
title = clean_html(get_element_by_attribute('class', 'title', webpage)) title = clean_html(get_element_by_class('title', webpage))
video_info = { video_info = {
'id': talk_id, 'id': talk_id,
'title': title, 'title': title,