[ellentube] Fix broken extractor (closes #14407)

This commit is contained in:
Alex Seiler 2017-10-23 21:15:48 +02:00
parent 55c727a547
commit 40d43e940c
3 changed files with 117 additions and 104 deletions

View File

@ -0,0 +1,114 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import (
int_or_none,
urljoin,
)
class EllenTubeBaseIE(InfoExtractor):
API_URL = 'https://api-prod.ellentube.com/'
def _extract_from_video_id(self, video_id, display_id=None):
video_data = self._download_json(
urljoin(self.API_URL, 'ellenapi/api/item/%s' % video_id), video_id)
title = video_data['title']
description = video_data.get('description')
publish_time = int_or_none(video_data.get('publishTime'))
thumbnail = video_data.get('thumbnail')
formats = []
duration = None
for entry in video_data.get('media'):
if entry.get('id') == 'm3u8':
formats = self._extract_m3u8_formats(entry.get(
'url'), video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls')
duration = int_or_none(entry.get('duration'))
break
self._sort_formats(formats)
return {
'id': video_id,
'title': title,
'description': description,
'display_id': display_id,
'duration': duration,
'thumbnail': thumbnail,
'timestamp': publish_time,
'formats': formats,
}
def _extract_video_ids_from_api_search(self, api_search, display_id):
feed_data = self._download_json(
urljoin(self.API_URL, 'ellenapi/api/feed/?%s' % api_search), display_id)
return [entry.get('id') for entry in feed_data if entry.get('type') == 'VIDEO']
class EllenTubeVideoIE(EllenTubeBaseIE):
_VALID_URL = r'https?://(?:www\.)?ellentube\.com/video/(?P<id>.+)\.html'
_TEST = {
'url': 'https://www.ellentube.com/video/ellen-meets-las-vegas-survivors-jesus-campos-and-stephen-schuck.html',
'md5': '2fabc277131bddafdd120e0fc0f974c9',
'info_dict': {
'id': '0822171c-3829-43bf-b99f-d77358ae75e3',
'ext': 'mp4',
'title': 'Ellen Meets Las Vegas Survivors Jesus Campos and Stephen Schuck',
'description': 'md5:76e3355e2242a78ad9e3858e5616923f',
'display_id': 'ellen-meets-las-vegas-survivors-jesus-campos-and-stephen-schuck',
'duration': 514,
'timestamp': 1508505120000,
'thumbnail': 'https://warnerbros-h.assetsadobe.com/is/image/content/dam/ellen/videos/episodes/season15/32/video--2728751654987218111',
}
}
def _real_extract(self, url):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
video_id_regex = r'data-config.+([\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})'
video_id = self._search_regex(video_id_regex, webpage, 'video id')
return self._extract_from_video_id(video_id, display_id)
class EllenTubePlaylistIE(EllenTubeBaseIE):
_VALID_URL = r'https?://(?:www\.)?ellentube\.com/(?:episode|studios)/(?P<id>.+)\.html'
_TESTS = [{
'url': 'https://www.ellentube.com/episode/dax-shepard-jordan-fisher-haim.html',
'info_dict': {
'id': 'dax-shepard-jordan-fisher-haim',
'title': 'Dax Shepard, \'DWTS\' Team Jordan Fisher & Lindsay Arnold, HAIM',
},
'playlist_count': 6,
}, {
'url': 'https://www.ellentube.com/studios/macey-goes-rving0.html',
'info_dict': {
'id': 'macey-goes-rving0',
'title': 'Macey Goes RVing',
},
'playlist_mincount': 3,
}]
def _real_extract(self, url):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
playlist_data = self._html_search_regex(
r'<div\s+data-component\s*=\s*"Details"(.+)</div>', webpage, 'episode data')
playlist_title = self._search_regex(
r'title"\s*:\s*"(.+?)"', playlist_data, 'playlist title')
entries = [self._extract_from_video_id(m.group('vid')) for m in re.finditer(
r'pid=(?P<vid>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})', playlist_data)]
if not entries:
api_search = self._search_regex(
r'filter"\s*:\s*"(.+?)"', playlist_data, 'api search')
video_ids = self._extract_video_ids_from_api_search(
api_search, display_id)
entries = [self._extract_from_video_id(
vid, display_id) for vid in video_ids]
return self.playlist_result(entries, display_id, playlist_title)

View File

@ -1,101 +0,0 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from .kaltura import KalturaIE
from ..utils import NO_DEFAULT
class EllenTVIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?(?:ellentv|ellentube)\.com/videos/(?P<id>[a-z0-9_-]+)'
_TESTS = [{
'url': 'http://www.ellentv.com/videos/0-ipq1gsai/',
'md5': '4294cf98bc165f218aaa0b89e0fd8042',
'info_dict': {
'id': '0_ipq1gsai',
'ext': 'mov',
'title': 'Fast Fingers of Fate',
'description': 'md5:3539013ddcbfa64b2a6d1b38d910868a',
'timestamp': 1428035648,
'upload_date': '20150403',
'uploader_id': 'batchUser',
},
}, {
# not available via http://widgets.ellentube.com/
'url': 'http://www.ellentv.com/videos/1-szkgu2m2/',
'info_dict': {
'id': '1_szkgu2m2',
'ext': 'flv',
'title': "Ellen's Amazingly Talented Audience",
'description': 'md5:86ff1e376ff0d717d7171590e273f0a5',
'timestamp': 1255140900,
'upload_date': '20091010',
'uploader_id': 'ellenkaltura@gmail.com',
},
'params': {
'skip_download': True,
},
}]
def _real_extract(self, url):
video_id = self._match_id(url)
URLS = ('http://widgets.ellentube.com/videos/%s' % video_id, url)
for num, url_ in enumerate(URLS, 1):
webpage = self._download_webpage(
url_, video_id, fatal=num == len(URLS))
default = NO_DEFAULT if num == len(URLS) else None
partner_id = self._search_regex(
r"var\s+partnerId\s*=\s*'([^']+)", webpage, 'partner id',
default=default)
kaltura_id = self._search_regex(
[r'id="kaltura_player_([^"]+)"',
r"_wb_entry_id\s*:\s*'([^']+)",
r'data-kaltura-entry-id="([^"]+)'],
webpage, 'kaltura id', default=default)
if partner_id and kaltura_id:
break
return self.url_result('kaltura:%s:%s' % (partner_id, kaltura_id), KalturaIE.ie_key())
class EllenTVClipsIE(InfoExtractor):
IE_NAME = 'EllenTV:clips'
_VALID_URL = r'https?://(?:www\.)?ellentv\.com/episodes/(?P<id>[a-z0-9_-]+)'
_TEST = {
'url': 'http://www.ellentv.com/episodes/meryl-streep-vanessa-hudgens/',
'info_dict': {
'id': 'meryl-streep-vanessa-hudgens',
'title': 'Meryl Streep, Vanessa Hudgens',
},
'playlist_mincount': 5,
}
def _real_extract(self, url):
playlist_id = self._match_id(url)
webpage = self._download_webpage(url, playlist_id)
playlist = self._extract_playlist(webpage, playlist_id)
return {
'_type': 'playlist',
'id': playlist_id,
'title': self._og_search_title(webpage),
'entries': self._extract_entries(playlist)
}
def _extract_playlist(self, webpage, playlist_id):
json_string = self._search_regex(r'playerView.addClips\(\[\{(.*?)\}\]\);', webpage, 'json')
return self._parse_json('[{' + json_string + '}]', playlist_id)
def _extract_entries(self, playlist):
return [
self.url_result(
'kaltura:%s:%s' % (item['kaltura_partner_id'], item['kaltura_entry_id']),
KalturaIE.ie_key(), video_id=item['kaltura_entry_id'])
for item in playlist]

View File

@ -308,9 +308,9 @@ from .ehow import EHowIE
from .eighttracks import EightTracksIE from .eighttracks import EightTracksIE
from .einthusan import EinthusanIE from .einthusan import EinthusanIE
from .eitb import EitbIE from .eitb import EitbIE
from .ellentv import ( from .ellentube import (
EllenTVIE, EllenTubePlaylistIE,
EllenTVClipsIE, EllenTubeVideoIE,
) )
from .elpais import ElPaisIE from .elpais import ElPaisIE
from .embedly import EmbedlyIE from .embedly import EmbedlyIE