[ellentube] Improve extraction of playlists.

Instead of searching the webpage for video ids and making a api
request for each video id, we now make one single api request to
get all the required information needed to extract a playlist.
This commit is contained in:
Alex Seiler 2017-10-30 01:47:49 +01:00
parent 061b47ce71
commit 5d81166f78
2 changed files with 52 additions and 39 deletions

View File

@ -1,10 +1,9 @@
# coding: utf-8 # coding: utf-8
from __future__ import unicode_literals from __future__ import unicode_literals
import re
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import ( from ..utils import (
clean_html,
int_or_none, int_or_none,
urljoin, urljoin,
) )
@ -13,24 +12,21 @@ from ..utils import (
class EllenTubeBaseIE(InfoExtractor): class EllenTubeBaseIE(InfoExtractor):
API_URL = 'https://api-prod.ellentube.com/' API_URL = 'https://api-prod.ellentube.com/'
def _extract_from_video_id(self, video_id, display_id=None): def _extract_video_from_json(self, data, video_id, display_id=None):
video_data = self._download_json( title = data['title']
urljoin(self.API_URL, 'ellenapi/api/item/%s' % video_id), video_id) description = data.get('description')
title = video_data['title'] publish_time = int_or_none(data.get('publishTime'))
description = video_data.get('description') thumbnail = data.get('thumbnail')
publish_time = int_or_none(video_data.get('publishTime'))
thumbnail = video_data.get('thumbnail')
formats = [] formats = []
duration = None duration = None
for entry in video_data.get('media'): for entry in data.get('media'):
if entry.get('id') == 'm3u8': if entry.get('id') == 'm3u8':
formats = self._extract_m3u8_formats(entry.get( formats = self._extract_m3u8_formats(
'url'), video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls') entry.get('url'), video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls')
duration = int_or_none(entry.get('duration')) duration = int_or_none(entry.get('duration'))
break break
self._sort_formats(formats) self._sort_formats(formats)
return { return {
'id': video_id, 'id': video_id,
'title': title, 'title': title,
@ -42,10 +38,31 @@ class EllenTubeBaseIE(InfoExtractor):
'formats': formats, 'formats': formats,
} }
def _extract_video_ids_from_api_search(self, api_search, display_id): def _extract_playlist_entries_from_json(self, data, display_id):
feed_data = self._download_json( return [self._extract_video_from_json(elem, elem['id'])
for elem in data if elem.get('type') == 'VIDEO']
def _extract_from_video_id(self, video_id, display_id=None):
api_data = self._download_json(
urljoin(self.API_URL, 'ellenapi/api/item/%s' % video_id), video_id)
return self._extract_video_from_json(api_data, video_id, display_id)
def _extract_playlist(self, url, display_id, extract_description=True):
webpage = self._download_webpage(url, display_id)
playlist_data = self._html_search_regex(
r'<div\s+data-component\s*=\s*"Details"(.+)</div>', webpage, 'playlist data')
playlist_title = self._search_regex(
r'"title"\s*:\s*"(.+?)"', playlist_data, 'playlist title')
playlist_description = clean_html(self._search_regex(
r'"description"\s*:\s*"(.+?)"', playlist_data, 'playlist description',
fatal=False)) if extract_description else None
api_search = self._search_regex(
r'"filter"\s*:\s*"(.+?)"', playlist_data, 'playlist api request')
api_data = self._download_json(
urljoin(self.API_URL, 'ellenapi/api/feed/?%s' % api_search), display_id) urljoin(self.API_URL, 'ellenapi/api/feed/?%s' % api_search), display_id)
return [entry.get('id') for entry in feed_data if entry.get('type') == 'VIDEO'] return self.playlist_result(
self._extract_playlist_entries_from_json(api_data, display_id),
display_id, playlist_title, playlist_description)
class EllenTubeVideoIE(EllenTubeBaseIE): class EllenTubeVideoIE(EllenTubeBaseIE):
@ -74,41 +91,36 @@ class EllenTubeVideoIE(EllenTubeBaseIE):
return self._extract_from_video_id(video_id, display_id) return self._extract_from_video_id(video_id, display_id)
class EllenTubePlaylistIE(EllenTubeBaseIE): class EllenTubeEpisodeIE(EllenTubeBaseIE):
_VALID_URL = r'https?://(?:www\.)?ellentube\.com/(?:episode|studios)/(?P<id>.+)\.html' _VALID_URL = r'https?://(?:www\.)?ellentube\.com/episode/(?P<id>.+)\.html'
_TESTS = [{ _TEST = {
'url': 'https://www.ellentube.com/episode/dax-shepard-jordan-fisher-haim.html', 'url': 'https://www.ellentube.com/episode/dax-shepard-jordan-fisher-haim.html',
'info_dict': { 'info_dict': {
'id': 'dax-shepard-jordan-fisher-haim', 'id': 'dax-shepard-jordan-fisher-haim',
'title': 'Dax Shepard, \'DWTS\' Team Jordan Fisher & Lindsay Arnold, HAIM', 'title': 'Dax Shepard, \'DWTS\' Team Jordan Fisher & Lindsay Arnold, HAIM',
'description': 'md5:aed85d42892f6126e71ec5ed2aea2a0d'
}, },
'playlist_count': 6, 'playlist_count': 6,
}, { }
def _real_extract(self, url):
display_id = self._match_id(url)
return self._extract_playlist(url, display_id)
class EllenTubeStudioIE(EllenTubeBaseIE):
_VALID_URL = r'https?://(?:www\.)?ellentube\.com/studios/(?P<id>.+)\.html'
_TEST = {
'url': 'https://www.ellentube.com/studios/macey-goes-rving0.html', 'url': 'https://www.ellentube.com/studios/macey-goes-rving0.html',
'info_dict': { 'info_dict': {
'id': 'macey-goes-rving0', 'id': 'macey-goes-rving0',
'title': 'Macey Goes RVing', 'title': 'Macey Goes RVing',
}, },
'playlist_mincount': 3, 'playlist_mincount': 3,
}] }
def _real_extract(self, url): def _real_extract(self, url):
display_id = self._match_id(url) display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id) return self._extract_playlist(url, display_id, False)
playlist_data = self._html_search_regex(
r'<div\s+data-component\s*=\s*"Details"(.+)</div>', webpage, 'episode data')
playlist_title = self._search_regex(
r'title"\s*:\s*"(.+?)"', playlist_data, 'playlist title')
entries = [self._extract_from_video_id(m.group('vid')) for m in re.finditer(
r'pid=(?P<vid>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})', playlist_data)]
if not entries:
api_search = self._search_regex(
r'filter"\s*:\s*"(.+?)"', playlist_data, 'api search')
video_ids = self._extract_video_ids_from_api_search(
api_search, display_id)
entries = [self._extract_from_video_id(
vid, display_id) for vid in video_ids]
return self.playlist_result(entries, display_id, playlist_title)

View File

@ -309,7 +309,8 @@ from .eighttracks import EightTracksIE
from .einthusan import EinthusanIE from .einthusan import EinthusanIE
from .eitb import EitbIE from .eitb import EitbIE
from .ellentube import ( from .ellentube import (
EllenTubePlaylistIE, EllenTubeEpisodeIE,
EllenTubeStudioIE,
EllenTubeVideoIE, EllenTubeVideoIE,
) )
from .elpais import ElPaisIE from .elpais import ElPaisIE