[WatchTvSeries] Add new extractor for watch-tv-series.to
This commit is contained in:
parent
d26b1317ed
commit
9d334aedc8
@ -504,6 +504,10 @@ from .vulture import VultureIE
|
|||||||
from .walla import WallaIE
|
from .walla import WallaIE
|
||||||
from .washingtonpost import WashingtonPostIE
|
from .washingtonpost import WashingtonPostIE
|
||||||
from .wat import WatIE
|
from .wat import WatIE
|
||||||
|
from .watch_tv_series import (
|
||||||
|
WatchTvSeriesSeasonIE,
|
||||||
|
WatchTvSeriesEpisodeIE,
|
||||||
|
)
|
||||||
from .wayofthemaster import WayOfTheMasterIE
|
from .wayofthemaster import WayOfTheMasterIE
|
||||||
from .wdr import (
|
from .wdr import (
|
||||||
WDRIE,
|
WDRIE,
|
||||||
|
108
youtube_dl/extractor/watch_tv_series.py
Normal file
108
youtube_dl/extractor/watch_tv_series.py
Normal file
@ -0,0 +1,108 @@
|
|||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
import re
|
||||||
|
|
||||||
|
from .common import InfoExtractor
|
||||||
|
from ..utils import clean_html, ExtractorError
|
||||||
|
|
||||||
|
try:
|
||||||
|
import microdata
|
||||||
|
except ImportError:
|
||||||
|
microdata = None
|
||||||
|
|
||||||
|
|
||||||
|
class WatchTvSeriesSeasonIE(InfoExtractor):
|
||||||
|
domain_base = 'http://watch-tv-series.to/'
|
||||||
|
_VALID_URL = domain_base + r'(?P<path>season-(?P<season>\d+)/[a-z]+)'
|
||||||
|
|
||||||
|
def extract_microdata(self):
|
||||||
|
item, = microdata.get_items(self.content)
|
||||||
|
name = item.name
|
||||||
|
playlist_title = '%s - Season %s' % (name, self.season)
|
||||||
|
playlist_description = item.description
|
||||||
|
|
||||||
|
season = item.season
|
||||||
|
episodes = sorted(
|
||||||
|
season.get_all('episode'), key=lambda ep: ep.datepublished)
|
||||||
|
|
||||||
|
entries = [
|
||||||
|
self.url_result(self.domain_base + ep.url.lstrip('/'),
|
||||||
|
'WatchTvSeriesEpisode')
|
||||||
|
for ep in episodes
|
||||||
|
]
|
||||||
|
|
||||||
|
return self.playlist_result(
|
||||||
|
entries, playlist_id=item.url, playlist_title=playlist_title,
|
||||||
|
playlist_description=playlist_description)
|
||||||
|
|
||||||
|
def extract_strings(self):
|
||||||
|
name = self._search_regex(
|
||||||
|
r'<span itemprop="name">([^<]+)</span>',
|
||||||
|
self.content, 'show name')
|
||||||
|
|
||||||
|
playlist_title = '%s - Season %s' % (name, self.season)
|
||||||
|
|
||||||
|
data = {}
|
||||||
|
entries = []
|
||||||
|
|
||||||
|
# This parsing is sensitive to the order of HTML attributes in the
|
||||||
|
# content of the page, but it works for now.
|
||||||
|
itemprop_regex = r'itemprop="([a-z]+)"(?: content="([^"]+)"|>([^<]+))'
|
||||||
|
|
||||||
|
for mobj in re.finditer(itemprop_regex, self.content):
|
||||||
|
key = mobj.group(1)
|
||||||
|
value = mobj.group(2) or mobj.group(3)
|
||||||
|
value = clean_html(value).strip()
|
||||||
|
data[key] = value
|
||||||
|
if key == 'name' and data.get('url', '').startswith('/episode'):
|
||||||
|
url = self.domain_base + data['url'].lstrip('/')
|
||||||
|
entries.append(self.url_result(url, 'WatchTvSeriesEpisode'))
|
||||||
|
|
||||||
|
# Episodes are displayed with the latest first,
|
||||||
|
# but we want to retrieve the earliest first.
|
||||||
|
entries.reverse()
|
||||||
|
|
||||||
|
return self.playlist_result(
|
||||||
|
entries, playlist_id=self.path, playlist_title=playlist_title)
|
||||||
|
|
||||||
|
def _real_extract(self, url):
|
||||||
|
mobj = re.match(self._VALID_URL, url)
|
||||||
|
self.path = mobj.group('path')
|
||||||
|
self.season = mobj.group('season')
|
||||||
|
self.content = self._download_webpage(url, self.path)
|
||||||
|
if microdata:
|
||||||
|
return self.extract_microdata()
|
||||||
|
else:
|
||||||
|
return self.extract_strings()
|
||||||
|
|
||||||
|
|
||||||
|
class WatchTvSeriesEpisodeIE(InfoExtractor):
|
||||||
|
domain_base = 'http://watch-tv-series.to/'
|
||||||
|
_VALID_URL = 'http://watch-tv-series.to/(?P<path>episode/[a-z_0-9]+.html)'
|
||||||
|
|
||||||
|
def _real_extract(self, url):
|
||||||
|
# There is microdata on this page, but we don't need it,
|
||||||
|
# since we will return a url_result anyway.
|
||||||
|
|
||||||
|
mobj = re.match(self._VALID_URL, url)
|
||||||
|
path = mobj.group('path')
|
||||||
|
|
||||||
|
content = self._download_webpage(url, path)
|
||||||
|
|
||||||
|
movshare_regex = (
|
||||||
|
r'href="/([^"]+)" class="buttonlink" title="movshare.net"')
|
||||||
|
|
||||||
|
mobj = re.search(movshare_regex, content)
|
||||||
|
if mobj is None:
|
||||||
|
raise ExtractorError(
|
||||||
|
'Unable to extract movshare.net link. ' +
|
||||||
|
'No other video sites are supported.',
|
||||||
|
expected=True)
|
||||||
|
|
||||||
|
external_link = self._download_webpage(
|
||||||
|
self.domain_base + mobj.group(1), path + '[external]')
|
||||||
|
|
||||||
|
mobj = re.search(
|
||||||
|
r'http://www.movshare.net/video/[a-z0-9]+', external_link)
|
||||||
|
|
||||||
|
return self.url_result(mobj.group(0))
|
Loading…
x
Reference in New Issue
Block a user