adjusted url handling to pull in all moviestorm urls and provide informative error if not a handleable moviestorm url. This is to prevent youtube-dl from falling back on the generic IE for bad moviestorm urls, as that will always fail
This commit is contained in:
parent
3ca77367f9
commit
5af03d5cfe
@ -17,6 +17,7 @@ class MovieStormHTMLParser(compat_html_parser.HTMLParser):
|
||||
self.found_button = False
|
||||
self.watch_urls = []
|
||||
self.direct_url = False
|
||||
self.series_home_page = False
|
||||
compat_html_parser.HTMLParser.__init__(self)
|
||||
|
||||
def handle_starttag(self, tag, attrs):
|
||||
@ -24,7 +25,7 @@ class MovieStormHTMLParser(compat_html_parser.HTMLParser):
|
||||
if tag == 'td' and attrs['class'] == 'link_td':
|
||||
self.found_button = True
|
||||
elif tag == 'a' and self.found_button:
|
||||
# suppress ishare and other direct links, can't handle now
|
||||
# Suppress ishare and other direct links, can't handle now
|
||||
if 'moviestorm' in attrs['href']:
|
||||
self.watch_urls.append(attrs['href'].strip())
|
||||
elif tag == 'a' and 'class' in attrs and attrs['class'] == 'real_link':
|
||||
@ -34,35 +35,47 @@ class MovieStormHTMLParser(compat_html_parser.HTMLParser):
|
||||
if tag == 'td':
|
||||
self.found_button = False
|
||||
|
||||
@classmethod
|
||||
def extract_watch_urls(cls, html):
|
||||
p = cls()
|
||||
p.feed(html)
|
||||
p.close()
|
||||
return p.watch_urls
|
||||
def handle_data(self, data):
|
||||
if data.strip() == 'SHOW EPISODES':
|
||||
self.series_home_page = True
|
||||
|
||||
@classmethod
|
||||
def extract_direct_url(cls, html):
|
||||
def custom_parse(cls, html, return_variable):
|
||||
p = cls()
|
||||
p.feed(html)
|
||||
p.close()
|
||||
return p.direct_url
|
||||
return getattr(p, return_variable)
|
||||
|
||||
class MovieStormIE(InfoExtractor):
|
||||
IE_DESC = 'Movie Storm (link farm)'
|
||||
IE_NAME = 'MovieStorm'
|
||||
_VALID_URL = r'http://moviestorm\.eu/view/(\d+)-watch-(.*)/season-(\d+)/episode-(\d+)'
|
||||
_LINK_FARM = True
|
||||
|
||||
# HANDLER INFO:
|
||||
# There are no tests for this IE because the links on any given moviestorm
|
||||
# page can dynamically change, and because the actual download/extraction
|
||||
# is ultimately preformed by another IE. An example of an acceptable url to
|
||||
# feed to this IE is: http://moviestorm.eu/view/218-watch-the-simpsons/season-26/episode-1
|
||||
# is ultimately preformed by another IE. Example urls to
|
||||
# feed to this IE are:
|
||||
#
|
||||
# EPISODE: http://moviestorm.eu/view/5821-watch-portlandia/season-1/episode-1
|
||||
# MOVIE: http://moviestorm.eu/view/5269-watch-taken-3-online.html
|
||||
#
|
||||
# If the user provides a series url, like the one below, this IE should detect
|
||||
# and raise an error:
|
||||
#
|
||||
# SERIES: http://moviestorm.eu/view/5821-watch-portlandia.html
|
||||
#
|
||||
# In other news, moviestorm's drupal db config is unstable at times retry up to 5
|
||||
# times before giving up, waiting 5 second delay between each retry.
|
||||
#
|
||||
# Also, this IE will catch all links with http://moviestorm.eu urls. If it's an
|
||||
# un-handleable url, an error will be thrown informing the user of appropriate
|
||||
# urls to provide. Not using a more complex regex is meant to prevent unacceptable
|
||||
# moviestorm urls from falling back into the generic IE, as that will always fail on
|
||||
# moviestorm links.
|
||||
|
||||
IE_DESC = 'Movie Storm (link farm)'
|
||||
IE_NAME = 'MovieStorm'
|
||||
_VALID_URL = r'http://moviestorm\.eu'
|
||||
_LINK_FARM = True
|
||||
_TEST = False
|
||||
|
||||
# moviestorm's drupal db config is unstable at times
|
||||
# retry up to 5 times before giving up, 5 second delay
|
||||
# between each retry
|
||||
retry_count = 0
|
||||
max_retries = 5
|
||||
retry_wait = 5
|
||||
@ -75,7 +88,12 @@ class MovieStormIE(InfoExtractor):
|
||||
return (uri, hash, token)
|
||||
|
||||
def _real_extract(self, url):
|
||||
# retry loop to capture moviestorm page
|
||||
# Inform user to provide proper moviestorm link
|
||||
if 'watch' not in url:
|
||||
msg = ('The moviestorm handler requires either a movie page link or '
|
||||
'a series episode page link. Please try again with one of those.')
|
||||
raise ExtractorError(msg, expected=True)
|
||||
|
||||
while True:
|
||||
if self.retry_count == 0:
|
||||
note = 'Downloading link farm page'
|
||||
@ -93,8 +111,21 @@ class MovieStormIE(InfoExtractor):
|
||||
)
|
||||
|
||||
if farmpage.strip() != 'MySQL server has gone away':
|
||||
series_home_page = MovieStormHTMLParser.custom_parse(
|
||||
farmpage,
|
||||
'series_home_page'
|
||||
)
|
||||
|
||||
# Fail if provided series home page
|
||||
if series_home_page:
|
||||
msg = ('It looks like you provided an show page url. You must provide '
|
||||
'an episode page url or movie page url')
|
||||
raise ExtractorError(msg, expected=True)
|
||||
|
||||
# Success
|
||||
break
|
||||
|
||||
# Continue retrying if moviestorm database is currently unstable
|
||||
if self.retry_count < self.max_retries:
|
||||
self.retry_count += 1
|
||||
sleep(self.retry_wait)
|
||||
@ -102,25 +133,39 @@ class MovieStormIE(InfoExtractor):
|
||||
msg = 'The moviestorm database is currently unstable. Please try again later.'
|
||||
raise ExtractorError(msg, expected=True)
|
||||
|
||||
# scrape WATCH button links from moviestorm page
|
||||
# Scrape WATCH button links from moviestorm page
|
||||
self.to_screen(': Extracting watch page urls')
|
||||
watch_urls = MovieStormHTMLParser.extract_watch_urls(farmpage)
|
||||
watch_urls = MovieStormHTMLParser.custom_parse(
|
||||
farmpage,
|
||||
'watch_urls'
|
||||
)
|
||||
|
||||
# get direct urls from scraped watch pages
|
||||
# Get direct urls from scraped watch pages
|
||||
self.to_screen(': Extracting direct links from watch pages')
|
||||
for watch_url in watch_urls:
|
||||
(_, _, token) = self._parse_target(watch_url)
|
||||
watchpage = self._download_webpage(
|
||||
watch_url, token,
|
||||
note=False,
|
||||
errnote='Unable to download link farm watch page',
|
||||
fatal=False
|
||||
)
|
||||
direct_url_count = 1
|
||||
|
||||
if watchpage is not None:
|
||||
direct_url = MovieStormHTMLParser.extract_direct_url(watchpage)
|
||||
if direct_url:
|
||||
self.direct_urls.append(direct_url)
|
||||
for watch_url in watch_urls:
|
||||
# Stop after gathering 50 urls, moviestorm sends 503 if
|
||||
# request too many in rapid succession
|
||||
if direct_url_count < 50:
|
||||
(_, _, token) = self._parse_target(watch_url)
|
||||
watchpage = self._download_webpage(
|
||||
watch_url, token,
|
||||
note=False,
|
||||
errnote='Unable to download link farm watch page',
|
||||
fatal=False
|
||||
)
|
||||
|
||||
if watchpage is not None:
|
||||
direct_url = MovieStormHTMLParser.custom_parse(
|
||||
watchpage,
|
||||
'direct_url'
|
||||
)
|
||||
|
||||
if direct_url:
|
||||
self.direct_urls.append(direct_url)
|
||||
|
||||
direct_url_count += 1
|
||||
|
||||
self.to_screen(': Passing off farmed links to InfoExtractors')
|
||||
return list(set(self.direct_urls))
|
||||
|
Loading…
x
Reference in New Issue
Block a user