adjusted url handling to pull in all moviestorm urls and provide informative error if not a handleable moviestorm url. This is to prevent youtube-dl from falling back on the generic IE for bad moviestorm urls, as that will always fail

This commit is contained in:
Philip Ardery 2015-03-14 18:34:27 -04:00
parent 3ca77367f9
commit 5af03d5cfe

View File

@ -17,6 +17,7 @@ class MovieStormHTMLParser(compat_html_parser.HTMLParser):
self.found_button = False
self.watch_urls = []
self.direct_url = False
self.series_home_page = False
compat_html_parser.HTMLParser.__init__(self)
def handle_starttag(self, tag, attrs):
@ -24,7 +25,7 @@ class MovieStormHTMLParser(compat_html_parser.HTMLParser):
if tag == 'td' and attrs['class'] == 'link_td':
self.found_button = True
elif tag == 'a' and self.found_button:
# suppress ishare and other direct links, can't handle now
# Suppress ishare and other direct links, can't handle now
if 'moviestorm' in attrs['href']:
self.watch_urls.append(attrs['href'].strip())
elif tag == 'a' and 'class' in attrs and attrs['class'] == 'real_link':
@ -34,35 +35,47 @@ class MovieStormHTMLParser(compat_html_parser.HTMLParser):
if tag == 'td':
self.found_button = False
@classmethod
def extract_watch_urls(cls, html):
p = cls()
p.feed(html)
p.close()
return p.watch_urls
def handle_data(self, data):
if data.strip() == 'SHOW EPISODES':
self.series_home_page = True
@classmethod
def extract_direct_url(cls, html):
def custom_parse(cls, html, return_variable):
p = cls()
p.feed(html)
p.close()
return p.direct_url
return getattr(p, return_variable)
class MovieStormIE(InfoExtractor):
IE_DESC = 'Movie Storm (link farm)'
IE_NAME = 'MovieStorm'
_VALID_URL = r'http://moviestorm\.eu/view/(\d+)-watch-(.*)/season-(\d+)/episode-(\d+)'
_LINK_FARM = True
# HANDLER INFO:
# There are no tests for this IE because the links on any given moviestorm
# page can dynamically change, and because the actual download/extraction
# is ultimately preformed by another IE. An example of an acceptable url to
# feed to this IE is: http://moviestorm.eu/view/218-watch-the-simpsons/season-26/episode-1
# is ultimately preformed by another IE. Example urls to
# feed to this IE are:
#
# EPISODE: http://moviestorm.eu/view/5821-watch-portlandia/season-1/episode-1
# MOVIE: http://moviestorm.eu/view/5269-watch-taken-3-online.html
#
# If the user provides a series url, like the one below, this IE should detect
# and raise an error:
#
# SERIES: http://moviestorm.eu/view/5821-watch-portlandia.html
#
# In other news, moviestorm's drupal db config is unstable at times retry up to 5
# times before giving up, waiting 5 second delay between each retry.
#
# Also, this IE will catch all links with http://moviestorm.eu urls. If it's an
# un-handleable url, an error will be thrown informing the user of appropriate
# urls to provide. Not using a more complex regex is meant to prevent unacceptable
# moviestorm urls from falling back into the generic IE, as that will always fail on
# moviestorm links.
IE_DESC = 'Movie Storm (link farm)'
IE_NAME = 'MovieStorm'
_VALID_URL = r'http://moviestorm\.eu'
_LINK_FARM = True
_TEST = False
# moviestorm's drupal db config is unstable at times
# retry up to 5 times before giving up, 5 second delay
# between each retry
retry_count = 0
max_retries = 5
retry_wait = 5
@ -75,7 +88,12 @@ class MovieStormIE(InfoExtractor):
return (uri, hash, token)
def _real_extract(self, url):
# retry loop to capture moviestorm page
# Inform user to provide proper moviestorm link
if 'watch' not in url:
msg = ('The moviestorm handler requires either a movie page link or '
'a series episode page link. Please try again with one of those.')
raise ExtractorError(msg, expected=True)
while True:
if self.retry_count == 0:
note = 'Downloading link farm page'
@ -93,8 +111,21 @@ class MovieStormIE(InfoExtractor):
)
if farmpage.strip() != 'MySQL server has gone away':
series_home_page = MovieStormHTMLParser.custom_parse(
farmpage,
'series_home_page'
)
# Fail if provided series home page
if series_home_page:
msg = ('It looks like you provided an show page url. You must provide '
'an episode page url or movie page url')
raise ExtractorError(msg, expected=True)
# Success
break
# Continue retrying if moviestorm database is currently unstable
if self.retry_count < self.max_retries:
self.retry_count += 1
sleep(self.retry_wait)
@ -102,25 +133,39 @@ class MovieStormIE(InfoExtractor):
msg = 'The moviestorm database is currently unstable. Please try again later.'
raise ExtractorError(msg, expected=True)
# scrape WATCH button links from moviestorm page
# Scrape WATCH button links from moviestorm page
self.to_screen(': Extracting watch page urls')
watch_urls = MovieStormHTMLParser.extract_watch_urls(farmpage)
watch_urls = MovieStormHTMLParser.custom_parse(
farmpage,
'watch_urls'
)
# get direct urls from scraped watch pages
# Get direct urls from scraped watch pages
self.to_screen(': Extracting direct links from watch pages')
for watch_url in watch_urls:
(_, _, token) = self._parse_target(watch_url)
watchpage = self._download_webpage(
watch_url, token,
note=False,
errnote='Unable to download link farm watch page',
fatal=False
)
direct_url_count = 1
if watchpage is not None:
direct_url = MovieStormHTMLParser.extract_direct_url(watchpage)
if direct_url:
self.direct_urls.append(direct_url)
for watch_url in watch_urls:
# Stop after gathering 50 urls, moviestorm sends 503 if
# request too many in rapid succession
if direct_url_count < 50:
(_, _, token) = self._parse_target(watch_url)
watchpage = self._download_webpage(
watch_url, token,
note=False,
errnote='Unable to download link farm watch page',
fatal=False
)
if watchpage is not None:
direct_url = MovieStormHTMLParser.custom_parse(
watchpage,
'direct_url'
)
if direct_url:
self.direct_urls.append(direct_url)
direct_url_count += 1
self.to_screen(': Passing off farmed links to InfoExtractors')
return list(set(self.direct_urls))