From 5af03d5cfecdaf3adf24f3ad4d8660a8b0743278 Mon Sep 17 00:00:00 2001
From: Philip Ardery <arderyp@gmail.com>
Date: Sat, 14 Mar 2015 18:34:27 -0400
Subject: [PATCH] adjusted url handling to pull in all moviestorm urls and
 provide informative error if not a handleable moviestorm url.  This is to
 prevent youtube-dl from falling back on the generic IE for bad moviestorm
 urls, as that will always fail

---
 youtube_dl/extractor/moviestorm.py | 115 ++++++++++++++++++++---------
 1 file changed, 80 insertions(+), 35 deletions(-)

diff --git a/youtube_dl/extractor/moviestorm.py b/youtube_dl/extractor/moviestorm.py
index d6ddd8e82..01ab19faf 100644
--- a/youtube_dl/extractor/moviestorm.py
+++ b/youtube_dl/extractor/moviestorm.py
@@ -17,6 +17,7 @@ class MovieStormHTMLParser(compat_html_parser.HTMLParser):
         self.found_button = False
         self.watch_urls = []
         self.direct_url = False
+        self.series_home_page = False
         compat_html_parser.HTMLParser.__init__(self)
 
     def handle_starttag(self, tag, attrs):
@@ -24,7 +25,7 @@ class MovieStormHTMLParser(compat_html_parser.HTMLParser):
         if tag == 'td' and attrs['class'] == 'link_td':
             self.found_button = True
         elif tag == 'a' and self.found_button:
-            # suppress ishare and other direct links, can't handle now
+            # Suppress ishare and other direct links, can't handle now
             if 'moviestorm' in attrs['href']:
                 self.watch_urls.append(attrs['href'].strip())
         elif tag == 'a' and 'class' in attrs and attrs['class'] == 'real_link':
@@ -34,35 +35,47 @@ class MovieStormHTMLParser(compat_html_parser.HTMLParser):
         if tag == 'td':
             self.found_button = False
 
-    @classmethod
-    def extract_watch_urls(cls, html):
-        p = cls()
-        p.feed(html)
-        p.close()
-        return p.watch_urls
+    def handle_data(self, data):
+        if data.strip() == 'SHOW EPISODES':
+            self.series_home_page = True
 
     @classmethod
-    def extract_direct_url(cls, html):
+    def custom_parse(cls, html, return_variable):
         p = cls()
         p.feed(html)
         p.close()
-        return p.direct_url
+        return getattr(p, return_variable)
 
 class MovieStormIE(InfoExtractor):
-    IE_DESC = 'Movie Storm (link farm)'
-    IE_NAME = 'MovieStorm'
-    _VALID_URL = r'http://moviestorm\.eu/view/(\d+)-watch-(.*)/season-(\d+)/episode-(\d+)'
-    _LINK_FARM = True
-
+    # HANDLER INFO:
     # There are no tests for this IE because the links on any given moviestorm
     # page can dynamically change, and because the actual download/extraction
-    # is ultimately preformed by another IE. An example of an acceptable url to
-    # feed to this IE is: http://moviestorm.eu/view/218-watch-the-simpsons/season-26/episode-1
+    # is ultimately preformed by another IE. Example urls to
+    # feed to this IE are:
+    #
+    #   EPISODE: http://moviestorm.eu/view/5821-watch-portlandia/season-1/episode-1
+    #   MOVIE:   http://moviestorm.eu/view/5269-watch-taken-3-online.html
+    #
+    # If the user provides a series url, like the one below, this IE should detect
+    # and raise an error:
+    #
+    #   SERIES:  http://moviestorm.eu/view/5821-watch-portlandia.html
+    #
+    # In other news, moviestorm's drupal db config is unstable at times retry up to 5
+    # times before giving up, waiting 5 second delay between each retry.
+    #
+    # Also, this IE will catch all links with http://moviestorm.eu urls. If it's an
+    # un-handleable url, an error will be thrown informing the user of appropriate
+    # urls to provide. Not using a more complex regex is meant to prevent unacceptable
+    # moviestorm urls from falling back into the generic IE, as that will always fail on
+    # moviestorm links.
+
+    IE_DESC = 'Movie Storm (link farm)'
+    IE_NAME = 'MovieStorm'
+    _VALID_URL = r'http://moviestorm\.eu'
+    _LINK_FARM = True
     _TEST = False
 
-    # moviestorm's drupal db config is unstable at times
-    # retry up to 5 times before giving up, 5 second delay
-    # between each retry
     retry_count = 0
     max_retries = 5
     retry_wait = 5
@@ -75,7 +88,12 @@ class MovieStormIE(InfoExtractor):
         return (uri, hash, token)
 
     def _real_extract(self, url):
-        # retry loop to capture moviestorm page
+        # Inform user to provide proper moviestorm link
+        if 'watch' not in url:
+            msg = ('The moviestorm handler requires either a movie page link or '
+                'a series episode page link.  Please try again with one of those.')
+            raise ExtractorError(msg, expected=True)
+
         while True:
             if self.retry_count == 0:
                 note = 'Downloading link farm page'
@@ -93,8 +111,21 @@ class MovieStormIE(InfoExtractor):
             )
 
             if farmpage.strip() != 'MySQL server has gone away':
+                series_home_page = MovieStormHTMLParser.custom_parse(
+                    farmpage,
+                    'series_home_page'
+                )
+
+                # Fail if provided series home page
+                if series_home_page:
+                    msg = ('It looks like you provided an show page url.  You must provide '
+                        'an episode page url or movie page url')
+                    raise ExtractorError(msg, expected=True)
+
+                # Success
                 break
 
+            # Continue retrying if moviestorm database is currently unstable
             if self.retry_count < self.max_retries:
                 self.retry_count += 1
                 sleep(self.retry_wait)
@@ -102,25 +133,39 @@ class MovieStormIE(InfoExtractor):
                 msg = 'The moviestorm database is currently unstable.  Please try again later.'
                 raise ExtractorError(msg, expected=True)
 
-        # scrape WATCH button links from moviestorm page
+        # Scrape WATCH button links from moviestorm page
         self.to_screen(': Extracting watch page urls')
-        watch_urls = MovieStormHTMLParser.extract_watch_urls(farmpage)
+        watch_urls = MovieStormHTMLParser.custom_parse(
+            farmpage,
+            'watch_urls'
+        )
 
-        # get direct urls from scraped watch pages
+        # Get direct urls from scraped watch pages
         self.to_screen(': Extracting direct links from watch pages')
-        for watch_url in watch_urls:
-            (_, _, token) = self._parse_target(watch_url)
-            watchpage = self._download_webpage(
-                watch_url, token,
-                note=False,
-                errnote='Unable to download link farm watch page',
-                fatal=False
-            )
+        direct_url_count = 1
 
-            if watchpage is not None:
-                direct_url = MovieStormHTMLParser.extract_direct_url(watchpage)
-                if direct_url:
-                    self.direct_urls.append(direct_url)
+        for watch_url in watch_urls:
+            # Stop after gathering 50 urls, moviestorm sends 503 if
+            # request too many in rapid succession
+            if direct_url_count < 50:
+                (_, _, token) = self._parse_target(watch_url)
+                watchpage = self._download_webpage(
+                    watch_url, token,
+                    note=False,
+                    errnote='Unable to download link farm watch page',
+                    fatal=False
+                )
+
+                if watchpage is not None:
+                    direct_url = MovieStormHTMLParser.custom_parse(
+                        watchpage,
+                        'direct_url'
+                    )
+
+                    if direct_url:
+                        self.direct_urls.append(direct_url)
+
+            direct_url_count += 1
 
         self.to_screen(': Passing off farmed links to InfoExtractors')
         return list(set(self.direct_urls))