adjusted url handling to pull in all moviestorm urls and provide informative error if not a handleable moviestorm url. This is to prevent youtube-dl from falling back on the generic IE for bad moviestorm urls, as that will always fail

2015-03-14 18:34:27 -04:00 · 2015-03-14 18:34:27 -04:00 · 5af03d5cfe
commit 5af03d5cfe
parent 3ca77367f9
1 changed files with 80 additions and 35 deletions
--- a/youtube_dl/extractor/moviestorm.py
+++ b/youtube_dl/extractor/moviestorm.py
@ -17,6 +17,7 @@ class MovieStormHTMLParser(compat_html_parser.HTMLParser):
        self.found_button = False
        self.watch_urls = []
        self.direct_url = False
+        self.series_home_page = False
        compat_html_parser.HTMLParser.__init__(self)

    def handle_starttag(self, tag, attrs):
@ -24,7 +25,7 @@ class MovieStormHTMLParser(compat_html_parser.HTMLParser):
        if tag == 'td' and attrs['class'] == 'link_td':
            self.found_button = True
        elif tag == 'a' and self.found_button:
-            # suppress ishare and other direct links, can't handle now
+            # Suppress ishare and other direct links, can't handle now
            if 'moviestorm' in attrs['href']:
                self.watch_urls.append(attrs['href'].strip())
        elif tag == 'a' and 'class' in attrs and attrs['class'] == 'real_link':
@ -34,35 +35,47 @@ class MovieStormHTMLParser(compat_html_parser.HTMLParser):
        if tag == 'td':
            self.found_button = False

-    @classmethod
-    def extract_watch_urls(cls, html):
-        p = cls()
-        p.feed(html)
-        p.close()
-        return p.watch_urls
+    def handle_data(self, data):
+        if data.strip() == 'SHOW EPISODES':
+            self.series_home_page = True

    @classmethod
-    def extract_direct_url(cls, html):
+    def custom_parse(cls, html, return_variable):
        p = cls()
        p.feed(html)
        p.close()
-        return p.direct_url
+        return getattr(p, return_variable)

 class MovieStormIE(InfoExtractor):
-    IE_DESC = 'Movie Storm (link farm)'
-    IE_NAME = 'MovieStorm'
-    _VALID_URL = r'http://moviestorm\.eu/view/(\d+)-watch-(.*)/season-(\d+)/episode-(\d+)'
-    _LINK_FARM = True
-
+    # HANDLER INFO:
    # There are no tests for this IE because the links on any given moviestorm
    # page can dynamically change, and because the actual download/extraction
-    # is ultimately preformed by another IE. An example of an acceptable url to
-    # feed to this IE is: http://moviestorm.eu/view/218-watch-the-simpsons/season-26/episode-1
+    # is ultimately preformed by another IE. Example urls to
+    # feed to this IE are:
+    #
+    #   EPISODE: http://moviestorm.eu/view/5821-watch-portlandia/season-1/episode-1
+    #   MOVIE:   http://moviestorm.eu/view/5269-watch-taken-3-online.html
+    #
+    # If the user provides a series url, like the one below, this IE should detect
+    # and raise an error:
+    #
+    #   SERIES:  http://moviestorm.eu/view/5821-watch-portlandia.html
+    #
+    # In other news, moviestorm's drupal db config is unstable at times retry up to 5
+    # times before giving up, waiting 5 second delay between each retry.
+    #
+    # Also, this IE will catch all links with http://moviestorm.eu urls. If it's an
+    # un-handleable url, an error will be thrown informing the user of appropriate
+    # urls to provide. Not using a more complex regex is meant to prevent unacceptable
+    # moviestorm urls from falling back into the generic IE, as that will always fail on
+    # moviestorm links.
+
+    IE_DESC = 'Movie Storm (link farm)'
+    IE_NAME = 'MovieStorm'
+    _VALID_URL = r'http://moviestorm\.eu'
+    _LINK_FARM = True
    _TEST = False

-    # moviestorm's drupal db config is unstable at times
-    # retry up to 5 times before giving up, 5 second delay
-    # between each retry
    retry_count = 0
    max_retries = 5
    retry_wait = 5
@ -75,7 +88,12 @@ class MovieStormIE(InfoExtractor):
        return (uri, hash, token)

    def _real_extract(self, url):
-        # retry loop to capture moviestorm page
+        # Inform user to provide proper moviestorm link
+        if 'watch' not in url:
+            msg = ('The moviestorm handler requires either a movie page link or '
+                'a series episode page link.  Please try again with one of those.')
+            raise ExtractorError(msg, expected=True)
+
        while True:
            if self.retry_count == 0:
                note = 'Downloading link farm page'
@ -93,8 +111,21 @@ class MovieStormIE(InfoExtractor):
            )

            if farmpage.strip() != 'MySQL server has gone away':
+                series_home_page = MovieStormHTMLParser.custom_parse(
+                    farmpage,
+                    'series_home_page'
+                )
+
+                # Fail if provided series home page
+                if series_home_page:
+                    msg = ('It looks like you provided an show page url.  You must provide '
+                        'an episode page url or movie page url')
+                    raise ExtractorError(msg, expected=True)
+
+                # Success
                break

+            # Continue retrying if moviestorm database is currently unstable
            if self.retry_count < self.max_retries:
                self.retry_count += 1
                sleep(self.retry_wait)
@ -102,25 +133,39 @@ class MovieStormIE(InfoExtractor):
                msg = 'The moviestorm database is currently unstable.  Please try again later.'
                raise ExtractorError(msg, expected=True)

-        # scrape WATCH button links from moviestorm page
+        # Scrape WATCH button links from moviestorm page
        self.to_screen(': Extracting watch page urls')
-        watch_urls = MovieStormHTMLParser.extract_watch_urls(farmpage)
+        watch_urls = MovieStormHTMLParser.custom_parse(
+            farmpage,
+            'watch_urls'
+        )

-        # get direct urls from scraped watch pages
+        # Get direct urls from scraped watch pages
        self.to_screen(': Extracting direct links from watch pages')
-        for watch_url in watch_urls:
-            (_, _, token) = self._parse_target(watch_url)
-            watchpage = self._download_webpage(
-                watch_url, token,
-                note=False,
-                errnote='Unable to download link farm watch page',
-                fatal=False
-            )
+        direct_url_count = 1

-            if watchpage is not None:
-                direct_url = MovieStormHTMLParser.extract_direct_url(watchpage)
-                if direct_url:
-                    self.direct_urls.append(direct_url)
+        for watch_url in watch_urls:
+            # Stop after gathering 50 urls, moviestorm sends 503 if
+            # request too many in rapid succession
+            if direct_url_count < 50:
+                (_, _, token) = self._parse_target(watch_url)
+                watchpage = self._download_webpage(
+                    watch_url, token,
+                    note=False,
+                    errnote='Unable to download link farm watch page',
+                    fatal=False
+                )
+
+                if watchpage is not None:
+                    direct_url = MovieStormHTMLParser.custom_parse(
+                        watchpage,
+                        'direct_url'
+                    )
+
+                    if direct_url:
+                        self.direct_urls.append(direct_url)
+
+            direct_url_count += 1

        self.to_screen(': Passing off farmed links to InfoExtractors')
        return list(set(self.direct_urls))