added moviestorm InfoExtractor. This is a link farm handler that scrapes urls from a moviestorm page, which are then handed off to one of youtube-dl's other handlers for download/extraction
This commit is contained in:
parent
082b1155a3
commit
9dc8c6eb23
128
youtube_dl/extractor/moviestorm.py
Normal file
128
youtube_dl/extractor/moviestorm.py
Normal file
@ -0,0 +1,128 @@
|
|||||||
|
# coding: utf-8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
import os.path
|
||||||
|
import re
|
||||||
|
from time import sleep
|
||||||
|
|
||||||
|
from .common import InfoExtractor
|
||||||
|
from ..utils import ExtractorError
|
||||||
|
from ..compat import (
|
||||||
|
compat_html_parser,
|
||||||
|
compat_urllib_parse,
|
||||||
|
compat_urllib_request,
|
||||||
|
compat_urlparse,
|
||||||
|
)
|
||||||
|
|
||||||
|
class MovieStormHTMLParser(compat_html_parser.HTMLParser):
|
||||||
|
def __init__(self):
|
||||||
|
self.found_button = False
|
||||||
|
self.watch_urls = []
|
||||||
|
self.direct_url = False
|
||||||
|
compat_html_parser.HTMLParser.__init__(self)
|
||||||
|
|
||||||
|
def handle_starttag(self, tag, attrs):
|
||||||
|
attrs = dict((k, v) for k, v in attrs)
|
||||||
|
if tag == 'td' and attrs['class'] == 'link_td':
|
||||||
|
self.found_button = True
|
||||||
|
elif tag == 'a' and self.found_button:
|
||||||
|
# suppress ishare and other direct links, can't handle now
|
||||||
|
if 'moviestorm' in attrs['href']:
|
||||||
|
self.watch_urls.append(attrs['href'].strip())
|
||||||
|
elif tag == 'a' and 'class' in attrs and attrs['class'] == 'real_link':
|
||||||
|
self.direct_url = attrs['href'].strip()
|
||||||
|
|
||||||
|
def handle_endtag(self, tag):
|
||||||
|
if tag == 'td':
|
||||||
|
self.found_button = False
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def extract_watch_urls(cls, html):
|
||||||
|
p = cls()
|
||||||
|
p.feed(html)
|
||||||
|
p.close()
|
||||||
|
return p.watch_urls
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def extract_direct_url(cls, html):
|
||||||
|
p = cls()
|
||||||
|
p.feed(html)
|
||||||
|
p.close()
|
||||||
|
return p.direct_url
|
||||||
|
|
||||||
|
class MovieStormIE(InfoExtractor):
|
||||||
|
IE_DESC = 'Movie Storm (link farm)'
|
||||||
|
IE_NAME = 'MovieStorm'
|
||||||
|
_VALID_URL = r'http://moviestorm\.eu/view/(\d+)-watch-(.*)/season-(\d+)/episode-(\d+)'
|
||||||
|
_LINK_FARM = True
|
||||||
|
|
||||||
|
# There are no tests for this IE because the links on any given moviestorm
|
||||||
|
# page can dynamically change, and because the actual download/extraction
|
||||||
|
# is ultimately preformed by another IE. An example of an acceptable url to
|
||||||
|
# feed to this IE is: http://moviestorm.eu/view/218-watch-the-simpsons/season-26/episode-1
|
||||||
|
_TEST = False
|
||||||
|
|
||||||
|
# moviestorm's drupal db config is unstable at times
|
||||||
|
# retry up to 5 times before giving up, 5 second delay
|
||||||
|
# between each retry
|
||||||
|
retry_count = 0
|
||||||
|
max_retries = 5
|
||||||
|
retry_wait = 5
|
||||||
|
direct_urls = []
|
||||||
|
|
||||||
|
def _parse_target(self, target):
|
||||||
|
uri = compat_urlparse.urlparse(target)
|
||||||
|
hash = uri.fragment[1:].split('?')[0]
|
||||||
|
token = os.path.basename(hash.rstrip('/'))
|
||||||
|
return (uri, hash, token)
|
||||||
|
|
||||||
|
def _real_extract(self, url):
|
||||||
|
# retry loop to capture moviestorm page
|
||||||
|
while True:
|
||||||
|
if self.retry_count == 0:
|
||||||
|
note = 'Downloading link farm page'
|
||||||
|
else:
|
||||||
|
note = ('Unstable db connection, retying again in %s seconds '
|
||||||
|
'[%s/%s]' % (self.retry_wait, self.retry_count,
|
||||||
|
self.max_retries))
|
||||||
|
|
||||||
|
(_, _, token) = self._parse_target(url)
|
||||||
|
farmpage = self._download_webpage(
|
||||||
|
url, token,
|
||||||
|
note=note,
|
||||||
|
errnote='Unable to download link farm page',
|
||||||
|
fatal=False
|
||||||
|
)
|
||||||
|
|
||||||
|
if farmpage.strip() != 'MySQL server has gone away':
|
||||||
|
break
|
||||||
|
|
||||||
|
if self.retry_count < self.max_retries:
|
||||||
|
self.retry_count += 1
|
||||||
|
sleep(self.retry_wait)
|
||||||
|
else:
|
||||||
|
msg = 'The moviestorm database is currently unstable. Please try again later.'
|
||||||
|
raise ExtractorError(msg, expected=True)
|
||||||
|
|
||||||
|
# scrape WATCH button links from moviestorm page
|
||||||
|
self.to_screen(': Extracting watch page urls')
|
||||||
|
watch_urls = MovieStormHTMLParser.extract_watch_urls(farmpage)
|
||||||
|
|
||||||
|
# get direct urls from scraped watch pages
|
||||||
|
self.to_screen(': Extracting direct links from watch pages')
|
||||||
|
for watch_url in watch_urls:
|
||||||
|
(_, _, token) = self._parse_target(watch_url)
|
||||||
|
watchpage = self._download_webpage(
|
||||||
|
watch_url, token,
|
||||||
|
note=False,
|
||||||
|
errnote='Unable to download link farm watch page',
|
||||||
|
fatal=False
|
||||||
|
)
|
||||||
|
|
||||||
|
if watchpage is not None:
|
||||||
|
direct_url = MovieStormHTMLParser.extract_direct_url(watchpage)
|
||||||
|
if direct_url:
|
||||||
|
self.direct_urls.append(direct_url)
|
||||||
|
|
||||||
|
self.to_screen(': Passing off farmed links to InfoExtractors')
|
||||||
|
return list(set(self.direct_urls))
|
Loading…
x
Reference in New Issue
Block a user