Added new extractors

This commit is contained in:
Joel Leclerc 2014-09-12 04:06:34 -04:00
parent 3fbeb95e14
commit 4020cd0572
6 changed files with 340 additions and 0 deletions

Binary file not shown.

View File

@ -131,6 +131,10 @@ from .gametrailers import GametrailersIE
from .gdcvault import GDCVaultIE
from .generic import GenericIE
from .godtube import GodTubeIE
from .gogoanime import (
GoGoAnimeIE,
GoGoAnimeSearchIE
)
from .googleplus import GooglePlusIE
from .googlesearch import GoogleSearchIE
from .gorillavid import GorillaVidIE
@ -258,6 +262,11 @@ from .parliamentliveuk import ParliamentLiveUKIE
from .patreon import PatreonIE
from .pbs import PBSIE
from .photobucket import PhotobucketIE
from .play44 import (
Play44IE,
ByZooIE,
Video44IE
)
from .playfm import PlayFMIE
from .playvid import PlayvidIE
from .podomatic import PodomaticIE
@ -308,6 +317,10 @@ from .smotri import (
from .snotr import SnotrIE
from .sockshare import SockshareIE
from .sohu import SohuIE
from .soulanime import (
SoulAnimeWatchingIE,
SoulAnimeSeriesIE
)
from .soundcloud import (
SoundcloudIE,
SoundcloudSetIE,
@ -381,6 +394,7 @@ from .viddler import ViddlerIE
from .videobam import VideoBamIE
from .videodetective import VideoDetectiveIE
from .videolecturesnet import VideoLecturesNetIE
from .videofun import VideoFunIE
from .videofyme import VideofyMeIE
from .videopremium import VideoPremiumIE
from .videott import VideoTtIE

View File

@ -0,0 +1,109 @@
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import (
ExtractorError,
compat_urlparse,
compat_urllib_parse,
get_element_by_attribute,
unescapeHTML
)
class GoGoAnimeIE(InfoExtractor):
IE_NAME = 'gogoanime'
IE_DESC = 'GoGoAnime'
_VALID_URL = r'http://www.gogoanime.com/(?P<id>[A-Za-z0-9-]+)'
_NOT_FOUND_REGEX = r'Oops! Page Not Found</font>'
_FILEKEY_REGEX = r'flashvars\.filekey="(?P<filekey>[^"]+)";'
_TITLE_REGEX = r'<div class="postdesc">[^<]*<h1>([^<]+)</h1>'
_SINGLEPART_REGEX = r'<div class="postcontent">[^<]*<p><iframe src=[\'"][^>]+></iframe></p>'
_MULTIPART_REGEX = r'<div class="postcontent">[^<]*<p><iframe src=[\'"][^>]+></iframe><br />'
_POSTCONTENT_REGEX = r'<div class="postcontent">(?P<content>(?!</div>)*)</div>'
_IFRAME_REGEX = r'<iframe[^>]*src=[\'"](h[^\'"]+)[\'"]'
"""_TEST = {
'url': 'http://www.gogoanime.com/mahou-shoujo-madoka-magica-episode-12',
'md5': 'd9b511f92ce9348206f8481ba19dc9f1',
'info_dict': {
'id': 'Mahou-Shoujo-Madoka-Magica-12',
'ext': 'flv',
'title': 'Mahou-Shoujo-Madoka-Magica-12',
'description': 'Mahou-Shoujo-Madoka-Magica-12'
}
},"""
_TEST = {
'url': 'http://www.gogoanime.com/mahou-shoujo-madoka-magica-movie-1',
'info_dict': {
'id': 'mahou-shoujo-madoka-magica-movie-1'
},
'playlist_count': 3
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
page = self._download_webpage(url, video_id, "Downloading video page")
if re.search(self._NOT_FOUND_REGEX, page) is not None:
raise ExtractorError('Video does not exist', expected=True)
title = self._html_search_regex(self._TITLE_REGEX, page, 'title', fatal=False)
description = title
content = get_element_by_attribute("class", "postcontent", page)
pattern = re.compile(self._IFRAME_REGEX)
vids = pattern.findall(content)
vids = [unescapeHTML(compat_urllib_parse.unquote(x)) for x in vids if not re.search(".*videofun.*", x)]
if (re.search(self._SINGLEPART_REGEX, page)):
return {
'_type': 'url',
'id': None,
'url': vids[0],
'title': title,
'description': title
}
if (re.search(self._MULTIPART_REGEX, page)):
return self.playlist_result([self.url_result(vid) for vid in vids], video_id)
print("Error parsing!")
return {}
class GoGoAnimeSearchIE(InfoExtractor):
IE_NAME = 'gogoanime:search'
IE_DESC = 'GoGoAnime Search'
_VALID_URL = r'http://www\.gogoanime\.com/.*\?s=(?P<id>.*)'
_POSTLIST_REGEX = r'<div class="postlist">[^<]*<p[^>]*>[^<]*<a href="(?P<url>[^"]+)"'
_TEST = {
'url': 'http://www.gogoanime.com/?s=bokusatsu',
'info_dict': {
'id': 'bokusatsu'
},
'playlist_count': 6
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
page = self._download_webpage(url, video_id, "Downloading video page")
pattern = re.compile(self._POSTLIST_REGEX)
content = pattern.findall(page)
return self.playlist_result([self.url_result(vid) for vid in content], video_id)

View File

@ -0,0 +1,82 @@
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import (
ExtractorError,
compat_urllib_parse
)
class Play44IE(InfoExtractor):
IE_NAME = 'play44'
IE_DESC = 'Play44'
_VALID_URL = r'http://[w.]*play44\.net/embed\.php[^/]*/(?P<id>.+)'
_VIDEO_URL_REGEX = r'_url = "(https?://[^"]+)";'
_TITLE_REGEX = r'.*/(?P<title>[^.]*).'
_TEST = {
'url': 'http://play44.net/embed.php?w=600&h=438&vid=M/mahou-shoujo-madoka-magica-07.flv',
'md5': 'e37e99d665f503dd2db952f7c4dba9e6',
'info_dict': {
'id': 'mahou-shoujo-madoka-magica-07',
'ext': 'flv',
'title': 'mahou-shoujo-madoka-magica-07',
'description': 'mahou-shoujo-madoka-magica-07'
}
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
page = self._download_webpage(url, video_id, "Downloading video page")
video_url_encoded = self._html_search_regex(self._VIDEO_URL_REGEX, page, 'url', fatal=True)
video_url = compat_urllib_parse.unquote(video_url_encoded)
title = re.match(self._TITLE_REGEX, video_url).group('title')
return {
'id': title,
'url': video_url,
'title': title,
'description': title
}
class ByZooIE(Play44IE):
IE_NAME = "byzoo"
IE_DESC = "ByZoo"
_VALID_URL = r'http://[w.]*byzoo\.org/embed\.php[^/]*/(?P<id>.+)'
_TEST = {
'url': 'http://byzoo.org/embed.php?w=600&h=438&vid=at/nw/mahou_shoujo_madoka_magica_movie_3_-_part1.mp4',
'md5': '455c83dabe2cd9fd74a87612b01fe017',
'info_dict': {
'id': 'mahou_shoujo_madoka_magica_movie_3_-_part1',
'ext': 'mp4',
'title': 'mahou_shoujo_madoka_magica_movie_3_-_part1',
'description': 'mahou_shoujo_madoka_magica_movie_3_-_part1'
}
}
class Video44IE(Play44IE):
IE_NAME = "video44"
IE_DESC = "Video44"
_VALID_URL = r'http://[w.]*video44\.net/.*file=(?P<id>[^&].).*'
_TEST = {
'url': 'http://www.video44.net/gogo/?w=600&h=438&file=chaoshead-12.flv&sv=1',
'md5': '43eaec6d0beb10e8d42459b9f108aff3',
'info_dict': {
'id': 'chaoshead-12',
'ext': 'mp4',
'title': 'chaoshead-12',
'description': 'chaoshead-12'
}
}

View File

@ -0,0 +1,87 @@
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import (
ExtractorError,
compat_urllib_parse
)
class SoulAnimeBaseIE(InfoExtractor):
_VID_VALID_URL = r'http://[w.]*soul-anime\.net/watching/(?P<id>[^/]+)'
_VIDEO_URL_REGEX = r'<div id="download">[^<]*<a href="(?P<url>[^"]+)"'
def _down_vid(self, url):
mobj = re.match(self._VID_VALID_URL, url)
video_id = mobj.group('id')
page = self._download_webpage(url, video_id, "Downloading video page")
video_url_encoded = self._html_search_regex(self._VIDEO_URL_REGEX, page, 'url', fatal=True)
video_url = "http://www.soul-anime.net" + video_url_encoded
vid = self._request_webpage(video_url, video_id)
ext = vid.getheader("Content-Type").split("/")[1]
return {
'id': video_id,
'url': video_url,
'ext': ext,
'title': video_id,
'description': video_id
}
class SoulAnimeWatchingIE(SoulAnimeBaseIE):
IE_NAME = "soulanime:watching"
IE_DESC = "SoulAnime Watching"
_VALID_URL = SoulAnimeBaseIE._VID_VALID_URL
_TEST = {
'url': 'http://www.soul-anime.net/watching/seirei-tsukai-no-blade-dance-episode-9/',
'md5': '05fae04abf72298098b528e98abf4298',
'info_dict': {
'id': 'seirei-tsukai-no-blade-dance-episode-9',
'ext': 'mp4',
'title': 'seirei-tsukai-no-blade-dance-episode-9',
'description': 'seirei-tsukai-no-blade-dance-episode-9'
}
}
def _real_extract(self, url):
return self._down_vid(url)
class SoulAnimeSeriesIE(InfoExtractor):
IE_NAME = "soulanime:series"
IE_DESC = "SoulAnime Series"
_VALID_URL = r'http://[w.]*soul-anime\.net/anime./(?P<id>[^/]+)'
_EPISODE_REGEX = r'<option value="(/watching/[^"]+)">[^<]*</option>'
_TEST = {
'url': 'http://www.soul-anime.net/anime1/black-rock-shooter-tv/',
'info_dict': {
'id': 'black-rock-shooter-tv'
},
'playlist_count': 8
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
series_id = mobj.group('id')
pattern = re.compile(self._EPISODE_REGEX)
page = self._download_webpage(url, series_id, "Downloading series page")
mobj = pattern.findall(page)
entries = [self.url_result("http://www.soul-anime.net" + obj) for obj in mobj]
return self.playlist_result(entries, series_id)

View File

@ -0,0 +1,48 @@
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import (
ExtractorError,
compat_urllib_parse
)
class VideoFunIE(InfoExtractor):
IE_NAME = 'videofun'
IE_DESC = 'VideoFun'
_VALID_URL = r'http://[w.]*videofun\.me/embed/(?P<id>.+)'
_VIDEO_URL_REGEX = r'url: "(http://gateway\.videofun\.me[^"]+)"'
_TITLE_REGEX = r'.*/(?P<title>[^.]*).'
_TEST = {
'url': 'http://videofun.me/embed/8267659be070860af600fee7deadbcdb?w=600&h=438',
'md5': 'e37e99d665f503dd2db952f7c4dba9e6',
'info_dict': {
'id': 'Mahou-Shoujo-Madoka-Magica-07',
'ext': 'flv',
'title': 'Mahou-Shoujo-Madoka-Magica-07',
'description': 'Mahou-Shoujo-Madoka-Magica-07'
}
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
page = self._download_webpage(url, video_id, "Downloading video page")
video_url_encoded = self._html_search_regex(self._VIDEO_URL_REGEX, page, 'url', fatal=True)
video_url = compat_urllib_parse.unquote(video_url_encoded)
title = re.match(self._TITLE_REGEX, video_url).group('title')
return {
'id': title,
'url': video_url,
'title': title,
'description': title
}