Added new extractors

This commit is contained in:
Joel Leclerc 2014-09-12 04:06:34 -04:00
parent 4d46c1c68c
commit f9fa66dcac
5 changed files with 340 additions and 0 deletions

View File

@ -144,6 +144,10 @@ from .globo import GloboIE
from .godtube import GodTubeIE
from .goldenmoustache import GoldenMoustacheIE
from .golem import GolemIE
from .gogoanime import (
GoGoAnimeIE,
GoGoAnimeSearchIE
)
from .googleplus import GooglePlusIE
from .googlesearch import GoogleSearchIE
from .gorillavid import GorillaVidIE
@ -286,6 +290,11 @@ from .phoenix import PhoenixIE
from .photobucket import PhotobucketIE
from .planetaplay import PlanetaPlayIE
from .played import PlayedIE
from .play44 import (
Play44IE,
ByZooIE,
Video44IE
)
from .playfm import PlayFMIE
from .playvid import PlayvidIE
from .podomatic import PodomaticIE
@ -341,6 +350,10 @@ from .smotri import (
from .snotr import SnotrIE
from .sockshare import SockshareIE
from .sohu import SohuIE
from .soulanime import (
SoulAnimeWatchingIE,
SoulAnimeSeriesIE
)
from .soundcloud import (
SoundcloudIE,
SoundcloudSetIE,
@ -429,6 +442,7 @@ from .viddler import ViddlerIE
from .videobam import VideoBamIE
from .videodetective import VideoDetectiveIE
from .videolecturesnet import VideoLecturesNetIE
from .videofun import VideoFunIE
from .videofyme import VideofyMeIE
from .videomega import VideoMegaIE
from .videopremium import VideoPremiumIE

View File

@ -0,0 +1,109 @@
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import (
ExtractorError,
compat_urlparse,
compat_urllib_parse,
get_element_by_attribute,
unescapeHTML
)
class GoGoAnimeIE(InfoExtractor):
IE_NAME = 'gogoanime'
IE_DESC = 'GoGoAnime'
_VALID_URL = r'http://www.gogoanime.com/(?P<id>[A-Za-z0-9-]+)'
_NOT_FOUND_REGEX = r'Oops! Page Not Found</font>'
_FILEKEY_REGEX = r'flashvars\.filekey="(?P<filekey>[^"]+)";'
_TITLE_REGEX = r'<div class="postdesc">[^<]*<h1>([^<]+)</h1>'
_SINGLEPART_REGEX = r'<div class="postcontent">[^<]*<p><iframe src=[\'"][^>]+></iframe></p>'
_MULTIPART_REGEX = r'<div class="postcontent">[^<]*<p><iframe src=[\'"][^>]+></iframe><br />'
_POSTCONTENT_REGEX = r'<div class="postcontent">(?P<content>(?!</div>)*)</div>'
_IFRAME_REGEX = r'<iframe[^>]*src=[\'"](h[^\'"]+)[\'"]'
"""_TEST = {
'url': 'http://www.gogoanime.com/mahou-shoujo-madoka-magica-episode-12',
'md5': 'd9b511f92ce9348206f8481ba19dc9f1',
'info_dict': {
'id': 'Mahou-Shoujo-Madoka-Magica-12',
'ext': 'flv',
'title': 'Mahou-Shoujo-Madoka-Magica-12',
'description': 'Mahou-Shoujo-Madoka-Magica-12'
}
},"""
_TEST = {
'url': 'http://www.gogoanime.com/mahou-shoujo-madoka-magica-movie-1',
'info_dict': {
'id': 'mahou-shoujo-madoka-magica-movie-1'
},
'playlist_count': 3
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
page = self._download_webpage(url, video_id, "Downloading video page")
if re.search(self._NOT_FOUND_REGEX, page) is not None:
raise ExtractorError('Video does not exist', expected=True)
title = self._html_search_regex(self._TITLE_REGEX, page, 'title', fatal=False)
description = title
content = get_element_by_attribute("class", "postcontent", page)
pattern = re.compile(self._IFRAME_REGEX)
vids = pattern.findall(content)
vids = [unescapeHTML(compat_urllib_parse.unquote(x)) for x in vids if not re.search(".*videofun.*", x)]
if (re.search(self._SINGLEPART_REGEX, page)):
return {
'_type': 'url',
'id': None,
'url': vids[0],
'title': title,
'description': title
}
if (re.search(self._MULTIPART_REGEX, page)):
return self.playlist_result([self.url_result(vid) for vid in vids], video_id)
print("Error parsing!")
return {}
class GoGoAnimeSearchIE(InfoExtractor):
IE_NAME = 'gogoanime:search'
IE_DESC = 'GoGoAnime Search'
_VALID_URL = r'http://www\.gogoanime\.com/.*\?s=(?P<id>.*)'
_POSTLIST_REGEX = r'<div class="postlist">[^<]*<p[^>]*>[^<]*<a href="(?P<url>[^"]+)"'
_TEST = {
'url': 'http://www.gogoanime.com/?s=bokusatsu',
'info_dict': {
'id': 'bokusatsu'
},
'playlist_count': 6
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
page = self._download_webpage(url, video_id, "Downloading video page")
pattern = re.compile(self._POSTLIST_REGEX)
content = pattern.findall(page)
return self.playlist_result([self.url_result(vid) for vid in content], video_id)

View File

@ -0,0 +1,82 @@
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import (
ExtractorError,
compat_urllib_parse
)
class Play44IE(InfoExtractor):
IE_NAME = 'play44'
IE_DESC = 'Play44'
_VALID_URL = r'http://[w.]*play44\.net/embed\.php[^/]*/(?P<id>.+)'
_VIDEO_URL_REGEX = r'_url = "(https?://[^"]+)";'
_TITLE_REGEX = r'.*/(?P<title>[^.]*).'
_TEST = {
'url': 'http://play44.net/embed.php?w=600&h=438&vid=M/mahou-shoujo-madoka-magica-07.flv',
'md5': 'e37e99d665f503dd2db952f7c4dba9e6',
'info_dict': {
'id': 'mahou-shoujo-madoka-magica-07',
'ext': 'flv',
'title': 'mahou-shoujo-madoka-magica-07',
'description': 'mahou-shoujo-madoka-magica-07'
}
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
page = self._download_webpage(url, video_id, "Downloading video page")
video_url_encoded = self._html_search_regex(self._VIDEO_URL_REGEX, page, 'url', fatal=True)
video_url = compat_urllib_parse.unquote(video_url_encoded)
title = re.match(self._TITLE_REGEX, video_url).group('title')
return {
'id': title,
'url': video_url,
'title': title,
'description': title
}
class ByZooIE(Play44IE):
IE_NAME = "byzoo"
IE_DESC = "ByZoo"
_VALID_URL = r'http://[w.]*byzoo\.org/embed\.php[^/]*/(?P<id>.+)'
_TEST = {
'url': 'http://byzoo.org/embed.php?w=600&h=438&vid=at/nw/mahou_shoujo_madoka_magica_movie_3_-_part1.mp4',
'md5': '455c83dabe2cd9fd74a87612b01fe017',
'info_dict': {
'id': 'mahou_shoujo_madoka_magica_movie_3_-_part1',
'ext': 'mp4',
'title': 'mahou_shoujo_madoka_magica_movie_3_-_part1',
'description': 'mahou_shoujo_madoka_magica_movie_3_-_part1'
}
}
class Video44IE(Play44IE):
IE_NAME = "video44"
IE_DESC = "Video44"
_VALID_URL = r'http://[w.]*video44\.net/.*file=(?P<id>[^&].).*'
_TEST = {
'url': 'http://www.video44.net/gogo/?w=600&h=438&file=chaoshead-12.flv&sv=1',
'md5': '43eaec6d0beb10e8d42459b9f108aff3',
'info_dict': {
'id': 'chaoshead-12',
'ext': 'mp4',
'title': 'chaoshead-12',
'description': 'chaoshead-12'
}
}

View File

@ -0,0 +1,87 @@
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import (
ExtractorError,
compat_urllib_parse
)
class SoulAnimeBaseIE(InfoExtractor):
_VID_VALID_URL = r'http://[w.]*soul-anime\.net/watching/(?P<id>[^/]+)'
_VIDEO_URL_REGEX = r'<div id="download">[^<]*<a href="(?P<url>[^"]+)"'
def _down_vid(self, url):
mobj = re.match(self._VID_VALID_URL, url)
video_id = mobj.group('id')
page = self._download_webpage(url, video_id, "Downloading video page")
video_url_encoded = self._html_search_regex(self._VIDEO_URL_REGEX, page, 'url', fatal=True)
video_url = "http://www.soul-anime.net" + video_url_encoded
vid = self._request_webpage(video_url, video_id)
ext = vid.getheader("Content-Type").split("/")[1]
return {
'id': video_id,
'url': video_url,
'ext': ext,
'title': video_id,
'description': video_id
}
class SoulAnimeWatchingIE(SoulAnimeBaseIE):
IE_NAME = "soulanime:watching"
IE_DESC = "SoulAnime Watching"
_VALID_URL = SoulAnimeBaseIE._VID_VALID_URL
_TEST = {
'url': 'http://www.soul-anime.net/watching/seirei-tsukai-no-blade-dance-episode-9/',
'md5': '05fae04abf72298098b528e98abf4298',
'info_dict': {
'id': 'seirei-tsukai-no-blade-dance-episode-9',
'ext': 'mp4',
'title': 'seirei-tsukai-no-blade-dance-episode-9',
'description': 'seirei-tsukai-no-blade-dance-episode-9'
}
}
def _real_extract(self, url):
return self._down_vid(url)
class SoulAnimeSeriesIE(InfoExtractor):
IE_NAME = "soulanime:series"
IE_DESC = "SoulAnime Series"
_VALID_URL = r'http://[w.]*soul-anime\.net/anime./(?P<id>[^/]+)'
_EPISODE_REGEX = r'<option value="(/watching/[^"]+)">[^<]*</option>'
_TEST = {
'url': 'http://www.soul-anime.net/anime1/black-rock-shooter-tv/',
'info_dict': {
'id': 'black-rock-shooter-tv'
},
'playlist_count': 8
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
series_id = mobj.group('id')
pattern = re.compile(self._EPISODE_REGEX)
page = self._download_webpage(url, series_id, "Downloading series page")
mobj = pattern.findall(page)
entries = [self.url_result("http://www.soul-anime.net" + obj) for obj in mobj]
return self.playlist_result(entries, series_id)

View File

@ -0,0 +1,48 @@
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import (
ExtractorError,
compat_urllib_parse
)
class VideoFunIE(InfoExtractor):
IE_NAME = 'videofun'
IE_DESC = 'VideoFun'
_VALID_URL = r'http://[w.]*videofun\.me/embed/(?P<id>.+)'
_VIDEO_URL_REGEX = r'url: "(http://gateway\.videofun\.me[^"]+)"'
_TITLE_REGEX = r'.*/(?P<title>[^.]*).'
_TEST = {
'url': 'http://videofun.me/embed/8267659be070860af600fee7deadbcdb?w=600&h=438',
'md5': 'e37e99d665f503dd2db952f7c4dba9e6',
'info_dict': {
'id': 'Mahou-Shoujo-Madoka-Magica-07',
'ext': 'flv',
'title': 'Mahou-Shoujo-Madoka-Magica-07',
'description': 'Mahou-Shoujo-Madoka-Magica-07'
}
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
page = self._download_webpage(url, video_id, "Downloading video page")
video_url_encoded = self._html_search_regex(self._VIDEO_URL_REGEX, page, 'url', fatal=True)
video_url = compat_urllib_parse.unquote(video_url_encoded)
title = re.match(self._TITLE_REGEX, video_url).group('title')
return {
'id': title,
'url': video_url,
'title': title,
'description': title
}