2015-12-07 00:15:19 +02:00
# coding: utf-8
from __future__ import unicode_literals
2015-12-11 23:00:22 +06:00
2016-06-01 00:48:22 -04:00
import re
2015-12-07 00:15:19 +02:00
from . common import InfoExtractor
2016-04-30 00:17:09 +08:00
from . . compat import (
compat_HTTPError ,
compat_urllib_parse_unquote_plus ,
2016-05-16 10:50:49 -04:00
compat_urllib_parse_urlparse ,
compat_parse_qs ,
2016-04-30 00:17:09 +08:00
)
2015-12-07 00:15:19 +02:00
from . . utils import (
2015-12-11 23:00:22 +06:00
clean_html ,
determine_ext ,
2015-12-12 01:02:54 +06:00
int_or_none ,
2016-05-16 10:50:49 -04:00
float_or_none ,
2015-12-07 00:15:19 +02:00
sanitized_Request ,
ExtractorError ,
2016-05-16 10:50:49 -04:00
urlencode_postdata ,
NO_DEFAULT ,
2016-06-01 00:48:22 -04:00
OnDemandPagedList ,
2015-12-07 00:15:19 +02:00
)
2015-12-11 21:11:45 +06:00
2016-06-01 00:48:22 -04:00
class FunimationBaseIE ( InfoExtractor ) :
2015-12-13 07:17:42 +06:00
_NETRC_MACHINE = ' funimation '
2016-04-30 00:17:09 +08:00
_LOGIN_URL = ' http://www.funimation.com/login '
def _download_webpage ( self , * args , * * kwargs ) :
try :
2016-06-01 00:48:22 -04:00
return super ( FunimationBaseIE , self ) . _download_webpage ( * args , * * kwargs )
2016-04-30 00:17:09 +08:00
except ExtractorError as ee :
if isinstance ( ee . cause , compat_HTTPError ) and ee . cause . code == 403 :
response = ee . cause . read ( )
if b ' >Please complete the security check to access< ' in response :
raise ExtractorError (
' Access to funimation.com is blocked by CloudFlare. '
' Please browse to http://www.funimation.com/, solve '
' the reCAPTCHA, export browser cookies to a text file, '
' and then try again with --cookies YOUR_COOKIE_FILE. ' ,
expected = True )
raise
def _extract_cloudflare_session_ua ( self , url ) :
ci_session_cookie = self . _get_cookies ( url ) . get ( ' ci_session ' )
if ci_session_cookie :
ci_session = compat_urllib_parse_unquote_plus ( ci_session_cookie . value )
# ci_session is a string serialized by PHP function serialize()
# This case is simple enough to use regular expressions only
return self . _search_regex (
r ' " user_agent " ;s: \ d+: " ([^ " ]+) " ' , ci_session , ' user agent ' ,
default = None )
2015-12-07 00:15:19 +02:00
def _login ( self ) :
( username , password ) = self . _get_login_info ( )
if username is None :
return
2016-03-26 01:46:57 +06:00
data = urlencode_postdata ( {
2015-12-07 00:15:19 +02:00
' email_field ' : username ,
' password_field ' : password ,
2016-03-26 01:46:57 +06:00
} )
2016-04-30 00:17:09 +08:00
user_agent = self . _extract_cloudflare_session_ua ( self . _LOGIN_URL )
if not user_agent :
user_agent = ' Mozilla/5.0 (Windows NT 5.2; WOW64; rv:42.0) Gecko/20100101 Firefox/42.0 '
login_request = sanitized_Request ( self . _LOGIN_URL , data , headers = {
' User-Agent ' : user_agent ,
2015-12-11 21:34:30 +06:00
' Content-Type ' : ' application/x-www-form-urlencoded '
} )
2015-12-13 07:17:42 +06:00
login_page = self . _download_webpage (
2015-12-11 23:00:22 +06:00
login_request , None , ' Logging in as %s ' % username )
2015-12-13 07:17:42 +06:00
if any ( p in login_page for p in ( ' funimation.com/logout ' , ' >Log Out< ' ) ) :
return
error = self . _html_search_regex (
r ' (?s)<div[^>]+id=[ " \' ]errorMessages[ " \' ][^>]*>(.+?)</div> ' ,
login_page , ' error messages ' , default = None )
if error :
raise ExtractorError ( ' Unable to login: %s ' % error , expected = True )
raise ExtractorError ( ' Unable to log in ' )
2015-12-07 00:15:19 +02:00
def _real_initialize ( self ) :
self . _login ( )
2016-06-01 00:48:22 -04:00
class FunimationIE ( FunimationBaseIE ) :
_VALID_URL = r ' https?://(?:www \ .)?funimation \ .com/shows/[^/]+/videos/(?:official|promotional)/(?P<id>[^/?#& " ]+) '
_TESTS = [ {
' url ' : ' http://www.funimation.com/shows/hacksign/videos/official/role-play ' ,
' info_dict ' : {
' id ' : ' 31128 ' ,
' display_id ' : ' role-play ' ,
' ext ' : ' mp4 ' ,
' title ' : ' .hack//SIGN - 1 - Role Play ' ,
' description ' : ' md5:b602bdc15eef4c9bbb201bb6e6a4a2dd ' ,
' thumbnail ' : ' re:https?://.* \ .jpg ' ,
} ,
' skip ' : ' Access without user interaction is forbidden by CloudFlare ' ,
} , {
' url ' : ' http://www.funimation.com/shows/attack-on-titan-junior-high/videos/promotional/broadcast-dub-preview ' ,
' info_dict ' : {
' id ' : ' 9635 ' ,
' display_id ' : ' broadcast-dub-preview ' ,
' ext ' : ' mp4 ' ,
' title ' : ' Attack on Titan: Junior High - Broadcast Dub Preview ' ,
' description ' : ' md5:f8ec49c0aff702a7832cd81b8a44f803 ' ,
' thumbnail ' : ' re:https?://.* \ .(?:jpg|png) ' ,
} ,
' skip ' : ' Access without user interaction is forbidden by CloudFlare ' ,
} ]
2015-12-07 00:15:19 +02:00
def _real_extract ( self , url ) :
2015-12-11 23:00:22 +06:00
display_id = self . _match_id ( url )
2015-12-12 00:38:58 +06:00
errors = [ ]
formats = [ ]
2015-12-11 23:00:22 +06:00
ERRORS_MAP = {
' ERROR_MATURE_CONTENT_LOGGED_IN ' : ' matureContentLoggedIn ' ,
' ERROR_MATURE_CONTENT_LOGGED_OUT ' : ' matureContentLoggedOut ' ,
' ERROR_SUBSCRIPTION_LOGGED_OUT ' : ' subscriptionLoggedOut ' ,
' ERROR_VIDEO_EXPIRED ' : ' videoExpired ' ,
' ERROR_TERRITORY_UNAVAILABLE ' : ' territoryUnavailable ' ,
' SVODBASIC_SUBSCRIPTION_IN_PLAYER ' : ' basicSubscription ' ,
' SVODNON_SUBSCRIPTION_IN_PLAYER ' : ' nonSubscription ' ,
' ERROR_PLAYER_NOT_RESPONDING ' : ' playerNotResponding ' ,
' ERROR_UNABLE_TO_CONNECT_TO_CDN ' : ' unableToConnectToCDN ' ,
' ERROR_STREAM_NOT_FOUND ' : ' streamNotFound ' ,
}
2015-12-12 00:38:58 +06:00
USER_AGENTS = (
# PC UA is served with m3u8 that provides some bonus lower quality formats
( ' pc ' , ' Mozilla/5.0 (Windows NT 5.2; WOW64; rv:42.0) Gecko/20100101 Firefox/42.0 ' ) ,
# Mobile UA allows to extract direct links and also does not fail when
# PC UA fails with hulu error (e.g.
# http://www.funimation.com/shows/hacksign/videos/official/role-play)
( ' mobile ' , ' Mozilla/5.0 (Linux; Android 4.4.2; Nexus 4 Build/KOT49H) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.114 Mobile Safari/537.36 ' ) ,
)
2016-04-30 00:17:09 +08:00
user_agent = self . _extract_cloudflare_session_ua ( url )
if user_agent :
USER_AGENTS = ( ( None , user_agent ) , )
2016-05-16 10:50:49 -04:00
# Extract language preference from URL if present
query = compat_parse_qs ( compat_urllib_parse_urlparse ( url ) . query )
preference = query . get ( ' watch ' , [ None ] ) [ - 1 ]
# Initialize variables with defaults
season_id = None
season_number = None
episode_number = None
2015-12-12 00:38:58 +06:00
for kind , user_agent in USER_AGENTS :
request = sanitized_Request ( url )
request . add_header ( ' User-Agent ' , user_agent )
webpage = self . _download_webpage (
2016-04-30 00:17:09 +08:00
request , display_id ,
' Downloading %s webpage ' % kind if kind else ' Downloading webpage ' )
2015-12-12 00:38:58 +06:00
2015-12-12 00:48:09 +06:00
playlist = self . _parse_json (
2015-12-12 00:38:58 +06:00
self . _search_regex (
r ' var \ s+playersData \ s*= \ s*( \ [.+? \ ]); \ n ' ,
webpage , ' players data ' ) ,
2015-12-12 00:48:09 +06:00
display_id ) [ 0 ] [ ' playlist ' ]
2015-12-12 00:38:58 +06:00
2016-05-16 10:50:49 -04:00
season = next ( item for item in playlist if item . get ( ' items ' ) )
item = next ( item for item in season [ ' items ' ] if item . get ( ' itemAK ' ) == display_id )
if season . get ( ' itemClass ' ) == ' season ' :
season_id = season . get ( ' itemAK ' )
season_number = int_or_none ( self . _search_regex (
r ' ^Season ([0-9]+)$ ' , season_id , ' season number ' , None ) )
episode_number = float_or_none ( item . get ( ' number ' ) )
2015-12-12 00:38:58 +06:00
error_messages = { }
video_error_messages = self . _search_regex (
r ' var \ s+videoErrorMessages \ s*= \ s*( { .+?}); \ n ' ,
webpage , ' error messages ' , default = None )
if video_error_messages :
error_messages_json = self . _parse_json ( video_error_messages , display_id , fatal = False )
if error_messages_json :
for _ , error in error_messages_json . items ( ) :
type_ = error . get ( ' type ' )
description = error . get ( ' description ' )
content = error . get ( ' content ' )
if type_ == ' text ' and description and content :
error_message = ERRORS_MAP . get ( description )
if error_message :
error_messages [ error_message ] = content
for video in item . get ( ' videoSet ' , [ ] ) :
auth_token = video . get ( ' authToken ' )
if not auth_token :
2015-12-11 23:00:22 +06:00
continue
2015-12-12 00:38:58 +06:00
funimation_id = video . get ( ' FUNImationID ' ) or video . get ( ' videoId ' )
if not auth_token . startswith ( ' ? ' ) :
auth_token = ' ? %s ' % auth_token
2015-12-12 01:02:54 +06:00
for quality , height in ( ( ' sd ' , 480 ) , ( ' hd ' , 720 ) , ( ' hd1080 ' , 1080 ) ) :
2015-12-12 00:38:58 +06:00
format_url = video . get ( ' %s Url ' % quality )
if not format_url :
continue
if not format_url . startswith ( ( ' http ' , ' // ' ) ) :
errors . append ( format_url )
continue
if determine_ext ( format_url ) == ' m3u8 ' :
2016-05-16 10:50:49 -04:00
m3u8_formats = self . _extract_m3u8_formats (
2015-12-12 00:38:58 +06:00
format_url + auth_token , display_id , ' mp4 ' , entry_protocol = ' m3u8_native ' ,
2016-05-16 10:50:49 -04:00
m3u8_id = ' %s -hls ' % funimation_id , fatal = False )
# Add language and preference
for m3u8_format in m3u8_formats :
m3u8_format [ ' language ' ] = ( ' en-US '
if video . get ( ' languageMode ' ) == ' dub '
else ' ja-JP ' )
m3u8_format [ ' language_preference ' ] = ( 10
if video . get ( ' languageMode ' ) == preference
else - 1 )
formats . append ( m3u8_format )
2015-12-12 00:38:58 +06:00
else :
2015-12-12 01:02:54 +06:00
tbr = int_or_none ( self . _search_regex (
r ' -( \ d+)[Kk] ' , format_url , ' tbr ' , default = None ) )
formats . append ( {
2015-12-12 00:38:58 +06:00
' url ' : format_url + auth_token ,
2015-12-12 01:02:54 +06:00
' format_id ' : ' %s -http- %d p ' % ( funimation_id , height ) ,
' height ' : height ,
' tbr ' : tbr ,
2016-05-16 10:50:49 -04:00
' language ' : ' en-US ' if video . get ( ' languageMode ' ) == ' dub ' else ' ja-JP ' ,
' language_preference ' : 10 if video . get ( ' languageMode ' ) == preference else - 1
2015-12-12 01:02:54 +06:00
} )
2015-12-11 23:00:22 +06:00
if not formats and errors :
raise ExtractorError (
' %s returned error: %s '
% ( self . IE_NAME , clean_html ( error_messages . get ( errors [ 0 ] , errors [ 0 ] ) ) ) ,
expected = True )
2015-12-12 00:38:58 +06:00
self . _sort_formats ( formats )
2015-12-11 23:00:22 +06:00
title = item [ ' title ' ]
artist = item . get ( ' artist ' )
2016-05-16 10:50:49 -04:00
episode = None
2015-12-11 23:00:22 +06:00
if artist :
title = ' %s - %s ' % ( artist , title )
2016-05-16 10:50:49 -04:00
episode = self . _search_regex (
r ' ^[0-9]+ - (.*)$ ' , item [ ' title ' ] , ' episode name ' , NO_DEFAULT , False )
2015-12-11 23:00:22 +06:00
description = self . _og_search_description ( webpage ) or item . get ( ' description ' )
2016-06-01 00:48:22 -04:00
if description :
description = description . strip ( )
2015-12-11 23:00:22 +06:00
thumbnail = self . _og_search_thumbnail ( webpage ) or item . get ( ' posterUrl ' )
video_id = item . get ( ' itemId ' ) or display_id
2015-12-07 00:15:19 +02:00
return {
' id ' : video_id ,
2015-12-11 23:00:22 +06:00
' display_id ' : display_id ,
' title ' : title ,
' description ' : description ,
2016-05-16 10:50:49 -04:00
' series ' : artist ,
' season_id ' : season_id ,
' season_number ' : season_number ,
' episode_id ' : item . get ( ' videoUrl ' ) ,
' episode ' : episode ,
' episode_number ' : episode_number ,
2015-12-11 23:00:22 +06:00
' thumbnail ' : thumbnail ,
2015-12-07 00:15:19 +02:00
' formats ' : formats ,
}
2016-06-01 00:48:22 -04:00
class FunimationShowPlaylistIE ( FunimationBaseIE ) :
IE_NAME = ' funimation:playlist '
_VALID_URL = r ' (?P<seriesurl>https?://(?:www \ .)?funimation \ .com/shows/(?P<id>[^/]+))(?:/(?:home|about|videos))?$ '
_TESTS = [ {
' url ' : ' http://www.funimation.com/shows/a-certain-scientific-railgun/home ' ,
' info_dict ' : {
' id ' : ' a-certain-scientific-railgun ' ,
' description ' : ' Misaka’ s electro-manipulation abilities – and delightfully destructive Railgun projectile move – make her a rock star in Academy City. The techno-metropolis is packed with supernaturally powered students known as espers, including Misaka’ s flirty friend and roommate, Kuroko. She uses her teleportation skills as a member of the Judgment law enforcement team, fighting crime alongside her fellow agent Uiharu. Joined by their friend Saten, a spunky Level 0 esper, Misaka, ' ,
' title ' : ' A Certain Scientific Railgun '
} ,
' playlist_count ' : 48
} , {
' url ' : ' http://www.funimation.com/shows/hacksign/home ' ,
' info_dict ' : {
' id ' : ' hacksign ' ,
' description ' : ' Tsukasa wakes up inside The World, a massive online role-playing game full of magic and monsters, and finds himself unable to log out. With no knowledge of what’ s happening in the real world, Tsukasa must discover how he ended up stuck in the game, and what connection he has with the fabled Key of the Twilight—an item that’ s rumored to grant ultimate control over the digital realm. ' ,
' title ' : ' .hack//SIGN '
} ,
' playlist_count ' : 56
} ]
def _real_extract ( self , url ) :
display_id = self . _match_id ( url )
user_agent = self . _extract_cloudflare_session_ua ( url )
# Use series page to get ID number and title / description
series_url = self . _search_regex ( self . _VALID_URL , url , ' series URL ' , group = ' seriesurl ' )
request = sanitized_Request ( series_url )
request . add_header ( ' User-Agent ' , user_agent )
webpage = self . _download_webpage ( request , display_id , ' Downloading series webpage ' )
# Parseable show data stored as a JavaScript variable
playlist = self . _parse_json (
self . _search_regex (
r ' var \ s+playersData \ s*= \ s*( \ [.+? \ ]); \ n ' ,
webpage , ' players data ' ) ,
display_id ) [ 0 ] [ ' playlist ' ] [ 0 ]
def pagefunc ( pagenum ) :
# Internal Funimation endpoint for getting paginated video list HTML
request = sanitized_Request (
' https://www.funimation.com/shows/viewAllFiltered?section=episodes&showid= {0} &offset= {1} '
. format ( playlist . get ( ' showId ' ) , pagenum * 20 ) )
request . add_header ( ' User-Agent ' , user_agent )
episode_list = self . _download_json (
request , display_id , ' Downloading episode list from {0} ' . format ( pagenum * 20 ) ) [ ' main ' ]
# There are multiple instances of each video URL, so filter for unique URLs
# while keeping the order of the episodes
urls_seen = set ( )
episode_paths = re . finditer (
r ' (?s)<a href= " ( ' + FunimationIE . _VALID_URL + r ' ) " ' ,
episode_list )
episode_paths = [
path . group ( 1 ) for path in episode_paths
if not ( path . group ( 1 ) in urls_seen or urls_seen . add ( path . group ( 1 ) ) ) ]
return [ self . url_result ( ep , FunimationIE . ie_key ( ) ) for ep in episode_paths ]
description = self . _og_search_description ( webpage ) or playlist . get ( ' description ' )
if description :
description = description . strip ( )
return {
' _type ' : ' playlist ' ,
' id ' : display_id ,
' title ' : playlist . get ( ' artist ' ) ,
' description ' : description ,
' entries ' : OnDemandPagedList ( pagefunc , 20 , True )
}