141 lines
		
	
	
		
			5.4 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
		
		
			
		
	
	
			141 lines
		
	
	
		
			5.4 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| 
								 | 
							
								# coding: utf-8
							 | 
						||
| 
								 | 
							
								from __future__ import unicode_literals
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								import re
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								from .common import InfoExtractor
							 | 
						||
| 
								 | 
							
								from ..utils import (
							 | 
						||
| 
								 | 
							
								    clean_html,
							 | 
						||
| 
								 | 
							
								    determine_ext,
							 | 
						||
| 
								 | 
							
								    ExtractorError,
							 | 
						||
| 
								 | 
							
								    float_or_none,
							 | 
						||
| 
								 | 
							
								    get_element_by_class,
							 | 
						||
| 
								 | 
							
								    get_element_by_id,
							 | 
						||
| 
								 | 
							
								    parse_duration,
							 | 
						||
| 
								 | 
							
								    remove_end,
							 | 
						||
| 
								 | 
							
								    urlencode_postdata,
							 | 
						||
| 
								 | 
							
								    urljoin,
							 | 
						||
| 
								 | 
							
								)
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								class TeamTreeHouseIE(InfoExtractor):
							 | 
						||
| 
								 | 
							
								    _VALID_URL = r'https?://(?:www\.)?teamtreehouse\.com/library/(?P<id>[^/]+)'
							 | 
						||
| 
								 | 
							
								    _TESTS = [{
							 | 
						||
| 
								 | 
							
								        # Course
							 | 
						||
| 
								 | 
							
								        'url': 'https://teamtreehouse.com/library/introduction-to-user-authentication-in-php',
							 | 
						||
| 
								 | 
							
								        'info_dict': {
							 | 
						||
| 
								 | 
							
								            'id': 'introduction-to-user-authentication-in-php',
							 | 
						||
| 
								 | 
							
								            'title': 'Introduction to User Authentication in PHP',
							 | 
						||
| 
								 | 
							
								            'description': 'md5:405d7b4287a159b27ddf30ca72b5b053',
							 | 
						||
| 
								 | 
							
								        },
							 | 
						||
| 
								 | 
							
								        'playlist_mincount': 24,
							 | 
						||
| 
								 | 
							
								    }, {
							 | 
						||
| 
								 | 
							
								        # WorkShop
							 | 
						||
| 
								 | 
							
								        'url': 'https://teamtreehouse.com/library/deploying-a-react-app',
							 | 
						||
| 
								 | 
							
								        'info_dict': {
							 | 
						||
| 
								 | 
							
								            'id': 'deploying-a-react-app',
							 | 
						||
| 
								 | 
							
								            'title': 'Deploying a React App',
							 | 
						||
| 
								 | 
							
								            'description': 'md5:10a82e3ddff18c14ac13581c9b8e5921',
							 | 
						||
| 
								 | 
							
								        },
							 | 
						||
| 
								 | 
							
								        'playlist_mincount': 4,
							 | 
						||
| 
								 | 
							
								    }, {
							 | 
						||
| 
								 | 
							
								        # Video
							 | 
						||
| 
								 | 
							
								        'url': 'https://teamtreehouse.com/library/application-overview-2',
							 | 
						||
| 
								 | 
							
								        'info_dict': {
							 | 
						||
| 
								 | 
							
								            'id': 'application-overview-2',
							 | 
						||
| 
								 | 
							
								            'ext': 'mp4',
							 | 
						||
| 
								 | 
							
								            'title': 'Application Overview',
							 | 
						||
| 
								 | 
							
								            'description': 'md5:4b0a234385c27140a4378de5f1e15127',
							 | 
						||
| 
								 | 
							
								        },
							 | 
						||
| 
								 | 
							
								        'expected_warnings': ['This is just a preview'],
							 | 
						||
| 
								 | 
							
								    }]
							 | 
						||
| 
								 | 
							
								    _NETRC_MACHINE = 'teamtreehouse'
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    def _real_initialize(self):
							 | 
						||
| 
								 | 
							
								        email, password = self._get_login_info()
							 | 
						||
| 
								 | 
							
								        if email is None:
							 | 
						||
| 
								 | 
							
								            return
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								        signin_page = self._download_webpage(
							 | 
						||
| 
								 | 
							
								            'https://teamtreehouse.com/signin',
							 | 
						||
| 
								 | 
							
								            None, 'Downloading signin page')
							 | 
						||
| 
								 | 
							
								        data = self._form_hidden_inputs('new_user_session', signin_page)
							 | 
						||
| 
								 | 
							
								        data.update({
							 | 
						||
| 
								 | 
							
								            'user_session[email]': email,
							 | 
						||
| 
								 | 
							
								            'user_session[password]': password,
							 | 
						||
| 
								 | 
							
								        })
							 | 
						||
| 
								 | 
							
								        error_message = get_element_by_class('error-message', self._download_webpage(
							 | 
						||
| 
								 | 
							
								            'https://teamtreehouse.com/person_session',
							 | 
						||
| 
								 | 
							
								            None, 'Logging in', data=urlencode_postdata(data)))
							 | 
						||
| 
								 | 
							
								        if error_message:
							 | 
						||
| 
								 | 
							
								            raise ExtractorError(clean_html(error_message), expected=True)
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    def _real_extract(self, url):
							 | 
						||
| 
								 | 
							
								        display_id = self._match_id(url)
							 | 
						||
| 
								 | 
							
								        webpage = self._download_webpage(url, display_id)
							 | 
						||
| 
								 | 
							
								        title = self._html_search_meta(['og:title', 'twitter:title'], webpage)
							 | 
						||
| 
								 | 
							
								        description = self._html_search_meta(
							 | 
						||
| 
								 | 
							
								            ['description', 'og:description', 'twitter:description'], webpage)
							 | 
						||
| 
								 | 
							
								        entries = self._parse_html5_media_entries(url, webpage, display_id)
							 | 
						||
| 
								 | 
							
								        if entries:
							 | 
						||
| 
								 | 
							
								            info = entries[0]
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								            for subtitles in info.get('subtitles', {}).values():
							 | 
						||
| 
								 | 
							
								                for subtitle in subtitles:
							 | 
						||
| 
								 | 
							
								                    subtitle['ext'] = determine_ext(subtitle['url'], 'srt')
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								            is_preview = 'data-preview="true"' in webpage
							 | 
						||
| 
								 | 
							
								            if is_preview:
							 | 
						||
| 
								 | 
							
								                self.report_warning(
							 | 
						||
| 
								 | 
							
								                    'This is just a preview. You need to be signed in with a Basic account to download the entire video.', display_id)
							 | 
						||
| 
								 | 
							
								                duration = 30
							 | 
						||
| 
								 | 
							
								            else:
							 | 
						||
| 
								 | 
							
								                duration = float_or_none(self._search_regex(
							 | 
						||
| 
								 | 
							
								                    r'data-duration="(\d+)"', webpage, 'duration'), 1000)
							 | 
						||
| 
								 | 
							
								                if not duration:
							 | 
						||
| 
								 | 
							
								                    duration = parse_duration(get_element_by_id(
							 | 
						||
| 
								 | 
							
								                        'video-duration', webpage))
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								            info.update({
							 | 
						||
| 
								 | 
							
								                'id': display_id,
							 | 
						||
| 
								 | 
							
								                'title': title,
							 | 
						||
| 
								 | 
							
								                'description': description,
							 | 
						||
| 
								 | 
							
								                'duration': duration,
							 | 
						||
| 
								 | 
							
								            })
							 | 
						||
| 
								 | 
							
								            return info
							 | 
						||
| 
								 | 
							
								        else:
							 | 
						||
| 
								 | 
							
								            def extract_urls(html, extract_info=None):
							 | 
						||
| 
								 | 
							
								                for path in re.findall(r'<a[^>]+href="([^"]+)"', html):
							 | 
						||
| 
								 | 
							
								                    page_url = urljoin(url, path)
							 | 
						||
| 
								 | 
							
								                    entry = {
							 | 
						||
| 
								 | 
							
								                        '_type': 'url_transparent',
							 | 
						||
| 
								 | 
							
								                        'id': self._match_id(page_url),
							 | 
						||
| 
								 | 
							
								                        'url': page_url,
							 | 
						||
| 
								 | 
							
								                        'id_key': self.ie_key(),
							 | 
						||
| 
								 | 
							
								                    }
							 | 
						||
| 
								 | 
							
								                    if extract_info:
							 | 
						||
| 
								 | 
							
								                        entry.update(extract_info)
							 | 
						||
| 
								 | 
							
								                    entries.append(entry)
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								            workshop_videos = self._search_regex(
							 | 
						||
| 
								 | 
							
								                r'(?s)<ul[^>]+id="workshop-videos"[^>]*>(.+?)</ul>',
							 | 
						||
| 
								 | 
							
								                webpage, 'workshop videos', default=None)
							 | 
						||
| 
								 | 
							
								            if workshop_videos:
							 | 
						||
| 
								 | 
							
								                extract_urls(workshop_videos)
							 | 
						||
| 
								 | 
							
								            else:
							 | 
						||
| 
								 | 
							
								                stages_path = self._search_regex(
							 | 
						||
| 
								 | 
							
								                    r'(?s)<div[^>]+id="syllabus-stages"[^>]+data-url="([^"]+)"',
							 | 
						||
| 
								 | 
							
								                    webpage, 'stages path')
							 | 
						||
| 
								 | 
							
								                if stages_path:
							 | 
						||
| 
								 | 
							
								                    stages_page = self._download_webpage(
							 | 
						||
| 
								 | 
							
								                        urljoin(url, stages_path), display_id, 'Downloading stages page')
							 | 
						||
| 
								 | 
							
								                    for chapter_number, (chapter, steps_list) in enumerate(re.findall(r'(?s)<h2[^>]*>\s*(.+?)\s*</h2>.+?<ul[^>]*>(.+?)</ul>', stages_page), 1):
							 | 
						||
| 
								 | 
							
								                        extract_urls(steps_list, {
							 | 
						||
| 
								 | 
							
								                            'chapter': chapter,
							 | 
						||
| 
								 | 
							
								                            'chapter_number': chapter_number,
							 | 
						||
| 
								 | 
							
								                        })
							 | 
						||
| 
								 | 
							
								                    title = remove_end(title, ' Course')
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								            return self.playlist_result(
							 | 
						||
| 
								 | 
							
								                entries, display_id, title, description)
							 |