2014-11-20 16:34:54 +01:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								from __future__ import unicode_literals
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2013-06-23 21:16:32 +02:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								import re
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								from .common import InfoExtractor
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								from ..utils import (
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    ExtractorError,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    orderedSet,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    unescapeHTML,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								class StanfordOpenClassroomIE(InfoExtractor):
							 | 
						
					
						
							
								
									
										
										
										
											2014-11-20 16:34:54 +01:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    IE_NAME = 'stanfordoc'
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    IE_DESC = 'Stanford Open ClassRoom'
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    _VALID_URL = r'https?://openclassroom\.stanford\.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
							 | 
						
					
						
							
								
									
										
										
										
											2013-06-27 20:46:46 +02:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    _TEST = {
							 | 
						
					
						
							
								
									
										
										
										
											2014-11-20 16:34:54 +01:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        'url': 'http://openclassroom.stanford.edu/MainFolder/VideoPage.php?course=PracticalUnix&video=intro-environment&speed=100',
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        'md5': '544a9468546059d4e80d76265b0443b8',
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        'info_dict': {
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            'id': 'PracticalUnix_intro-environment',
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            'ext': 'mp4',
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            'title': 'Intro Environment',
							 | 
						
					
						
							
								
									
										
										
										
											2013-06-27 20:46:46 +02:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        }
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    }
							 | 
						
					
						
							
								
									
										
										
										
											2013-06-23 21:16:32 +02:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    def _real_extract(self, url):
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        mobj = re.match(self._VALID_URL, url)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2014-11-20 16:34:54 +01:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if mobj.group('course') and mobj.group('video'):  # A specific video
							 | 
						
					
						
							
								
									
										
										
										
											2013-06-23 21:16:32 +02:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            course = mobj.group('course')
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            video = mobj.group('video')
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            info = {
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                'id': course + '_' + video,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                'uploader': None,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                'upload_date': None,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            }
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            xmlUrl = baseUrl + video + '.xml'
							 | 
						
					
						
							
								
									
										
										
										
											2013-12-08 22:24:55 +01:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            mdoc = self._download_xml(xmlUrl, info['id'])
							 | 
						
					
						
							
								
									
										
										
										
											2013-06-23 21:16:32 +02:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            try:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                info['title'] = mdoc.findall('./title')[0].text
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            except IndexError:
							 | 
						
					
						
							
								
									
										
										
										
											2014-11-20 16:34:54 +01:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                raise ExtractorError('Invalid metadata XML file')
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            return info
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        elif mobj.group('course'):  # A course page
							 | 
						
					
						
							
								
									
										
										
										
											2013-06-23 21:16:32 +02:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            course = mobj.group('course')
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            info = {
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                'id': course,
							 | 
						
					
						
							
								
									
										
										
										
											2014-11-20 16:34:54 +01:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                '_type': 'playlist',
							 | 
						
					
						
							
								
									
										
										
										
											2013-06-23 21:16:32 +02:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                'uploader': None,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                'upload_date': None,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            }
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2014-11-20 16:34:54 +01:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            coursepage = self._download_webpage(
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                url, info['id'],
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                note='Downloading course info page',
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                errnote='Unable to download course info page')
							 | 
						
					
						
							
								
									
										
										
										
											2013-06-23 21:16:32 +02:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2014-11-20 16:34:54 +01:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            info['title'] = self._html_search_regex(
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                r'<h1>([^<]+)</h1>', coursepage, 'title', default=info['id'])
							 | 
						
					
						
							
								
									
										
										
										
											2013-06-23 21:16:32 +02:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2014-11-20 16:34:54 +01:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            info['description'] = self._html_search_regex(
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                r'(?s)<description>([^<]+)</description>',
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                coursepage, 'description', fatal=False)
							 | 
						
					
						
							
								
									
										
										
										
											2013-06-23 21:16:32 +02:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
							 | 
						
					
						
							
								
									
										
										
										
											2014-11-20 16:34:54 +01:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            info['entries'] = [self.url_result(
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                'http://openclassroom.stanford.edu/MainFolder/%s' % unescapeHTML(l)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            ) for l in links]
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            return info
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        else:  # Root page
							 | 
						
					
						
							
								
									
										
										
										
											2013-06-23 21:16:32 +02:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            info = {
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                'id': 'Stanford OpenClassroom',
							 | 
						
					
						
							
								
									
										
										
										
											2014-11-20 16:34:54 +01:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                '_type': 'playlist',
							 | 
						
					
						
							
								
									
										
										
										
											2013-06-23 21:16:32 +02:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                'uploader': None,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                'upload_date': None,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            }
							 | 
						
					
						
							
								
									
										
										
										
											2014-11-20 16:34:54 +01:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            info['title'] = info['id']
							 | 
						
					
						
							
								
									
										
										
										
											2013-06-23 21:16:32 +02:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
							 | 
						
					
						
							
								
									
										
										
										
											2013-12-08 22:24:55 +01:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            rootpage = self._download_webpage(rootURL, info['id'],
							 | 
						
					
						
							
								
									
										
										
										
											2014-11-23 21:39:15 +01:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                                              errnote='Unable to download course info page')
							 | 
						
					
						
							
								
									
										
										
										
											2013-06-23 21:16:32 +02:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
							 | 
						
					
						
							
								
									
										
										
										
											2014-11-20 16:34:54 +01:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            info['entries'] = [self.url_result(
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                'http://openclassroom.stanford.edu/MainFolder/%s' % unescapeHTML(l)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            ) for l in links]
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            return info
							 |