| 
									
										
										
										
											2014-11-20 16:34:54 +01:00
										 |  |  | from __future__ import unicode_literals | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2013-06-23 21:16:32 +02:00
										 |  |  | import re | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | from .common import InfoExtractor | 
					
						
							|  |  |  | from ..utils import ( | 
					
						
							|  |  |  |     ExtractorError, | 
					
						
							|  |  |  |     orderedSet, | 
					
						
							|  |  |  |     unescapeHTML, | 
					
						
							|  |  |  | ) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | class StanfordOpenClassroomIE(InfoExtractor): | 
					
						
							| 
									
										
										
										
											2014-11-20 16:34:54 +01:00
										 |  |  |     IE_NAME = 'stanfordoc' | 
					
						
							|  |  |  |     IE_DESC = 'Stanford Open ClassRoom' | 
					
						
							|  |  |  |     _VALID_URL = r'https?://openclassroom\.stanford\.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$' | 
					
						
							| 
									
										
										
										
											2013-06-27 20:46:46 +02:00
										 |  |  |     _TEST = { | 
					
						
							| 
									
										
										
										
											2014-11-20 16:34:54 +01:00
										 |  |  |         'url': 'http://openclassroom.stanford.edu/MainFolder/VideoPage.php?course=PracticalUnix&video=intro-environment&speed=100', | 
					
						
							|  |  |  |         'md5': '544a9468546059d4e80d76265b0443b8', | 
					
						
							|  |  |  |         'info_dict': { | 
					
						
							|  |  |  |             'id': 'PracticalUnix_intro-environment', | 
					
						
							|  |  |  |             'ext': 'mp4', | 
					
						
							|  |  |  |             'title': 'Intro Environment', | 
					
						
							| 
									
										
										
										
											2013-06-27 20:46:46 +02:00
										 |  |  |         } | 
					
						
							|  |  |  |     } | 
					
						
							| 
									
										
										
										
											2013-06-23 21:16:32 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |     def _real_extract(self, url): | 
					
						
							|  |  |  |         mobj = re.match(self._VALID_URL, url) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-11-20 16:34:54 +01:00
										 |  |  |         if mobj.group('course') and mobj.group('video'):  # A specific video | 
					
						
							| 
									
										
										
										
											2013-06-23 21:16:32 +02:00
										 |  |  |             course = mobj.group('course') | 
					
						
							|  |  |  |             video = mobj.group('video') | 
					
						
							|  |  |  |             info = { | 
					
						
							|  |  |  |                 'id': course + '_' + video, | 
					
						
							|  |  |  |                 'uploader': None, | 
					
						
							|  |  |  |                 'upload_date': None, | 
					
						
							|  |  |  |             } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/' | 
					
						
							|  |  |  |             xmlUrl = baseUrl + video + '.xml' | 
					
						
							| 
									
										
										
										
											2013-12-08 22:24:55 +01:00
										 |  |  |             mdoc = self._download_xml(xmlUrl, info['id']) | 
					
						
							| 
									
										
										
										
											2013-06-23 21:16:32 +02:00
										 |  |  |             try: | 
					
						
							|  |  |  |                 info['title'] = mdoc.findall('./title')[0].text | 
					
						
							|  |  |  |                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text | 
					
						
							|  |  |  |             except IndexError: | 
					
						
							| 
									
										
										
										
											2014-11-20 16:34:54 +01:00
										 |  |  |                 raise ExtractorError('Invalid metadata XML file') | 
					
						
							|  |  |  |             return info | 
					
						
							|  |  |  |         elif mobj.group('course'):  # A course page | 
					
						
							| 
									
										
										
										
											2013-06-23 21:16:32 +02:00
										 |  |  |             course = mobj.group('course') | 
					
						
							|  |  |  |             info = { | 
					
						
							|  |  |  |                 'id': course, | 
					
						
							| 
									
										
										
										
											2014-11-20 16:34:54 +01:00
										 |  |  |                 '_type': 'playlist', | 
					
						
							| 
									
										
										
										
											2013-06-23 21:16:32 +02:00
										 |  |  |                 'uploader': None, | 
					
						
							|  |  |  |                 'upload_date': None, | 
					
						
							|  |  |  |             } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-11-20 16:34:54 +01:00
										 |  |  |             coursepage = self._download_webpage( | 
					
						
							|  |  |  |                 url, info['id'], | 
					
						
							|  |  |  |                 note='Downloading course info page', | 
					
						
							|  |  |  |                 errnote='Unable to download course info page') | 
					
						
							| 
									
										
										
										
											2013-06-23 21:16:32 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-11-20 16:34:54 +01:00
										 |  |  |             info['title'] = self._html_search_regex( | 
					
						
							|  |  |  |                 r'<h1>([^<]+)</h1>', coursepage, 'title', default=info['id']) | 
					
						
							| 
									
										
										
										
											2013-06-23 21:16:32 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-11-20 16:34:54 +01:00
										 |  |  |             info['description'] = self._html_search_regex( | 
					
						
							|  |  |  |                 r'(?s)<description>([^<]+)</description>', | 
					
						
							|  |  |  |                 coursepage, 'description', fatal=False) | 
					
						
							| 
									
										
										
										
											2013-06-23 21:16:32 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2017-10-09 23:50:53 +07:00
										 |  |  |             links = orderedSet(re.findall(r'<a href="(VideoPage\.php\?[^"]+)">', coursepage)) | 
					
						
							| 
									
										
										
										
											2014-11-20 16:34:54 +01:00
										 |  |  |             info['entries'] = [self.url_result( | 
					
						
							|  |  |  |                 'http://openclassroom.stanford.edu/MainFolder/%s' % unescapeHTML(l) | 
					
						
							|  |  |  |             ) for l in links] | 
					
						
							|  |  |  |             return info | 
					
						
							|  |  |  |         else:  # Root page | 
					
						
							| 
									
										
										
										
											2013-06-23 21:16:32 +02:00
										 |  |  |             info = { | 
					
						
							|  |  |  |                 'id': 'Stanford OpenClassroom', | 
					
						
							| 
									
										
										
										
											2014-11-20 16:34:54 +01:00
										 |  |  |                 '_type': 'playlist', | 
					
						
							| 
									
										
										
										
											2013-06-23 21:16:32 +02:00
										 |  |  |                 'uploader': None, | 
					
						
							|  |  |  |                 'upload_date': None, | 
					
						
							|  |  |  |             } | 
					
						
							| 
									
										
										
										
											2014-11-20 16:34:54 +01:00
										 |  |  |             info['title'] = info['id'] | 
					
						
							| 
									
										
										
										
											2013-06-23 21:16:32 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php' | 
					
						
							| 
									
										
										
										
											2013-12-08 22:24:55 +01:00
										 |  |  |             rootpage = self._download_webpage(rootURL, info['id'], | 
					
						
							| 
									
										
										
										
											2014-11-23 21:39:15 +01:00
										 |  |  |                                               errnote='Unable to download course info page') | 
					
						
							| 
									
										
										
										
											2013-06-23 21:16:32 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2017-10-09 23:50:53 +07:00
										 |  |  |             links = orderedSet(re.findall(r'<a href="(CoursePage\.php\?[^"]+)">', rootpage)) | 
					
						
							| 
									
										
										
										
											2014-11-20 16:34:54 +01:00
										 |  |  |             info['entries'] = [self.url_result( | 
					
						
							|  |  |  |                 'http://openclassroom.stanford.edu/MainFolder/%s' % unescapeHTML(l) | 
					
						
							|  |  |  |             ) for l in links] | 
					
						
							|  |  |  |             return info |