| 
									
										
										
										
											2016-05-17 16:21:52 +02:00
										 |  |  | # coding: utf-8 | 
					
						
							|  |  |  | from __future__ import unicode_literals | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2016-06-04 00:26:03 +07:00
										 |  |  | import re | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2016-05-17 16:21:52 +02:00
										 |  |  | from .common import InfoExtractor | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2016-06-03 23:19:11 +07:00
										 |  |  | from ..utils import ( | 
					
						
							|  |  |  |     determine_ext, | 
					
						
							|  |  |  |     float_or_none, | 
					
						
							|  |  |  |     int_or_none, | 
					
						
							| 
									
										
										
										
											2016-06-04 00:26:03 +07:00
										 |  |  |     parse_filesize, | 
					
						
							| 
									
										
										
										
											2016-06-03 23:19:11 +07:00
										 |  |  | ) | 
					
						
							| 
									
										
										
										
											2016-05-17 16:21:52 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | class LibraryOfCongressIE(InfoExtractor): | 
					
						
							| 
									
										
										
										
											2016-06-03 23:19:11 +07:00
										 |  |  |     IE_NAME = 'loc' | 
					
						
							|  |  |  |     IE_DESC = 'Library of Congress' | 
					
						
							| 
									
										
										
										
											2016-06-03 23:43:34 +07:00
										 |  |  |     _VALID_URL = r'https?://(?:www\.)?loc\.gov/(?:item/|today/cyberlc/feature_wdesc\.php\?.*\brec=)(?P<id>[0-9]+)' | 
					
						
							|  |  |  |     _TESTS = [{ | 
					
						
							| 
									
										
										
										
											2016-06-03 23:55:22 +07:00
										 |  |  |         # embedded via <div class="media-player" | 
					
						
							| 
									
										
										
										
											2016-06-03 23:19:11 +07:00
										 |  |  |         'url': 'http://loc.gov/item/90716351/', | 
					
						
							|  |  |  |         'md5': '353917ff7f0255aa6d4b80a034833de8', | 
					
						
							| 
									
										
										
										
											2016-05-17 16:21:52 +02:00
										 |  |  |         'info_dict': { | 
					
						
							|  |  |  |             'id': '90716351', | 
					
						
							|  |  |  |             'ext': 'mp4', | 
					
						
							| 
									
										
										
										
											2016-06-03 23:19:11 +07:00
										 |  |  |             'title': "Pa's trip to Mars", | 
					
						
							| 
									
										
										
										
											2017-01-02 20:08:07 +08:00
										 |  |  |             'thumbnail': r're:^https?://.*\.jpg$', | 
					
						
							| 
									
										
										
										
											2016-06-03 23:19:11 +07:00
										 |  |  |             'duration': 0, | 
					
						
							|  |  |  |             'view_count': int, | 
					
						
							| 
									
										
										
										
											2016-05-17 16:21:52 +02:00
										 |  |  |         }, | 
					
						
							| 
									
										
										
										
											2016-06-03 23:43:34 +07:00
										 |  |  |     }, { | 
					
						
							| 
									
										
										
										
											2016-06-03 23:55:22 +07:00
										 |  |  |         # webcast embedded via mediaObjectId | 
					
						
							| 
									
										
										
										
											2016-06-03 23:43:34 +07:00
										 |  |  |         'url': 'https://www.loc.gov/today/cyberlc/feature_wdesc.php?rec=5578', | 
					
						
							| 
									
										
										
										
											2016-06-03 23:55:22 +07:00
										 |  |  |         'info_dict': { | 
					
						
							|  |  |  |             'id': '5578', | 
					
						
							|  |  |  |             'ext': 'mp4', | 
					
						
							|  |  |  |             'title': 'Help! Preservation Training Needs Here, There & Everywhere', | 
					
						
							|  |  |  |             'duration': 3765, | 
					
						
							|  |  |  |             'view_count': int, | 
					
						
							|  |  |  |             'subtitles': 'mincount:1', | 
					
						
							|  |  |  |         }, | 
					
						
							|  |  |  |         'params': { | 
					
						
							|  |  |  |             'skip_download': True, | 
					
						
							|  |  |  |         }, | 
					
						
							| 
									
										
										
										
											2016-06-04 00:26:03 +07:00
										 |  |  |     }, { | 
					
						
							|  |  |  |         # with direct download links | 
					
						
							|  |  |  |         'url': 'https://www.loc.gov/item/78710669/', | 
					
						
							|  |  |  |         'info_dict': { | 
					
						
							|  |  |  |             'id': '78710669', | 
					
						
							|  |  |  |             'ext': 'mp4', | 
					
						
							|  |  |  |             'title': 'La vie et la passion de Jesus-Christ', | 
					
						
							|  |  |  |             'duration': 0, | 
					
						
							|  |  |  |             'view_count': int, | 
					
						
							|  |  |  |             'formats': 'mincount:4', | 
					
						
							|  |  |  |         }, | 
					
						
							|  |  |  |         'params': { | 
					
						
							|  |  |  |             'skip_download': True, | 
					
						
							|  |  |  |         }, | 
					
						
							| 
									
										
										
										
											2016-06-03 23:43:34 +07:00
										 |  |  |     }] | 
					
						
							| 
									
										
										
										
											2016-05-17 16:21:52 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |     def _real_extract(self, url): | 
					
						
							|  |  |  |         video_id = self._match_id(url) | 
					
						
							|  |  |  |         webpage = self._download_webpage(url, video_id) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2016-06-03 23:19:11 +07:00
										 |  |  |         media_id = self._search_regex( | 
					
						
							|  |  |  |             (r'id=(["\'])media-player-(?P<id>.+?)\1', | 
					
						
							|  |  |  |              r'<video[^>]+id=(["\'])uuid-(?P<id>.+?)\1', | 
					
						
							| 
									
										
										
										
											2016-06-03 23:43:34 +07:00
										 |  |  |              r'<video[^>]+data-uuid=(["\'])(?P<id>.+?)\1', | 
					
						
							|  |  |  |              r'mediaObjectId\s*:\s*(["\'])(?P<id>.+?)\1'), | 
					
						
							| 
									
										
										
										
											2016-06-03 23:19:11 +07:00
										 |  |  |             webpage, 'media id', group='id') | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2016-06-03 23:43:34 +07:00
										 |  |  |         data = self._download_json( | 
					
						
							|  |  |  |             'https://media.loc.gov/services/v1/media?id=%s&context=json' % media_id, | 
					
						
							| 
									
										
										
										
											2016-06-03 23:19:11 +07:00
										 |  |  |             video_id)['mediaObject'] | 
					
						
							| 
									
										
										
										
											2016-05-17 16:21:52 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2016-06-03 23:19:11 +07:00
										 |  |  |         derivative = data['derivatives'][0] | 
					
						
							|  |  |  |         media_url = derivative['derivativeUrl'] | 
					
						
							| 
									
										
										
										
											2016-05-17 16:21:52 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2016-06-04 00:26:03 +07:00
										 |  |  |         title = derivative.get('shortName') or data.get('shortName') or self._og_search_title( | 
					
						
							|  |  |  |             webpage) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2016-06-03 23:19:11 +07:00
										 |  |  |         # Following algorithm was extracted from setAVSource js function | 
					
						
							|  |  |  |         # found in webpage | 
					
						
							| 
									
										
										
										
											2016-05-17 16:21:52 +02:00
										 |  |  |         media_url = media_url.replace('rtmp', 'https') | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2016-06-03 23:19:11 +07:00
										 |  |  |         is_video = data.get('mediaType', 'v').lower() == 'v' | 
					
						
							|  |  |  |         ext = determine_ext(media_url) | 
					
						
							|  |  |  |         if ext not in ('mp4', 'mp3'): | 
					
						
							| 
									
										
										
										
											2016-05-17 16:21:52 +02:00
										 |  |  |             media_url += '.mp4' if is_video else '.mp3' | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2016-06-03 23:19:11 +07:00
										 |  |  |         if 'vod/mp4:' in media_url: | 
					
						
							|  |  |  |             formats = [{ | 
					
						
							|  |  |  |                 'url': media_url.replace('vod/mp4:', 'hls-vod/media/') + '.m3u8', | 
					
						
							|  |  |  |                 'format_id': 'hls', | 
					
						
							|  |  |  |                 'ext': 'mp4', | 
					
						
							|  |  |  |                 'protocol': 'm3u8_native', | 
					
						
							| 
									
										
										
										
											2016-06-04 00:26:03 +07:00
										 |  |  |                 'quality': 1, | 
					
						
							| 
									
										
										
										
											2016-06-03 23:19:11 +07:00
										 |  |  |             }] | 
					
						
							|  |  |  |         elif 'vod/mp3:' in media_url: | 
					
						
							|  |  |  |             formats = [{ | 
					
						
							|  |  |  |                 'url': media_url.replace('vod/mp3:', ''), | 
					
						
							|  |  |  |                 'vcodec': 'none', | 
					
						
							|  |  |  |             }] | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2016-06-04 00:26:03 +07:00
										 |  |  |         download_urls = set() | 
					
						
							|  |  |  |         for m in re.finditer( | 
					
						
							|  |  |  |                 r'<option[^>]+value=(["\'])(?P<url>.+?)\1[^>]+data-file-download=[^>]+>\s*(?P<id>.+?)(?:(?: |\s+)\((?P<size>.+?)\))?\s*<', webpage): | 
					
						
							|  |  |  |             format_id = m.group('id').lower() | 
					
						
							|  |  |  |             if format_id == 'gif': | 
					
						
							|  |  |  |                 continue | 
					
						
							|  |  |  |             download_url = m.group('url') | 
					
						
							|  |  |  |             if download_url in download_urls: | 
					
						
							|  |  |  |                 continue | 
					
						
							|  |  |  |             download_urls.add(download_url) | 
					
						
							|  |  |  |             formats.append({ | 
					
						
							|  |  |  |                 'url': download_url, | 
					
						
							|  |  |  |                 'format_id': format_id, | 
					
						
							|  |  |  |                 'filesize_approx': parse_filesize(m.group('size')), | 
					
						
							|  |  |  |             }) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2016-06-03 23:19:11 +07:00
										 |  |  |         self._sort_formats(formats) | 
					
						
							| 
									
										
										
										
											2016-05-17 16:21:52 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2016-06-03 23:19:11 +07:00
										 |  |  |         duration = float_or_none(data.get('duration')) | 
					
						
							|  |  |  |         view_count = int_or_none(data.get('viewCount')) | 
					
						
							| 
									
										
										
										
											2016-05-17 16:21:52 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2016-06-03 23:55:22 +07:00
										 |  |  |         subtitles = {} | 
					
						
							|  |  |  |         cc_url = data.get('ccUrl') | 
					
						
							|  |  |  |         if cc_url: | 
					
						
							|  |  |  |             subtitles.setdefault('en', []).append({ | 
					
						
							|  |  |  |                 'url': cc_url, | 
					
						
							|  |  |  |                 'ext': 'ttml', | 
					
						
							|  |  |  |             }) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2016-05-17 16:21:52 +02:00
										 |  |  |         return { | 
					
						
							|  |  |  |             'id': video_id, | 
					
						
							| 
									
										
										
										
											2016-06-03 23:19:11 +07:00
										 |  |  |             'title': title, | 
					
						
							| 
									
										
										
										
											2016-06-03 23:43:34 +07:00
										 |  |  |             'thumbnail': self._og_search_thumbnail(webpage, default=None), | 
					
						
							| 
									
										
										
										
											2016-06-03 23:19:11 +07:00
										 |  |  |             'duration': duration, | 
					
						
							|  |  |  |             'view_count': view_count, | 
					
						
							| 
									
										
										
										
											2016-05-17 16:21:52 +02:00
										 |  |  |             'formats': formats, | 
					
						
							| 
									
										
										
										
											2016-06-03 23:55:22 +07:00
										 |  |  |             'subtitles': subtitles, | 
					
						
							| 
									
										
										
										
											2016-05-17 16:21:52 +02:00
										 |  |  |         } |