[nytimes] improve extraction
This commit is contained in:
		
							parent
							
								
									f97ec8bcb9
								
							
						
					
					
						commit
						4191779dcd
					
				| @ -1,26 +1,37 @@ | |||||||
| from __future__ import unicode_literals | from __future__ import unicode_literals | ||||||
| 
 | 
 | ||||||
|  | import hmac | ||||||
|  | import hashlib | ||||||
|  | import base64 | ||||||
|  | 
 | ||||||
| from .common import InfoExtractor | from .common import InfoExtractor | ||||||
| from ..utils import ( | from ..utils import ( | ||||||
|     float_or_none, |     float_or_none, | ||||||
|     int_or_none, |     int_or_none, | ||||||
|     parse_iso8601, |     parse_iso8601, | ||||||
|  |     mimetype2ext, | ||||||
|  |     determine_ext, | ||||||
| ) | ) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class NYTimesBaseIE(InfoExtractor): | class NYTimesBaseIE(InfoExtractor): | ||||||
|  |     _SECRET = b'pX(2MbU2);4N{7J8)>YwKRJ+/pQ3JkiU2Q^V>mFYv6g6gYvt6v' | ||||||
|  | 
 | ||||||
|     def _extract_video_from_id(self, video_id): |     def _extract_video_from_id(self, video_id): | ||||||
|         video_data = self._download_json( |         # Authorization generation algorithm is reverse engineered from `signer` in | ||||||
|             'http://www.nytimes.com/svc/video/api/v2/video/%s' % video_id, |         # http://graphics8.nytimes.com/video/vhs/vhs-2.x.min.js | ||||||
|             video_id, 'Downloading video JSON') |         path = '/svc/video/api/v3/video/' + video_id | ||||||
|  |         hm = hmac.new(self._SECRET, (path + ':vhs').encode(), hashlib.sha512).hexdigest() | ||||||
|  |         video_data = self._download_json('http://www.nytimes.com' + path, video_id, 'Downloading video JSON', headers={ | ||||||
|  |             'Authorization': 'NYTV ' + base64.b64encode(hm.encode()).decode(), | ||||||
|  |             'X-NYTV': 'vhs', | ||||||
|  |         }, fatal=False) | ||||||
|  |         if not video_data: | ||||||
|  |             video_data = self._download_json( | ||||||
|  |                 'http://www.nytimes.com/svc/video/api/v2/video/' + video_id, | ||||||
|  |                 video_id, 'Downloading video JSON') | ||||||
| 
 | 
 | ||||||
|         title = video_data['headline'] |         title = video_data['headline'] | ||||||
|         description = video_data.get('summary') |  | ||||||
|         duration = float_or_none(video_data.get('duration'), 1000) |  | ||||||
| 
 |  | ||||||
|         uploader = video_data.get('byline') |  | ||||||
|         publication_date = video_data.get('publication_date') |  | ||||||
|         timestamp = parse_iso8601(publication_date[:-8]) if publication_date else None |  | ||||||
| 
 | 
 | ||||||
|         def get_file_size(file_size): |         def get_file_size(file_size): | ||||||
|             if isinstance(file_size, int): |             if isinstance(file_size, int): | ||||||
| @ -28,35 +39,59 @@ class NYTimesBaseIE(InfoExtractor): | |||||||
|             elif isinstance(file_size, dict): |             elif isinstance(file_size, dict): | ||||||
|                 return int(file_size.get('value', 0)) |                 return int(file_size.get('value', 0)) | ||||||
|             else: |             else: | ||||||
|                 return 0 |                 return None | ||||||
| 
 | 
 | ||||||
|         formats = [ |         urls = [] | ||||||
|             { |         formats = [] | ||||||
|                 'url': video['url'], |         for video in video_data.get('renditions', []): | ||||||
|                 'format_id': video.get('type'), |             video_url = video.get('url') | ||||||
|                 'vcodec': video.get('video_codec'), |             format_id = video.get('type') | ||||||
|                 'width': int_or_none(video.get('width')), |             if not video_url or format_id == 'thumbs' or video_url in urls: | ||||||
|                 'height': int_or_none(video.get('height')), |                 continue | ||||||
|                 'filesize': get_file_size(video.get('fileSize')), |             urls.append(video_url) | ||||||
|             } for video in video_data['renditions'] if video.get('url') |             ext = mimetype2ext(video.get('mimetype')) or determine_ext(video_url) | ||||||
|         ] |             if ext == 'm3u8': | ||||||
|  |                 formats.extend(self._extract_m3u8_formats( | ||||||
|  |                     video_url, video_id, 'mp4', 'm3u8_native', | ||||||
|  |                     m3u8_id=format_id or 'hls', fatal=False)) | ||||||
|  |             elif ext == 'mpd': | ||||||
|  |                 continue | ||||||
|  |             #     formats.extend(self._extract_mpd_formats( | ||||||
|  |             #         video_url, video_id, format_id or 'dash', fatal=False)) | ||||||
|  |             else: | ||||||
|  |                 formats.append({ | ||||||
|  |                     'url': video_url, | ||||||
|  |                     'format_id': format_id, | ||||||
|  |                     'vcodec': video.get('videoencoding') or video.get('video_codec'), | ||||||
|  |                     'width': int_or_none(video.get('width')), | ||||||
|  |                     'height': int_or_none(video.get('height')), | ||||||
|  |                     'filesize': get_file_size(video.get('file_size') or video.get('fileSize')), | ||||||
|  |                     'tbr': int_or_none(video.get('bitrate'), 1000), | ||||||
|  |                     'ext': ext, | ||||||
|  |                 }) | ||||||
|         self._sort_formats(formats) |         self._sort_formats(formats) | ||||||
| 
 | 
 | ||||||
|         thumbnails = [ |         thumbnails = [] | ||||||
|             { |         for image in video_data.get('images', []): | ||||||
|                 'url': 'http://www.nytimes.com/%s' % image['url'], |             image_url = image.get('url') | ||||||
|  |             if not image_url: | ||||||
|  |                 continue | ||||||
|  |             thumbnails.append({ | ||||||
|  |                 'url': 'http://www.nytimes.com/' + image_url, | ||||||
|                 'width': int_or_none(image.get('width')), |                 'width': int_or_none(image.get('width')), | ||||||
|                 'height': int_or_none(image.get('height')), |                 'height': int_or_none(image.get('height')), | ||||||
|             } for image in video_data.get('images', []) if image.get('url') |             }) | ||||||
|         ] | 
 | ||||||
|  |         publication_date = video_data.get('publication_date') | ||||||
|  |         timestamp = parse_iso8601(publication_date[:-8]) if publication_date else None | ||||||
| 
 | 
 | ||||||
|         return { |         return { | ||||||
|             'id': video_id, |             'id': video_id, | ||||||
|             'title': title, |             'title': title, | ||||||
|             'description': description, |             'description': video_data.get('summary'), | ||||||
|             'timestamp': timestamp, |             'timestamp': timestamp, | ||||||
|             'uploader': uploader, |             'uploader': video_data.get('byline'), | ||||||
|             'duration': duration, |             'duration': float_or_none(video_data.get('duration'), 1000), | ||||||
|             'formats': formats, |             'formats': formats, | ||||||
|             'thumbnails': thumbnails, |             'thumbnails': thumbnails, | ||||||
|         } |         } | ||||||
| @ -67,7 +102,7 @@ class NYTimesIE(NYTimesBaseIE): | |||||||
| 
 | 
 | ||||||
|     _TESTS = [{ |     _TESTS = [{ | ||||||
|         'url': 'http://www.nytimes.com/video/opinion/100000002847155/verbatim-what-is-a-photocopier.html?playlistId=100000001150263', |         'url': 'http://www.nytimes.com/video/opinion/100000002847155/verbatim-what-is-a-photocopier.html?playlistId=100000001150263', | ||||||
|         'md5': '18a525a510f942ada2720db5f31644c0', |         'md5': 'd665342765db043f7e225cff19df0f2d', | ||||||
|         'info_dict': { |         'info_dict': { | ||||||
|             'id': '100000002847155', |             'id': '100000002847155', | ||||||
|             'ext': 'mov', |             'ext': 'mov', | ||||||
|  | |||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user