Refactor fragments interface and dash segments downloader
- Eliminate segment_urls and initialization_url + Introduce manifest_url (manifest may contain unfragmented data in this case url will be used for direct media URL and manifest_url for manifest itself correspondingly) * Rewrite dashsegments downloader to use fragments data * Improve generic mpd extraction
This commit is contained in:
		
							parent
							
								
									21d21b0c72
								
							
						
					
					
						commit
						86f4d14f81
					
				| @ -1,7 +1,6 @@ | |||||||
| from __future__ import unicode_literals | from __future__ import unicode_literals | ||||||
| 
 | 
 | ||||||
| import os | import os | ||||||
| import re |  | ||||||
| 
 | 
 | ||||||
| from .fragment import FragmentFD | from .fragment import FragmentFD | ||||||
| from ..compat import compat_urllib_error | from ..compat import compat_urllib_error | ||||||
| @ -19,34 +18,32 @@ class DashSegmentsFD(FragmentFD): | |||||||
|     FD_NAME = 'dashsegments' |     FD_NAME = 'dashsegments' | ||||||
| 
 | 
 | ||||||
|     def real_download(self, filename, info_dict): |     def real_download(self, filename, info_dict): | ||||||
|         base_url = info_dict['url'] |         segments = info_dict['fragments'][:1] if self.params.get( | ||||||
|         segment_urls = [info_dict['segment_urls'][0]] if self.params.get('test', False) else info_dict['segment_urls'] |             'test', False) else info_dict['fragments'] | ||||||
|         initialization_url = info_dict.get('initialization_url') |  | ||||||
| 
 | 
 | ||||||
|         ctx = { |         ctx = { | ||||||
|             'filename': filename, |             'filename': filename, | ||||||
|             'total_frags': len(segment_urls) + (1 if initialization_url else 0), |             'total_frags': len(segments), | ||||||
|         } |         } | ||||||
| 
 | 
 | ||||||
|         self._prepare_and_start_frag_download(ctx) |         self._prepare_and_start_frag_download(ctx) | ||||||
| 
 | 
 | ||||||
|         def combine_url(base_url, target_url): |  | ||||||
|             if re.match(r'^https?://', target_url): |  | ||||||
|                 return target_url |  | ||||||
|             return '%s%s%s' % (base_url, '' if base_url.endswith('/') else '/', target_url) |  | ||||||
| 
 |  | ||||||
|         segments_filenames = [] |         segments_filenames = [] | ||||||
| 
 | 
 | ||||||
|         fragment_retries = self.params.get('fragment_retries', 0) |         fragment_retries = self.params.get('fragment_retries', 0) | ||||||
|         skip_unavailable_fragments = self.params.get('skip_unavailable_fragments', True) |         skip_unavailable_fragments = self.params.get('skip_unavailable_fragments', True) | ||||||
| 
 | 
 | ||||||
|         def process_segment(segment, tmp_filename, fatal): |         def process_segment(segment, tmp_filename, num): | ||||||
|             target_url, segment_name = segment |             segment_url = segment['url'] | ||||||
|  |             segment_name = 'Frag%d' % num | ||||||
|             target_filename = '%s-%s' % (tmp_filename, segment_name) |             target_filename = '%s-%s' % (tmp_filename, segment_name) | ||||||
|  |             # In DASH, the first segment contains necessary headers to | ||||||
|  |             # generate a valid MP4 file, so always abort for the first segment | ||||||
|  |             fatal = num == 0 or not skip_unavailable_fragments | ||||||
|             count = 0 |             count = 0 | ||||||
|             while count <= fragment_retries: |             while count <= fragment_retries: | ||||||
|                 try: |                 try: | ||||||
|                     success = ctx['dl'].download(target_filename, {'url': combine_url(base_url, target_url)}) |                     success = ctx['dl'].download(target_filename, {'url': segment_url}) | ||||||
|                     if not success: |                     if not success: | ||||||
|                         return False |                         return False | ||||||
|                     down, target_sanitized = sanitize_open(target_filename, 'rb') |                     down, target_sanitized = sanitize_open(target_filename, 'rb') | ||||||
| @ -72,16 +69,8 @@ class DashSegmentsFD(FragmentFD): | |||||||
|                 return False |                 return False | ||||||
|             return True |             return True | ||||||
| 
 | 
 | ||||||
|         segments_to_download = [(initialization_url, 'Init')] if initialization_url else [] |         for i, segment in enumerate(segments): | ||||||
|         segments_to_download.extend([ |             if not process_segment(segment, ctx['tmpfilename'], i): | ||||||
|             (segment_url, 'Seg%d' % i) |  | ||||||
|             for i, segment_url in enumerate(segment_urls)]) |  | ||||||
| 
 |  | ||||||
|         for i, segment in enumerate(segments_to_download): |  | ||||||
|             # In DASH, the first segment contains necessary headers to |  | ||||||
|             # generate a valid MP4 file, so always abort for the first segment |  | ||||||
|             fatal = i == 0 or not skip_unavailable_fragments |  | ||||||
|             if not process_segment(segment, ctx['tmpfilename'], fatal): |  | ||||||
|                 return False |                 return False | ||||||
| 
 | 
 | ||||||
|         self._finish_frag_download(ctx) |         self._finish_frag_download(ctx) | ||||||
|  | |||||||
| @ -86,9 +86,10 @@ class InfoExtractor(object): | |||||||
|                     from worst to best quality. |                     from worst to best quality. | ||||||
| 
 | 
 | ||||||
|                     Potential fields: |                     Potential fields: | ||||||
|                     * url        Mandatory. The URL of the video file or URL of |                     * url        Mandatory. The URL of the video file | ||||||
|                                  the manifest file in case of fragmented media |                     * manifest_url | ||||||
|                                  (DASH, hls, hds). |                                  The URL of the manifest file in case of | ||||||
|  |                                  fragmented media (DASH, hls, hds) | ||||||
|                     * ext        Will be calculated from URL if missing |                     * ext        Will be calculated from URL if missing | ||||||
|                     * format     A human-readable description of the format |                     * format     A human-readable description of the format | ||||||
|                                  ("mp4 container with h264/opus"). |                                  ("mp4 container with h264/opus"). | ||||||
| @ -1528,9 +1529,10 @@ class InfoExtractor(object): | |||||||
|         mpd_base_url = re.match(r'https?://.+/', urlh.geturl()).group() |         mpd_base_url = re.match(r'https?://.+/', urlh.geturl()).group() | ||||||
| 
 | 
 | ||||||
|         return self._parse_mpd_formats( |         return self._parse_mpd_formats( | ||||||
|             compat_etree_fromstring(mpd.encode('utf-8')), mpd_id, mpd_base_url, formats_dict=formats_dict) |             compat_etree_fromstring(mpd.encode('utf-8')), mpd_id, mpd_base_url, | ||||||
|  |             formats_dict=formats_dict, mpd_url=mpd_url) | ||||||
| 
 | 
 | ||||||
|     def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}): |     def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}, mpd_url=None): | ||||||
|         """ |         """ | ||||||
|         Parse formats from MPD manifest. |         Parse formats from MPD manifest. | ||||||
|         References: |         References: | ||||||
| @ -1654,6 +1656,7 @@ class InfoExtractor(object): | |||||||
|                         f = { |                         f = { | ||||||
|                             'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id, |                             'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id, | ||||||
|                             'url': base_url, |                             'url': base_url, | ||||||
|  |                             'manifest_url': mpd_url, | ||||||
|                             'ext': mimetype2ext(mime_type), |                             'ext': mimetype2ext(mime_type), | ||||||
|                             'width': int_or_none(representation_attrib.get('width')), |                             'width': int_or_none(representation_attrib.get('width')), | ||||||
|                             'height': int_or_none(representation_attrib.get('height')), |                             'height': int_or_none(representation_attrib.get('height')), | ||||||
| @ -1682,14 +1685,6 @@ class InfoExtractor(object): | |||||||
|                                 if 'total_number' not in representation_ms_info and 'segment_duration': |                                 if 'total_number' not in representation_ms_info and 'segment_duration': | ||||||
|                                     segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale']) |                                     segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale']) | ||||||
|                                     representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration)) |                                     representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration)) | ||||||
|                                 representation_ms_info['segment_urls'] = [ |  | ||||||
|                                     media_template % { |  | ||||||
|                                         'Number': segment_number, |  | ||||||
|                                         'Bandwidth': representation_attrib.get('bandwidth'), |  | ||||||
|                                     } |  | ||||||
|                                     for segment_number in range( |  | ||||||
|                                         representation_ms_info['start_number'], |  | ||||||
|                                         representation_ms_info['total_number'] + representation_ms_info['start_number'])] |  | ||||||
|                                 representation_ms_info['fragments'] = [{ |                                 representation_ms_info['fragments'] = [{ | ||||||
|                                     'url': media_template % { |                                     'url': media_template % { | ||||||
|                                         'Number': segment_number, |                                         'Number': segment_number, | ||||||
| @ -1703,7 +1698,6 @@ class InfoExtractor(object): | |||||||
|                                 # $Number*$ or $Time$ in media template with S list available |                                 # $Number*$ or $Time$ in media template with S list available | ||||||
|                                 # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg |                                 # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg | ||||||
|                                 # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411 |                                 # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411 | ||||||
|                                 representation_ms_info['segment_urls'] = [] |  | ||||||
|                                 representation_ms_info['fragments'] = [] |                                 representation_ms_info['fragments'] = [] | ||||||
|                                 segment_time = 0 |                                 segment_time = 0 | ||||||
|                                 segment_d = None |                                 segment_d = None | ||||||
| @ -1715,7 +1709,6 @@ class InfoExtractor(object): | |||||||
|                                         'Bandwidth': representation_attrib.get('bandwidth'), |                                         'Bandwidth': representation_attrib.get('bandwidth'), | ||||||
|                                         'Number': segment_number, |                                         'Number': segment_number, | ||||||
|                                     } |                                     } | ||||||
|                                     representation_ms_info['segment_urls'].append(segment_url) |  | ||||||
|                                     representation_ms_info['fragments'].append({ |                                     representation_ms_info['fragments'].append({ | ||||||
|                                         'url': segment_url, |                                         'url': segment_url, | ||||||
|                                         'duration': float_or_none(segment_d, representation_ms_info['timescale']), |                                         'duration': float_or_none(segment_d, representation_ms_info['timescale']), | ||||||
| @ -1745,17 +1738,15 @@ class InfoExtractor(object): | |||||||
|                                         'duration': float_or_none(s['d'], representation_ms_info['timescale']), |                                         'duration': float_or_none(s['d'], representation_ms_info['timescale']), | ||||||
|                                     }) |                                     }) | ||||||
|                             representation_ms_info['fragments'] = fragments |                             representation_ms_info['fragments'] = fragments | ||||||
|                         if 'segment_urls' in representation_ms_info: |                         # NB: MPD manifest may contain direct URLs to unfragmented media. | ||||||
|  |                         # No fragments key is present in this case. | ||||||
|  |                         if 'fragments' in representation_ms_info: | ||||||
|                             f.update({ |                             f.update({ | ||||||
|                                 'segment_urls': representation_ms_info['segment_urls'], |  | ||||||
|                                 'fragments': [], |                                 'fragments': [], | ||||||
|                                 'protocol': 'http_dash_segments', |                                 'protocol': 'http_dash_segments', | ||||||
|                             }) |                             }) | ||||||
|                             if 'initialization_url' in representation_ms_info: |                             if 'initialization_url' in representation_ms_info: | ||||||
|                                 initialization_url = representation_ms_info['initialization_url'].replace('$RepresentationID$', representation_id) |                                 initialization_url = representation_ms_info['initialization_url'].replace('$RepresentationID$', representation_id) | ||||||
|                                 f.update({ |  | ||||||
|                                     'initialization_url': initialization_url, |  | ||||||
|                                 }) |  | ||||||
|                                 if not f.get('url'): |                                 if not f.get('url'): | ||||||
|                                     f['url'] = initialization_url |                                     f['url'] = initialization_url | ||||||
|                                 f['fragments'].append({'url': initialization_url}) |                                 f['fragments'].append({'url': initialization_url}) | ||||||
|  | |||||||
| @ -1657,7 +1657,9 @@ class GenericIE(InfoExtractor): | |||||||
|                 return self.playlist_result(self._parse_xspf(doc, video_id), video_id) |                 return self.playlist_result(self._parse_xspf(doc, video_id), video_id) | ||||||
|             elif re.match(r'(?i)^(?:{[^}]+})?MPD$', doc.tag): |             elif re.match(r'(?i)^(?:{[^}]+})?MPD$', doc.tag): | ||||||
|                 info_dict['formats'] = self._parse_mpd_formats( |                 info_dict['formats'] = self._parse_mpd_formats( | ||||||
|                     doc, video_id, mpd_base_url=url.rpartition('/')[0]) |                     doc, video_id, | ||||||
|  |                     mpd_base_url=full_response.geturl().rpartition('/')[0], | ||||||
|  |                     mpd_url=url) | ||||||
|                 self._sort_formats(info_dict['formats']) |                 self._sort_formats(info_dict['formats']) | ||||||
|                 return info_dict |                 return info_dict | ||||||
|             elif re.match(r'^{http://ns\.adobe\.com/f4m/[12]\.0}manifest$', doc.tag): |             elif re.match(r'^{http://ns\.adobe\.com/f4m/[12]\.0}manifest$', doc.tag): | ||||||
|  | |||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user