Merge pull request #6428 from dstftw/improve-generic-smil-support
Improve generic SMIL support
This commit is contained in:
		
						commit
						d5d7bdaeb5
					
				| @ -133,8 +133,8 @@ def expect_info_dict(self, got_dict, expected_dict): | |||||||
|             elif isinstance(expected, compat_str) and expected.startswith('mincount:'): |             elif isinstance(expected, compat_str) and expected.startswith('mincount:'): | ||||||
|                 got = got_dict.get(info_field) |                 got = got_dict.get(info_field) | ||||||
|                 self.assertTrue( |                 self.assertTrue( | ||||||
|                     isinstance(got, list), |                     isinstance(got, (list, dict)), | ||||||
|                     'Expected field %s to be a list, but it is of type %s' % ( |                     'Expected field %s to be a list or a dict, but it is of type %s' % ( | ||||||
|                         info_field, type(got).__name__)) |                         info_field, type(got).__name__)) | ||||||
|                 expected_num = int(expected.partition(':')[2]) |                 expected_num = int(expected.partition(':')[2]) | ||||||
|                 assertGreaterEqual( |                 assertGreaterEqual( | ||||||
|  | |||||||
| @ -136,7 +136,9 @@ def generator(test_case): | |||||||
|                     # We're not using .download here sine that is just a shim |                     # We're not using .download here sine that is just a shim | ||||||
|                     # for outside error handling, and returns the exit code |                     # for outside error handling, and returns the exit code | ||||||
|                     # instead of the result dict. |                     # instead of the result dict. | ||||||
|                     res_dict = ydl.extract_info(test_case['url']) |                     res_dict = ydl.extract_info( | ||||||
|  |                         test_case['url'], | ||||||
|  |                         force_generic_extractor=params.get('force_generic_extractor', False)) | ||||||
|                 except (DownloadError, ExtractorError) as err: |                 except (DownloadError, ExtractorError) as err: | ||||||
|                     # Check if the exception is not a network related one |                     # Check if the exception is not a network related one | ||||||
|                     if not err.exc_info[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError, compat_http_client.BadStatusLine) or (err.exc_info[0] == compat_HTTPError and err.exc_info[1].code == 503): |                     if not err.exc_info[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError, compat_http_client.BadStatusLine) or (err.exc_info[0] == compat_HTTPError and err.exc_info[1].code == 503): | ||||||
|  | |||||||
| @ -18,6 +18,7 @@ from ..compat import ( | |||||||
|     compat_HTTPError, |     compat_HTTPError, | ||||||
|     compat_http_client, |     compat_http_client, | ||||||
|     compat_urllib_error, |     compat_urllib_error, | ||||||
|  |     compat_urllib_parse, | ||||||
|     compat_urllib_parse_urlparse, |     compat_urllib_parse_urlparse, | ||||||
|     compat_urllib_request, |     compat_urllib_request, | ||||||
|     compat_urlparse, |     compat_urlparse, | ||||||
| @ -37,6 +38,7 @@ from ..utils import ( | |||||||
|     RegexNotFoundError, |     RegexNotFoundError, | ||||||
|     sanitize_filename, |     sanitize_filename, | ||||||
|     unescapeHTML, |     unescapeHTML, | ||||||
|  |     url_basename, | ||||||
| ) | ) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @ -978,69 +980,167 @@ class InfoExtractor(object): | |||||||
|         self._sort_formats(formats) |         self._sort_formats(formats) | ||||||
|         return formats |         return formats | ||||||
| 
 | 
 | ||||||
|     # TODO: improve extraction |     @staticmethod | ||||||
|     def _extract_smil_formats(self, smil_url, video_id, fatal=True): |     def _xpath_ns(path, namespace=None): | ||||||
|         smil = self._download_xml( |         if not namespace: | ||||||
|             smil_url, video_id, 'Downloading SMIL file', |             return path | ||||||
|             'Unable to download SMIL file', fatal=fatal) |         out = [] | ||||||
|  |         for c in path.split('/'): | ||||||
|  |             if not c or c == '.': | ||||||
|  |                 out.append(c) | ||||||
|  |             else: | ||||||
|  |                 out.append('{%s}%s' % (namespace, c)) | ||||||
|  |         return '/'.join(out) | ||||||
|  | 
 | ||||||
|  |     def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None): | ||||||
|  |         smil = self._download_smil(smil_url, video_id, fatal=fatal) | ||||||
|  | 
 | ||||||
|         if smil is False: |         if smil is False: | ||||||
|             assert not fatal |             assert not fatal | ||||||
|             return [] |             return [] | ||||||
| 
 | 
 | ||||||
|         base = smil.find('./head/meta').get('base') |         namespace = self._parse_smil_namespace(smil) | ||||||
|  | 
 | ||||||
|  |         return self._parse_smil_formats( | ||||||
|  |             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params) | ||||||
|  | 
 | ||||||
|  |     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None): | ||||||
|  |         smil = self._download_smil(smil_url, video_id, fatal=fatal) | ||||||
|  |         if smil is False: | ||||||
|  |             return {} | ||||||
|  |         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params) | ||||||
|  | 
 | ||||||
|  |     def _download_smil(self, smil_url, video_id, fatal=True): | ||||||
|  |         return self._download_xml( | ||||||
|  |             smil_url, video_id, 'Downloading SMIL file', | ||||||
|  |             'Unable to download SMIL file', fatal=fatal) | ||||||
|  | 
 | ||||||
|  |     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None): | ||||||
|  |         namespace = self._parse_smil_namespace(smil) | ||||||
|  | 
 | ||||||
|  |         formats = self._parse_smil_formats( | ||||||
|  |             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params) | ||||||
|  |         subtitles = self._parse_smil_subtitles(smil, namespace=namespace) | ||||||
|  | 
 | ||||||
|  |         video_id = os.path.splitext(url_basename(smil_url))[0] | ||||||
|  |         title = None | ||||||
|  |         description = None | ||||||
|  |         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)): | ||||||
|  |             name = meta.attrib.get('name') | ||||||
|  |             content = meta.attrib.get('content') | ||||||
|  |             if not name or not content: | ||||||
|  |                 continue | ||||||
|  |             if not title and name == 'title': | ||||||
|  |                 title = content | ||||||
|  |             elif not description and name in ('description', 'abstract'): | ||||||
|  |                 description = content | ||||||
|  | 
 | ||||||
|  |         return { | ||||||
|  |             'id': video_id, | ||||||
|  |             'title': title or video_id, | ||||||
|  |             'description': description, | ||||||
|  |             'formats': formats, | ||||||
|  |             'subtitles': subtitles, | ||||||
|  |         } | ||||||
|  | 
 | ||||||
|  |     def _parse_smil_namespace(self, smil): | ||||||
|  |         return self._search_regex( | ||||||
|  |             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None) | ||||||
|  | 
 | ||||||
|  |     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None): | ||||||
|  |         base = smil_url | ||||||
|  |         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)): | ||||||
|  |             b = meta.get('base') or meta.get('httpBase') | ||||||
|  |             if b: | ||||||
|  |                 base = b | ||||||
|  |                 break | ||||||
| 
 | 
 | ||||||
|         formats = [] |         formats = [] | ||||||
|         rtmp_count = 0 |         rtmp_count = 0 | ||||||
|         if smil.findall('./body/seq/video'): |         http_count = 0 | ||||||
|             video = smil.findall('./body/seq/video')[0] |  | ||||||
|             fmts, rtmp_count = self._parse_smil_video(video, video_id, base, rtmp_count) |  | ||||||
|             formats.extend(fmts) |  | ||||||
|         else: |  | ||||||
|             for video in smil.findall('./body/switch/video'): |  | ||||||
|                 fmts, rtmp_count = self._parse_smil_video(video, video_id, base, rtmp_count) |  | ||||||
|                 formats.extend(fmts) |  | ||||||
| 
 | 
 | ||||||
|         self._sort_formats(formats) |         videos = smil.findall(self._xpath_ns('.//video', namespace)) | ||||||
| 
 |         for video in videos: | ||||||
|         return formats |  | ||||||
| 
 |  | ||||||
|     def _parse_smil_video(self, video, video_id, base, rtmp_count): |  | ||||||
|             src = video.get('src') |             src = video.get('src') | ||||||
|             if not src: |             if not src: | ||||||
|             return [], rtmp_count |                 continue | ||||||
|  | 
 | ||||||
|             bitrate = int_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000) |             bitrate = int_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000) | ||||||
|  |             filesize = int_or_none(video.get('size') or video.get('fileSize')) | ||||||
|             width = int_or_none(video.get('width')) |             width = int_or_none(video.get('width')) | ||||||
|             height = int_or_none(video.get('height')) |             height = int_or_none(video.get('height')) | ||||||
|             proto = video.get('proto') |             proto = video.get('proto') | ||||||
|         if not proto: |  | ||||||
|             if base: |  | ||||||
|                 if base.startswith('rtmp'): |  | ||||||
|                     proto = 'rtmp' |  | ||||||
|                 elif base.startswith('http'): |  | ||||||
|                     proto = 'http' |  | ||||||
|             ext = video.get('ext') |             ext = video.get('ext') | ||||||
|         if proto == 'm3u8': |             src_ext = determine_ext(src) | ||||||
|             return self._extract_m3u8_formats(src, video_id, ext), rtmp_count |  | ||||||
|         elif proto == 'rtmp': |  | ||||||
|             rtmp_count += 1 |  | ||||||
|             streamer = video.get('streamer') or base |             streamer = video.get('streamer') or base | ||||||
|             return ([{ | 
 | ||||||
|  |             if proto == 'rtmp' or streamer.startswith('rtmp'): | ||||||
|  |                 rtmp_count += 1 | ||||||
|  |                 formats.append({ | ||||||
|                     'url': streamer, |                     'url': streamer, | ||||||
|                     'play_path': src, |                     'play_path': src, | ||||||
|                     'ext': 'flv', |                     'ext': 'flv', | ||||||
|                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate), |                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate), | ||||||
|                     'tbr': bitrate, |                     'tbr': bitrate, | ||||||
|  |                     'filesize': filesize, | ||||||
|                     'width': width, |                     'width': width, | ||||||
|                     'height': height, |                     'height': height, | ||||||
|             }], rtmp_count) |                 }) | ||||||
|         elif proto.startswith('http'): |                 continue | ||||||
|             return ([{ | 
 | ||||||
|                 'url': base + src, |             src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src) | ||||||
|                 'ext': ext or 'flv', | 
 | ||||||
|  |             if proto == 'm3u8' or src_ext == 'm3u8': | ||||||
|  |                 formats.extend(self._extract_m3u8_formats( | ||||||
|  |                     src_url, video_id, ext or 'mp4', m3u8_id='hls')) | ||||||
|  |                 continue | ||||||
|  | 
 | ||||||
|  |             if src_ext == 'f4m': | ||||||
|  |                 f4m_url = src_url | ||||||
|  |                 if not f4m_params: | ||||||
|  |                     f4m_params = { | ||||||
|  |                         'hdcore': '3.2.0', | ||||||
|  |                         'plugin': 'flowplayer-3.2.0.1', | ||||||
|  |                     } | ||||||
|  |                 f4m_url += '&' if '?' in f4m_url else '?' | ||||||
|  |                 f4m_url += compat_urllib_parse.urlencode(f4m_params) | ||||||
|  |                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds')) | ||||||
|  |                 continue | ||||||
|  | 
 | ||||||
|  |             if src_url.startswith('http'): | ||||||
|  |                 http_count += 1 | ||||||
|  |                 formats.append({ | ||||||
|  |                     'url': src_url, | ||||||
|  |                     'ext': ext or src_ext or 'flv', | ||||||
|  |                     'format_id': 'http-%d' % (bitrate or http_count), | ||||||
|                     'tbr': bitrate, |                     'tbr': bitrate, | ||||||
|  |                     'filesize': filesize, | ||||||
|                     'width': width, |                     'width': width, | ||||||
|                     'height': height, |                     'height': height, | ||||||
|             }], rtmp_count) |                 }) | ||||||
|  |                 continue | ||||||
|  | 
 | ||||||
|  |         self._sort_formats(formats) | ||||||
|  | 
 | ||||||
|  |         return formats | ||||||
|  | 
 | ||||||
|  |     def _parse_smil_subtitles(self, smil, namespace=None): | ||||||
|  |         subtitles = {} | ||||||
|  |         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))): | ||||||
|  |             src = textstream.get('src') | ||||||
|  |             if not src: | ||||||
|  |                 continue | ||||||
|  |             ext = textstream.get('ext') or determine_ext(src) | ||||||
|  |             if not ext: | ||||||
|  |                 type_ = textstream.get('type') | ||||||
|  |                 if type_ == 'text/srt': | ||||||
|  |                     ext = 'srt' | ||||||
|  |             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') | ||||||
|  |             subtitles.setdefault(lang, []).append({ | ||||||
|  |                 'url': src, | ||||||
|  |                 'ext': ext, | ||||||
|  |             }) | ||||||
|  |         return subtitles | ||||||
| 
 | 
 | ||||||
|     def _live_title(self, name): |     def _live_title(self, name): | ||||||
|         """ Generate the title for a live video """ |         """ Generate the title for a live video """ | ||||||
|  | |||||||
| @ -130,6 +130,74 @@ class GenericIE(InfoExtractor): | |||||||
|                 'title': 'pdv_maddow_netcast_m4v-02-27-2015-201624', |                 'title': 'pdv_maddow_netcast_m4v-02-27-2015-201624', | ||||||
|             } |             } | ||||||
|         }, |         }, | ||||||
|  |         # SMIL from http://videolectures.net/promogram_igor_mekjavic_eng | ||||||
|  |         { | ||||||
|  |             'url': 'http://videolectures.net/promogram_igor_mekjavic_eng/video/1/smil.xml', | ||||||
|  |             'info_dict': { | ||||||
|  |                 'id': 'smil', | ||||||
|  |                 'ext': 'mp4', | ||||||
|  |                 'title': 'Automatics, robotics and biocybernetics', | ||||||
|  |                 'description': 'md5:815fc1deb6b3a2bff99de2d5325be482', | ||||||
|  |                 'formats': 'mincount:16', | ||||||
|  |                 'subtitles': 'mincount:1', | ||||||
|  |             }, | ||||||
|  |             'params': { | ||||||
|  |                 'force_generic_extractor': True, | ||||||
|  |                 'skip_download': True, | ||||||
|  |             }, | ||||||
|  |         }, | ||||||
|  |         # SMIL from http://www1.wdr.de/mediathek/video/livestream/index.html | ||||||
|  |         { | ||||||
|  |             'url': 'http://metafilegenerator.de/WDR/WDR_FS/hds/hds.smil', | ||||||
|  |             'info_dict': { | ||||||
|  |                 'id': 'hds', | ||||||
|  |                 'ext': 'flv', | ||||||
|  |                 'title': 'hds', | ||||||
|  |                 'formats': 'mincount:1', | ||||||
|  |             }, | ||||||
|  |             'params': { | ||||||
|  |                 'skip_download': True, | ||||||
|  |             }, | ||||||
|  |         }, | ||||||
|  |         # SMIL from https://www.restudy.dk/video/play/id/1637 | ||||||
|  |         { | ||||||
|  |             'url': 'https://www.restudy.dk/awsmedia/SmilDirectory/video_1637.xml', | ||||||
|  |             'info_dict': { | ||||||
|  |                 'id': 'video_1637', | ||||||
|  |                 'ext': 'flv', | ||||||
|  |                 'title': 'video_1637', | ||||||
|  |                 'formats': 'mincount:3', | ||||||
|  |             }, | ||||||
|  |             'params': { | ||||||
|  |                 'skip_download': True, | ||||||
|  |             }, | ||||||
|  |         }, | ||||||
|  |         # SMIL from http://adventure.howstuffworks.com/5266-cool-jobs-iditarod-musher-video.htm | ||||||
|  |         { | ||||||
|  |             'url': 'http://services.media.howstuffworks.com/videos/450221/smil-service.smil', | ||||||
|  |             'info_dict': { | ||||||
|  |                 'id': 'smil-service', | ||||||
|  |                 'ext': 'flv', | ||||||
|  |                 'title': 'smil-service', | ||||||
|  |                 'formats': 'mincount:1', | ||||||
|  |             }, | ||||||
|  |             'params': { | ||||||
|  |                 'skip_download': True, | ||||||
|  |             }, | ||||||
|  |         }, | ||||||
|  |         # SMIL from http://new.livestream.com/CoheedandCambria/WebsterHall/videos/4719370 | ||||||
|  |         { | ||||||
|  |             'url': 'http://api.new.livestream.com/accounts/1570303/events/1585861/videos/4719370.smil', | ||||||
|  |             'info_dict': { | ||||||
|  |                 'id': '4719370', | ||||||
|  |                 'ext': 'mp4', | ||||||
|  |                 'title': '571de1fd-47bc-48db-abf9-238872a58d1f', | ||||||
|  |                 'formats': 'mincount:3', | ||||||
|  |             }, | ||||||
|  |             'params': { | ||||||
|  |                 'skip_download': True, | ||||||
|  |             }, | ||||||
|  |         }, | ||||||
|         # google redirect |         # google redirect | ||||||
|         { |         { | ||||||
|             'url': 'http://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&cad=rja&ved=0CCUQtwIwAA&url=http%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DcmQHVoWB5FY&ei=F-sNU-LLCaXk4QT52ICQBQ&usg=AFQjCNEw4hL29zgOohLXvpJ-Bdh2bils1Q&bvm=bv.61965928,d.bGE', |             'url': 'http://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&cad=rja&ved=0CCUQtwIwAA&url=http%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DcmQHVoWB5FY&ei=F-sNU-LLCaXk4QT52ICQBQ&usg=AFQjCNEw4hL29zgOohLXvpJ-Bdh2bils1Q&bvm=bv.61965928,d.bGE', | ||||||
| @ -1123,11 +1191,13 @@ class GenericIE(InfoExtractor): | |||||||
| 
 | 
 | ||||||
|         self.report_extraction(video_id) |         self.report_extraction(video_id) | ||||||
| 
 | 
 | ||||||
|         # Is it an RSS feed? |         # Is it an RSS feed or a SMIL file? | ||||||
|         try: |         try: | ||||||
|             doc = parse_xml(webpage) |             doc = parse_xml(webpage) | ||||||
|             if doc.tag == 'rss': |             if doc.tag == 'rss': | ||||||
|                 return self._extract_rss(url, video_id, doc) |                 return self._extract_rss(url, video_id, doc) | ||||||
|  |             elif re.match(r'^(?:{[^}]+})?smil$', doc.tag): | ||||||
|  |                 return self._parse_smil(doc, url, video_id) | ||||||
|         except compat_xml_parse_error: |         except compat_xml_parse_error: | ||||||
|             pass |             pass | ||||||
| 
 | 
 | ||||||
|  | |||||||
| @ -12,7 +12,7 @@ from ..utils import ( | |||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class VideoLecturesNetIE(InfoExtractor): | class VideoLecturesNetIE(InfoExtractor): | ||||||
|     _VALID_URL = r'http://(?:www\.)?videolectures\.net/(?P<id>[^/#?]+)/' |     _VALID_URL = r'http://(?:www\.)?videolectures\.net/(?P<id>[^/#?]+)(?:/?[#?].*)?$' | ||||||
|     IE_NAME = 'videolectures.net' |     IE_NAME = 'videolectures.net' | ||||||
| 
 | 
 | ||||||
|     _TEST = { |     _TEST = { | ||||||
|  | |||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user