[tagesschau] Add support for playlists
This commit is contained in:
		
							parent
							
								
									9e1b96ae40
								
							
						
					
					
						commit
						4c1b2e5c0e
					
				| @ -8,11 +8,11 @@ from ..utils import parse_filesize | |||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class TagesschauIE(InfoExtractor): | class TagesschauIE(InfoExtractor): | ||||||
|     _VALID_URL = r'https?://(?:www\.)?tagesschau\.de/multimedia/(?:[^/]+/)*?[^/#?]+?(?P<id>-?[0-9]+)(?:~_?[^/#?]+?)?\.html' |     _VALID_URL = r'https?://(?:www\.)?tagesschau\.de/[^/]+/(?:[^/]+/)*?[^/#?]+?(?P<id>-?[0-9]+)(?:~_?[^/#?]+?)?\.html' | ||||||
| 
 | 
 | ||||||
|     _TESTS = [{ |     _TESTS = [{ | ||||||
|         'url': 'http://www.tagesschau.de/multimedia/video/video-102143.html', |         'url': 'http://www.tagesschau.de/multimedia/video/video-102143.html', | ||||||
|         'md5': '917a228bc7df7850783bc47979673a09', |         'md5': 'f7c27a0eff3bfe8c7727e65f8fe1b1e6', | ||||||
|         'info_dict': { |         'info_dict': { | ||||||
|             'id': '102143', |             'id': '102143', | ||||||
|             'ext': 'mp4', |             'ext': 'mp4', | ||||||
| @ -40,6 +40,13 @@ class TagesschauIE(InfoExtractor): | |||||||
|             'description': 'Flüchtlingsdebatte: Hitzig, aber wenig hilfreich', |             'description': 'Flüchtlingsdebatte: Hitzig, aber wenig hilfreich', | ||||||
|             'thumbnail': 're:^https?:.*\.jpg$', |             'thumbnail': 're:^https?:.*\.jpg$', | ||||||
|         }, |         }, | ||||||
|  |     }, { | ||||||
|  |         'url': 'http://www.tagesschau.de/inland/afd-parteitag-135.html', | ||||||
|  |         'info_dict': { | ||||||
|  |             'id': '135', | ||||||
|  |             'title': 'Möchtegern-Underdog mit Machtanspruch', | ||||||
|  |         }, | ||||||
|  |         'playlist_count': 2, | ||||||
|     }, { |     }, { | ||||||
|         'url': 'http://www.tagesschau.de/multimedia/sendung/tsg-3771.html', |         'url': 'http://www.tagesschau.de/multimedia/sendung/tsg-3771.html', | ||||||
|         'only_matching': True, |         'only_matching': True, | ||||||
| @ -75,6 +82,41 @@ class TagesschauIE(InfoExtractor): | |||||||
|         'xxl': {'quality': 5}, |         'xxl': {'quality': 5}, | ||||||
|     } |     } | ||||||
| 
 | 
 | ||||||
|  |     def _extract_formats(self, download_text): | ||||||
|  |         links = re.finditer( | ||||||
|  |             r'<div class="button" title="(?P<title>[^"]*)"><a href="(?P<url>[^"]+)">(?P<name>.+?)</a></div>', | ||||||
|  |             download_text) | ||||||
|  |         formats = [] | ||||||
|  |         for l in links: | ||||||
|  |             format_id = self._search_regex( | ||||||
|  |                 r'.*/[^/.]+\.([^/]+)\.[^/.]+', l.group('url'), 'format ID') | ||||||
|  |             format = { | ||||||
|  |                 'format_id': format_id, | ||||||
|  |                 'url': l.group('url'), | ||||||
|  |                 'format_name': l.group('name'), | ||||||
|  |             } | ||||||
|  |             m = re.match( | ||||||
|  |                 r'''(?x) | ||||||
|  |                     Video:\s*(?P<vcodec>[a-zA-Z0-9/._-]+)\s*&\#10; | ||||||
|  |                     (?P<width>[0-9]+)x(?P<height>[0-9]+)px&\#10; | ||||||
|  |                     (?P<vbr>[0-9]+)kbps&\#10; | ||||||
|  |                     Audio:\s*(?P<abr>[0-9]+)kbps,\s*(?P<audio_desc>[A-Za-z\.0-9]+)&\#10; | ||||||
|  |                     Größe:\s*(?P<filesize_approx>[0-9.,]+\s+[a-zA-Z]*B)''', | ||||||
|  |                 l.group('title')) | ||||||
|  |             if m: | ||||||
|  |                 format.update({ | ||||||
|  |                     'format_note': m.group('audio_desc'), | ||||||
|  |                     'vcodec': m.group('vcodec'), | ||||||
|  |                     'width': int(m.group('width')), | ||||||
|  |                     'height': int(m.group('height')), | ||||||
|  |                     'abr': int(m.group('abr')), | ||||||
|  |                     'vbr': int(m.group('vbr')), | ||||||
|  |                     'filesize_approx': parse_filesize(m.group('filesize_approx')), | ||||||
|  |                 }) | ||||||
|  |             formats.append(format) | ||||||
|  |         self._sort_formats(formats) | ||||||
|  |         return formats | ||||||
|  | 
 | ||||||
|     def _real_extract(self, url): |     def _real_extract(self, url): | ||||||
|         video_id = self._match_id(url) |         video_id = self._match_id(url) | ||||||
|         display_id = video_id.lstrip('-') |         display_id = video_id.lstrip('-') | ||||||
| @ -94,14 +136,14 @@ class TagesschauIE(InfoExtractor): | |||||||
|                         (?:,\s*quality:(?P<q_quality>["\'])(?P<quality>.+?)(?P=q_quality))? |                         (?:,\s*quality:(?P<q_quality>["\'])(?P<quality>.+?)(?P=q_quality))? | ||||||
|                     ''', playerpage): |                     ''', playerpage): | ||||||
|                 url = media.group('url') |                 url = media.group('url') | ||||||
|                 type_ = media.group('type') |                 webpage_type = media.group('type') | ||||||
|                 ext = media.group('ext') |                 ext = media.group('ext') | ||||||
|                 res = media.group('quality') |                 res = media.group('quality') | ||||||
|                 f = { |                 f = { | ||||||
|                     'format_id': '%s_%s' % (res, ext) if res else ext, |                     'format_id': '%s_%s' % (res, ext) if res else ext, | ||||||
|                     'url': url, |                     'url': url, | ||||||
|                     'ext': ext, |                     'ext': ext, | ||||||
|                     'vcodec': 'none' if type_ == 'audio' else None, |                     'vcodec': 'none' if webpage_type == 'audio' else None, | ||||||
|                 } |                 } | ||||||
|                 f.update(self._FORMATS.get(res, {})) |                 f.update(self._FORMATS.get(res, {})) | ||||||
|                 formats.append(f) |                 formats.append(f) | ||||||
| @ -109,47 +151,31 @@ class TagesschauIE(InfoExtractor): | |||||||
|             title = self._og_search_title(webpage).strip() |             title = self._og_search_title(webpage).strip() | ||||||
|             description = self._og_search_description(webpage).strip() |             description = self._og_search_description(webpage).strip() | ||||||
|         else: |         else: | ||||||
|             download_text = self._search_regex( |  | ||||||
|                 r'(?s)<p>Wir bieten dieses Video in folgenden Formaten zum Download an:</p>\s*<div class="controls">(.*?)</div>\s*<p>', |  | ||||||
|                 webpage, 'download links') |  | ||||||
|             links = re.finditer( |  | ||||||
|                 r'<div class="button" title="(?P<title>[^"]*)"><a href="(?P<url>[^"]+)">(?P<name>.+?)</a></div>', |  | ||||||
|                 download_text) |  | ||||||
|             formats = [] |  | ||||||
|             for l in links: |  | ||||||
|                 format_id = self._search_regex( |  | ||||||
|                     r'.*/[^/.]+\.([^/]+)\.[^/.]+', l.group('url'), 'format ID') |  | ||||||
|                 format = { |  | ||||||
|                     'format_id': format_id, |  | ||||||
|                     'url': l.group('url'), |  | ||||||
|                     'format_name': l.group('name'), |  | ||||||
|                 } |  | ||||||
|                 m = re.match( |  | ||||||
|                     r'''(?x) |  | ||||||
|                         Video:\s*(?P<vcodec>[a-zA-Z0-9/._-]+)\s*&\#10; |  | ||||||
|                         (?P<width>[0-9]+)x(?P<height>[0-9]+)px&\#10; |  | ||||||
|                         (?P<vbr>[0-9]+)kbps&\#10; |  | ||||||
|                         Audio:\s*(?P<abr>[0-9]+)kbps,\s*(?P<audio_desc>[A-Za-z\.0-9]+)&\#10; |  | ||||||
|                         Größe:\s*(?P<filesize_approx>[0-9.,]+\s+[a-zA-Z]*B)''', |  | ||||||
|                     l.group('title')) |  | ||||||
|                 if m: |  | ||||||
|                     format.update({ |  | ||||||
|                         'format_note': m.group('audio_desc'), |  | ||||||
|                         'vcodec': m.group('vcodec'), |  | ||||||
|                         'width': int(m.group('width')), |  | ||||||
|                         'height': int(m.group('height')), |  | ||||||
|                         'abr': int(m.group('abr')), |  | ||||||
|                         'vbr': int(m.group('vbr')), |  | ||||||
|                         'filesize_approx': parse_filesize(m.group('filesize_approx')), |  | ||||||
|                     }) |  | ||||||
|                 formats.append(format) |  | ||||||
|             thumbnail = self._og_search_thumbnail(webpage) |  | ||||||
|             description = self._html_search_regex( |  | ||||||
|                 r'(?s)<p class="teasertext">(.*?)</p>', |  | ||||||
|                 webpage, 'description', default=None) |  | ||||||
|             title = self._html_search_regex( |             title = self._html_search_regex( | ||||||
|                 r'<span class="headline".*?>(.*?)</span>', webpage, 'title') |                 r'<span class="headline".*?>(.*?)</span>', webpage, 'title') | ||||||
| 
 | 
 | ||||||
|  |             DOWNLOAD_REGEX = r'(?s)<p>Wir bieten dieses Video in folgenden Formaten zum Download an:</p>\s*<div class="controls">(.*?)</div>\s*<p>' | ||||||
|  | 
 | ||||||
|  |             webpage_type = self._og_search_property('type', webpage, default=None) | ||||||
|  |             if webpage_type == 'website':  # Article | ||||||
|  |                 entries = [] | ||||||
|  |                 for num, (entry_title, download_text) in enumerate(re.findall( | ||||||
|  |                         r'(?s)<p[^>]+class="infotext"[^>]*>.*?<strong>(.+?)</strong>.*?</p>.*?%s' % DOWNLOAD_REGEX, | ||||||
|  |                         webpage)): | ||||||
|  |                     entries.append({ | ||||||
|  |                         'id': display_id, | ||||||
|  |                         'title': '%s-%d' % (entry_title, num), | ||||||
|  |                         'formats': self._extract_formats(download_text), | ||||||
|  |                     }) | ||||||
|  |                 return self.playlist_result(entries, display_id, title) | ||||||
|  |             else:  # Assume single video | ||||||
|  |                 download_text = self._search_regex(DOWNLOAD_REGEX, webpage, 'download links') | ||||||
|  |                 formats = self._extract_formats(download_text) | ||||||
|  |                 thumbnail = self._og_search_thumbnail(webpage) | ||||||
|  |                 description = self._html_search_regex( | ||||||
|  |                     r'(?s)<p class="teasertext">(.*?)</p>', | ||||||
|  |                     webpage, 'description', default=None) | ||||||
|  | 
 | ||||||
|         self._sort_formats(formats) |         self._sort_formats(formats) | ||||||
| 
 | 
 | ||||||
|         return { |         return { | ||||||
|  | |||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user