ArteTvIE: rewrite the extract process to support the new site (fixes #875)
The video can be downloaded with rtmp or http, but the best quality format seems to always use rtmp. Deleted the old methods.
This commit is contained in:
		
							parent
							
								
									bcd606c0fe
								
							
						
					
					
						commit
						75c9481224
					
				| @ -1,53 +1,21 @@ | ||||
| import re | ||||
| import socket | ||||
| import json | ||||
| 
 | ||||
| from .common import InfoExtractor | ||||
| from ..utils import ( | ||||
|     compat_http_client, | ||||
|     compat_str, | ||||
|     compat_urllib_error, | ||||
|     # This is used by the not implemented extractLiveStream method | ||||
|     compat_urllib_parse, | ||||
|     compat_urllib_request, | ||||
| 
 | ||||
|     ExtractorError, | ||||
|     unified_strdate, | ||||
| ) | ||||
| 
 | ||||
| class ArteTvIE(InfoExtractor): | ||||
|     """arte.tv information extractor.""" | ||||
| 
 | ||||
|     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*' | ||||
|     _VALID_URL = r'(?:http://)?www\.arte.tv/guide/(?:fr|de)/(?:(?:sendungen|emissions)/)?(?P<id>.*?)/(?P<name>.*?)(\?.*)?' | ||||
|     _LIVE_URL = r'index-[0-9]+\.html$' | ||||
| 
 | ||||
|     IE_NAME = u'arte.tv' | ||||
| 
 | ||||
|     def fetch_webpage(self, url): | ||||
|         request = compat_urllib_request.Request(url) | ||||
|         try: | ||||
|             self.report_download_webpage(url) | ||||
|             webpage = compat_urllib_request.urlopen(request).read() | ||||
|         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: | ||||
|             raise ExtractorError(u'Unable to retrieve video webpage: %s' % compat_str(err)) | ||||
|         except ValueError as err: | ||||
|             raise ExtractorError(u'Invalid URL: %s' % url) | ||||
|         return webpage | ||||
| 
 | ||||
|     def grep_webpage(self, url, regex, regexFlags, matchTuples): | ||||
|         page = self.fetch_webpage(url) | ||||
|         mobj = re.search(regex, page, regexFlags) | ||||
|         info = {} | ||||
| 
 | ||||
|         if mobj is None: | ||||
|             raise ExtractorError(u'Invalid URL: %s' % url) | ||||
| 
 | ||||
|         for (i, key, err) in matchTuples: | ||||
|             if mobj.group(i) is None: | ||||
|                 raise ExtractorError(err) | ||||
|             else: | ||||
|                 info[key] = mobj.group(i) | ||||
| 
 | ||||
|         return info | ||||
| 
 | ||||
|     # TODO implement Live Stream | ||||
|     # def extractLiveStream(self, url): | ||||
|     #     video_lang = url.split('/')[-4] | ||||
| @ -75,62 +43,44 @@ class ArteTvIE(InfoExtractor): | ||||
|     #     ) | ||||
|     #     video_url = u'%s/%s' % (info.get('url'), info.get('path')) | ||||
| 
 | ||||
|     def extractPlus7Stream(self, url): | ||||
|         video_lang = url.split('/')[-3] | ||||
|         info = self.grep_webpage( | ||||
|             url, | ||||
|             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)', | ||||
|             0, | ||||
|             [ | ||||
|                 (1, 'url', u'Invalid URL: %s' % url) | ||||
|             ] | ||||
|         ) | ||||
|         next_url = compat_urllib_parse.unquote(info.get('url')) | ||||
|         info = self.grep_webpage( | ||||
|             next_url, | ||||
|             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang, | ||||
|             0, | ||||
|             [ | ||||
|                 (1, 'url', u'Could not find <video> tag: %s' % url) | ||||
|             ] | ||||
|         ) | ||||
|         next_url = compat_urllib_parse.unquote(info.get('url')) | ||||
| 
 | ||||
|         info = self.grep_webpage( | ||||
|             next_url, | ||||
|             r'<video id="(.*?)".*?>.*?' + | ||||
|                 '<name>(.*?)</name>.*?' + | ||||
|                 '<dateVideo>(.*?)</dateVideo>.*?' + | ||||
|                 '<url quality="hd">(.*?)</url>', | ||||
|             re.DOTALL, | ||||
|             [ | ||||
|                 (1, 'id',    u'could not extract video id: %s' % url), | ||||
|                 (2, 'title', u'could not extract video title: %s' % url), | ||||
|                 (3, 'date',  u'could not extract video date: %s' % url), | ||||
|                 (4, 'url',   u'could not extract video url: %s' % url) | ||||
|             ] | ||||
|         ) | ||||
| 
 | ||||
|         return { | ||||
|             'id':           info.get('id'), | ||||
|             'url':          compat_urllib_parse.unquote(info.get('url')), | ||||
|             'uploader':     u'arte.tv', | ||||
|             'upload_date':  unified_strdate(info.get('date')), | ||||
|             'title':        info.get('title').decode('utf-8'), | ||||
|             'ext':          u'mp4', | ||||
|             'format':       u'NA', | ||||
|             'player_url':   None, | ||||
|         } | ||||
| 
 | ||||
|     def _real_extract(self, url): | ||||
|         video_id = url.split('/')[-1] | ||||
|         self.report_extraction(video_id) | ||||
|         mobj = re.match(self._VALID_URL, url) | ||||
|         name = mobj.group('name') | ||||
|         # This is not a real id, it can be for example AJT for the news | ||||
|         # http://www.arte.tv/guide/fr/emissions/AJT/arte-journal | ||||
|         video_id = mobj.group('id') | ||||
| 
 | ||||
|         if re.search(self._LIVE_URL, video_id) is not None: | ||||
|             raise ExtractorError(u'Arte live streams are not yet supported, sorry') | ||||
|             # self.extractLiveStream(url) | ||||
|             # return | ||||
|         else: | ||||
|             info = self.extractPlus7Stream(url) | ||||
| 
 | ||||
|         return [info] | ||||
|         webpage = self._download_webpage(url, video_id) | ||||
|         json_url = self._html_search_regex(r'arte_vp_url="(.*?)"', webpage, 'json url') | ||||
| 
 | ||||
|         json_info = self._download_webpage(json_url, video_id, 'Downloading info json') | ||||
|         self.report_extraction(video_id) | ||||
|         info = json.loads(json_info) | ||||
|         player_info = info['videoJsonPlayer'] | ||||
| 
 | ||||
|         info_dict = {'id': player_info['VID'], | ||||
|                      'title': player_info['VTI'], | ||||
|                      'description': player_info['VDE'], | ||||
|                      'upload_date': unified_strdate(player_info['VDA'].split(' ')[0]), | ||||
|                      'thumbnail': player_info['programImage'], | ||||
|                      } | ||||
| 
 | ||||
|         formats = player_info['VSR'].values() | ||||
|         # We order the formats by quality | ||||
|         formats = sorted(formats, key=lambda f: int(f['height'])) | ||||
|         # Pick the best quality | ||||
|         format_info = formats[-1] | ||||
|         if format_info['mediaType'] == u'rtmp': | ||||
|             info_dict['url'] = format_info['streamer'] | ||||
|             info_dict['play_path'] = 'mp4:' + format_info['url'] | ||||
|             info_dict['ext'] = 'mp4' | ||||
|         else: | ||||
|             info_dict['url'] = format_info['url'] | ||||
|             info_dict['ext'] = 'mp4' | ||||
| 
 | ||||
|         return info_dict | ||||
|  | ||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user