ArteTvIE: rewrite the extract process to support the new site (fixes #875)
The video can be downloaded with rtmp or http, but the best quality format seems to always use rtmp. Deleted the old methods.
This commit is contained in:
		
							parent
							
								
									bcd606c0fe
								
							
						
					
					
						commit
						75c9481224
					
				| @ -1,53 +1,21 @@ | |||||||
| import re | import re | ||||||
| import socket | import json | ||||||
| 
 | 
 | ||||||
| from .common import InfoExtractor | from .common import InfoExtractor | ||||||
| from ..utils import ( | from ..utils import ( | ||||||
|     compat_http_client, |     # This is used by the not implemented extractLiveStream method | ||||||
|     compat_str, |  | ||||||
|     compat_urllib_error, |  | ||||||
|     compat_urllib_parse, |     compat_urllib_parse, | ||||||
|     compat_urllib_request, |  | ||||||
| 
 | 
 | ||||||
|     ExtractorError, |     ExtractorError, | ||||||
|     unified_strdate, |     unified_strdate, | ||||||
| ) | ) | ||||||
| 
 | 
 | ||||||
| class ArteTvIE(InfoExtractor): | class ArteTvIE(InfoExtractor): | ||||||
|     """arte.tv information extractor.""" |     _VALID_URL = r'(?:http://)?www\.arte.tv/guide/(?:fr|de)/(?:(?:sendungen|emissions)/)?(?P<id>.*?)/(?P<name>.*?)(\?.*)?' | ||||||
| 
 |  | ||||||
|     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*' |  | ||||||
|     _LIVE_URL = r'index-[0-9]+\.html$' |     _LIVE_URL = r'index-[0-9]+\.html$' | ||||||
| 
 | 
 | ||||||
|     IE_NAME = u'arte.tv' |     IE_NAME = u'arte.tv' | ||||||
| 
 | 
 | ||||||
|     def fetch_webpage(self, url): |  | ||||||
|         request = compat_urllib_request.Request(url) |  | ||||||
|         try: |  | ||||||
|             self.report_download_webpage(url) |  | ||||||
|             webpage = compat_urllib_request.urlopen(request).read() |  | ||||||
|         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: |  | ||||||
|             raise ExtractorError(u'Unable to retrieve video webpage: %s' % compat_str(err)) |  | ||||||
|         except ValueError as err: |  | ||||||
|             raise ExtractorError(u'Invalid URL: %s' % url) |  | ||||||
|         return webpage |  | ||||||
| 
 |  | ||||||
|     def grep_webpage(self, url, regex, regexFlags, matchTuples): |  | ||||||
|         page = self.fetch_webpage(url) |  | ||||||
|         mobj = re.search(regex, page, regexFlags) |  | ||||||
|         info = {} |  | ||||||
| 
 |  | ||||||
|         if mobj is None: |  | ||||||
|             raise ExtractorError(u'Invalid URL: %s' % url) |  | ||||||
| 
 |  | ||||||
|         for (i, key, err) in matchTuples: |  | ||||||
|             if mobj.group(i) is None: |  | ||||||
|                 raise ExtractorError(err) |  | ||||||
|             else: |  | ||||||
|                 info[key] = mobj.group(i) |  | ||||||
| 
 |  | ||||||
|         return info |  | ||||||
| 
 |  | ||||||
|     # TODO implement Live Stream |     # TODO implement Live Stream | ||||||
|     # def extractLiveStream(self, url): |     # def extractLiveStream(self, url): | ||||||
|     #     video_lang = url.split('/')[-4] |     #     video_lang = url.split('/')[-4] | ||||||
| @ -75,62 +43,44 @@ class ArteTvIE(InfoExtractor): | |||||||
|     #     ) |     #     ) | ||||||
|     #     video_url = u'%s/%s' % (info.get('url'), info.get('path')) |     #     video_url = u'%s/%s' % (info.get('url'), info.get('path')) | ||||||
| 
 | 
 | ||||||
|     def extractPlus7Stream(self, url): |  | ||||||
|         video_lang = url.split('/')[-3] |  | ||||||
|         info = self.grep_webpage( |  | ||||||
|             url, |  | ||||||
|             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)', |  | ||||||
|             0, |  | ||||||
|             [ |  | ||||||
|                 (1, 'url', u'Invalid URL: %s' % url) |  | ||||||
|             ] |  | ||||||
|         ) |  | ||||||
|         next_url = compat_urllib_parse.unquote(info.get('url')) |  | ||||||
|         info = self.grep_webpage( |  | ||||||
|             next_url, |  | ||||||
|             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang, |  | ||||||
|             0, |  | ||||||
|             [ |  | ||||||
|                 (1, 'url', u'Could not find <video> tag: %s' % url) |  | ||||||
|             ] |  | ||||||
|         ) |  | ||||||
|         next_url = compat_urllib_parse.unquote(info.get('url')) |  | ||||||
| 
 |  | ||||||
|         info = self.grep_webpage( |  | ||||||
|             next_url, |  | ||||||
|             r'<video id="(.*?)".*?>.*?' + |  | ||||||
|                 '<name>(.*?)</name>.*?' + |  | ||||||
|                 '<dateVideo>(.*?)</dateVideo>.*?' + |  | ||||||
|                 '<url quality="hd">(.*?)</url>', |  | ||||||
|             re.DOTALL, |  | ||||||
|             [ |  | ||||||
|                 (1, 'id',    u'could not extract video id: %s' % url), |  | ||||||
|                 (2, 'title', u'could not extract video title: %s' % url), |  | ||||||
|                 (3, 'date',  u'could not extract video date: %s' % url), |  | ||||||
|                 (4, 'url',   u'could not extract video url: %s' % url) |  | ||||||
|             ] |  | ||||||
|         ) |  | ||||||
| 
 |  | ||||||
|         return { |  | ||||||
|             'id':           info.get('id'), |  | ||||||
|             'url':          compat_urllib_parse.unquote(info.get('url')), |  | ||||||
|             'uploader':     u'arte.tv', |  | ||||||
|             'upload_date':  unified_strdate(info.get('date')), |  | ||||||
|             'title':        info.get('title').decode('utf-8'), |  | ||||||
|             'ext':          u'mp4', |  | ||||||
|             'format':       u'NA', |  | ||||||
|             'player_url':   None, |  | ||||||
|         } |  | ||||||
| 
 |  | ||||||
|     def _real_extract(self, url): |     def _real_extract(self, url): | ||||||
|         video_id = url.split('/')[-1] |         mobj = re.match(self._VALID_URL, url) | ||||||
|         self.report_extraction(video_id) |         name = mobj.group('name') | ||||||
|  |         # This is not a real id, it can be for example AJT for the news | ||||||
|  |         # http://www.arte.tv/guide/fr/emissions/AJT/arte-journal | ||||||
|  |         video_id = mobj.group('id') | ||||||
| 
 | 
 | ||||||
|         if re.search(self._LIVE_URL, video_id) is not None: |         if re.search(self._LIVE_URL, video_id) is not None: | ||||||
|             raise ExtractorError(u'Arte live streams are not yet supported, sorry') |             raise ExtractorError(u'Arte live streams are not yet supported, sorry') | ||||||
|             # self.extractLiveStream(url) |             # self.extractLiveStream(url) | ||||||
|             # return |             # return | ||||||
|         else: |  | ||||||
|             info = self.extractPlus7Stream(url) |  | ||||||
| 
 | 
 | ||||||
|         return [info] |         webpage = self._download_webpage(url, video_id) | ||||||
|  |         json_url = self._html_search_regex(r'arte_vp_url="(.*?)"', webpage, 'json url') | ||||||
|  | 
 | ||||||
|  |         json_info = self._download_webpage(json_url, video_id, 'Downloading info json') | ||||||
|  |         self.report_extraction(video_id) | ||||||
|  |         info = json.loads(json_info) | ||||||
|  |         player_info = info['videoJsonPlayer'] | ||||||
|  | 
 | ||||||
|  |         info_dict = {'id': player_info['VID'], | ||||||
|  |                      'title': player_info['VTI'], | ||||||
|  |                      'description': player_info['VDE'], | ||||||
|  |                      'upload_date': unified_strdate(player_info['VDA'].split(' ')[0]), | ||||||
|  |                      'thumbnail': player_info['programImage'], | ||||||
|  |                      } | ||||||
|  | 
 | ||||||
|  |         formats = player_info['VSR'].values() | ||||||
|  |         # We order the formats by quality | ||||||
|  |         formats = sorted(formats, key=lambda f: int(f['height'])) | ||||||
|  |         # Pick the best quality | ||||||
|  |         format_info = formats[-1] | ||||||
|  |         if format_info['mediaType'] == u'rtmp': | ||||||
|  |             info_dict['url'] = format_info['streamer'] | ||||||
|  |             info_dict['play_path'] = 'mp4:' + format_info['url'] | ||||||
|  |             info_dict['ext'] = 'mp4' | ||||||
|  |         else: | ||||||
|  |             info_dict['url'] = format_info['url'] | ||||||
|  |             info_dict['ext'] = 'mp4' | ||||||
|  | 
 | ||||||
|  |         return info_dict | ||||||
|  | |||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user