| 
									
										
										
										
											2014-04-21 13:45:27 +02:00
										 |  |  |  | # coding: utf-8 | 
					
						
							|  |  |  |  | from __future__ import unicode_literals | 
					
						
							|  |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2016-05-29 01:26:00 +08:00
										 |  |  |  | import calendar | 
					
						
							|  |  |  |  | import datetime | 
					
						
							| 
									
										
										
										
											2015-10-17 23:12:58 +01:00
										 |  |  |  | import re | 
					
						
							| 
									
										
										
										
											2014-04-21 13:45:27 +02:00
										 |  |  |  | 
 | 
					
						
							|  |  |  |  | from .common import InfoExtractor | 
					
						
							| 
									
										
										
										
											2016-05-29 01:26:00 +08:00
										 |  |  |  | from ..compat import ( | 
					
						
							|  |  |  |  |     compat_etree_fromstring, | 
					
						
							|  |  |  |  |     compat_str, | 
					
						
							|  |  |  |  |     compat_parse_qs, | 
					
						
							|  |  |  |  |     compat_xml_parse_error, | 
					
						
							|  |  |  |  | ) | 
					
						
							| 
									
										
										
										
											2014-04-21 13:45:27 +02:00
										 |  |  |  | from ..utils import ( | 
					
						
							| 
									
										
										
										
											2015-04-21 02:32:10 +08:00
										 |  |  |  |     ExtractorError, | 
					
						
							| 
									
										
										
										
											2016-05-29 01:26:00 +08:00
										 |  |  |  |     int_or_none, | 
					
						
							|  |  |  |  |     float_or_none, | 
					
						
							| 
									
										
										
										
											2015-12-03 22:01:32 +01:00
										 |  |  |  |     xpath_text, | 
					
						
							| 
									
										
										
										
											2014-04-21 13:45:27 +02:00
										 |  |  |  | ) | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  | class BiliBiliIE(InfoExtractor): | 
					
						
							| 
									
										
										
										
											2016-05-29 01:26:00 +08:00
										 |  |  |  |     _VALID_URL = r'https?://www\.bilibili\.(?:tv|com)/video/av(?P<id>\d+)' | 
					
						
							| 
									
										
										
										
											2014-04-21 13:45:27 +02:00
										 |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2015-04-30 18:23:35 +08:00
										 |  |  |  |     _TESTS = [{ | 
					
						
							| 
									
										
										
										
											2014-04-21 13:45:27 +02:00
										 |  |  |  |         'url': 'http://www.bilibili.tv/video/av1074402/', | 
					
						
							| 
									
										
										
										
											2016-08-08 12:57:17 +08:00
										 |  |  |  |         'md5': '9fa226fe2b8a9a4d5a69b4c6a183417e', | 
					
						
							| 
									
										
										
										
											2014-04-21 13:45:27 +02:00
										 |  |  |  |         'info_dict': { | 
					
						
							| 
									
										
										
										
											2015-10-17 17:28:09 +01:00
										 |  |  |  |             'id': '1554319', | 
					
						
							| 
									
										
										
										
											2016-08-08 12:57:17 +08:00
										 |  |  |  |             'ext': 'mp4', | 
					
						
							| 
									
										
										
										
											2014-04-21 13:45:27 +02:00
										 |  |  |  |             'title': '【金坷垃】金泡沫', | 
					
						
							| 
									
										
										
										
											2016-05-29 01:26:00 +08:00
										 |  |  |  |             'description': 'md5:ce18c2a2d2193f0df2917d270f2e5923', | 
					
						
							| 
									
										
										
										
											2016-08-08 12:57:17 +08:00
										 |  |  |  |             'duration': 308.315, | 
					
						
							| 
									
										
										
										
											2016-05-29 01:26:00 +08:00
										 |  |  |  |             'timestamp': 1398012660, | 
					
						
							| 
									
										
										
										
											2014-04-21 13:45:27 +02:00
										 |  |  |  |             'upload_date': '20140420', | 
					
						
							|  |  |  |  |             'thumbnail': 're:^https?://.+\.jpg', | 
					
						
							| 
									
										
										
										
											2015-10-17 17:28:09 +01:00
										 |  |  |  |             'uploader': '菊子桑', | 
					
						
							| 
									
										
										
										
											2016-05-29 01:26:00 +08:00
										 |  |  |  |             'uploader_id': '156160', | 
					
						
							| 
									
										
										
										
											2014-04-21 13:45:27 +02:00
										 |  |  |  |         }, | 
					
						
							| 
									
										
										
										
											2015-04-30 18:23:35 +08:00
										 |  |  |  |     }, { | 
					
						
							|  |  |  |  |         'url': 'http://www.bilibili.com/video/av1041170/', | 
					
						
							|  |  |  |  |         'info_dict': { | 
					
						
							| 
									
										
										
										
											2016-08-08 12:57:17 +08:00
										 |  |  |  |             'id': '1507019', | 
					
						
							|  |  |  |  |             'ext': 'mp4', | 
					
						
							| 
									
										
										
										
											2015-04-30 18:23:35 +08:00
										 |  |  |  |             'title': '【BD1080P】刀语【诸神&异域】', | 
					
						
							| 
									
										
										
										
											2015-10-17 23:12:58 +01:00
										 |  |  |  |             'description': '这是个神奇的故事~每个人不留弹幕不给走哦~切利哦!~', | 
					
						
							| 
									
										
										
										
											2016-08-08 12:57:17 +08:00
										 |  |  |  |             'timestamp': 1396530060, | 
					
						
							|  |  |  |  |             'upload_date': '20140403', | 
					
						
							|  |  |  |  |             'uploader': '枫叶逝去', | 
					
						
							|  |  |  |  |             'uploader_id': '520116', | 
					
						
							| 
									
										
										
										
											2015-04-30 18:23:35 +08:00
										 |  |  |  |         }, | 
					
						
							| 
									
										
										
										
											2016-06-02 19:27:57 +08:00
										 |  |  |  |     }, { | 
					
						
							|  |  |  |  |         'url': 'http://www.bilibili.com/video/av4808130/', | 
					
						
							|  |  |  |  |         'info_dict': { | 
					
						
							| 
									
										
										
										
											2016-08-08 12:57:17 +08:00
										 |  |  |  |             'id': '7802182', | 
					
						
							|  |  |  |  |             'ext': 'mp4', | 
					
						
							| 
									
										
										
										
											2016-06-02 19:27:57 +08:00
										 |  |  |  |             'title': '【长篇】哆啦A梦443【钉铛】', | 
					
						
							|  |  |  |  |             'description': '(2016.05.27)来组合客人的脸吧&amp;寻母六千里锭 抱歉,又轮到周日上班现在才到家 封面www.pixiv.net/member_illust.php?mode=medium&amp;illust_id=56912929', | 
					
						
							| 
									
										
										
										
											2016-08-08 12:57:17 +08:00
										 |  |  |  |             'timestamp': 1464564180, | 
					
						
							|  |  |  |  |             'upload_date': '20160529', | 
					
						
							|  |  |  |  |             'uploader': '喜欢拉面', | 
					
						
							|  |  |  |  |             'uploader_id': '151066', | 
					
						
							| 
									
										
										
										
											2016-06-02 19:27:57 +08:00
										 |  |  |  |         }, | 
					
						
							| 
									
										
										
										
											2016-06-08 14:29:53 +08:00
										 |  |  |  |     }, { | 
					
						
							|  |  |  |  |         # Missing upload time | 
					
						
							|  |  |  |  |         'url': 'http://www.bilibili.com/video/av1867637/', | 
					
						
							|  |  |  |  |         'info_dict': { | 
					
						
							|  |  |  |  |             'id': '2880301', | 
					
						
							| 
									
										
										
										
											2016-08-08 12:57:17 +08:00
										 |  |  |  |             'ext': 'mp4', | 
					
						
							| 
									
										
										
										
											2016-06-08 14:29:53 +08:00
										 |  |  |  |             'title': '【HDTV】【喜剧】岳父岳母真难当 (2014)【法国票房冠军】', | 
					
						
							|  |  |  |  |             'description': '一个信奉天主教的法国旧式传统资产阶级家庭中有四个女儿。三个女儿却分别找了阿拉伯、犹太、中国丈夫,老夫老妻唯独期盼剩下未嫁的小女儿能找一个信奉天主教的法国白人,结果没想到小女儿找了一位非裔黑人……【这次应该不会跳帧了】', | 
					
						
							|  |  |  |  |             'uploader': '黑夜为猫', | 
					
						
							|  |  |  |  |             'uploader_id': '610729', | 
					
						
							|  |  |  |  |         }, | 
					
						
							|  |  |  |  |         'params': { | 
					
						
							|  |  |  |  |             # Just to test metadata extraction | 
					
						
							|  |  |  |  |             'skip_download': True, | 
					
						
							|  |  |  |  |         }, | 
					
						
							|  |  |  |  |         'expected_warnings': ['upload time'], | 
					
						
							| 
									
										
										
										
											2015-04-30 18:23:35 +08:00
										 |  |  |  |     }] | 
					
						
							| 
									
										
										
										
											2014-04-21 13:45:27 +02:00
										 |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2016-05-29 01:26:00 +08:00
										 |  |  |  |     # BiliBili blocks keys from time to time. The current key is extracted from | 
					
						
							|  |  |  |  |     # the Android client | 
					
						
							|  |  |  |  |     # TODO: find the sign algorithm used in the flash player | 
					
						
							|  |  |  |  |     _APP_KEY = '86385cdc024c0f6c' | 
					
						
							|  |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2015-10-17 23:12:58 +01:00
										 |  |  |  |     def _real_extract(self, url): | 
					
						
							|  |  |  |  |         mobj = re.match(self._VALID_URL, url) | 
					
						
							|  |  |  |  |         video_id = mobj.group('id') | 
					
						
							|  |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2016-05-29 01:26:00 +08:00
										 |  |  |  |         webpage = self._download_webpage(url, video_id) | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  |         params = compat_parse_qs(self._search_regex( | 
					
						
							|  |  |  |  |             [r'EmbedPlayer\([^)]+,\s*"([^"]+)"\)', | 
					
						
							|  |  |  |  |              r'<iframe[^>]+src="https://secure\.bilibili\.com/secure,([^"]+)"'], | 
					
						
							|  |  |  |  |             webpage, 'player parameters')) | 
					
						
							|  |  |  |  |         cid = params['cid'][0] | 
					
						
							| 
									
										
										
										
											2015-10-17 23:12:58 +01:00
										 |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2016-05-29 01:26:00 +08:00
										 |  |  |  |         info_xml_str = self._download_webpage( | 
					
						
							|  |  |  |  |             'http://interface.bilibili.com/v_cdn_play', | 
					
						
							|  |  |  |  |             cid, query={'appkey': self._APP_KEY, 'cid': cid}, | 
					
						
							|  |  |  |  |             note='Downloading video info page') | 
					
						
							| 
									
										
										
										
											2015-04-30 18:23:35 +08:00
										 |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2016-05-29 01:26:00 +08:00
										 |  |  |  |         err_msg = None | 
					
						
							|  |  |  |  |         durls = None | 
					
						
							|  |  |  |  |         info_xml = None | 
					
						
							|  |  |  |  |         try: | 
					
						
							|  |  |  |  |             info_xml = compat_etree_fromstring(info_xml_str.encode('utf-8')) | 
					
						
							|  |  |  |  |         except compat_xml_parse_error: | 
					
						
							|  |  |  |  |             info_json = self._parse_json(info_xml_str, video_id, fatal=False) | 
					
						
							|  |  |  |  |             err_msg = (info_json or {}).get('error_text') | 
					
						
							|  |  |  |  |         else: | 
					
						
							|  |  |  |  |             err_msg = xpath_text(info_xml, './message') | 
					
						
							| 
									
										
										
										
											2015-05-27 04:23:21 +08:00
										 |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2016-05-29 01:26:00 +08:00
										 |  |  |  |         if info_xml is not None: | 
					
						
							|  |  |  |  |             durls = info_xml.findall('./durl') | 
					
						
							|  |  |  |  |         if not durls: | 
					
						
							|  |  |  |  |             if err_msg: | 
					
						
							|  |  |  |  |                 raise ExtractorError('%s said: %s' % (self.IE_NAME, err_msg), expected=True) | 
					
						
							|  |  |  |  |             else: | 
					
						
							|  |  |  |  |                 raise ExtractorError('No videos found!') | 
					
						
							| 
									
										
										
										
											2014-04-21 13:45:27 +02:00
										 |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2015-10-17 17:28:09 +01:00
										 |  |  |  |         entries = [] | 
					
						
							| 
									
										
										
										
											2015-04-30 18:23:35 +08:00
										 |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2016-05-29 01:26:00 +08:00
										 |  |  |  |         for durl in durls: | 
					
						
							| 
									
										
										
										
											2015-12-03 22:01:32 +01:00
										 |  |  |  |             size = xpath_text(durl, ['./filesize', './size']) | 
					
						
							| 
									
										
										
										
											2015-10-21 08:24:05 +01:00
										 |  |  |  |             formats = [{ | 
					
						
							| 
									
										
										
										
											2015-10-17 18:30:51 +01:00
										 |  |  |  |                 'url': durl.find('./url').text, | 
					
						
							| 
									
										
										
										
											2015-12-03 22:01:32 +01:00
										 |  |  |  |                 'filesize': int_or_none(size), | 
					
						
							| 
									
										
										
										
											2015-10-21 08:24:05 +01:00
										 |  |  |  |             }] | 
					
						
							| 
									
										
										
										
											2016-05-29 01:26:00 +08:00
										 |  |  |  |             for backup_url in durl.findall('./backup_url/url'): | 
					
						
							|  |  |  |  |                 formats.append({ | 
					
						
							|  |  |  |  |                     'url': backup_url.text, | 
					
						
							|  |  |  |  |                     # backup URLs have lower priorities | 
					
						
							|  |  |  |  |                     'preference': -2 if 'hd.mp4' in backup_url.text else -3, | 
					
						
							|  |  |  |  |                 }) | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  |             self._sort_formats(formats) | 
					
						
							| 
									
										
										
										
											2015-10-21 08:24:05 +01:00
										 |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2015-04-30 18:23:35 +08:00
										 |  |  |  |             entries.append({ | 
					
						
							| 
									
										
										
										
											2015-12-03 22:01:32 +01:00
										 |  |  |  |                 'id': '%s_part%s' % (cid, xpath_text(durl, './order')), | 
					
						
							|  |  |  |  |                 'duration': int_or_none(xpath_text(durl, './length'), 1000), | 
					
						
							| 
									
										
										
										
											2015-10-17 18:30:51 +01:00
										 |  |  |  |                 'formats': formats, | 
					
						
							| 
									
										
										
										
											2015-01-08 01:33:22 +06:00
										 |  |  |  |             }) | 
					
						
							| 
									
										
										
										
											2014-04-21 13:45:27 +02:00
										 |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2016-05-29 01:26:00 +08:00
										 |  |  |  |         title = self._html_search_regex('<h1[^>]+title="([^"]+)">', webpage, 'title') | 
					
						
							|  |  |  |  |         description = self._html_search_meta('description', webpage) | 
					
						
							|  |  |  |  |         datetime_str = self._html_search_regex( | 
					
						
							|  |  |  |  |             r'<time[^>]+datetime="([^"]+)"', webpage, 'upload time', fatal=False) | 
					
						
							| 
									
										
										
										
											2016-06-08 14:29:53 +08:00
										 |  |  |  |         timestamp = None | 
					
						
							| 
									
										
										
										
											2016-05-29 01:26:00 +08:00
										 |  |  |  |         if datetime_str: | 
					
						
							|  |  |  |  |             timestamp = calendar.timegm(datetime.datetime.strptime(datetime_str, '%Y-%m-%dT%H:%M').timetuple()) | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  |         # TODO 'view_count' requires deobfuscating Javascript | 
					
						
							| 
									
										
										
										
											2015-10-17 17:28:09 +01:00
										 |  |  |  |         info = { | 
					
						
							| 
									
										
										
										
											2015-12-03 22:01:32 +01:00
										 |  |  |  |             'id': compat_str(cid), | 
					
						
							| 
									
										
										
										
											2015-10-17 17:28:09 +01:00
										 |  |  |  |             'title': title, | 
					
						
							| 
									
										
										
										
											2016-05-29 01:26:00 +08:00
										 |  |  |  |             'description': description, | 
					
						
							|  |  |  |  |             'timestamp': timestamp, | 
					
						
							|  |  |  |  |             'thumbnail': self._html_search_meta('thumbnailUrl', webpage), | 
					
						
							|  |  |  |  |             'duration': float_or_none(xpath_text(info_xml, './timelength'), scale=1000), | 
					
						
							| 
									
										
										
										
											2014-04-21 13:45:27 +02:00
										 |  |  |  |         } | 
					
						
							| 
									
										
										
										
											2015-10-17 17:28:09 +01:00
										 |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2016-05-29 01:26:00 +08:00
										 |  |  |  |         uploader_mobj = re.search( | 
					
						
							|  |  |  |  |             r'<a[^>]+href="https?://space\.bilibili\.com/(?P<id>\d+)"[^>]+title="(?P<name>[^"]+)"', | 
					
						
							|  |  |  |  |             webpage) | 
					
						
							|  |  |  |  |         if uploader_mobj: | 
					
						
							|  |  |  |  |             info.update({ | 
					
						
							|  |  |  |  |                 'uploader': uploader_mobj.group('name'), | 
					
						
							|  |  |  |  |                 'uploader_id': uploader_mobj.group('id'), | 
					
						
							|  |  |  |  |             }) | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  |         for entry in entries: | 
					
						
							|  |  |  |  |             entry.update(info) | 
					
						
							|  |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2015-10-17 17:28:09 +01:00
										 |  |  |  |         if len(entries) == 1: | 
					
						
							|  |  |  |  |             return entries[0] | 
					
						
							|  |  |  |  |         else: | 
					
						
							| 
									
										
										
										
											2016-06-02 19:27:57 +08:00
										 |  |  |  |             for idx, entry in enumerate(entries): | 
					
						
							|  |  |  |  |                 entry['id'] = '%s_part%d' % (video_id, (idx + 1)) | 
					
						
							|  |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2016-05-29 01:26:00 +08:00
										 |  |  |  |             return { | 
					
						
							| 
									
										
										
										
											2015-10-17 17:28:09 +01:00
										 |  |  |  |                 '_type': 'multi_video', | 
					
						
							| 
									
										
										
										
											2015-10-17 23:12:58 +01:00
										 |  |  |  |                 'id': video_id, | 
					
						
							| 
									
										
										
										
											2016-05-29 01:26:00 +08:00
										 |  |  |  |                 'title': title, | 
					
						
							|  |  |  |  |                 'description': description, | 
					
						
							| 
									
										
										
										
											2015-10-17 17:28:09 +01:00
										 |  |  |  |                 'entries': entries, | 
					
						
							| 
									
										
										
										
											2016-05-29 01:26:00 +08:00
										 |  |  |  |             } |