diff --git a/youtube_dl/extractor/bilibili.py b/youtube_dl/extractor/bilibili.py index 80bd696e2..5eec78a7e 100644 --- a/youtube_dl/extractor/bilibili.py +++ b/youtube_dl/extractor/bilibili.py @@ -15,7 +15,6 @@ from ..utils import ( float_or_none, parse_iso8601, smuggle_url, - str_or_none, strip_jsonp, unified_timestamp, unsmuggle_url, @@ -24,9 +23,41 @@ from ..utils import ( class BiliBiliIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.|bangumi\.|)bilibili\.(?:tv|com)/(?:video/av|anime/(?P\d+)/play#)(?P\d+)' + _VALID_URL = r'https?://(?:www\.|bangumi\.|)bilibili\.(?:tv|com)/(?:video/av|anime/(?P\d+)/play#)(?P\d+)(?:/?\?p=(?P\d+))?' _TESTS = [{ + 'url': 'https://www.bilibili.com/video/av41213189?p=1', + 'md5': '166c3e684970fbb4f834f24ddd19b275', + 'info_dict': { + 'id': '41213189_p1', + 'cid': '72383807', + 'ext': 'flv', + 'title': '【春晚鬼畜】宋丹丹:我就是念诗女王!【改革春风吹进门】_p1', + 'description': 'md5:a29fb90e0aff106d062a38658b0b75e2', + 'duration': 152.024, + 'timestamp': 1548014429, + 'upload_date': '20190120', + 'thumbnail': r're:^https?://.+\.jpg', + 'uploader': '吃素的狮子', + 'uploader_id': '808171', + }, + }, { + 'url': 'https://www.bilibili.com/video/av41213189?p=2', + 'md5': 'bda0939f327f2ead942e89d7f028ecc3', + 'info_dict': { + 'id': '41213189_p2', + 'cid': '72387898', + 'ext': 'flv', + 'title': '【春晚鬼畜】宋丹丹:我就是念诗女王!【改革春风吹进门】_p2', + 'description': 'md5:a29fb90e0aff106d062a38658b0b75e2', + 'duration': 152.024, + 'timestamp': 1548014429, + 'upload_date': '20190120', + 'thumbnail': r're:^https?://.+\.jpg', + 'uploader': '吃素的狮子', + 'uploader_id': '808171', + }, + }, { 'url': 'http://www.bilibili.tv/video/av1074402/', 'md5': '5f7d29e1a2872f3df0cf76b1f87d3788', 'info_dict': { @@ -111,10 +142,14 @@ class BiliBiliIE(InfoExtractor): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') anime_id = mobj.group('anime_id') + page_id = mobj.group('page') webpage = self._download_webpage(url, video_id) if 'anime/' not in url: cid = self._search_regex( + r'\bcid(?:["\']:|=)(\d+),["\']page(?:["\']:|=)' + str(page_id), webpage, 'cid', + default=None + ) or self._search_regex( r'\bcid(?:["\']:|=)(\d+)', webpage, 'cid', default=None ) or compat_parse_qs(self._search_regex( @@ -194,7 +229,7 @@ class BiliBiliIE(InfoExtractor): title = self._html_search_regex( (']+\btitle=(["\'])(?P(?:(?!\1).)+)\1', '(?s)<h1[^>]*>(?P<title>.+?)</h1>'), webpage, 'title', - group='title') + group='title') + ('_p' + str(page_id) if page_id is not None else '') description = self._html_search_meta('description', webpage) timestamp = unified_timestamp(self._html_search_regex( r'<time[^>]+datetime="([^"]+)"', webpage, 'upload time', @@ -204,7 +239,8 @@ class BiliBiliIE(InfoExtractor): # TODO 'view_count' requires deobfuscating Javascript info = { - 'id': video_id, + 'id': video_id if page_id is None else str(video_id) + '_p' + str(page_id), + 'cid': cid, 'title': title, 'description': description, 'timestamp': timestamp, @@ -307,115 +343,3 @@ class BiliBiliBangumiIE(InfoExtractor): return self.playlist_result( entries, bangumi_id, season_info.get('bangumi_title'), season_info.get('evaluate')) - - -class BilibiliAudioBaseIE(InfoExtractor): - def _call_api(self, path, sid, query=None): - if not query: - query = {'sid': sid} - return self._download_json( - 'https://www.bilibili.com/audio/music-service-c/web/' + path, - sid, query=query)['data'] - - -class BilibiliAudioIE(BilibiliAudioBaseIE): - _VALID_URL = r'https?://(?:www\.)?bilibili\.com/audio/au(?P<id>\d+)' - _TEST = { - 'url': 'https://www.bilibili.com/audio/au1003142', - 'md5': 'fec4987014ec94ef9e666d4d158ad03b', - 'info_dict': { - 'id': '1003142', - 'ext': 'm4a', - 'title': '【tsukimi】YELLOW / 神山羊', - 'artist': 'tsukimi', - 'comment_count': int, - 'description': 'YELLOW的mp3版!', - 'duration': 183, - 'subtitles': { - 'origin': [{ - 'ext': 'lrc', - }], - }, - 'thumbnail': r're:^https?://.+\.jpg', - 'timestamp': 1564836614, - 'upload_date': '20190803', - 'uploader': 'tsukimi-つきみぐー', - 'view_count': int, - }, - } - - def _real_extract(self, url): - au_id = self._match_id(url) - - play_data = self._call_api('url', au_id) - formats = [{ - 'url': play_data['cdns'][0], - 'filesize': int_or_none(play_data.get('size')), - }] - - song = self._call_api('song/info', au_id) - title = song['title'] - statistic = song.get('statistic') or {} - - subtitles = None - lyric = song.get('lyric') - if lyric: - subtitles = { - 'origin': [{ - 'url': lyric, - }] - } - - return { - 'id': au_id, - 'title': title, - 'formats': formats, - 'artist': song.get('author'), - 'comment_count': int_or_none(statistic.get('comment')), - 'description': song.get('intro'), - 'duration': int_or_none(song.get('duration')), - 'subtitles': subtitles, - 'thumbnail': song.get('cover'), - 'timestamp': int_or_none(song.get('passtime')), - 'uploader': song.get('uname'), - 'view_count': int_or_none(statistic.get('play')), - } - - -class BilibiliAudioAlbumIE(BilibiliAudioBaseIE): - _VALID_URL = r'https?://(?:www\.)?bilibili\.com/audio/am(?P<id>\d+)' - _TEST = { - 'url': 'https://www.bilibili.com/audio/am10624', - 'info_dict': { - 'id': '10624', - 'title': '每日新曲推荐(每日11:00更新)', - 'description': '每天11:00更新,为你推送最新音乐', - }, - 'playlist_count': 19, - } - - def _real_extract(self, url): - am_id = self._match_id(url) - - songs = self._call_api( - 'song/of-menu', am_id, {'sid': am_id, 'pn': 1, 'ps': 100})['data'] - - entries = [] - for song in songs: - sid = str_or_none(song.get('id')) - if not sid: - continue - entries.append(self.url_result( - 'https://www.bilibili.com/audio/au' + sid, - BilibiliAudioIE.ie_key(), sid)) - - if entries: - album_data = self._call_api('menu/info', am_id) or {} - album_title = album_data.get('title') - if album_title: - for entry in entries: - entry['album'] = album_title - return self.playlist_result( - entries, am_id, album_title, album_data.get('intro')) - - return self.playlist_result(entries, am_id)