From aaf7499008a68446a60556b98692459dabb57171 Mon Sep 17 00:00:00 2001 From: remitamine Date: Sat, 17 Oct 2015 05:20:18 +0100 Subject: [PATCH 1/3] [naver] add support for other naver sites --- youtube_dl/extractor/naver.py | 178 +++++++++++++++++++++++----------- 1 file changed, 120 insertions(+), 58 deletions(-) diff --git a/youtube_dl/extractor/naver.py b/youtube_dl/extractor/naver.py index 1f5fc2145..128ecf192 100644 --- a/youtube_dl/extractor/naver.py +++ b/youtube_dl/extractor/naver.py @@ -7,23 +7,28 @@ from .common import InfoExtractor from ..compat import ( compat_urllib_parse, compat_urlparse, + compat_urllib_request, ) from ..utils import ( ExtractorError, + int_or_none, + float_or_none, + determine_ext, ) class NaverIE(InfoExtractor): - _VALID_URL = r'https?://(?:m\.)?tvcast\.naver\.com/v/(?P\d+)' + _VALID_URL = r'https?://(?:m\.)?(?:tvcast\.naver\.com/v/|news\.naver\.com/main/read\.nhn?.*aid=|sports\.news\.naver.com/(?:videoCenter|sports)/(?:index|video)\.nhn?.*id=|movie\.naver.com/movie/bi/mi/mediaView\.nhn?.*mid=|music\.naver\.com/artist/videoPlayer\.nhn?.*videoId=)(?P\d+)' _TESTS = [{ 'url': 'http://tvcast.naver.com/v/81652', + 'md5': '0fe25e226a0ec388cd75679981bd2a1a', 'info_dict': { - 'id': '81652', + 'id': '4B40C2B7F4BC7C7BBA5237C5E3CED1ADEAF5', 'ext': 'mp4', 'title': '[9월 모의고사 해설강의][수학_김상희] 수학 A형 16~20번', - 'description': '합격불변의 법칙 메가스터디 | 메가스터디 수학 김상희 선생님이 9월 모의고사 수학A형 16번에서 20번까지 해설강의를 공개합니다.', - 'upload_date': '20130903', + 'uploader_id': 'megastudy', + 'uploader': '합격불변의 법칙 메가스터디', }, }, { 'url': 'http://tvcast.naver.com/v/395837', @@ -36,63 +41,120 @@ class NaverIE(InfoExtractor): 'upload_date': '20150519', }, 'skip': 'Georestricted', + }, { + 'url': 'http://news.naver.com/main/read.nhn?mode=LSD&mid=tvh&oid=056&aid=0010235712&sid1=293', + 'md5': 'ea02b7943173553618f49de08d6bd36e', + 'info_dict': { + 'id': '3C026B64F022C136ED9057AA820458275CD7', + 'ext': 'mp4', + 'title': '한미 정상 “북핵 문제 최우선”…공동성명 첫 채택', + 'uploader_id': 'muploader_o', + 'uploader': '', + }, + }, { + 'url': 'http://sports.news.naver.com/videoCenter/index.nhn?uCategory=esports&category=lol&id=158508', + 'md5': '436254dbbb7dde42053c731039f6f14d', + 'info_dict': { + 'id': '3CD20A3B7B15044056189B09557FB349DE0B', + 'ext': 'mp4', + 'title': '\'전승 가도\' 절대강자 Faker의 인터뷰', + 'uploader_id': 'muploader_n', + 'uploader': '', + }, + }, { + 'url': 'http://movie.naver.com/movie/bi/mi/mediaView.nhn?code=115622&mid=27362', + 'md5': 'ad9623ff5a23d6e9c0c7f451b063912f', + 'info_dict': { + 'id': '1132280440B9037B8E48DB0F569208440008', + 'ext': 'mp4', + 'title': '<인사이드 아웃> 메인 예고편', + 'uploader_id': 'navermovie', + 'uploader': '네이버 영화', + }, }] + def _extract_video_formats(self, formats_list): + formats = [] + for format_el in formats_list: + url = format_el.get('source') + if url: + encoding_option = format_el.get('encodingOption') + bitrate = format_el.get('bitrate') + formats.append({ + 'format_id': encoding_option.get('id') or encoding_option.get('name'), + 'url': format_el['source'], + 'width': int_or_none(encoding_option.get('width')), + 'height': int_or_none(encoding_option.get('height')), + 'vbr': float_or_none(bitrate.get('video')), + 'abr': float_or_none(bitrate.get('audio')), + 'filesize': int_or_none(format_el.get('size')), + 'vcodec': format_el.get('type'), + 'ext': determine_ext(url, 'mp4'), + }) + if formats: + self._sort_formats(formats) + return formats + + def _extract_video_info(self, vid, key): + play_data = self._download_json( + 'http://global.apis.naver.com/linetv/rmcnmv/vod_play_videoInfo.json?' + compat_urllib_parse.urlencode({'videoId': vid, 'key': key}), + vid, 'Downloading video info') + meta = play_data.get('meta') + user = meta.get('user') + + thumbnails = [] + for thumbnail in play_data['thumbnails']['list']: + thumbnails.append({'url': thumbnail['source']}) + + formats = self._extract_video_formats(play_data['videos']['list']) + if not formats: + video_info = self._download_json( + 'http://serviceapi.rmcnmv.naver.com/mobile/getVideoInfo.nhn?' + compat_urllib_parse.urlencode({'videoId': vid, 'inKey': key, 'protocol': 'http'}), + vid, 'Downloading video info') + formats = self._extract_video_formats(video_info['videos']['list']) + + return { + 'id': vid, + 'title': meta['subject'], + 'formats': formats, + 'thumbnail': meta.get('cover', {}).get('source'), + 'thumbnails': thumbnails, + 'view_count': int_or_none(meta.get('count')), + 'uploader_id': user.get('id'), + 'uploader': user.get('name'), + } + + def _extract_id_and_key(self, webpage): + m_id = re.search(r'(?s)new\s+nhn.rmcnmv.RMCVideoPlayer\(\s*["\']([^"\']+)["\']\s*,\s*(?:{[^}]*?value[^:]*?:\s*?)?["\']([^"\']+)["\']', webpage) + if not m_id: + m_id = re.search(r'(?s)_sVid\s*=\s*["\']([^"\']+)["\'];\s*var\s+_sInkey\s*=\s*["\']([^"\']+)["\'];', webpage) + return m_id + def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - m_id = re.search(r'var rmcPlayer = new nhn.rmcnmv.RMCVideoPlayer\("(.+?)", "(.+?)"', - webpage) - if m_id is None: - error = self._html_search_regex( - r'(?s)
\s*(?:)?\s*

(?P.+?)

\s*
', - webpage, 'error', default=None) - if error: - raise ExtractorError(error, expected=True) - raise ExtractorError('couldn\'t extract vid and key') - vid = m_id.group(1) - key = m_id.group(2) - query = compat_urllib_parse.urlencode({'vid': vid, 'inKey': key, }) - query_urls = compat_urllib_parse.urlencode({ - 'masterVid': vid, - 'protocol': 'p2p', - 'inKey': key, - }) - info = self._download_xml( - 'http://serviceapi.rmcnmv.naver.com/flash/videoInfo.nhn?' + query, - video_id, 'Downloading video info') - urls = self._download_xml( - 'http://serviceapi.rmcnmv.naver.com/flash/playableEncodingOption.nhn?' + query_urls, - video_id, 'Downloading video formats info') - - formats = [] - for format_el in urls.findall('EncodingOptions/EncodingOption'): - domain = format_el.find('Domain').text - uri = format_el.find('uri').text - f = { - 'url': compat_urlparse.urljoin(domain, uri), - 'ext': 'mp4', - 'width': int(format_el.find('width').text), - 'height': int(format_el.find('height').text), - } - if domain.startswith('rtmp'): - # urlparse does not support custom schemes - # https://bugs.python.org/issue18828 - f.update({ - 'url': domain + uri, - 'ext': 'flv', - 'rtmp_protocol': '1', # rtmpt - }) - formats.append(f) - self._sort_formats(formats) - - return { - 'id': video_id, - 'title': info.find('Subject').text, - 'formats': formats, - 'description': self._og_search_description(webpage), - 'thumbnail': self._og_search_thumbnail(webpage), - 'upload_date': info.find('WriteDate').text.replace('.', ''), - 'view_count': int(info.find('PlayCount').text), - } + m_id = self._extract_id_and_key(webpage) + if not m_id: + iframe_urls = re.findall(r'<(?:iframe|IFRAME)[^>]+src="((?:/main/readVod|/movie/bi/mi/videoPlayer|http://serviceapi\.rmcnmv\.naver\.com/flash/outKeyPlayer)\.nhn[^"]+)"', webpage) + if iframe_urls: + entries = [] + for iframe_url in iframe_urls: + if iframe_url.startswith('/'): + iframe_url = compat_urlparse.urljoin(url, iframe_url) + request = compat_urllib_request.Request(iframe_url, headers={'Referer': url}) + iframe_webpage = self._download_webpage(request, video_id, 'Downloading iframe webpage') + m_id = self._extract_id_and_key(iframe_webpage) + if m_id: + vid, key = m_id.groups() + entries.append(self._extract_video_info(vid, key)) + return entries[0] if len(entries) == 1 else self.playlist_result(entries) + else: + error = self._html_search_regex( + r'(?s)
\s*(?:)?\s*

(?P.+?)

\s*
', + webpage, 'error', default=None) + if error: + raise ExtractorError(error, expected=True) + raise ExtractorError('couldn\'t extract vid and key') + vid, key = m_id.groups() + return self._extract_video_info(vid, key) From 0a9fad85278975d924f5f357e7425d36f1228df9 Mon Sep 17 00:00:00 2001 From: remitamine Date: Sat, 17 Oct 2015 10:51:07 +0100 Subject: [PATCH 2/3] [naver] add test for music video and fix escaped iframe urls --- youtube_dl/extractor/naver.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/youtube_dl/extractor/naver.py b/youtube_dl/extractor/naver.py index 128ecf192..aef8f01f5 100644 --- a/youtube_dl/extractor/naver.py +++ b/youtube_dl/extractor/naver.py @@ -14,6 +14,7 @@ from ..utils import ( int_or_none, float_or_none, determine_ext, + unescapeHTML, ) @@ -71,6 +72,16 @@ class NaverIE(InfoExtractor): 'uploader_id': 'navermovie', 'uploader': '네이버 영화', }, + }, { + 'url': 'http://music.naver.com/artist/videoPlayer.nhn?videoId=99476', + 'md5': '4378409358f457bdce12e90f40ba33e2', + 'info_dict': { + 'id': 'E2651FBE1723D209C17AB611C296C57EA0A1', + 'ext': 'mp4', + 'title': '디아크 인사말', + 'uploader_id': 'muploader_c', + 'uploader': '', + }, }] def _extract_video_formats(self, formats_list): @@ -140,6 +151,7 @@ class NaverIE(InfoExtractor): if iframe_urls: entries = [] for iframe_url in iframe_urls: + iframe_url = unescapeHTML(iframe_url) if iframe_url.startswith('/'): iframe_url = compat_urlparse.urljoin(url, iframe_url) request = compat_urllib_request.Request(iframe_url, headers={'Referer': url}) From a0b06b344a9b6b43af7bca9fa82b437c02d6f75d Mon Sep 17 00:00:00 2001 From: remitamine Date: Sun, 1 Nov 2015 19:16:00 +0100 Subject: [PATCH 3/3] [naver] extract subtitles and hls formats and reuse code in VLiveIE --- youtube_dl/extractor/naver.py | 70 ++++++++++++++--------- youtube_dl/extractor/vlive.py | 103 ++++++++++++++++------------------ 2 files changed, 92 insertions(+), 81 deletions(-) diff --git a/youtube_dl/extractor/naver.py b/youtube_dl/extractor/naver.py index aef8f01f5..25d3ed06d 100644 --- a/youtube_dl/extractor/naver.py +++ b/youtube_dl/extractor/naver.py @@ -84,49 +84,53 @@ class NaverIE(InfoExtractor): }, }] - def _extract_video_formats(self, formats_list): + def _extract_video_formats(self, formats_list, vid): formats = [] for format_el in formats_list: url = format_el.get('source') if url: - encoding_option = format_el.get('encodingOption') - bitrate = format_el.get('bitrate') - formats.append({ - 'format_id': encoding_option.get('id') or encoding_option.get('name'), - 'url': format_el['source'], - 'width': int_or_none(encoding_option.get('width')), - 'height': int_or_none(encoding_option.get('height')), - 'vbr': float_or_none(bitrate.get('video')), - 'abr': float_or_none(bitrate.get('audio')), - 'filesize': int_or_none(format_el.get('size')), - 'vcodec': format_el.get('type'), - 'ext': determine_ext(url, 'mp4'), - }) + if format_el.get('type') == 'HLS': + key = format_el.get('key') + if key: + url += '?%s=%s' % (key['name'], key['value']) + formats.extend(self._extract_m3u8_formats(url, vid, 'mp4', m3u8_id='hls')) + else: + encoding_option = format_el.get('encodingOption') + bitrate = format_el.get('bitrate') + formats.append({ + 'format_id': encoding_option.get('id') or encoding_option.get('name'), + 'url': format_el['source'], + 'width': int_or_none(encoding_option.get('width')), + 'height': int_or_none(encoding_option.get('height')), + 'vbr': float_or_none(bitrate.get('video')), + 'abr': float_or_none(bitrate.get('audio')), + 'filesize': int_or_none(format_el.get('size')), + 'vcodec': format_el.get('type'), + 'ext': determine_ext(url, 'mp4'), + }) if formats: self._sort_formats(formats) return formats - def _extract_video_info(self, vid, key): - play_data = self._download_json( - 'http://global.apis.naver.com/linetv/rmcnmv/vod_play_videoInfo.json?' + compat_urllib_parse.urlencode({'videoId': vid, 'key': key}), - vid, 'Downloading video info') + def _parse_video_info(self, play_data, vid): meta = play_data.get('meta') - user = meta.get('user') + user = meta.get('user', {}) thumbnails = [] - for thumbnail in play_data['thumbnails']['list']: + for thumbnail in play_data.get('thumbnails', {}).get('list', []): thumbnails.append({'url': thumbnail['source']}) - formats = self._extract_video_formats(play_data['videos']['list']) - if not formats: - video_info = self._download_json( - 'http://serviceapi.rmcnmv.naver.com/mobile/getVideoInfo.nhn?' + compat_urllib_parse.urlencode({'videoId': vid, 'inKey': key, 'protocol': 'http'}), - vid, 'Downloading video info') - formats = self._extract_video_formats(video_info['videos']['list']) + subtitles = {} + for caption in play_data.get('captions', {}).get('list', []): + subtitles[caption['language']] = [ + {'ext': determine_ext(caption['source'], default_ext='vtt'), + 'url': caption['source']}] + + formats = self._extract_video_formats(play_data['videos']['list'] + play_data.get('streams', []), vid) return { 'id': vid, - 'title': meta['subject'], + 'title': meta.get('subject'), 'formats': formats, 'thumbnail': meta.get('cover', {}).get('source'), 'thumbnails': thumbnails, @@ -135,6 +139,18 @@ class NaverIE(InfoExtractor): 'uploader': user.get('name'), } + def _extract_video_info(self, vid, key): + play_data = self._download_json( + 'http://global.apis.naver.com/rmcnmv/rmcnmv/vod_play_videoInfo.json?' + compat_urllib_parse.urlencode({'videoId': vid, 'key': key}), + vid, 'Downloading video info') + info = self._parse_video_info(play_data, vid) + if not info['formats']: + play_data = self._download_json( + 'http://serviceapi.rmcnmv.naver.com/mobile/getVideoInfo.nhn?' + compat_urllib_parse.urlencode({'videoId': vid, 'inKey': key, 'protocol': 'http'}), + vid, 'Downloading video info') + info['formats'] = self._extract_video_formats(play_data['videos']['list'] + play_data.get('streams', []), vid) + return info + def _extract_id_and_key(self, webpage): m_id = re.search(r'(?s)new\s+nhn.rmcnmv.RMCVideoPlayer\(\s*["\']([^"\']+)["\']\s*,\s*(?:{[^}]*?value[^:]*?:\s*?)?["\']([^"\']+)["\']', webpage) if not m_id: diff --git a/youtube_dl/extractor/vlive.py b/youtube_dl/extractor/vlive.py index 86c1cb5ef..8755cdc6a 100644 --- a/youtube_dl/extractor/vlive.py +++ b/youtube_dl/extractor/vlive.py @@ -6,19 +6,19 @@ from hashlib import sha1 from base64 import b64encode from time import time -from .common import InfoExtractor +from .naver import NaverIE from ..utils import ( ExtractorError, - determine_ext + int_or_none, ) from ..compat import compat_urllib_parse -class VLiveIE(InfoExtractor): +class VLiveIE(NaverIE): IE_NAME = 'vlive' # www.vlive.tv/video/ links redirect to m.vlive.tv/video/ for mobile devices _VALID_URL = r'https?://(?:(www|m)\.)?vlive\.tv/video/(?P[0-9]+)' - _TEST = { + _TESTS = [{ 'url': 'http://m.vlive.tv/video/1326', 'md5': 'cc7314812855ce56de70a06a27314983', 'info_dict': { @@ -27,60 +27,55 @@ class VLiveIE(InfoExtractor): 'title': '[V] Girl\'s Day\'s Broadcast', 'creator': 'Girl\'s Day', }, - } + }] _SECRET = 'rFkwZet6pqk1vQt6SxxUkAHX7YL3lmqzUMrU4IDusTo4jEBdtOhNfT4BYYAdArwH' def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage( - 'http://m.vlive.tv/video/%s' % video_id, - video_id, note='Download video page') + status = self._download_json( + 'http://www.vlive.tv/video/status?videoSeq=%s' % video_id, + video_id, note='Download status metadata') - title = self._og_search_title(webpage) - thumbnail = self._og_search_thumbnail(webpage) - creator = self._html_search_regex( - r']+class="name">([^<>]+)', webpage, 'creator') - - url = 'http://global.apis.naver.com/globalV/globalV/vod/%s/playinfo?' % video_id - msgpad = '%.0f' % (time() * 1000) - md = b64encode( - hmac.new(self._SECRET.encode('ascii'), - (url[:255] + msgpad).encode('ascii'), sha1).digest() - ) - url += '&' + compat_urllib_parse.urlencode({'msgpad': msgpad, 'md': md}) - playinfo = self._download_json(url, video_id, 'Downloading video json') - - if playinfo.get('message', '') != 'success': - raise ExtractorError(playinfo.get('message', 'JSON request unsuccessful')) - - if not playinfo.get('result'): - raise ExtractorError('No videos found.') - - formats = [] - for vid in playinfo['result'].get('videos', {}).get('list', []): - formats.append({ - 'url': vid['source'], - 'ext': 'mp4', - 'abr': vid.get('bitrate', {}).get('audio'), - 'vbr': vid.get('bitrate', {}).get('video'), - 'format_id': vid['encodingOption']['name'], - 'height': vid.get('height'), - 'width': vid.get('width'), + vid = status.get('vodId') + if vid: + key = status.get('vodInKey') + if not key: + key = self._download_webpage('http://www.vlive.tv/video/inkey?vodId=%s' % vid, video_id) + if key: + video_info = self._extract_video_info(vid, key) + elif status['status'] not in ('CANCELED', 'COMING_SOON', 'NOT_FOUND'): + webpage = self._download_webpage( + 'http://m.vlive.tv/video/%s' % video_id, + video_id, note='Download video page') + title = self._og_search_title(webpage) + thumbnail = self._og_search_thumbnail(webpage) + creator = self._html_search_regex( + r']+class="name">([^<>]+)', webpage, 'creator') + url = 'http://global.apis.naver.com/globalV/globalV/vod/%s/playinfo?' % video_id + msgpad = '%.0f' % (time() * 1000) + md = b64encode( + hmac.new(self._SECRET.encode('ascii'), + (url[:255] + msgpad).encode('ascii'), sha1).digest() + ) + url += '&' + compat_urllib_parse.urlencode({'msgpad': msgpad, 'md': md}) + playinfo = self._download_json(url, video_id, 'Downloading video json') + if playinfo.get('message', '') != 'success': + raise ExtractorError(playinfo.get('message', 'JSON request unsuccessful')) + result = playinfo.get('result') + if not result: + raise ExtractorError('No videos found.') + video_info = self._parse_video_info(result, video_id) + video_info.update({ + 'title': title, + 'thumbnail': thumbnail, + 'creator': creator, }) - self._sort_formats(formats) - - subtitles = {} - for caption in playinfo['result'].get('captions', {}).get('list', []): - subtitles[caption['language']] = [ - {'ext': determine_ext(caption['source'], default_ext='vtt'), - 'url': caption['source']}] - - return { - 'id': video_id, - 'title': title, - 'creator': creator, - 'thumbnail': thumbnail, - 'formats': formats, - 'subtitles': subtitles, - } + if video_info: + video_info.update({ + 'id': video_id, + 'view_count': int_or_none(status.get('playCount')), + 'likes': int_or_none(status.get('likeCount')), + }) + return video_info + raise ExtractorError(status['status'])