From aaf7499008a68446a60556b98692459dabb57171 Mon Sep 17 00:00:00 2001
From: remitamine <remitamine@gmail.com>
Date: Sat, 17 Oct 2015 05:20:18 +0100
Subject: [PATCH 1/3] [naver] add support for other naver sites

---
 youtube_dl/extractor/naver.py | 178 +++++++++++++++++++++++-----------
 1 file changed, 120 insertions(+), 58 deletions(-)
diff --git a/youtube_dl/extractor/naver.py b/youtube_dl/extractor/naver.py
index 1f5fc2145..128ecf192 100644
--- a/youtube_dl/extractor/naver.py
+++ b/youtube_dl/extractor/naver.py
@@ -7,23 +7,28 @@ from .common import InfoExtractor
 from ..compat import (
     compat_urllib_parse,
     compat_urlparse,
+    compat_urllib_request,
 )
 from ..utils import (
     ExtractorError,
+    int_or_none,
+    float_or_none,
+    determine_ext,
 )
 
 
 class NaverIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:m\.)?tvcast\.naver\.com/v/(?P<id>\d+)'
+    _VALID_URL = r'https?://(?:m\.)?(?:tvcast\.naver\.com/v/|news\.naver\.com/main/read\.nhn?.*aid=|sports\.news\.naver.com/(?:videoCenter|sports)/(?:index|video)\.nhn?.*id=|movie\.naver.com/movie/bi/mi/mediaView\.nhn?.*mid=|music\.naver\.com/artist/videoPlayer\.nhn?.*videoId=)(?P<id>\d+)'
 
     _TESTS = [{
         'url': 'http://tvcast.naver.com/v/81652',
+        'md5': '0fe25e226a0ec388cd75679981bd2a1a',
         'info_dict': {
-            'id': '81652',
+            'id': '4B40C2B7F4BC7C7BBA5237C5E3CED1ADEAF5',
             'ext': 'mp4',
             'title': '[9월 모의고사 해설강의][수학_김상희] 수학 A형 16~20번',
-            'description': '합격불변의 법칙 메가스터디 | 메가스터디 수학 김상희 선생님이 9월 모의고사 수학A형 16번에서 20번까지 해설강의를 공개합니다.',
-            'upload_date': '20130903',
+            'uploader_id': 'megastudy',
+            'uploader': '합격불변의 법칙 메가스터디',
         },
     }, {
         'url': 'http://tvcast.naver.com/v/395837',
@@ -36,63 +41,120 @@ class NaverIE(InfoExtractor):
             'upload_date': '20150519',
         },
         'skip': 'Georestricted',
+    }, {
+        'url': 'http://news.naver.com/main/read.nhn?mode=LSD&mid=tvh&oid=056&aid=0010235712&sid1=293',
+        'md5': 'ea02b7943173553618f49de08d6bd36e',
+        'info_dict': {
+            'id': '3C026B64F022C136ED9057AA820458275CD7',
+            'ext': 'mp4',
+            'title': '한미 정상 “북핵 문제 최우선”…공동성명 첫 채택',
+            'uploader_id': 'muploader_o',
+            'uploader': '',
+        },
+    }, {
+        'url': 'http://sports.news.naver.com/videoCenter/index.nhn?uCategory=esports&category=lol&id=158508',
+        'md5': '436254dbbb7dde42053c731039f6f14d',
+        'info_dict': {
+            'id': '3CD20A3B7B15044056189B09557FB349DE0B',
+            'ext': 'mp4',
+            'title': '\'전승 가도\' 절대강자 Faker의 인터뷰',
+            'uploader_id': 'muploader_n',
+            'uploader': '',
+        },
+    }, {
+        'url': 'http://movie.naver.com/movie/bi/mi/mediaView.nhn?code=115622&mid=27362',
+        'md5': 'ad9623ff5a23d6e9c0c7f451b063912f',
+        'info_dict': {
+            'id': '1132280440B9037B8E48DB0F569208440008',
+            'ext': 'mp4',
+            'title': '<인사이드 아웃> 메인 예고편',
+            'uploader_id': 'navermovie',
+            'uploader': '네이버 영화',
+        },
     }]
 
+    def _extract_video_formats(self, formats_list):
+        formats = []
+        for format_el in formats_list:
+            url = format_el.get('source')
+            if url:
+                encoding_option = format_el.get('encodingOption')
+                bitrate = format_el.get('bitrate')
+                formats.append({
+                    'format_id': encoding_option.get('id') or encoding_option.get('name'),
+                    'url': format_el['source'],
+                    'width': int_or_none(encoding_option.get('width')),
+                    'height': int_or_none(encoding_option.get('height')),
+                    'vbr': float_or_none(bitrate.get('video')),
+                    'abr': float_or_none(bitrate.get('audio')),
+                    'filesize': int_or_none(format_el.get('size')),
+                    'vcodec': format_el.get('type'),
+                    'ext': determine_ext(url, 'mp4'),
+                })
+        if formats:
+            self._sort_formats(formats)
+        return formats
+
+    def _extract_video_info(self, vid, key):
+        play_data = self._download_json(
+            'http://global.apis.naver.com/linetv/rmcnmv/vod_play_videoInfo.json?' + compat_urllib_parse.urlencode({'videoId': vid, 'key': key}),
+            vid, 'Downloading video info')
+        meta = play_data.get('meta')
+        user = meta.get('user')
+
+        thumbnails = []
+        for thumbnail in play_data['thumbnails']['list']:
+            thumbnails.append({'url': thumbnail['source']})
+
+        formats = self._extract_video_formats(play_data['videos']['list'])
+        if not formats:
+            video_info = self._download_json(
+                'http://serviceapi.rmcnmv.naver.com/mobile/getVideoInfo.nhn?' + compat_urllib_parse.urlencode({'videoId': vid, 'inKey': key, 'protocol': 'http'}),
+                vid, 'Downloading video info')
+            formats = self._extract_video_formats(video_info['videos']['list'])
+
+        return {
+            'id': vid,
+            'title': meta['subject'],
+            'formats': formats,
+            'thumbnail': meta.get('cover', {}).get('source'),
+            'thumbnails': thumbnails,
+            'view_count': int_or_none(meta.get('count')),
+            'uploader_id': user.get('id'),
+            'uploader': user.get('name'),
+        }
+
+    def _extract_id_and_key(self, webpage):
+        m_id = re.search(r'(?s)new\s+nhn.rmcnmv.RMCVideoPlayer\(\s*["\']([^"\']+)["\']\s*,\s*(?:{[^}]*?value[^:]*?:\s*?)?["\']([^"\']+)["\']', webpage)
+        if not m_id:
+            m_id = re.search(r'(?s)_sVid\s*=\s*["\']([^"\']+)["\'];\s*var\s+_sInkey\s*=\s*["\']([^"\']+)["\'];', webpage)
+        return m_id
+
     def _real_extract(self, url):
         video_id = self._match_id(url)
         webpage = self._download_webpage(url, video_id)
 
-        m_id = re.search(r'var rmcPlayer = new nhn.rmcnmv.RMCVideoPlayer\("(.+?)", "(.+?)"',
-                         webpage)
-        if m_id is None:
-            error = self._html_search_regex(
-                r'(?s)<div class="(?:nation_error|nation_box|error_box)">\s*(?:<!--.*?-->)?\s*<p class="[^"]+">(?P<msg>.+?)</p>\s*</div>',
-                webpage, 'error', default=None)
-            if error:
-                raise ExtractorError(error, expected=True)
-            raise ExtractorError('couldn\'t extract vid and key')
-        vid = m_id.group(1)
-        key = m_id.group(2)
-        query = compat_urllib_parse.urlencode({'vid': vid, 'inKey': key, })
-        query_urls = compat_urllib_parse.urlencode({
-            'masterVid': vid,
-            'protocol': 'p2p',
-            'inKey': key,
-        })
-        info = self._download_xml(
-            'http://serviceapi.rmcnmv.naver.com/flash/videoInfo.nhn?' + query,
-            video_id, 'Downloading video info')
-        urls = self._download_xml(
-            'http://serviceapi.rmcnmv.naver.com/flash/playableEncodingOption.nhn?' + query_urls,
-            video_id, 'Downloading video formats info')
-
-        formats = []
-        for format_el in urls.findall('EncodingOptions/EncodingOption'):
-            domain = format_el.find('Domain').text
-            uri = format_el.find('uri').text
-            f = {
-                'url': compat_urlparse.urljoin(domain, uri),
-                'ext': 'mp4',
-                'width': int(format_el.find('width').text),
-                'height': int(format_el.find('height').text),
-            }
-            if domain.startswith('rtmp'):
-                # urlparse does not support custom schemes
-                # https://bugs.python.org/issue18828
-                f.update({
-                    'url': domain + uri,
-                    'ext': 'flv',
-                    'rtmp_protocol': '1',  # rtmpt
-                })
-            formats.append(f)
-        self._sort_formats(formats)
-
-        return {
-            'id': video_id,
-            'title': info.find('Subject').text,
-            'formats': formats,
-            'description': self._og_search_description(webpage),
-            'thumbnail': self._og_search_thumbnail(webpage),
-            'upload_date': info.find('WriteDate').text.replace('.', ''),
-            'view_count': int(info.find('PlayCount').text),
-        }
+        m_id = self._extract_id_and_key(webpage)
+        if not m_id:
+            iframe_urls = re.findall(r'<(?:iframe|IFRAME)[^>]+src="((?:/main/readVod|/movie/bi/mi/videoPlayer|http://serviceapi\.rmcnmv\.naver\.com/flash/outKeyPlayer)\.nhn[^"]+)"', webpage)
+            if iframe_urls:
+                entries = []
+                for iframe_url in iframe_urls:
+                    if iframe_url.startswith('/'):
+                        iframe_url = compat_urlparse.urljoin(url, iframe_url)
+                    request = compat_urllib_request.Request(iframe_url, headers={'Referer': url})
+                    iframe_webpage = self._download_webpage(request, video_id, 'Downloading iframe webpage')
+                    m_id = self._extract_id_and_key(iframe_webpage)
+                    if m_id:
+                        vid, key = m_id.groups()
+                        entries.append(self._extract_video_info(vid, key))
+                return entries[0] if len(entries) == 1 else self.playlist_result(entries)
+            else:
+                error = self._html_search_regex(
+                    r'(?s)<div class="(?:nation_error|nation_box|error_box)">\s*(?:<!--.*?-->)?\s*<p class="[^"]+">(?P<msg>.+?)</p>\s*</div>',
+                    webpage, 'error', default=None)
+                if error:
+                    raise ExtractorError(error, expected=True)
+                raise ExtractorError('couldn\'t extract vid and key')
+        vid, key = m_id.groups()
+        return self._extract_video_info(vid, key)

From 0a9fad85278975d924f5f357e7425d36f1228df9 Mon Sep 17 00:00:00 2001
From: remitamine <remitamine@gmail.com>
Date: Sat, 17 Oct 2015 10:51:07 +0100
Subject: [PATCH 2/3] [naver] add test for music video and fix escaped iframe
 urls

---
 youtube_dl/extractor/naver.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/youtube_dl/extractor/naver.py b/youtube_dl/extractor/naver.py
index 128ecf192..aef8f01f5 100644
--- a/youtube_dl/extractor/naver.py
+++ b/youtube_dl/extractor/naver.py
@@ -14,6 +14,7 @@ from ..utils import (
     int_or_none,
     float_or_none,
     determine_ext,
+    unescapeHTML,
 )
 
 
@@ -71,6 +72,16 @@ class NaverIE(InfoExtractor):
             'uploader_id': 'navermovie',
             'uploader': '네이버 영화',
         },
+    }, {
+        'url': 'http://music.naver.com/artist/videoPlayer.nhn?videoId=99476',
+        'md5': '4378409358f457bdce12e90f40ba33e2',
+        'info_dict': {
+            'id': 'E2651FBE1723D209C17AB611C296C57EA0A1',
+            'ext': 'mp4',
+            'title': '디아크 인사말',
+            'uploader_id': 'muploader_c',
+            'uploader': '',
+        },
     }]
 
     def _extract_video_formats(self, formats_list):
@@ -140,6 +151,7 @@ class NaverIE(InfoExtractor):
             if iframe_urls:
                 entries = []
                 for iframe_url in iframe_urls:
+                    iframe_url = unescapeHTML(iframe_url)
                     if iframe_url.startswith('/'):
                         iframe_url = compat_urlparse.urljoin(url, iframe_url)
                     request = compat_urllib_request.Request(iframe_url, headers={'Referer': url})

From a0b06b344a9b6b43af7bca9fa82b437c02d6f75d Mon Sep 17 00:00:00 2001
From: remitamine <remitamine@gmail.com>
Date: Sun, 1 Nov 2015 19:16:00 +0100
Subject: [PATCH 3/3] [naver] extract subtitles and hls formats and reuse code
 in VLiveIE

---
 youtube_dl/extractor/naver.py |  70 ++++++++++++++---------
 youtube_dl/extractor/vlive.py | 103 ++++++++++++++++------------------
 2 files changed, 92 insertions(+), 81 deletions(-)

diff --git a/youtube_dl/extractor/naver.py b/youtube_dl/extractor/naver.py
index aef8f01f5..25d3ed06d 100644
--- a/youtube_dl/extractor/naver.py
+++ b/youtube_dl/extractor/naver.py
@@ -84,49 +84,53 @@ class NaverIE(InfoExtractor):
         },
     }]
 
-    def _extract_video_formats(self, formats_list):
+    def _extract_video_formats(self, formats_list, vid):
         formats = []
         for format_el in formats_list:
             url = format_el.get('source')
             if url:
-                encoding_option = format_el.get('encodingOption')
-                bitrate = format_el.get('bitrate')
-                formats.append({
-                    'format_id': encoding_option.get('id') or encoding_option.get('name'),
-                    'url': format_el['source'],
-                    'width': int_or_none(encoding_option.get('width')),
-                    'height': int_or_none(encoding_option.get('height')),
-                    'vbr': float_or_none(bitrate.get('video')),
-                    'abr': float_or_none(bitrate.get('audio')),
-                    'filesize': int_or_none(format_el.get('size')),
-                    'vcodec': format_el.get('type'),
-                    'ext': determine_ext(url, 'mp4'),
-                })
+                if format_el.get('type') == 'HLS':
+                    key = format_el.get('key')
+                    if key:
+                        url += '?%s=%s' % (key['name'], key['value'])
+                    formats.extend(self._extract_m3u8_formats(url, vid, 'mp4', m3u8_id='hls'))
+                else:
+                    encoding_option = format_el.get('encodingOption')
+                    bitrate = format_el.get('bitrate')
+                    formats.append({
+                        'format_id': encoding_option.get('id') or encoding_option.get('name'),
+                        'url': format_el['source'],
+                        'width': int_or_none(encoding_option.get('width')),
+                        'height': int_or_none(encoding_option.get('height')),
+                        'vbr': float_or_none(bitrate.get('video')),
+                        'abr': float_or_none(bitrate.get('audio')),
+                        'filesize': int_or_none(format_el.get('size')),
+                        'vcodec': format_el.get('type'),
+                        'ext': determine_ext(url, 'mp4'),
+                    })
         if formats:
             self._sort_formats(formats)
         return formats
 
-    def _extract_video_info(self, vid, key):
-        play_data = self._download_json(
-            'http://global.apis.naver.com/linetv/rmcnmv/vod_play_videoInfo.json?' + compat_urllib_parse.urlencode({'videoId': vid, 'key': key}),
-            vid, 'Downloading video info')
+    def _parse_video_info(self, play_data, vid):
         meta = play_data.get('meta')
-        user = meta.get('user')
+        user = meta.get('user', {})
 
         thumbnails = []
-        for thumbnail in play_data['thumbnails']['list']:
+        for thumbnail in play_data.get('thumbnails', {}).get('list', []):
             thumbnails.append({'url': thumbnail['source']})
 
-        formats = self._extract_video_formats(play_data['videos']['list'])
-        if not formats:
-            video_info = self._download_json(
-                'http://serviceapi.rmcnmv.naver.com/mobile/getVideoInfo.nhn?' + compat_urllib_parse.urlencode({'videoId': vid, 'inKey': key, 'protocol': 'http'}),
-                vid, 'Downloading video info')
-            formats = self._extract_video_formats(video_info['videos']['list'])
+        subtitles = {}
+        for caption in play_data.get('captions', {}).get('list', []):
+            subtitles[caption['language']] = [
+                {'ext': determine_ext(caption['source'], default_ext='vtt'),
+                 'url': caption['source']}]
+
+        formats = self._extract_video_formats(play_data['videos']['list'] + play_data.get('streams', []), vid)
 
         return {
             'id': vid,
-            'title': meta['subject'],
+            'title': meta.get('subject'),
             'formats': formats,
             'thumbnail': meta.get('cover', {}).get('source'),
             'thumbnails': thumbnails,
@@ -135,6 +139,18 @@ class NaverIE(InfoExtractor):
             'uploader': user.get('name'),
         }
 
+    def _extract_video_info(self, vid, key):
+        play_data = self._download_json(
+            'http://global.apis.naver.com/rmcnmv/rmcnmv/vod_play_videoInfo.json?' + compat_urllib_parse.urlencode({'videoId': vid, 'key': key}),
+            vid, 'Downloading video info')
+        info = self._parse_video_info(play_data, vid)
+        if not info['formats']:
+            play_data = self._download_json(
+                'http://serviceapi.rmcnmv.naver.com/mobile/getVideoInfo.nhn?' + compat_urllib_parse.urlencode({'videoId': vid, 'inKey': key, 'protocol': 'http'}),
+                vid, 'Downloading video info')
+            info['formats'] = self._extract_video_formats(play_data['videos']['list'] + play_data.get('streams', []), vid)
+        return info
+
     def _extract_id_and_key(self, webpage):
         m_id = re.search(r'(?s)new\s+nhn.rmcnmv.RMCVideoPlayer\(\s*["\']([^"\']+)["\']\s*,\s*(?:{[^}]*?value[^:]*?:\s*?)?["\']([^"\']+)["\']', webpage)
         if not m_id:
diff --git a/youtube_dl/extractor/vlive.py b/youtube_dl/extractor/vlive.py
index 86c1cb5ef..8755cdc6a 100644
--- a/youtube_dl/extractor/vlive.py
+++ b/youtube_dl/extractor/vlive.py
@@ -6,19 +6,19 @@ from hashlib import sha1
 from base64 import b64encode
 from time import time
 
-from .common import InfoExtractor
+from .naver import NaverIE
 from ..utils import (
     ExtractorError,
-    determine_ext
+    int_or_none,
 )
 from ..compat import compat_urllib_parse
 
 
-class VLiveIE(InfoExtractor):
+class VLiveIE(NaverIE):
     IE_NAME = 'vlive'
     # www.vlive.tv/video/ links redirect to m.vlive.tv/video/ for mobile devices
     _VALID_URL = r'https?://(?:(www|m)\.)?vlive\.tv/video/(?P<id>[0-9]+)'
-    _TEST = {
+    _TESTS = [{
         'url': 'http://m.vlive.tv/video/1326',
         'md5': 'cc7314812855ce56de70a06a27314983',
         'info_dict': {
@@ -27,60 +27,55 @@ class VLiveIE(InfoExtractor):
             'title': '[V] Girl\'s Day\'s Broadcast',
             'creator': 'Girl\'s Day',
         },
-    }
+    }]
     _SECRET = 'rFkwZet6pqk1vQt6SxxUkAHX7YL3lmqzUMrU4IDusTo4jEBdtOhNfT4BYYAdArwH'
 
     def _real_extract(self, url):
         video_id = self._match_id(url)
 
-        webpage = self._download_webpage(
-            'http://m.vlive.tv/video/%s' % video_id,
-            video_id, note='Download video page')
+        status = self._download_json(
+            'http://www.vlive.tv/video/status?videoSeq=%s' % video_id,
+            video_id, note='Download status metadata')
 
-        title = self._og_search_title(webpage)
-        thumbnail = self._og_search_thumbnail(webpage)
-        creator = self._html_search_regex(
-            r'<span[^>]+class="name">([^<>]+)</span>', webpage, 'creator')
-
-        url = 'http://global.apis.naver.com/globalV/globalV/vod/%s/playinfo?' % video_id
-        msgpad = '%.0f' % (time() * 1000)
-        md = b64encode(
-            hmac.new(self._SECRET.encode('ascii'),
-                     (url[:255] + msgpad).encode('ascii'), sha1).digest()
-        )
-        url += '&' + compat_urllib_parse.urlencode({'msgpad': msgpad, 'md': md})
-        playinfo = self._download_json(url, video_id, 'Downloading video json')
-
-        if playinfo.get('message', '') != 'success':
-            raise ExtractorError(playinfo.get('message', 'JSON request unsuccessful'))
-
-        if not playinfo.get('result'):
-            raise ExtractorError('No videos found.')
-
-        formats = []
-        for vid in playinfo['result'].get('videos', {}).get('list', []):
-            formats.append({
-                'url': vid['source'],
-                'ext': 'mp4',
-                'abr': vid.get('bitrate', {}).get('audio'),
-                'vbr': vid.get('bitrate', {}).get('video'),
-                'format_id': vid['encodingOption']['name'],
-                'height': vid.get('height'),
-                'width': vid.get('width'),
+        vid = status.get('vodId')
+        if vid:
+            key = status.get('vodInKey')
+            if not key:
+                key = self._download_webpage('http://www.vlive.tv/video/inkey?vodId=%s' % vid, video_id)
+            if key:
+                video_info = self._extract_video_info(vid, key)
+        elif status['status'] not in ('CANCELED', 'COMING_SOON', 'NOT_FOUND'):
+            webpage = self._download_webpage(
+                'http://m.vlive.tv/video/%s' % video_id,
+                video_id, note='Download video page')
+            title = self._og_search_title(webpage)
+            thumbnail = self._og_search_thumbnail(webpage)
+            creator = self._html_search_regex(
+                r'<span[^>]+class="name">([^<>]+)</span>', webpage, 'creator')
+            url = 'http://global.apis.naver.com/globalV/globalV/vod/%s/playinfo?' % video_id
+            msgpad = '%.0f' % (time() * 1000)
+            md = b64encode(
+                hmac.new(self._SECRET.encode('ascii'),
+                         (url[:255] + msgpad).encode('ascii'), sha1).digest()
+            )
+            url += '&' + compat_urllib_parse.urlencode({'msgpad': msgpad, 'md': md})
+            playinfo = self._download_json(url, video_id, 'Downloading video json')
+            if playinfo.get('message', '') != 'success':
+                raise ExtractorError(playinfo.get('message', 'JSON request unsuccessful'))
+            result = playinfo.get('result')
+            if not result:
+                raise ExtractorError('No videos found.')
+            video_info = self._parse_video_info(result, video_id)
+            video_info.update({
+                'title': title,
+                'thumbnail': thumbnail,
+                'creator': creator,
             })
-        self._sort_formats(formats)
-
-        subtitles = {}
-        for caption in playinfo['result'].get('captions', {}).get('list', []):
-            subtitles[caption['language']] = [
-                {'ext': determine_ext(caption['source'], default_ext='vtt'),
-                 'url': caption['source']}]
-
-        return {
-            'id': video_id,
-            'title': title,
-            'creator': creator,
-            'thumbnail': thumbnail,
-            'formats': formats,
-            'subtitles': subtitles,
-        }
+        if video_info:
+            video_info.update({
+                'id': video_id,
+                'view_count': int_or_none(status.get('playCount')),
+                'likes': int_or_none(status.get('likeCount')),
+            })
+            return video_info
+        raise ExtractorError(status['status'])