From 491f42116cb4516dbfb81db665e72c9d64a3e456 Mon Sep 17 00:00:00 2001 From: qsniyg Date: Fri, 16 Dec 2016 09:16:06 -0800 Subject: [PATCH 1/6] [tistory] Add new extractor --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/tistory.py | 113 +++++++++++++++++++++++++++++ 2 files changed, 114 insertions(+) create mode 100644 youtube_dl/extractor/tistory.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index bcf9f1906..c11c8bcdd 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -912,6 +912,7 @@ from .telewebion import TelewebionIE from .testurl import TestURLIE from .tf1 import TF1IE from .tfo import TFOIE +from .tistory import TistoryIE from .theintercept import TheInterceptIE from .theplatform import ( ThePlatformIE, diff --git a/youtube_dl/extractor/tistory.py b/youtube_dl/extractor/tistory.py new file mode 100644 index 000000000..8299b6fdd --- /dev/null +++ b/youtube_dl/extractor/tistory.py @@ -0,0 +1,113 @@ +# coding: utf-8 + +from .common import InfoExtractor +from ..utils import ( + url_basename, + mimetype2ext, + HEADRequest, + ExtractorError +) +from ..compat import ( + compat_urllib_request, + compat_urlparse, + compat_str +) + +import os.path +import cgi +import re + + +class TistoryIE(InfoExtractor): + _VALID_URL = r'https?://cfile[0-9]*.uf.tistory.com/(?:media|attach|attachment)/(?P[A-Za-z0-9]*)' + + _TEST = { + 'url': 'http://cfile23.uf.tistory.com/media/111ED14A4FAEBC3C23AAE1', + 'md5': '55c32cda7b1a091d75c32aeaaea47595', + 'info_dict': { + 'id': '207B594C4FAEBBC118096B', + 'title': compat_str('함친.wmv-muxed', encoding="UTF-8"), + 'ext': 'mp4' + } + } + + def unquote(self, url): + return compat_urlparse.unquote(url) + + def get_title(self, url, response): + _, params = cgi.parse_header(response.info().get('Content-Disposition', '')) + if "filename" in params: + filename = params["filename"] + else: + filename = url_basename(url) + + retval = os.path.splitext(self.unquote(filename))[0] + + if type(retval) != compat_str: + retval = retval.decode('UTF-8') + + return retval + + def _real_extract(self, url): + video_id = self._match_id(url) + + self.to_screen('%s: Downloading headers' % (video_id)) + req = HEADRequest(url) + + head = compat_urllib_request.urlopen(req) + content_type = head.info().get("content-type") + + ret = { + "id": compat_str(video_id), + "url": url, + "title": self.get_title(url, head) + } + + if content_type == "application/x-shockwave-flash": + swfreq = self._request_webpage(url, video_id, "Downloading SWF") + data = swfreq.read() + + a = data[0] + b = data[1] + c = data[2] + + if isinstance(a, str): + a = ord(a) + b = ord(b) + c = ord(c) + + rawswfdata = data[8:] + + if a not in [0x43, 0x46, 0x5A] or b != 0x57 or c != 0x53: + raise ExtractorError("Not a SWF file") + + if a == 0x46: + swfdata = rawswfdata + elif a == 0x43: + import zlib + zip = zlib.decompressobj() + swfdata = str(zip.decompress(rawswfdata)) + elif a == 0x5A: + import pylzma + rawswfdata = data[11:] + swfdata = str(pylzma.decompress(rawswfdata)) + + match = re.search("(https?://[A-Za-z0-9.]*/attachment/cfile[0-9]*.uf@[A-Za-z0-9.@%]*)", + swfdata) + if not match: + raise ExtractorError("Unable to find check URL") + + checkurl = match.group(1) + + checkmatch = re.search("(cfile[0-9]*.uf)@([A-Z0-9]*)", checkurl) + if not checkmatch: + raise ExtractorError("Unable to find real URL in check URL") + + cfile = checkmatch.group(1) + url = checkmatch.group(2) + + ret["url"] = "http://" + cfile + ".tistory.com/attach/" + url + return self._real_extract(ret["url"]) + else: + ret["ext"] = mimetype2ext(content_type) + return ret From 6083b83002a5f9af3e782af3c12eb17ab55982b9 Mon Sep 17 00:00:00 2001 From: qsniyg Date: Fri, 16 Dec 2016 09:38:33 -0800 Subject: [PATCH 2/6] [tistory] Add unicode literals --- youtube_dl/extractor/tistory.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/tistory.py b/youtube_dl/extractor/tistory.py index 8299b6fdd..77aaf4dce 100644 --- a/youtube_dl/extractor/tistory.py +++ b/youtube_dl/extractor/tistory.py @@ -1,4 +1,5 @@ # coding: utf-8 +from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( @@ -26,7 +27,7 @@ class TistoryIE(InfoExtractor): 'md5': '55c32cda7b1a091d75c32aeaaea47595', 'info_dict': { 'id': '207B594C4FAEBBC118096B', - 'title': compat_str('함친.wmv-muxed', encoding="UTF-8"), + 'title': '함친.wmv-muxed', 'ext': 'mp4' } } From 9093632cffb1bd0b92d815b90fc297e921dcdad6 Mon Sep 17 00:00:00 2001 From: qsniyg Date: Fri, 16 Dec 2016 10:38:02 -0800 Subject: [PATCH 3/6] [tistory] Add support for /original/ urls --- youtube_dl/extractor/tistory.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/tistory.py b/youtube_dl/extractor/tistory.py index 77aaf4dce..2d238b21c 100644 --- a/youtube_dl/extractor/tistory.py +++ b/youtube_dl/extractor/tistory.py @@ -20,7 +20,7 @@ import re class TistoryIE(InfoExtractor): - _VALID_URL = r'https?://cfile[0-9]*.uf.tistory.com/(?:media|attach|attachment)/(?P[A-Za-z0-9]*)' + _VALID_URL = r'https?://cfile[0-9]*.uf.tistory.com/(?:media|attach|attachment|original)/(?P[A-Za-z0-9]*)' _TEST = { 'url': 'http://cfile23.uf.tistory.com/media/111ED14A4FAEBC3C23AAE1', From 8f2b90df537090821d7642500d05206312538f37 Mon Sep 17 00:00:00 2001 From: qsniyg Date: Fri, 16 Dec 2016 11:09:26 -0800 Subject: [PATCH 4/6] [tistory] Fix flvs --- youtube_dl/extractor/tistory.py | 38 ++++++++++++++++++++++++--------- 1 file changed, 28 insertions(+), 10 deletions(-) diff --git a/youtube_dl/extractor/tistory.py b/youtube_dl/extractor/tistory.py index 2d238b21c..8dbb009cb 100644 --- a/youtube_dl/extractor/tistory.py +++ b/youtube_dl/extractor/tistory.py @@ -22,15 +22,26 @@ import re class TistoryIE(InfoExtractor): _VALID_URL = r'https?://cfile[0-9]*.uf.tistory.com/(?:media|attach|attachment|original)/(?P[A-Za-z0-9]*)' - _TEST = { - 'url': 'http://cfile23.uf.tistory.com/media/111ED14A4FAEBC3C23AAE1', - 'md5': '55c32cda7b1a091d75c32aeaaea47595', - 'info_dict': { - 'id': '207B594C4FAEBBC118096B', - 'title': '함친.wmv-muxed', - 'ext': 'mp4' + _TESTS = [ + { + 'url': 'http://cfile23.uf.tistory.com/media/111ED14A4FAEBC3C23AAE1', + 'md5': '55c32cda7b1a091d75c32aeaaea47595', + 'info_dict': { + 'id': '207B594C4FAEBBC118096B', + 'title': '함친.wmv-muxed', + 'ext': 'mp4' + }, + }, + { + 'url': 'http://cfile24.uf.tistory.com/original/1870B0374FBD97A80980D2', + 'md5': 'dad089588a30447c0e51c78f29a9183e', + 'info_dict': { + 'id': '1870B0374FBD97A80980D2', + 'title': '무제-1', + 'ext': 'flv' + } } - } + ] def unquote(self, url): return compat_urlparse.unquote(url) @@ -49,6 +60,12 @@ class TistoryIE(InfoExtractor): return retval + def get_ext(self, mime): + ext = mimetype2ext(mime) + if ext == "x-shockwave-flash": + ext = "flv" + return ext + def _real_extract(self, url): video_id = self._match_id(url) @@ -57,6 +74,7 @@ class TistoryIE(InfoExtractor): head = compat_urllib_request.urlopen(req) content_type = head.info().get("content-type") + content_length = int(head.info().get("content-length")) ret = { "id": compat_str(video_id), @@ -64,7 +82,7 @@ class TistoryIE(InfoExtractor): "title": self.get_title(url, head) } - if content_type == "application/x-shockwave-flash": + if content_type == "application/x-shockwave-flash" and content_length < 200000: swfreq = self._request_webpage(url, video_id, "Downloading SWF") data = swfreq.read() @@ -110,5 +128,5 @@ class TistoryIE(InfoExtractor): ret["url"] = "http://" + cfile + ".tistory.com/attach/" + url return self._real_extract(ret["url"]) else: - ret["ext"] = mimetype2ext(content_type) + ret["ext"] = self.get_ext(content_type) return ret From 7829a5c2db38f444244c95a062ecb6f37b5b60b1 Mon Sep 17 00:00:00 2001 From: qsniyg Date: Fri, 16 Dec 2016 23:47:35 -0800 Subject: [PATCH 5/6] Add basic (non-concatenating) support for playlists --- youtube_dl/extractor/extractors.py | 5 +- youtube_dl/extractor/tistory.py | 238 +++++++++++++++++++---------- 2 files changed, 165 insertions(+), 78 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index c11c8bcdd..07b8be21a 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -912,7 +912,10 @@ from .telewebion import TelewebionIE from .testurl import TestURLIE from .tf1 import TF1IE from .tfo import TFOIE -from .tistory import TistoryIE +from .tistory import ( + TistoryIE, + TistoryPlaylistIE +) from .theintercept import TheInterceptIE from .theplatform import ( ThePlatformIE, diff --git a/youtube_dl/extractor/tistory.py b/youtube_dl/extractor/tistory.py index 8dbb009cb..885f1b9cb 100644 --- a/youtube_dl/extractor/tistory.py +++ b/youtube_dl/extractor/tistory.py @@ -10,6 +10,7 @@ from ..utils import ( ) from ..compat import ( compat_urllib_request, + compat_urllib_error, compat_urlparse, compat_str ) @@ -17,10 +18,149 @@ from ..compat import ( import os.path import cgi import re +import xml.etree.ElementTree as ET -class TistoryIE(InfoExtractor): - _VALID_URL = r'https?://cfile[0-9]*.uf.tistory.com/(?:media|attach|attachment|original)/(?P[A-Za-z0-9]*)' +class TistoryBaseIE(InfoExtractor): + _TI_MEDIA_URL = r'https?://cfile[0-9]*.uf.tistory.com/(?:media|attach|attachment|original)/(?P[A-Za-z0-9]*)' + + def _ti_unquote(self, url): + return compat_urlparse.unquote(url) + + def _ti_get_title(self, url, response): + _, params = cgi.parse_header(response.info().get('Content-Disposition', '')) + if "filename" in params: + filename = params["filename"] + else: + filename = url_basename(url) + + retval = os.path.splitext(self._ti_unquote(filename))[0] + + if type(retval) != compat_str: + retval = retval.decode('UTF-8') + + return retval + + def _ti_get_ext(self, mime): + ext = mimetype2ext(mime) + if ext == "x-shockwave-flash": + ext = "flv" + return ext + + def _ti_get_real_from_check(self, check): + checkmatch = re.search("(cfile[0-9]*.uf)@([A-Z0-9]*)(?:\.([A-Za-z0-9]*))?", check) + if not checkmatch: + return None + + cfile = checkmatch.group(1) + url = checkmatch.group(2) + ext = None + + if len(checkmatch.groups()) > 2: + ext = checkmatch.group(3) + + return ("http://" + cfile + ".tistory.com/attach/" + url, ext) + + def _ti_get_video_id(self, url): + if '_TI_MEDIA_URL_RE' not in self.__dict__: + self._TI_MEDIA_URL_RE = re.compile(self._TI_MEDIA_URL) + m = self._TI_MEDIA_URL_RE.match(url) + assert m + return m.group('id') + + def _ti_get_headers(self, url, video_id): + self.to_screen('%s: Downloading headers' % (video_id)) + req = HEADRequest(url) + + return compat_urllib_request.urlopen(req) + + def _ti_detect_swf(self, head): + content_type = head.info().get("content-type") + content_length = int(head.info().get("content-length")) + + if content_type == "application/x-shockwave-flash" and content_length < 200000: + return True + + return False + + def _ti_get_media(self, url, video_id, head, ext=None, title=None): + if head: + content_type = head.info().get("content-type") + ext = self._ti_get_ext(content_type) + title = self._ti_get_title(url, head) + + if not title: + title = video_id + + return { + "id": compat_str(video_id), + "url": url, + "title": title, + "ext": ext + } + + def _ti_read_swf(self, url, video_id, head): + swfreq = self._request_webpage(url, video_id, "Downloading SWF") + data = swfreq.read() + + a = data[0] + b = data[1] + c = data[2] + + if isinstance(a, str): + a = ord(a) + b = ord(b) + c = ord(c) + + rawswfdata = data[8:] + + if a not in [0x43, 0x46, 0x5A] or b != 0x57 or c != 0x53: + raise ExtractorError("Not a SWF file") + + if a == 0x46: + swfdata = rawswfdata + elif a == 0x43: + import zlib + zip = zlib.decompressobj() + swfdata = str(zip.decompress(rawswfdata)) + elif a == 0x5A: + import pylzma + rawswfdata = data[11:] + swfdata = str(pylzma.decompress(rawswfdata)) + + match = re.search("(https?://[A-Za-z0-9.]*/attachment/cfile[0-9]*.uf@[A-Za-z0-9.@%]*)", + swfdata) + if not match: + raise ExtractorError("Unable to find check URL") + + checkurl = match.group(1) + + real_url, ext = self._ti_get_real_from_check(checkurl) + if not real_url: + raise ExtractorError("Unable to find real URL in check URL") + + return (real_url, ext) + + def _ti_dl(self, url, ext=None, title=None): + video_id = self._ti_get_video_id(url) + + head = None + + try: + head = self._ti_get_headers(url, video_id) + except compat_urllib_error.HTTPError: + pass + except Exception: + head = None + + if head and self._ti_detect_swf(head): + return self._ti_dl(*self._ti_read_swf(url, video_id, head)) + else: + return self._ti_get_media(url, video_id, head, ext, title) + + +class TistoryIE(TistoryBaseIE): + _VALID_URL = TistoryBaseIE._TI_MEDIA_URL _TESTS = [ { @@ -43,90 +183,34 @@ class TistoryIE(InfoExtractor): } ] - def unquote(self, url): - return compat_urlparse.unquote(url) + def _real_extract(self, url): + return self._ti_dl(url) - def get_title(self, url, response): - _, params = cgi.parse_header(response.info().get('Content-Disposition', '')) - if "filename" in params: - filename = params["filename"] - else: - filename = url_basename(url) - retval = os.path.splitext(self.unquote(filename))[0] - - if type(retval) != compat_str: - retval = retval.decode('UTF-8') - - return retval - - def get_ext(self, mime): - ext = mimetype2ext(mime) - if ext == "x-shockwave-flash": - ext = "flv" - return ext +class TistoryPlaylistIE(TistoryBaseIE): + _VALID_URL = r'(?:https?://cfs.tistory.com/custom/blog/.*/skin/images/po.swf?.*file=)?(?Phttps?://cfs.tistory.com/custom/blog/.*/skin/images/(?P.*)\.xml).*' def _real_extract(self, url): video_id = self._match_id(url) + rurl = self._VALID_URL_RE.match(url).group("rurl") - self.to_screen('%s: Downloading headers' % (video_id)) - req = HEADRequest(url) + xml = self._download_xml(rurl, video_id) + entries = [] - head = compat_urllib_request.urlopen(req) - content_type = head.info().get("content-type") - content_length = int(head.info().get("content-length")) + for tracklist in xml: + for track in tracklist: + for tag in track: + print(ET.tostring(tag)) + if "location" not in tag.tag: + continue - ret = { - "id": compat_str(video_id), - "url": url, - "title": self.get_title(url, head) - } + loc = tag.text - if content_type == "application/x-shockwave-flash" and content_length < 200000: - swfreq = self._request_webpage(url, video_id, "Downloading SWF") - data = swfreq.read() + newloc, ext = self._ti_get_real_from_check(loc) + if newloc: + loc = newloc - a = data[0] - b = data[1] - c = data[2] + entries.append(self._ti_dl(loc, ext)) - if isinstance(a, str): - a = ord(a) - b = ord(b) - c = ord(c) - rawswfdata = data[8:] - - if a not in [0x43, 0x46, 0x5A] or b != 0x57 or c != 0x53: - raise ExtractorError("Not a SWF file") - - if a == 0x46: - swfdata = rawswfdata - elif a == 0x43: - import zlib - zip = zlib.decompressobj() - swfdata = str(zip.decompress(rawswfdata)) - elif a == 0x5A: - import pylzma - rawswfdata = data[11:] - swfdata = str(pylzma.decompress(rawswfdata)) - - match = re.search("(https?://[A-Za-z0-9.]*/attachment/cfile[0-9]*.uf@[A-Za-z0-9.@%]*)", - swfdata) - if not match: - raise ExtractorError("Unable to find check URL") - - checkurl = match.group(1) - - checkmatch = re.search("(cfile[0-9]*.uf)@([A-Z0-9]*)", checkurl) - if not checkmatch: - raise ExtractorError("Unable to find real URL in check URL") - - cfile = checkmatch.group(1) - url = checkmatch.group(2) - - ret["url"] = "http://" + cfile + ".tistory.com/attach/" + url - return self._real_extract(ret["url"]) - else: - ret["ext"] = self.get_ext(content_type) - return ret + return self.playlist_result(entries) From dd6bda841fb275cbfd02cf96aa906393e989e7d5 Mon Sep 17 00:00:00 2001 From: qsniyg Date: Sat, 17 Dec 2016 00:06:53 -0800 Subject: [PATCH 6/6] [tistory] Add support for daum --- youtube_dl/extractor/tistory.py | 74 +++++++++++++++++++-------------- 1 file changed, 43 insertions(+), 31 deletions(-) diff --git a/youtube_dl/extractor/tistory.py b/youtube_dl/extractor/tistory.py index 885f1b9cb..669684b75 100644 --- a/youtube_dl/extractor/tistory.py +++ b/youtube_dl/extractor/tistory.py @@ -18,11 +18,10 @@ from ..compat import ( import os.path import cgi import re -import xml.etree.ElementTree as ET class TistoryBaseIE(InfoExtractor): - _TI_MEDIA_URL = r'https?://cfile[0-9]*.uf.tistory.com/(?:media|attach|attachment|original)/(?P[A-Za-z0-9]*)' + _TI_MEDIA_URL = r'https?://cfile[0-9]*.uf.(tistory.com|daum.net)/(?:media|attach|attachment|original)/(?P[A-Za-z0-9]*)' def _ti_unquote(self, url): return compat_urlparse.unquote(url) @@ -48,18 +47,19 @@ class TistoryBaseIE(InfoExtractor): return ext def _ti_get_real_from_check(self, check): - checkmatch = re.search("(cfile[0-9]*.uf)@([A-Z0-9]*)(?:\.([A-Za-z0-9]*))?", check) + checkmatch = re.search("(?P(tistory.com|daum.net)).*(?Pcfile[0-9]*.uf)@(?P[A-Z0-9]*)(?:\.(?P[A-Za-z0-9]*))?", check) if not checkmatch: return None - cfile = checkmatch.group(1) - url = checkmatch.group(2) + host = checkmatch.group("host") + cfile = checkmatch.group("server") + url = checkmatch.group("id") ext = None - if len(checkmatch.groups()) > 2: - ext = checkmatch.group(3) + if len(checkmatch.groups()) > 3: + ext = checkmatch.group("ext") - return ("http://" + cfile + ".tistory.com/attach/" + url, ext) + return ("http://" + cfile + "." + host + "/attach/" + url, ext) def _ti_get_video_id(self, url): if '_TI_MEDIA_URL_RE' not in self.__dict__: @@ -83,6 +83,14 @@ class TistoryBaseIE(InfoExtractor): return False + def _ti_detect_xml(self, head): + content_type = head.info().get("content-type") + + if "xml" in content_type or content_type == "text/html": + return True + + return False + def _ti_get_media(self, url, video_id, head, ext=None, title=None): if head: content_type = head.info().get("content-type") @@ -141,8 +149,29 @@ class TistoryBaseIE(InfoExtractor): return (real_url, ext) - def _ti_dl(self, url, ext=None, title=None): - video_id = self._ti_get_video_id(url) + def _ti_read_xml(self, url, video_id): + xml = self._download_xml(url, video_id) + entries = [] + + for tracklist in xml: + for track in tracklist: + for tag in track: + if "location" not in tag.tag: + continue + + loc = tag.text + + newloc, ext = self._ti_get_real_from_check(loc) + if newloc: + loc = newloc + + entries.append(self._ti_dl(loc, ext)) + + return self.playlist_result(entries) + + def _ti_dl(self, url, ext=None, title=None, video_id=None): + if not video_id: + video_id = self._ti_get_video_id(url) head = None @@ -155,6 +184,8 @@ class TistoryBaseIE(InfoExtractor): if head and self._ti_detect_swf(head): return self._ti_dl(*self._ti_read_swf(url, video_id, head)) + elif head and self._ti_detect_xml(head): + return self._ti_read_xml(url, video_id) else: return self._ti_get_media(url, video_id, head, ext, title) @@ -188,29 +219,10 @@ class TistoryIE(TistoryBaseIE): class TistoryPlaylistIE(TistoryBaseIE): - _VALID_URL = r'(?:https?://cfs.tistory.com/custom/blog/.*/skin/images/po.swf?.*file=)?(?Phttps?://cfs.tistory.com/custom/blog/.*/skin/images/(?P.*)\.xml).*' + _VALID_URL = r'.*(?Phttps?://cfs.tistory.com/custom/blog/.*/skin/images/(?P.*)\.xml).*' def _real_extract(self, url): video_id = self._match_id(url) rurl = self._VALID_URL_RE.match(url).group("rurl") - xml = self._download_xml(rurl, video_id) - entries = [] - - for tracklist in xml: - for track in tracklist: - for tag in track: - print(ET.tostring(tag)) - if "location" not in tag.tag: - continue - - loc = tag.text - - newloc, ext = self._ti_get_real_from_check(loc) - if newloc: - loc = newloc - - entries.append(self._ti_dl(loc, ext)) - - - return self.playlist_result(entries) + return self._ti_dl(rurl, video_id=video_id)