From 491f42116cb4516dbfb81db665e72c9d64a3e456 Mon Sep 17 00:00:00 2001 From: qsniyg Date: Fri, 16 Dec 2016 09:16:06 -0800 Subject: [PATCH] [tistory] Add new extractor --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/tistory.py | 113 +++++++++++++++++++++++++++++ 2 files changed, 114 insertions(+) create mode 100644 youtube_dl/extractor/tistory.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index bcf9f1906..c11c8bcdd 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -912,6 +912,7 @@ from .telewebion import TelewebionIE from .testurl import TestURLIE from .tf1 import TF1IE from .tfo import TFOIE +from .tistory import TistoryIE from .theintercept import TheInterceptIE from .theplatform import ( ThePlatformIE, diff --git a/youtube_dl/extractor/tistory.py b/youtube_dl/extractor/tistory.py new file mode 100644 index 000000000..8299b6fdd --- /dev/null +++ b/youtube_dl/extractor/tistory.py @@ -0,0 +1,113 @@ +# coding: utf-8 + +from .common import InfoExtractor +from ..utils import ( + url_basename, + mimetype2ext, + HEADRequest, + ExtractorError +) +from ..compat import ( + compat_urllib_request, + compat_urlparse, + compat_str +) + +import os.path +import cgi +import re + + +class TistoryIE(InfoExtractor): + _VALID_URL = r'https?://cfile[0-9]*.uf.tistory.com/(?:media|attach|attachment)/(?P[A-Za-z0-9]*)' + + _TEST = { + 'url': 'http://cfile23.uf.tistory.com/media/111ED14A4FAEBC3C23AAE1', + 'md5': '55c32cda7b1a091d75c32aeaaea47595', + 'info_dict': { + 'id': '207B594C4FAEBBC118096B', + 'title': compat_str('함친.wmv-muxed', encoding="UTF-8"), + 'ext': 'mp4' + } + } + + def unquote(self, url): + return compat_urlparse.unquote(url) + + def get_title(self, url, response): + _, params = cgi.parse_header(response.info().get('Content-Disposition', '')) + if "filename" in params: + filename = params["filename"] + else: + filename = url_basename(url) + + retval = os.path.splitext(self.unquote(filename))[0] + + if type(retval) != compat_str: + retval = retval.decode('UTF-8') + + return retval + + def _real_extract(self, url): + video_id = self._match_id(url) + + self.to_screen('%s: Downloading headers' % (video_id)) + req = HEADRequest(url) + + head = compat_urllib_request.urlopen(req) + content_type = head.info().get("content-type") + + ret = { + "id": compat_str(video_id), + "url": url, + "title": self.get_title(url, head) + } + + if content_type == "application/x-shockwave-flash": + swfreq = self._request_webpage(url, video_id, "Downloading SWF") + data = swfreq.read() + + a = data[0] + b = data[1] + c = data[2] + + if isinstance(a, str): + a = ord(a) + b = ord(b) + c = ord(c) + + rawswfdata = data[8:] + + if a not in [0x43, 0x46, 0x5A] or b != 0x57 or c != 0x53: + raise ExtractorError("Not a SWF file") + + if a == 0x46: + swfdata = rawswfdata + elif a == 0x43: + import zlib + zip = zlib.decompressobj() + swfdata = str(zip.decompress(rawswfdata)) + elif a == 0x5A: + import pylzma + rawswfdata = data[11:] + swfdata = str(pylzma.decompress(rawswfdata)) + + match = re.search("(https?://[A-Za-z0-9.]*/attachment/cfile[0-9]*.uf@[A-Za-z0-9.@%]*)", + swfdata) + if not match: + raise ExtractorError("Unable to find check URL") + + checkurl = match.group(1) + + checkmatch = re.search("(cfile[0-9]*.uf)@([A-Z0-9]*)", checkurl) + if not checkmatch: + raise ExtractorError("Unable to find real URL in check URL") + + cfile = checkmatch.group(1) + url = checkmatch.group(2) + + ret["url"] = "http://" + cfile + ".tistory.com/attach/" + url + return self._real_extract(ret["url"]) + else: + ret["ext"] = mimetype2ext(content_type) + return ret