[tistory] Add new extractor

2016-12-16 09:16:06 -08:00 · 2016-12-16 09:16:06 -08:00 · 491f42116c
commit 491f42116c
parent dc1f3a9f20
2 changed files with 114 additions and 0 deletions
--- a/youtube_dl/extractor/extractors.py
+++ b/youtube_dl/extractor/extractors.py
@ -912,6 +912,7 @@ from .telewebion import TelewebionIE
 from .testurl import TestURLIE
 from .tf1 import TF1IE
 from .tfo import TFOIE
 from .tistory import TistoryIE
 from .theintercept import TheInterceptIE
 from .theplatform import (
    ThePlatformIE,
--- a/youtube_dl/extractor/tistory.py
+++ b/youtube_dl/extractor/tistory.py
@ -0,0 +1,113 @@
 # coding: utf-8
 from .common import InfoExtractor
 from ..utils import (
    url_basename,
    mimetype2ext,
    HEADRequest,
    ExtractorError
 )
 from ..compat import (
    compat_urllib_request,
    compat_urlparse,
    compat_str
 )
 import os.path
 import cgi
 import re
 class TistoryIE(InfoExtractor):
    _VALID_URL = r'https?://cfile[0-9]*.uf.tistory.com/(?:media|attach|attachment)/(?P<id>[A-Za-z0-9]*)'
    _TEST = {
        'url': 'http://cfile23.uf.tistory.com/media/111ED14A4FAEBC3C23AAE1',
        'md5': '55c32cda7b1a091d75c32aeaaea47595',
        'info_dict': {
            'id': '207B594C4FAEBBC118096B',
            'title': compat_str('함친.wmv-muxed', encoding="UTF-8"),
            'ext': 'mp4'
        }
    }
    def unquote(self, url):
        return compat_urlparse.unquote(url)
    def get_title(self, url, response):
        _, params = cgi.parse_header(response.info().get('Content-Disposition', ''))
        if "filename" in params:
            filename = params["filename"]
        else:
            filename = url_basename(url)
        retval = os.path.splitext(self.unquote(filename))[0]
        if type(retval) != compat_str:
            retval = retval.decode('UTF-8')
        return retval
    def _real_extract(self, url):
        video_id = self._match_id(url)
        self.to_screen('%s: Downloading headers' % (video_id))
        req = HEADRequest(url)
        head = compat_urllib_request.urlopen(req)
        content_type = head.info().get("content-type")
        ret = {
            "id": compat_str(video_id),
            "url": url,
            "title": self.get_title(url, head)
        }
        if content_type == "application/x-shockwave-flash":
            swfreq = self._request_webpage(url, video_id, "Downloading SWF")
            data = swfreq.read()
            a = data[0]
            b = data[1]
            c = data[2]
            if isinstance(a, str):
                a = ord(a)
                b = ord(b)
                c = ord(c)
            rawswfdata = data[8:]
            if a not in [0x43, 0x46, 0x5A] or b != 0x57 or c != 0x53:
                raise ExtractorError("Not a SWF file")
            if a == 0x46:
                swfdata = rawswfdata
            elif a == 0x43:
                import zlib
                zip = zlib.decompressobj()
                swfdata = str(zip.decompress(rawswfdata))
            elif a == 0x5A:
                import pylzma
                rawswfdata = data[11:]
                swfdata = str(pylzma.decompress(rawswfdata))
            match = re.search("(https?://[A-Za-z0-9.]*/attachment/cfile[0-9]*.uf@[A-Za-z0-9.@%]*)",
                              swfdata)
            if not match:
                raise ExtractorError("Unable to find check URL")
            checkurl = match.group(1)
            checkmatch = re.search("(cfile[0-9]*.uf)@([A-Z0-9]*)", checkurl)
            if not checkmatch:
                raise ExtractorError("Unable to find real URL in check URL")
            cfile = checkmatch.group(1)
            url = checkmatch.group(2)
            ret["url"] = "http://" + cfile + ".tistory.com/attach/" + url
            return self._real_extract(ret["url"])
        else:
            ret["ext"] = mimetype2ext(content_type)
            return ret