From 7829a5c2db38f444244c95a062ecb6f37b5b60b1 Mon Sep 17 00:00:00 2001 From: qsniyg Date: Fri, 16 Dec 2016 23:47:35 -0800 Subject: [PATCH] Add basic (non-concatenating) support for playlists --- youtube_dl/extractor/extractors.py | 5 +- youtube_dl/extractor/tistory.py | 238 +++++++++++++++++++---------- 2 files changed, 165 insertions(+), 78 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index c11c8bcdd..07b8be21a 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -912,7 +912,10 @@ from .telewebion import TelewebionIE from .testurl import TestURLIE from .tf1 import TF1IE from .tfo import TFOIE -from .tistory import TistoryIE +from .tistory import ( + TistoryIE, + TistoryPlaylistIE +) from .theintercept import TheInterceptIE from .theplatform import ( ThePlatformIE, diff --git a/youtube_dl/extractor/tistory.py b/youtube_dl/extractor/tistory.py index 8dbb009cb..885f1b9cb 100644 --- a/youtube_dl/extractor/tistory.py +++ b/youtube_dl/extractor/tistory.py @@ -10,6 +10,7 @@ from ..utils import ( ) from ..compat import ( compat_urllib_request, + compat_urllib_error, compat_urlparse, compat_str ) @@ -17,10 +18,149 @@ from ..compat import ( import os.path import cgi import re +import xml.etree.ElementTree as ET -class TistoryIE(InfoExtractor): - _VALID_URL = r'https?://cfile[0-9]*.uf.tistory.com/(?:media|attach|attachment|original)/(?P[A-Za-z0-9]*)' +class TistoryBaseIE(InfoExtractor): + _TI_MEDIA_URL = r'https?://cfile[0-9]*.uf.tistory.com/(?:media|attach|attachment|original)/(?P[A-Za-z0-9]*)' + + def _ti_unquote(self, url): + return compat_urlparse.unquote(url) + + def _ti_get_title(self, url, response): + _, params = cgi.parse_header(response.info().get('Content-Disposition', '')) + if "filename" in params: + filename = params["filename"] + else: + filename = url_basename(url) + + retval = os.path.splitext(self._ti_unquote(filename))[0] + + if type(retval) != compat_str: + retval = retval.decode('UTF-8') + + return retval + + def _ti_get_ext(self, mime): + ext = mimetype2ext(mime) + if ext == "x-shockwave-flash": + ext = "flv" + return ext + + def _ti_get_real_from_check(self, check): + checkmatch = re.search("(cfile[0-9]*.uf)@([A-Z0-9]*)(?:\.([A-Za-z0-9]*))?", check) + if not checkmatch: + return None + + cfile = checkmatch.group(1) + url = checkmatch.group(2) + ext = None + + if len(checkmatch.groups()) > 2: + ext = checkmatch.group(3) + + return ("http://" + cfile + ".tistory.com/attach/" + url, ext) + + def _ti_get_video_id(self, url): + if '_TI_MEDIA_URL_RE' not in self.__dict__: + self._TI_MEDIA_URL_RE = re.compile(self._TI_MEDIA_URL) + m = self._TI_MEDIA_URL_RE.match(url) + assert m + return m.group('id') + + def _ti_get_headers(self, url, video_id): + self.to_screen('%s: Downloading headers' % (video_id)) + req = HEADRequest(url) + + return compat_urllib_request.urlopen(req) + + def _ti_detect_swf(self, head): + content_type = head.info().get("content-type") + content_length = int(head.info().get("content-length")) + + if content_type == "application/x-shockwave-flash" and content_length < 200000: + return True + + return False + + def _ti_get_media(self, url, video_id, head, ext=None, title=None): + if head: + content_type = head.info().get("content-type") + ext = self._ti_get_ext(content_type) + title = self._ti_get_title(url, head) + + if not title: + title = video_id + + return { + "id": compat_str(video_id), + "url": url, + "title": title, + "ext": ext + } + + def _ti_read_swf(self, url, video_id, head): + swfreq = self._request_webpage(url, video_id, "Downloading SWF") + data = swfreq.read() + + a = data[0] + b = data[1] + c = data[2] + + if isinstance(a, str): + a = ord(a) + b = ord(b) + c = ord(c) + + rawswfdata = data[8:] + + if a not in [0x43, 0x46, 0x5A] or b != 0x57 or c != 0x53: + raise ExtractorError("Not a SWF file") + + if a == 0x46: + swfdata = rawswfdata + elif a == 0x43: + import zlib + zip = zlib.decompressobj() + swfdata = str(zip.decompress(rawswfdata)) + elif a == 0x5A: + import pylzma + rawswfdata = data[11:] + swfdata = str(pylzma.decompress(rawswfdata)) + + match = re.search("(https?://[A-Za-z0-9.]*/attachment/cfile[0-9]*.uf@[A-Za-z0-9.@%]*)", + swfdata) + if not match: + raise ExtractorError("Unable to find check URL") + + checkurl = match.group(1) + + real_url, ext = self._ti_get_real_from_check(checkurl) + if not real_url: + raise ExtractorError("Unable to find real URL in check URL") + + return (real_url, ext) + + def _ti_dl(self, url, ext=None, title=None): + video_id = self._ti_get_video_id(url) + + head = None + + try: + head = self._ti_get_headers(url, video_id) + except compat_urllib_error.HTTPError: + pass + except Exception: + head = None + + if head and self._ti_detect_swf(head): + return self._ti_dl(*self._ti_read_swf(url, video_id, head)) + else: + return self._ti_get_media(url, video_id, head, ext, title) + + +class TistoryIE(TistoryBaseIE): + _VALID_URL = TistoryBaseIE._TI_MEDIA_URL _TESTS = [ { @@ -43,90 +183,34 @@ class TistoryIE(InfoExtractor): } ] - def unquote(self, url): - return compat_urlparse.unquote(url) + def _real_extract(self, url): + return self._ti_dl(url) - def get_title(self, url, response): - _, params = cgi.parse_header(response.info().get('Content-Disposition', '')) - if "filename" in params: - filename = params["filename"] - else: - filename = url_basename(url) - retval = os.path.splitext(self.unquote(filename))[0] - - if type(retval) != compat_str: - retval = retval.decode('UTF-8') - - return retval - - def get_ext(self, mime): - ext = mimetype2ext(mime) - if ext == "x-shockwave-flash": - ext = "flv" - return ext +class TistoryPlaylistIE(TistoryBaseIE): + _VALID_URL = r'(?:https?://cfs.tistory.com/custom/blog/.*/skin/images/po.swf?.*file=)?(?Phttps?://cfs.tistory.com/custom/blog/.*/skin/images/(?P.*)\.xml).*' def _real_extract(self, url): video_id = self._match_id(url) + rurl = self._VALID_URL_RE.match(url).group("rurl") - self.to_screen('%s: Downloading headers' % (video_id)) - req = HEADRequest(url) + xml = self._download_xml(rurl, video_id) + entries = [] - head = compat_urllib_request.urlopen(req) - content_type = head.info().get("content-type") - content_length = int(head.info().get("content-length")) + for tracklist in xml: + for track in tracklist: + for tag in track: + print(ET.tostring(tag)) + if "location" not in tag.tag: + continue - ret = { - "id": compat_str(video_id), - "url": url, - "title": self.get_title(url, head) - } + loc = tag.text - if content_type == "application/x-shockwave-flash" and content_length < 200000: - swfreq = self._request_webpage(url, video_id, "Downloading SWF") - data = swfreq.read() + newloc, ext = self._ti_get_real_from_check(loc) + if newloc: + loc = newloc - a = data[0] - b = data[1] - c = data[2] + entries.append(self._ti_dl(loc, ext)) - if isinstance(a, str): - a = ord(a) - b = ord(b) - c = ord(c) - rawswfdata = data[8:] - - if a not in [0x43, 0x46, 0x5A] or b != 0x57 or c != 0x53: - raise ExtractorError("Not a SWF file") - - if a == 0x46: - swfdata = rawswfdata - elif a == 0x43: - import zlib - zip = zlib.decompressobj() - swfdata = str(zip.decompress(rawswfdata)) - elif a == 0x5A: - import pylzma - rawswfdata = data[11:] - swfdata = str(pylzma.decompress(rawswfdata)) - - match = re.search("(https?://[A-Za-z0-9.]*/attachment/cfile[0-9]*.uf@[A-Za-z0-9.@%]*)", - swfdata) - if not match: - raise ExtractorError("Unable to find check URL") - - checkurl = match.group(1) - - checkmatch = re.search("(cfile[0-9]*.uf)@([A-Z0-9]*)", checkurl) - if not checkmatch: - raise ExtractorError("Unable to find real URL in check URL") - - cfile = checkmatch.group(1) - url = checkmatch.group(2) - - ret["url"] = "http://" + cfile + ".tistory.com/attach/" + url - return self._real_extract(ret["url"]) - else: - ret["ext"] = self.get_ext(content_type) - return ret + return self.playlist_result(entries)