Add basic (non-concatenating) support for playlists

This commit is contained in:
qsniyg 2016-12-16 23:47:35 -08:00
parent 8f2b90df53
commit 7829a5c2db
2 changed files with 165 additions and 78 deletions

View File

@ -912,7 +912,10 @@ from .telewebion import TelewebionIE
from .testurl import TestURLIE
from .tf1 import TF1IE
from .tfo import TFOIE
from .tistory import TistoryIE
from .tistory import (
TistoryIE,
TistoryPlaylistIE
)
from .theintercept import TheInterceptIE
from .theplatform import (
ThePlatformIE,

View File

@ -10,6 +10,7 @@ from ..utils import (
)
from ..compat import (
compat_urllib_request,
compat_urllib_error,
compat_urlparse,
compat_str
)
@ -17,10 +18,149 @@ from ..compat import (
import os.path
import cgi
import re
import xml.etree.ElementTree as ET
class TistoryIE(InfoExtractor):
_VALID_URL = r'https?://cfile[0-9]*.uf.tistory.com/(?:media|attach|attachment|original)/(?P<id>[A-Za-z0-9]*)'
class TistoryBaseIE(InfoExtractor):
_TI_MEDIA_URL = r'https?://cfile[0-9]*.uf.tistory.com/(?:media|attach|attachment|original)/(?P<id>[A-Za-z0-9]*)'
def _ti_unquote(self, url):
return compat_urlparse.unquote(url)
def _ti_get_title(self, url, response):
_, params = cgi.parse_header(response.info().get('Content-Disposition', ''))
if "filename" in params:
filename = params["filename"]
else:
filename = url_basename(url)
retval = os.path.splitext(self._ti_unquote(filename))[0]
if type(retval) != compat_str:
retval = retval.decode('UTF-8')
return retval
def _ti_get_ext(self, mime):
ext = mimetype2ext(mime)
if ext == "x-shockwave-flash":
ext = "flv"
return ext
def _ti_get_real_from_check(self, check):
checkmatch = re.search("(cfile[0-9]*.uf)@([A-Z0-9]*)(?:\.([A-Za-z0-9]*))?", check)
if not checkmatch:
return None
cfile = checkmatch.group(1)
url = checkmatch.group(2)
ext = None
if len(checkmatch.groups()) > 2:
ext = checkmatch.group(3)
return ("http://" + cfile + ".tistory.com/attach/" + url, ext)
def _ti_get_video_id(self, url):
if '_TI_MEDIA_URL_RE' not in self.__dict__:
self._TI_MEDIA_URL_RE = re.compile(self._TI_MEDIA_URL)
m = self._TI_MEDIA_URL_RE.match(url)
assert m
return m.group('id')
def _ti_get_headers(self, url, video_id):
self.to_screen('%s: Downloading headers' % (video_id))
req = HEADRequest(url)
return compat_urllib_request.urlopen(req)
def _ti_detect_swf(self, head):
content_type = head.info().get("content-type")
content_length = int(head.info().get("content-length"))
if content_type == "application/x-shockwave-flash" and content_length < 200000:
return True
return False
def _ti_get_media(self, url, video_id, head, ext=None, title=None):
if head:
content_type = head.info().get("content-type")
ext = self._ti_get_ext(content_type)
title = self._ti_get_title(url, head)
if not title:
title = video_id
return {
"id": compat_str(video_id),
"url": url,
"title": title,
"ext": ext
}
def _ti_read_swf(self, url, video_id, head):
swfreq = self._request_webpage(url, video_id, "Downloading SWF")
data = swfreq.read()
a = data[0]
b = data[1]
c = data[2]
if isinstance(a, str):
a = ord(a)
b = ord(b)
c = ord(c)
rawswfdata = data[8:]
if a not in [0x43, 0x46, 0x5A] or b != 0x57 or c != 0x53:
raise ExtractorError("Not a SWF file")
if a == 0x46:
swfdata = rawswfdata
elif a == 0x43:
import zlib
zip = zlib.decompressobj()
swfdata = str(zip.decompress(rawswfdata))
elif a == 0x5A:
import pylzma
rawswfdata = data[11:]
swfdata = str(pylzma.decompress(rawswfdata))
match = re.search("(https?://[A-Za-z0-9.]*/attachment/cfile[0-9]*.uf@[A-Za-z0-9.@%]*)",
swfdata)
if not match:
raise ExtractorError("Unable to find check URL")
checkurl = match.group(1)
real_url, ext = self._ti_get_real_from_check(checkurl)
if not real_url:
raise ExtractorError("Unable to find real URL in check URL")
return (real_url, ext)
def _ti_dl(self, url, ext=None, title=None):
video_id = self._ti_get_video_id(url)
head = None
try:
head = self._ti_get_headers(url, video_id)
except compat_urllib_error.HTTPError:
pass
except Exception:
head = None
if head and self._ti_detect_swf(head):
return self._ti_dl(*self._ti_read_swf(url, video_id, head))
else:
return self._ti_get_media(url, video_id, head, ext, title)
class TistoryIE(TistoryBaseIE):
_VALID_URL = TistoryBaseIE._TI_MEDIA_URL
_TESTS = [
{
@ -43,90 +183,34 @@ class TistoryIE(InfoExtractor):
}
]
def unquote(self, url):
return compat_urlparse.unquote(url)
def _real_extract(self, url):
return self._ti_dl(url)
def get_title(self, url, response):
_, params = cgi.parse_header(response.info().get('Content-Disposition', ''))
if "filename" in params:
filename = params["filename"]
else:
filename = url_basename(url)
retval = os.path.splitext(self.unquote(filename))[0]
if type(retval) != compat_str:
retval = retval.decode('UTF-8')
return retval
def get_ext(self, mime):
ext = mimetype2ext(mime)
if ext == "x-shockwave-flash":
ext = "flv"
return ext
class TistoryPlaylistIE(TistoryBaseIE):
_VALID_URL = r'(?:https?://cfs.tistory.com/custom/blog/.*/skin/images/po.swf?.*file=)?(?P<rurl>https?://cfs.tistory.com/custom/blog/.*/skin/images/(?P<id>.*)\.xml).*'
def _real_extract(self, url):
video_id = self._match_id(url)
rurl = self._VALID_URL_RE.match(url).group("rurl")
self.to_screen('%s: Downloading headers' % (video_id))
req = HEADRequest(url)
xml = self._download_xml(rurl, video_id)
entries = []
head = compat_urllib_request.urlopen(req)
content_type = head.info().get("content-type")
content_length = int(head.info().get("content-length"))
for tracklist in xml:
for track in tracklist:
for tag in track:
print(ET.tostring(tag))
if "location" not in tag.tag:
continue
ret = {
"id": compat_str(video_id),
"url": url,
"title": self.get_title(url, head)
}
loc = tag.text
if content_type == "application/x-shockwave-flash" and content_length < 200000:
swfreq = self._request_webpage(url, video_id, "Downloading SWF")
data = swfreq.read()
newloc, ext = self._ti_get_real_from_check(loc)
if newloc:
loc = newloc
a = data[0]
b = data[1]
c = data[2]
entries.append(self._ti_dl(loc, ext))
if isinstance(a, str):
a = ord(a)
b = ord(b)
c = ord(c)
rawswfdata = data[8:]
if a not in [0x43, 0x46, 0x5A] or b != 0x57 or c != 0x53:
raise ExtractorError("Not a SWF file")
if a == 0x46:
swfdata = rawswfdata
elif a == 0x43:
import zlib
zip = zlib.decompressobj()
swfdata = str(zip.decompress(rawswfdata))
elif a == 0x5A:
import pylzma
rawswfdata = data[11:]
swfdata = str(pylzma.decompress(rawswfdata))
match = re.search("(https?://[A-Za-z0-9.]*/attachment/cfile[0-9]*.uf@[A-Za-z0-9.@%]*)",
swfdata)
if not match:
raise ExtractorError("Unable to find check URL")
checkurl = match.group(1)
checkmatch = re.search("(cfile[0-9]*.uf)@([A-Z0-9]*)", checkurl)
if not checkmatch:
raise ExtractorError("Unable to find real URL in check URL")
cfile = checkmatch.group(1)
url = checkmatch.group(2)
ret["url"] = "http://" + cfile + ".tistory.com/attach/" + url
return self._real_extract(ret["url"])
else:
ret["ext"] = self.get_ext(content_type)
return ret
return self.playlist_result(entries)