Add basic (non-concatenating) support for playlists

This commit is contained in:
qsniyg 2016-12-16 23:47:35 -08:00
parent 8f2b90df53
commit 7829a5c2db
2 changed files with 165 additions and 78 deletions

View File

@ -912,7 +912,10 @@ from .telewebion import TelewebionIE
from .testurl import TestURLIE from .testurl import TestURLIE
from .tf1 import TF1IE from .tf1 import TF1IE
from .tfo import TFOIE from .tfo import TFOIE
from .tistory import TistoryIE from .tistory import (
TistoryIE,
TistoryPlaylistIE
)
from .theintercept import TheInterceptIE from .theintercept import TheInterceptIE
from .theplatform import ( from .theplatform import (
ThePlatformIE, ThePlatformIE,

View File

@ -10,6 +10,7 @@ from ..utils import (
) )
from ..compat import ( from ..compat import (
compat_urllib_request, compat_urllib_request,
compat_urllib_error,
compat_urlparse, compat_urlparse,
compat_str compat_str
) )
@ -17,10 +18,149 @@ from ..compat import (
import os.path import os.path
import cgi import cgi
import re import re
import xml.etree.ElementTree as ET
class TistoryIE(InfoExtractor): class TistoryBaseIE(InfoExtractor):
_VALID_URL = r'https?://cfile[0-9]*.uf.tistory.com/(?:media|attach|attachment|original)/(?P<id>[A-Za-z0-9]*)' _TI_MEDIA_URL = r'https?://cfile[0-9]*.uf.tistory.com/(?:media|attach|attachment|original)/(?P<id>[A-Za-z0-9]*)'
def _ti_unquote(self, url):
return compat_urlparse.unquote(url)
def _ti_get_title(self, url, response):
_, params = cgi.parse_header(response.info().get('Content-Disposition', ''))
if "filename" in params:
filename = params["filename"]
else:
filename = url_basename(url)
retval = os.path.splitext(self._ti_unquote(filename))[0]
if type(retval) != compat_str:
retval = retval.decode('UTF-8')
return retval
def _ti_get_ext(self, mime):
ext = mimetype2ext(mime)
if ext == "x-shockwave-flash":
ext = "flv"
return ext
def _ti_get_real_from_check(self, check):
checkmatch = re.search("(cfile[0-9]*.uf)@([A-Z0-9]*)(?:\.([A-Za-z0-9]*))?", check)
if not checkmatch:
return None
cfile = checkmatch.group(1)
url = checkmatch.group(2)
ext = None
if len(checkmatch.groups()) > 2:
ext = checkmatch.group(3)
return ("http://" + cfile + ".tistory.com/attach/" + url, ext)
def _ti_get_video_id(self, url):
if '_TI_MEDIA_URL_RE' not in self.__dict__:
self._TI_MEDIA_URL_RE = re.compile(self._TI_MEDIA_URL)
m = self._TI_MEDIA_URL_RE.match(url)
assert m
return m.group('id')
def _ti_get_headers(self, url, video_id):
self.to_screen('%s: Downloading headers' % (video_id))
req = HEADRequest(url)
return compat_urllib_request.urlopen(req)
def _ti_detect_swf(self, head):
content_type = head.info().get("content-type")
content_length = int(head.info().get("content-length"))
if content_type == "application/x-shockwave-flash" and content_length < 200000:
return True
return False
def _ti_get_media(self, url, video_id, head, ext=None, title=None):
if head:
content_type = head.info().get("content-type")
ext = self._ti_get_ext(content_type)
title = self._ti_get_title(url, head)
if not title:
title = video_id
return {
"id": compat_str(video_id),
"url": url,
"title": title,
"ext": ext
}
def _ti_read_swf(self, url, video_id, head):
swfreq = self._request_webpage(url, video_id, "Downloading SWF")
data = swfreq.read()
a = data[0]
b = data[1]
c = data[2]
if isinstance(a, str):
a = ord(a)
b = ord(b)
c = ord(c)
rawswfdata = data[8:]
if a not in [0x43, 0x46, 0x5A] or b != 0x57 or c != 0x53:
raise ExtractorError("Not a SWF file")
if a == 0x46:
swfdata = rawswfdata
elif a == 0x43:
import zlib
zip = zlib.decompressobj()
swfdata = str(zip.decompress(rawswfdata))
elif a == 0x5A:
import pylzma
rawswfdata = data[11:]
swfdata = str(pylzma.decompress(rawswfdata))
match = re.search("(https?://[A-Za-z0-9.]*/attachment/cfile[0-9]*.uf@[A-Za-z0-9.@%]*)",
swfdata)
if not match:
raise ExtractorError("Unable to find check URL")
checkurl = match.group(1)
real_url, ext = self._ti_get_real_from_check(checkurl)
if not real_url:
raise ExtractorError("Unable to find real URL in check URL")
return (real_url, ext)
def _ti_dl(self, url, ext=None, title=None):
video_id = self._ti_get_video_id(url)
head = None
try:
head = self._ti_get_headers(url, video_id)
except compat_urllib_error.HTTPError:
pass
except Exception:
head = None
if head and self._ti_detect_swf(head):
return self._ti_dl(*self._ti_read_swf(url, video_id, head))
else:
return self._ti_get_media(url, video_id, head, ext, title)
class TistoryIE(TistoryBaseIE):
_VALID_URL = TistoryBaseIE._TI_MEDIA_URL
_TESTS = [ _TESTS = [
{ {
@ -43,90 +183,34 @@ class TistoryIE(InfoExtractor):
} }
] ]
def unquote(self, url): def _real_extract(self, url):
return compat_urlparse.unquote(url) return self._ti_dl(url)
def get_title(self, url, response):
_, params = cgi.parse_header(response.info().get('Content-Disposition', ''))
if "filename" in params:
filename = params["filename"]
else:
filename = url_basename(url)
retval = os.path.splitext(self.unquote(filename))[0] class TistoryPlaylistIE(TistoryBaseIE):
_VALID_URL = r'(?:https?://cfs.tistory.com/custom/blog/.*/skin/images/po.swf?.*file=)?(?P<rurl>https?://cfs.tistory.com/custom/blog/.*/skin/images/(?P<id>.*)\.xml).*'
if type(retval) != compat_str:
retval = retval.decode('UTF-8')
return retval
def get_ext(self, mime):
ext = mimetype2ext(mime)
if ext == "x-shockwave-flash":
ext = "flv"
return ext
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) video_id = self._match_id(url)
rurl = self._VALID_URL_RE.match(url).group("rurl")
self.to_screen('%s: Downloading headers' % (video_id)) xml = self._download_xml(rurl, video_id)
req = HEADRequest(url) entries = []
head = compat_urllib_request.urlopen(req) for tracklist in xml:
content_type = head.info().get("content-type") for track in tracklist:
content_length = int(head.info().get("content-length")) for tag in track:
print(ET.tostring(tag))
if "location" not in tag.tag:
continue
ret = { loc = tag.text
"id": compat_str(video_id),
"url": url,
"title": self.get_title(url, head)
}
if content_type == "application/x-shockwave-flash" and content_length < 200000: newloc, ext = self._ti_get_real_from_check(loc)
swfreq = self._request_webpage(url, video_id, "Downloading SWF") if newloc:
data = swfreq.read() loc = newloc
a = data[0] entries.append(self._ti_dl(loc, ext))
b = data[1]
c = data[2]
if isinstance(a, str):
a = ord(a)
b = ord(b)
c = ord(c)
rawswfdata = data[8:] return self.playlist_result(entries)
if a not in [0x43, 0x46, 0x5A] or b != 0x57 or c != 0x53:
raise ExtractorError("Not a SWF file")
if a == 0x46:
swfdata = rawswfdata
elif a == 0x43:
import zlib
zip = zlib.decompressobj()
swfdata = str(zip.decompress(rawswfdata))
elif a == 0x5A:
import pylzma
rawswfdata = data[11:]
swfdata = str(pylzma.decompress(rawswfdata))
match = re.search("(https?://[A-Za-z0-9.]*/attachment/cfile[0-9]*.uf@[A-Za-z0-9.@%]*)",
swfdata)
if not match:
raise ExtractorError("Unable to find check URL")
checkurl = match.group(1)
checkmatch = re.search("(cfile[0-9]*.uf)@([A-Z0-9]*)", checkurl)
if not checkmatch:
raise ExtractorError("Unable to find real URL in check URL")
cfile = checkmatch.group(1)
url = checkmatch.group(2)
ret["url"] = "http://" + cfile + ".tistory.com/attach/" + url
return self._real_extract(ret["url"])
else:
ret["ext"] = self.get_ext(content_type)
return ret