Update twitcasting extractor and add twitcasting history extractor

This commit is contained in:
Yarn 2020-04-21 15:21:17 -07:00
parent 049c0486bb
commit ea0a881157
2 changed files with 92 additions and 11 deletions

View File

@ -1214,7 +1214,7 @@ from .tweakers import TweakersIE
from .twentyfourvideo import TwentyFourVideoIE from .twentyfourvideo import TwentyFourVideoIE
from .twentymin import TwentyMinutenIE from .twentymin import TwentyMinutenIE
from .twentythreevideo import TwentyThreeVideoIE from .twentythreevideo import TwentyThreeVideoIE
from .twitcasting import TwitCastingIE from .twitcasting import TwitCastingIE, TwitCastingHistoryIE
from .twitch import ( from .twitch import (
TwitchVideoIE, TwitchVideoIE,
TwitchChapterIE, TwitchChapterIE,

View File

@ -1,11 +1,12 @@
# coding: utf-8 # coding: utf-8
from __future__ import unicode_literals from __future__ import unicode_literals
import re
import itertools
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import urlencode_postdata from ..utils import urlencode_postdata
import re
class TwitCastingIE(InfoExtractor): class TwitCastingIE(InfoExtractor):
_VALID_URL = r'https?://(?:[^/]+\.)?twitcasting\.tv/(?P<uploader_id>[^/]+)/movie/(?P<id>\d+)' _VALID_URL = r'https?://(?:[^/]+\.)?twitcasting\.tv/(?P<uploader_id>[^/]+)/movie/(?P<id>\d+)'
@ -56,15 +57,25 @@ class TwitCastingIE(InfoExtractor):
r'(?s)<[^>]+id=["\']movietitle[^>]+>(.+?)</', r'(?s)<[^>]+id=["\']movietitle[^>]+>(.+?)</',
webpage, 'title', default=None) or self._html_search_meta( webpage, 'title', default=None) or self._html_search_meta(
'twitter:title', webpage, fatal=True) 'twitter:title', webpage, fatal=True)
# title is split across lines with lots of whitespace
title = title.replace('\n', ' ')
while ' ' in title:
title = title.replace(' ', ' ')
m3u8_url = self._search_regex( # m3u8_url = self._search_regex(
(r'data-movie-url=(["\'])(?P<url>(?:(?!\1).)+)\1', # (r'data-movie-url=(["\'])(?P<url>(?:(?!\1).)+)\1',
r'(["\'])(?P<url>http.+?\.m3u8.*?)\1'), # r'(["\'])(?P<url>http.+?\.m3u8.*?)\1'),
webpage, 'm3u8 url', group='url') # webpage, 'm3u8 url', group='url')
# m3u8_url = m3u8_url.replace('\\/', '/')
formats = self._extract_m3u8_formats( # formats = self._extract_m3u8_formats(
m3u8_url, video_id, ext='mp4', entry_protocol='m3u8_native', # m3u8_url, video_id, ext='mp4', entry_protocol='m3u8_native',
m3u8_id='hls') # m3u8_id='hls')
formats = [
{
'url': "http://dl01.twitcasting.tv/{uploader_id}/download/{video_id}?dl=1".format(uploader_id=uploader_id, video_id=video_id),
'ext': 'mp4',
}
]
thumbnail = self._og_search_thumbnail(webpage) thumbnail = self._og_search_thumbnail(webpage)
description = self._og_search_description( description = self._og_search_description(
@ -79,3 +90,73 @@ class TwitCastingIE(InfoExtractor):
'uploader_id': uploader_id, 'uploader_id': uploader_id,
'formats': formats, 'formats': formats,
} }
class TwitCastingHistoryIE(InfoExtractor):
_VALID_URL = r'https?://(?:[^/]+\.)?twitcasting\.tv/(?P<uploader_id>[^/]+)/show'
_TESTS = [
{
'url': 'https://twitcasting.tv/mttbernardini/show/',
'info_dict': {
'title': 'Matteo Bernardini',
'id': 'mttbernardini',
},
'playlist_count': 1,
},
]
def _get_meta_and_entries(self, url):
for page_num in itertools.count(0):
page_url = "{}/{}".format(url.rstrip('/'), page_num)
pagenum = None
list_id = None
webpage = self._download_webpage(
page_url, list_id,
'Downloading page %s' % pagenum)
if page_num == 0:
# title = re.search(r'<span class="tw-user-nav-name">(.*)</span>', webpage)
title = re.search(r'(?s)<[^>]+class=["\']tw-user-nav-name[^>]+>(.+?)</', webpage)
title = title.group(1).strip()
user_id = re.search(r'data-user-id="(.*)"', webpage)
user_id = user_id.group(1).strip()
yield (title, user_id)
first_page_selected = webpage.find('class="selected">1</a>') != -1
if page_num != 0 and first_page_selected:
break
matches = re.finditer(r'''<a[^>]+class=["']tw-movie-thumbnail["'][^>]+href="(.+)"[^>]+>((?:\n|.)*?)</a>''', webpage)
matches = list(matches)
for match in matches:
href = match.group(1)
inner = match.group(2)
# if REC isn't present either a live broadcast or an image
# e.g. https://twitcasting.tv/marrynontan/movie/506296434
if 'REC' not in inner:
continue
# skip videos that require a password
# e.g. https://twitcasting.tv/mttbernardini/movie/3689740
locked = re.search(r'''src="/img/locked.png"''', inner)
if locked is not None:
continue
title = re.search(r'''<[^>]+class=["']tw-movie-thumbnail-title[^>]+>[ \n]*?(.+?) *?</''', inner)
if title is not None:
title = title.group(1).strip()
video_url = 'https://twitcasting.tv{}'.format(href)
video_id = href.split('/')[-1]
result = self.url_result(video_url, ie=TwitCastingIE.ie_key(), video_id=video_id, video_title=title)
yield result
def _real_extract(self, url):
entries = self._get_meta_and_entries(url)
(title, user_id) = next(entries)
result = self.playlist_result(entries, playlist_title=title, playlist_id=user_id)
return result