Update twitcasting extractor and add twitcasting history extractor

2020-04-21 15:21:17 -07:00 · 2020-04-21 15:21:17 -07:00 · ea0a881157
commit ea0a881157
parent 049c0486bb
2 changed files with 92 additions and 11 deletions
--- a/youtube_dl/extractor/extractors.py
+++ b/youtube_dl/extractor/extractors.py
@ -1214,7 +1214,7 @@ from .tweakers import TweakersIE
 from .twentyfourvideo import TwentyFourVideoIE
 from .twentymin import TwentyMinutenIE
 from .twentythreevideo import TwentyThreeVideoIE
-from .twitcasting import TwitCastingIE
+from .twitcasting import TwitCastingIE, TwitCastingHistoryIE
 from .twitch import (
    TwitchVideoIE,
    TwitchChapterIE,
--- a/youtube_dl/extractor/twitcasting.py
+++ b/youtube_dl/extractor/twitcasting.py
@ -1,11 +1,12 @@
 # coding: utf-8
 from __future__ import unicode_literals
 import re
 import itertools
 from .common import InfoExtractor
 from ..utils import urlencode_postdata
 import re
 class TwitCastingIE(InfoExtractor):
    _VALID_URL = r'https?://(?:[^/]+\.)?twitcasting\.tv/(?P<uploader_id>[^/]+)/movie/(?P<id>\d+)'
@ -56,15 +57,25 @@ class TwitCastingIE(InfoExtractor):
            r'(?s)<[^>]+id=["\']movietitle[^>]+>(.+?)</',
            webpage, 'title', default=None) or self._html_search_meta(
            'twitter:title', webpage, fatal=True)
        # title is split across lines with lots of whitespace
        title = title.replace('\n', ' ')
        while '  ' in title:
            title = title.replace('  ', ' ')
-        m3u8_url = self._search_regex(
+        # m3u8_url = self._search_regex(
-            (r'data-movie-url=(["\'])(?P<url>(?:(?!\1).)+)\1',
+        #     (r'data-movie-url=(["\'])(?P<url>(?:(?!\1).)+)\1',
-             r'(["\'])(?P<url>http.+?\.m3u8.*?)\1'),
+        #      r'(["\'])(?P<url>http.+?\.m3u8.*?)\1'),
-            webpage, 'm3u8 url', group='url')
+        #     webpage, 'm3u8 url', group='url')
-
+        # m3u8_url = m3u8_url.replace('\\/', '/')
-        formats = self._extract_m3u8_formats(
+        # formats = self._extract_m3u8_formats(
-            m3u8_url, video_id, ext='mp4', entry_protocol='m3u8_native',
+        #     m3u8_url, video_id, ext='mp4', entry_protocol='m3u8_native',
-            m3u8_id='hls')
+        #     m3u8_id='hls')
        formats = [
            {
                'url': "http://dl01.twitcasting.tv/{uploader_id}/download/{video_id}?dl=1".format(uploader_id=uploader_id, video_id=video_id),
                'ext': 'mp4',
            }
        ]
        thumbnail = self._og_search_thumbnail(webpage)
        description = self._og_search_description(
@ -79,3 +90,73 @@ class TwitCastingIE(InfoExtractor):
            'uploader_id': uploader_id,
            'formats': formats,
        }
 class TwitCastingHistoryIE(InfoExtractor):
    _VALID_URL = r'https?://(?:[^/]+\.)?twitcasting\.tv/(?P<uploader_id>[^/]+)/show'
    _TESTS = [
        {
            'url': 'https://twitcasting.tv/mttbernardini/show/',
            'info_dict': {
                'title': 'Matteo Bernardini',
                'id': 'mttbernardini',
            },
            'playlist_count': 1,
        },
    ]
    def _get_meta_and_entries(self, url):
        for page_num in itertools.count(0):
            page_url = "{}/{}".format(url.rstrip('/'), page_num)
            pagenum = None
            list_id = None
            webpage = self._download_webpage(
                page_url, list_id,
                'Downloading page %s' % pagenum)
            if page_num == 0:
                # title = re.search(r'<span class="tw-user-nav-name">(.*)</span>', webpage)
                title = re.search(r'(?s)<[^>]+class=["\']tw-user-nav-name[^>]+>(.+?)</', webpage)
                title = title.group(1).strip()
                user_id = re.search(r'data-user-id="(.*)"', webpage)
                user_id = user_id.group(1).strip()
                yield (title, user_id)
            first_page_selected = webpage.find('class="selected">1</a>') != -1
            if page_num != 0 and first_page_selected:
                break
            matches = re.finditer(r'''<a[^>]+class=["']tw-movie-thumbnail["'][^>]+href="(.+)"[^>]+>((?:\n|.)*?)</a>''', webpage)
            matches = list(matches)
            for match in matches:
                href = match.group(1)
                inner = match.group(2)
                # if REC isn't present either a live broadcast or an image
                # e.g. https://twitcasting.tv/marrynontan/movie/506296434
                if 'REC' not in inner:
                    continue
                # skip videos that require a password
                # e.g. https://twitcasting.tv/mttbernardini/movie/3689740
                locked = re.search(r'''src="/img/locked.png"''', inner)
                if locked is not None:
                    continue
                title = re.search(r'''<[^>]+class=["']tw-movie-thumbnail-title[^>]+>[ \n]*?(.+?) *?</''', inner)
                if title is not None:
                    title = title.group(1).strip()
                video_url = 'https://twitcasting.tv{}'.format(href)
                video_id = href.split('/')[-1]
                result = self.url_result(video_url, ie=TwitCastingIE.ie_key(), video_id=video_id, video_title=title)
                yield result
    def _real_extract(self, url):
        entries = self._get_meta_and_entries(url)
        (title, user_id) = next(entries)
        result = self.playlist_result(entries, playlist_title=title, playlist_id=user_id)
        return result