[Pivotshare] Add new extractor

2020-05-28 21:33:54 +10:00 · 2020-05-28 21:33:54 +10:00 · d5275d7ce6
commit d5275d7ce6
parent a54c5f83c0
2 changed files with 206 additions and 0 deletions
--- a/youtube_dl/extractor/extractors.py
+++ b/youtube_dl/extractor/extractors.py
@ -842,6 +842,7 @@ from .picarto import (
 )
 from .piksel import PikselIE
 from .pinkbike import PinkbikeIE
 from .pivotshare import PivotshareIE
 from .pladform import PladformIE
 from .platzi import (
    PlatziIE,
--- a/youtube_dl/extractor/pivotshare.py
+++ b/youtube_dl/extractor/pivotshare.py
@ -0,0 +1,205 @@
 # coding: utf-8
 from __future__ import unicode_literals
 import re
 import json
 from .common import InfoExtractor
 from ..utils import (
    try_get,
    compat_str,
    unified_strdate,
    unified_timestamp,
    determine_ext,
    ExtractorError,
    clean_html,
 )
 class PivotshareIE(InfoExtractor):
    _VALID_URL = r"""(?x)
                    https?://
                        (?:www\.)?
                        (?P<domain>
                            (?:
                                thunderboltpoweryogatv|
                                hungrymonkyoga|
                                soccer\.sklz|
                                womenstennisnetwork|
                                pigskinkids|
                                czwstudios|
                                highspotswrestlingnetwork|
                                titlematchwrestlingnetwork|
                                womenswrestlingnetwork|
                                rockstarpronetwork|
                                mcwragetv|
                                aawondemand|
                                pwnnetwork|
                                ondemand\.DiscoveryWrestling|
                                adsrcourses|
                                reaktortutorials|
                                crosscounter|
                                cultorama|
                                bongflix|
                                everyonecansalsa|
                                academy\.tedgibson|
                                video\.jasyoga|
                                (?P<subdomain>
                                    [^.]+
                                )\.pivotshare
                            )\.
                            (?:com|tv)
                        )?/media/
                        (?:
                            [^/]+
                        )/
                        (?P<id>
                            [0-9]+
                        )
                """
    _TESTS = [{
        'url': 'https://ted.pivotshare.com/media/rob-forbes-on-ways-of-seeing/61/feature',
        'md5': '30a2ba2b97d0a1ccd2efb5d534d922ae',
        'info_dict': {
            'id': '61',
            'ext': 'mp4',
            'title': 'Rob Forbes on ways of seeing',
            'description': 'md5:2dd273ce5f3e6fbb4c05d4be71db0174',
            'thumbnail': r're:^https?://.*\.(?:png|jpg)$',
            'uploader': 'Rob Forbes',
            'uploader_id': '28',
            'release_date': '20100909',
            'timestamp': 1284054573,
            'upload_date': '20100909',
            'channel': 'TED',
            'channel_id': 3,
            'channel_url': 'https://ted.pivotshare.com',
            'duration': 934,
            'categories': ['Arts']
        }
    }, {
        'url': 'https://www.hungrymonkyoga.com/media/home/9057/feature',
        'md5': '6a931de856aaa1c0956314a510e07e78',
        'info_dict': {
            'id': '9057',
            'ext': 'mp4',
            'title': 'Home',
            'description': 'md5:c2af199b6f178943676b78262b22c654',
            'thumbnail': r're:^https?://.*\.(?:png|jpg)$',
            'uploader': 'Bo Wang',
            'uploader_id': '2750',
            'release_date': '20140514',
            'timestamp': 1400056615,
            'upload_date': '20140514',
            'channel': 'Hungrymonk%20Yoga%E2%84%A2',
            'channel_id': 1769,
            'channel_url': 'www.hungrymonkyoga.com',
            'duration': 179,
            'categories': ['Hungrymonk Yoga']
        }
    }, {
        'url': 'https://www.highspotswrestlingnetwork.com/media/pwg%3A-mystery-vortex-6/97499/feature',
        'only_matching': True,
    }, {
        'url': 'https://video.jasyoga.com/media/functional-core/89966/?collectionId=3353',
        'only_matching': True,
    }]
    _API_BASE = 'https://api.pivotshare.com/v1/'
    _CLIENT_ID = 'c0da629bb49ceff00327ac7c1f128bca'
    _TOKEN = None
    _NETRC_MACHINE = 'pivotshare'
    def _real_initialize(self):
        self._login()
    def _login(self):
        username, password = self._get_login_info()
        if username is None:
            return
        login = self._download_json(
            '%slogin' % self._API_BASE, None, 'Logging in',
            data=json.dumps({
                'username': username,
                'password': password
            }).encode(),
            headers={
                'Content-Type': 'application/json'
            },
            query={
                'client_id': self._CLIENT_ID
            })
        if login.get('errors'):
            raise ExtractorError('Unable to login: %s' % clean_html(login['errors']), expected=True)
        else:
            self._TOKEN = try_get(login, lambda x: x['login']['access_token'], compat_str)
    def _real_extract(self, url):
        domain, subdomain, video_id = re.match(self._VALID_URL, url).groups()
        webpage = self._download_webpage(url, video_id)
        query = {
            'client_id': self._CLIENT_ID,
            'search_method': 'subdomain' if subdomain else 'domain'
        }
        if self._TOKEN:
            query['access_token'] = self._TOKEN
        channel_meta = self._download_json(
            '%schannels/%s' % (self._API_BASE, subdomain if subdomain else domain),
            subdomain, "Downloading channel JSON metadata",
            query=query)
        query.pop('search_method')
        channel_id = try_get(channel_meta, lambda x: x['channel']['id'], int)
        channel = try_get(channel_meta, lambda x: x['channel']['name'], compat_str)
        channel_url = try_get(channel_meta, lambda x: x['channel']['domain'], compat_str)
        if not channel_url:
            channel_url = 'https://%s.pivotshare.com' % try_get(
                channel_meta, lambda x: x['channel']['subdomain'], compat_str)
        meta = self._download_json(
            '%schannels/%s/media/%s' % (self._API_BASE, channel_id, video_id),
            video_id, "Downloading media JSON metadata",
            query=query)
        try:
            stream_data = self._download_json(
                '%schannels/%s/media/%s/stream' % (self._API_BASE, channel_id, video_id),
                video_id, "Downloading stream JSON metadata",
                query=query)
        except ExtractorError as e:
            self.raise_login_required(
                'This video is only available for %s subscribers' % channel)
        sources = try_get(
            stream_data, lambda x: x['channel']['media']['stream']['formats'], list)
        formats = []
        if sources:
            for source in sources:
                if determine_ext(source.get('url')) == 'm3u8':
                    formats.extend(self._extract_m3u8_formats(
                        source.get('url'), video_id, 'mp4',
                        entry_protocol='m3u8_native', m3u8_id='hls',
                        fatal=False))
        self._sort_formats(formats)
        return {
            'id': video_id,
            'formats': formats,
            'title': try_get(meta, lambda x: x['channel']['media']['title'], compat_str) or self._og_search_title(webpage),
            'description': try_get(meta, lambda x: x['channel']['media']['description'], compat_str) or self._og_search_description(webpage),
            'thumbnail': try_get(meta, lambda x: x['channel']['media']['thumbnail_url']['large'], compat_str),
            'uploader': try_get(meta, lambda x: x['channel']['media']['author'], compat_str),
            'uploader_id': try_get(meta, lambda x: x['channel']['media']['author_id'], compat_str),
            'release_date': unified_strdate(try_get(meta, lambda x: x['channel']['media']['submit_date'], compat_str)),
            'timestamp': unified_timestamp(try_get(meta, lambda x: x['channel']['media']['submit_date'], compat_str)),
            'channel': channel,
            'channel_id': channel_id,
            'channel_url': channel_url,
            'duration': try_get(meta, lambda x: x['channel']['media']['duration'], int),
            'categories': [try_get(meta, lambda x: x['channel']['media']['category'], compat_str)],
        }