From 23c4fb5e3ea619e7da6adc9d3dee8170b544fe25 Mon Sep 17 00:00:00 2001 From: jhwgh1968 Date: Fri, 14 Jun 2019 21:22:27 -0500 Subject: [PATCH] [thisvid] Add extractor --- youtube_dl/extractor/extractors.py | 4 + youtube_dl/extractor/thisvid.py | 132 +++++++++++++++++++++++++++++ 2 files changed, 136 insertions(+) create mode 100644 youtube_dl/extractor/thisvid.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 9cd7d3ac4..dabf1af84 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1162,6 +1162,10 @@ from .theweatherchannel import TheWeatherChannelIE from .thisamericanlife import ThisAmericanLifeIE from .thisav import ThisAVIE from .thisoldhouse import ThisOldHouseIE +from .thisvid import ( + ThisVidIE, + ThisVidEmbeddedIE +) from .threeqsdn import ThreeQSDNIE from .tiktok import ( TikTokIE, diff --git a/youtube_dl/extractor/thisvid.py b/youtube_dl/extractor/thisvid.py new file mode 100644 index 000000000..766f721f0 --- /dev/null +++ b/youtube_dl/extractor/thisvid.py @@ -0,0 +1,132 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json +import re + +from .common import InfoExtractor +from .openload import PhantomJSwrapper + + +class ThisVidIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?thisvid\.com/videos/(?P[A-Za-z0-9-]+)' + + _TEST = { + 'url': 'https://thisvid.com/videos/madonna-show-in-sexy-underwear/', + 'md5': '48e38730d38394c6e9f1cce66fb04c6e', + 'info_dict': { + 'id': '829503', + 'display_id': 'madonna-show-in-sexy-underwear', + 'ext': 'mp4', + 'title': 'Madonna show in sexy underwear', + 'thumbnail': r're:^https?://.*preview\.mp4\.jpg$', + 'uploader_id': 'Mike_Hunt', + 'uploader_url': 'https://thisvid.com/members/584768', + 'age_limit': 18, + } + } + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + # The webpage contains a raw piece of javascript which creates a + # variable called flashvars used by the player to update the webpage. + # + # Running the javascript fixes the URL in the static HTML which is + # broken, and updates the flashvars variable with the new info. + # + # Because there are a ton of errors from PhantomJS that do not affect + # the output, they have to be split from the actual JSON. + jscode = """ + function checkFlashVars() { + flashvars = page.evaluate(function() { + return JSON.stringify(flashvars) + }); + console.log('---'); // ensure any errors appear above where we are + console.log(flashvars); + saveAndExit(); + } + checkFlashVars();""" + phantom = PhantomJSwrapper(self, required_version='2.0') + webpage, output = phantom.get(url, html=webpage, jscode=jscode) + flashvars = self._parse_json(output.split("---", 2)[1], display_id) + + # Get the video URL from the flashvars. + video_url = flashvars['video_url'] + + # The value in the static HTML starts with "function/0/http://..." + # where the zero is sometimes another number. + # + # At try that static URL if there was a static update failure. + if video_url.startswith('function'): + self.report_warning('Page JS failed, fetch will likely fail') + video_url = video_url.split("/", 3)[2] + + # Sometimes the video url ends with ".mp4", + # other times it ends with ".mp4/", + # yet other times it ends with ".mp4/?". + # + # All of it needs to be cleaned up. + video_url = video_url.split("?", 2)[0].strip("/") + + # Get the thumbnail URL from the flashvars. + thumbnail_url = flashvars['preview_url'] + + # The thumbnail usually does not have a protocol on the front, e.g. + # "//media.thisvid.com" + if thumbnail_url.startswith("//"): + thumbnail_url = 'https:' + thumbnail_url + + # The simplest way to get the real internal ID is to get it from the + # URL we will be accessing. + video_id = video_url.split("/")[-2] + + # Parse the title information. + title = self._search_regex(r'(?P<title>.+) -([a-zA-Z ]+ at)? ThisVid(\.com| tube)', + webpage, display_id, group='title') + + # Parse the author information from a profile link. + author_re = r']*>(?P[^<]+)[A-Za-z0-9-]+)' + _TEST = { + 'url': 'https://thisvid.com/embed/854312', + 'md5': '8166497c0281b54a48b179c997463892', + 'info_dict': { + 'id': '854312', + 'display_id': 'soles-of-jaxwheeler', + 'ext': 'mp4', + 'title': 'Soles of JaxWheeler', + 'thumbnail': r're:^https?://.*preview\.mp4\.jpg$', + 'uploader_id': 'SNK13', + 'uploader_url': 'https://thisvid.com/members/252887', + 'age_limit': 18, + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + embedded_page = self._download_webpage(url, video_id) + full_url = self._search_regex( + r'