From 03585963608691fec71090878049a46ddac94c6b Mon Sep 17 00:00:00 2001 From: Parmjit Virk Date: Mon, 5 Jun 2017 03:26:36 -0500 Subject: [PATCH] [Tubepatrol] Add new extractor --- youtube_dl/extractor/extractors.py | 3 +- youtube_dl/extractor/tubepatrol.py | 87 ++++++++++++++++++++++++++++++ 2 files changed, 89 insertions(+), 1 deletion(-) create mode 100644 youtube_dl/extractor/tubepatrol.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index e1907314d..286124885 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1020,6 +1020,7 @@ from .traileraddict import TrailerAddictIE from .trilulilu import TriluliluIE from .trutv import TruTVIE from .tube8 import Tube8IE +from .tubepatrol import TubepatrolIE from .tubitv import TubiTvIE from .tumblr import TumblrIE from .tunein import ( @@ -1309,4 +1310,4 @@ from .youtube import ( from .zapiks import ZapiksIE from .zaq1 import Zaq1IE from .zdf import ZDFIE, ZDFChannelIE -from .zingmp3 import ZingMp3IE +from .zingmp3 import ZingMp3IE \ No newline at end of file diff --git a/youtube_dl/extractor/tubepatrol.py b/youtube_dl/extractor/tubepatrol.py new file mode 100644 index 000000000..6055fb751 --- /dev/null +++ b/youtube_dl/extractor/tubepatrol.py @@ -0,0 +1,87 @@ +# coding: utf-8 +from __future__ import unicode_literals +from ..utils import ExtractorError +from .common import InfoExtractor +import re + + +class TubepatrolIE(InfoExtractor): + # i.e. http://tubepatrol.sex/to/767066/plump-asian-loves-fucking-and-sucking.html + _VALID_URL = r'http?://(?:www\.)?tubepatrol\.sex/[^/]+/(?P\d+)/(?P[^/]+)\.html' + + _TESTS = [ + { + # MPEG-4 video format + 'url': 'http://tubepatrol.sex/to/555439/ani-black-fox-new-czech-anal-slut-legalporno-trailer.html', + 'info_dict': { + 'id': '555439', + 'display_id': 'ani-black-fox-new-czech-anal-slut-legalporno-trailer', + 'ext': 'mp4', + 'title': 'Ani Black Fox New Czech Anal Slut [legalporno Trailer]', + }, + }, + { + # Flash video format + 'url': 'http://tubepatrol.sex/to/3934608/ad4x-video-dp-de-kelly-lee-trailer-hd-porn-quebec.html', + 'info_dict': { + 'id': '3934608', + 'display_id': 'ad4x-video-dp-de-kelly-lee-trailer-hd-porn-quebec', + 'ext': 'flv', + 'title': 'AD4X Video - DP De Kelly Lee Trailer HD - Porn Quebec', + }, + }, + ] + + def _real_extract(self, url): + # Basic extractor implementation - Video ID, Display ID, Title, URL + + # IDs + + mobj = re.match(self._VALID_URL, url) + try: + video_id = mobj.group('id') + display_id = mobj.group('display_id') + except Exception as inst: + raise ExtractorError('%s said: %s' % (self.IE_NAME, "The video or display ID could not be extracted: %s" % inst), expected=True) + + # get the webpage source code + webpage = self._download_webpage(url, video_id) + + # Title + + # first try the generic header text + video_title = self._html_search_regex(r'

(.+?)

', webpage, 'video_title', default=None) + if video_title is None: + # fallback to the link text provided for embed + video_title = self._html_search_regex(r'(.+?)' % url, webpage, 'video_title', default=None) + if video_title is None: + raise ExtractorError('%s said: %s' % (self.IE_NAME, "The video title could not be extracted"), expected=True) + + # the URL for the video file is contained in a seperate link as: + # https://borfos.com/kt_player/player.php?id= + flashvars_webpage = self._download_webpage('https://borfos.com/kt_player/player.php?id=%s' % video_id, video_id) + flashvars_data = self._search_regex(r'(?s)flashvars\s*=\s*({.+?})', flashvars_webpage, 'flashvars_data', default=None) + if flashvars_data is None: + raise ExtractorError('%s said: %s' % (self.IE_NAME, "The flash player data could not be extracted"), expected=True) + + # URL + + # yes, we are going to use a regex to extract the video URL instead of using the JSON approach + # + # this is done because a bunch of extraneous fields in the flash data contain wonky characters + # that screw up the call to _parse_json() and we do not care for these fields anyway, so ... + + # first try the generic url + video_url = self._search_regex(r'(?s)video_url:\s"(.+?)",\s*', flashvars_data, 'video_url', default=None) + if video_url is None: + # fallback to the HTML5 url + video_url = self._search_regex(r'(?s)video_html5_url:\s"(.+?)",\s*', flashvars_data, 'video_url', default=None) + if video_url is None: + raise ExtractorError('%s said: %s' % (self.IE_NAME, "The video URL could not be extracted"), expected=True) + + return { + 'id': video_id, + 'display_id': display_id, + 'title': video_title, + 'url': video_url + }