[Tubepatrol] Add new extractor

2017-06-05 03:26:36 -05:00 · 2017-06-05 03:26:36 -05:00 · 0358596360
commit 0358596360
parent 537191826f
2 changed files with 89 additions and 1 deletions
--- a/youtube_dl/extractor/extractors.py
+++ b/youtube_dl/extractor/extractors.py
@ -1020,6 +1020,7 @@ from .traileraddict import TrailerAddictIE
 from .trilulilu import TriluliluIE
 from .trutv import TruTVIE
 from .tube8 import Tube8IE
 from .tubepatrol import TubepatrolIE
 from .tubitv import TubiTvIE
 from .tumblr import TumblrIE
 from .tunein import (
--- a/youtube_dl/extractor/tubepatrol.py
+++ b/youtube_dl/extractor/tubepatrol.py
@ -0,0 +1,87 @@
 # coding: utf-8
 from __future__ import unicode_literals
 from ..utils import ExtractorError
 from .common import InfoExtractor
 import re
 class TubepatrolIE(InfoExtractor):
    # i.e. http://tubepatrol.sex/to/767066/plump-asian-loves-fucking-and-sucking.html
    _VALID_URL = r'http?://(?:www\.)?tubepatrol\.sex/[^/]+/(?P<id>\d+)/(?P<display_id>[^/]+)\.html'
    _TESTS = [
        {
            # MPEG-4 video format
            'url': 'http://tubepatrol.sex/to/555439/ani-black-fox-new-czech-anal-slut-legalporno-trailer.html',
            'info_dict': {
                'id': '555439',
                'display_id': 'ani-black-fox-new-czech-anal-slut-legalporno-trailer',
                'ext': 'mp4',
                'title': 'Ani Black Fox New Czech Anal Slut [legalporno Trailer]',
            },
        },
        {
            # Flash video format
            'url': 'http://tubepatrol.sex/to/3934608/ad4x-video-dp-de-kelly-lee-trailer-hd-porn-quebec.html',
            'info_dict': {
                'id': '3934608',
                'display_id': 'ad4x-video-dp-de-kelly-lee-trailer-hd-porn-quebec',
                'ext': 'flv',
                'title': 'AD4X Video - DP De Kelly Lee Trailer HD - Porn Quebec',
            },
        },
    ]
    def _real_extract(self, url):
        # Basic extractor implementation - Video ID, Display ID, Title, URL
        # IDs
        mobj = re.match(self._VALID_URL, url)
        try:
            video_id = mobj.group('id')
            display_id = mobj.group('display_id')
        except Exception as inst:
            raise ExtractorError('%s said: %s' % (self.IE_NAME, "The video or display ID could not be extracted: %s" % inst), expected=True)
        # get the webpage source code
        webpage = self._download_webpage(url, video_id)
        # Title
        # first try the generic header text
        video_title = self._html_search_regex(r'<h1>(.+?)</h1>', webpage, 'video_title', default=None)
        if video_title is None:
            # fallback to the link text provided for embed
            video_title = self._html_search_regex(r'<a\shref="%s">(.+?)</a>' % url, webpage, 'video_title', default=None)
        if video_title is None:
            raise ExtractorError('%s said: %s' % (self.IE_NAME, "The video title could not be extracted"), expected=True)
        # the URL for the video file is contained in a seperate link as:
        # https://borfos.com/kt_player/player.php?id=<video_id>
        flashvars_webpage = self._download_webpage('https://borfos.com/kt_player/player.php?id=%s' % video_id, video_id)
        flashvars_data = self._search_regex(r'(?s)flashvars\s*=\s*({.+?})', flashvars_webpage, 'flashvars_data', default=None)
        if flashvars_data is None:
            raise ExtractorError('%s said: %s' % (self.IE_NAME, "The flash player data could not be extracted"), expected=True)
        # URL
        # yes, we are going to use a regex to extract the video URL instead of using the JSON approach
        #
        # this is done because a bunch of extraneous fields in the flash data contain wonky characters
        # that screw up the call to _parse_json() and we do not care for these fields anyway, so ...
        # first try the generic url
        video_url = self._search_regex(r'(?s)video_url:\s"(.+?)",\s*', flashvars_data, 'video_url', default=None)
        if video_url is None:
            # fallback to the HTML5 url
            video_url = self._search_regex(r'(?s)video_html5_url:\s"(.+?)",\s*', flashvars_data, 'video_url', default=None)
        if video_url is None:
            raise ExtractorError('%s said: %s' % (self.IE_NAME, "The video URL could not be extracted"), expected=True)
        return {
            'id': video_id,
            'display_id': display_id,
            'title': video_title,
            'url': video_url
        }