l1ving_youtube-dl/youtube_dl/extractor/vidlii.py

# coding: utf-8
from __future__ import unicode_literals

import re

from .common import InfoExtractor
from ..utils import (
    int_or_none,
    get_element_by_id, str_or_none, get_element_by_class, strip_or_none,
    float_or_none)


class VidliiIE(InfoExtractor):
    _VALID_URL = r'(?:https*?:\/\/)*(?:www\.)*vidlii.com\/watch\?v=(?P<id>[^?\s]{11})'
    _TEST = {
        'url': 'https://www.vidlii.com/watch?v=tJluaH4BJ3v',
        'md5': '9bf7d1e005dfa909b6efb0a1ff5175e2',
        'info_dict': {
            'id': 'tJluaH4BJ3v',
            'title': 'Vidlii is against me',
            'description': 'I have HAD it. Vidlii does not like me. I have tried to uplaod videos and submit them to the contest and no ne of my videos show up so maybe it is broken for everyone else but this one was trying to submit it because I wanted to submit to the contest :) Tanks I hope the website is fixed PS: Jan you are cool please add my video',
            'thumbnail': 'https://www.vidlii.com/usfi/thmp/tJluaH4BJ3v.jpg',
            'uploader': 'APPle5auc31995',
            'url': 'https://cdn.vidlii.com/videos/tJluaH4BJ3v.mp4',
            'uploader_url': 'https://www.vidlii.com/user/APPle5auc31995',
            'upload_date': '20171107',
            'categories': 'News & Politics',
            'tags': ['Vidlii', 'Jan', 'Videogames'],
            'duration': 212,
            # TODO this might change in future, how to handle?
            'view_count': 230,
            # TODO this might change in future, how to handle?
            'comment_count': 13,
            'average_rating': 1.8571428571429,
            'type': 'video',
            'ext': 'mp4'
            # * A value
            # * MD5 checksum; start the string with md5:
            # * A regular expression; start the string with re:
            # * Any Python type (for example int or float)
        }
    }

    def _real_extract(self, url):
        # get required video properties
        video_id = self._match_id(url)

        webpage = self._download_webpage(url, video_id)

        title = str_or_none(
            self._html_search_regex(r'<h1>(.+?)</h1>', webpage,
                                    'title', default=None)) or str_or_none(
            self._html_search_regex(r'<title>([^<]+?)</title>', webpage,
                                    'title', default=None)) or str_or_none(
            self._html_search_meta('twitter:title', webpage, 'title',
                                   default=False))
        description = strip_or_none(
            get_element_by_id('des_text', webpage).strip())

        uploader = str_or_none(
            self._html_search_regex(
                r'<div[^>]+class="wt_person"[^>]*>(?:[^<]+)<a href="\/user\/[^>]*?>([^<]*?)<',
                webpage,
                'uploader', default=None)) or str_or_none(
            self._html_search_regex(
                r'<img src="[^>]+?class=["\']avt2\s*["\'][^>]+?alt=["\']([^"\']+?)["\']',
                webpage, 'uploader', default=None))

        url = self._html_search_regex(
            r'videoInfo[\s]*=[\s]*{[^}]*src:[\s]*(?:"|\')([^"]*?)(?:"|\')',
            webpage, 'url', default=None)

        # get additional properties
        uploader_url = "https://www.vidlii.com/user/%s" % uploader

        upload_date = str_or_none(
            self._html_search_meta('datePublished', webpage, 'upload_date',
                                   default=False).replace("-",
                                                          "")) or str_or_none(
            self._html_search_regex(r'<date>(.+?)</date>', webpage,
                                    'upload_date', default="").replace("-",
                                                                       ""))
        categories = self._html_search_regex(
            r'<div>Category:\s*<\/div>[\s\r]*<div>[\s\r]*<a href="\/videos\?c=[^>]*>([^<]*?)<\/a>',
            webpage,
            'categories', default=None)
        tags = re.findall(r'<a href="/results\?q=[^>]*>[\s]*([^<]*)</a>',
                          webpage) or None
        duration = int_or_none(
            self._html_search_meta('video:duration', webpage, 'duration',
                                   default=False)) or int_or_none(
            self._html_search_regex(
                r'videoInfo[^=]*=[^{]*{[^}]*dur:([^,}]*?),', webpage,
                'duration', default=None))
        view_count_fallback = re.findall(r'<strong>([^<]*?)</strong>',
                                         get_element_by_class("w_views",
                                                              webpage))
        view_count_fallback = view_count_fallback[
            0] if view_count_fallback else None
        view_count = int_or_none(self._html_search_regex(
            r'Views:[^<]*<strong>([^<]*?)<\/strong>', webpage,
            'view_count', default=None)) or int_or_none(
            view_count_fallback)

        comment_count = int_or_none(self._html_search_regex(
            r'Comments:[^<]*<strong>([^<]*?)<\/strong>', webpage,
            'comment_count', default=None)) or int_or_none(
            self._html_search_regex(
                r'<span[^>]+id="cmt_num"[^>]*>([^<]+?)<\/span>', webpage,
                'comment_count', default=None))
        average_rating = float_or_none(
            self._html_search_regex(
                r'{[\s\r]*\$\("#rateYo"\).rateYo\({[^}]*rating:\s*([^,]*?),[^}.]*}',
                webpage, 'average_rating', default=None))
        thumbnail_link = self._html_search_regex(
            r'videoInfo[\s]*=[\s]*{[^}]*img:[\s]*(?:"|\')([^"]*?)(?:"|\')',
            webpage, 'thumbnail', default=None)
        thumbnail = 'https://www.vidlii.com%s' % thumbnail_link
        video_type = self._og_search_property('type', webpage, 'type')

        return {
            'id': video_id,
            'title': title,
            'description': description,
            'uploader': uploader,
            'url': url,
            'uploader_url': uploader_url,
            'upload_date': upload_date,
            'categories': categories,
            'tags': tags,
            'duration': duration,
            'view_count': view_count,
            'comment_count': comment_count,
            'average_rating': average_rating,
            'thumbnail': thumbnail,
            'type': video_type
        }
- Added template for extractor vidlii.py, last commit only added vidlii extractor into extractors.py 2017-11-08 14:09:57 +01:00			`# coding: utf-8`
			`from __future__ import unicode_literals`

- Implemented basic extraction for vidlii but requires improvement 2017-11-09 09:38:47 +01:00			`import re`

- Improved extraction for Vidlii (added fallbacks, simplified RegExes) - Added first test 2017-11-15 22:55:56 +01:00			`from .common import InfoExtractor`
- Implemented basic extraction for vidlii but requires improvement 2017-11-09 09:38:47 +01:00			`from ..utils import (`
			`int_or_none,`
- Improved extraction for Vidlii (added fallbacks, simplified RegExes) - Added first test 2017-11-15 22:55:56 +01:00			`get_element_by_id, str_or_none, get_element_by_class, strip_or_none,`
			`float_or_none)`
- Added template for extractor vidlii.py, last commit only added vidlii extractor into extractors.py 2017-11-08 14:09:57 +01:00

			`class VidliiIE(InfoExtractor):`
- Implemented basic extraction for vidlii but requires improvement 2017-11-09 09:38:47 +01:00			`_VALID_URL = r'(?:https?:\/\/)(?:www\.)*vidlii.com\/watch\?v=(?P<id>[^?\s]{11})'`
- Added template for extractor vidlii.py, last commit only added vidlii extractor into extractors.py 2017-11-08 14:09:57 +01:00			`_TEST = {`
- Improved extraction for Vidlii (added fallbacks, simplified RegExes) - Added first test 2017-11-15 22:55:56 +01:00			`'url': 'https://www.vidlii.com/watch?v=tJluaH4BJ3v',`
			`'md5': '9bf7d1e005dfa909b6efb0a1ff5175e2',`
- Added template for extractor vidlii.py, last commit only added vidlii extractor into extractors.py 2017-11-08 14:09:57 +01:00			`'info_dict': {`
- Improved extraction for Vidlii (added fallbacks, simplified RegExes) - Added first test 2017-11-15 22:55:56 +01:00			`'id': 'tJluaH4BJ3v',`
			`'title': 'Vidlii is against me',`
			`'description': 'I have HAD it. Vidlii does not like me. I have tried to uplaod videos and submit them to the contest and no ne of my videos show up so maybe it is broken for everyone else but this one was trying to submit it because I wanted to submit to the contest :) Tanks I hope the website is fixed PS: Jan you are cool please add my video',`
			`'thumbnail': 'https://www.vidlii.com/usfi/thmp/tJluaH4BJ3v.jpg',`
			`'uploader': 'APPle5auc31995',`
			`'url': 'https://cdn.vidlii.com/videos/tJluaH4BJ3v.mp4',`
			`'uploader_url': 'https://www.vidlii.com/user/APPle5auc31995',`
			`'upload_date': '20171107',`
			`'categories': 'News & Politics',`
			`'tags': ['Vidlii', 'Jan', 'Videogames'],`
			`'duration': 212,`
			`# TODO this might change in future, how to handle?`
			`'view_count': 230,`
			`# TODO this might change in future, how to handle?`
			`'comment_count': 13,`
			`'average_rating': 1.8571428571429,`
			`'type': 'video',`
			`'ext': 'mp4'`
- Added template for extractor vidlii.py, last commit only added vidlii extractor into extractors.py 2017-11-08 14:09:57 +01:00			`# * A value`
			`# * MD5 checksum; start the string with md5:`
			`# * A regular expression; start the string with re:`
			`# * Any Python type (for example int or float)`
			`}`
			`}`

			`def _real_extract(self, url):`
- Implemented basic extraction for vidlii but requires improvement 2017-11-09 09:38:47 +01:00			`# get required video properties`
- Added template for extractor vidlii.py, last commit only added vidlii extractor into extractors.py 2017-11-08 14:09:57 +01:00			`video_id = self._match_id(url)`
- Improved extraction for Vidlii (added fallbacks, simplified RegExes) - Added first test 2017-11-15 22:55:56 +01:00
- Added template for extractor vidlii.py, last commit only added vidlii extractor into extractors.py 2017-11-08 14:09:57 +01:00			`webpage = self._download_webpage(url, video_id)`
- Improved extraction for Vidlii (added fallbacks, simplified RegExes) - Added first test 2017-11-15 22:55:56 +01:00
			`title = str_or_none(`
			`self._html_search_regex(r'<h1>(.+?)</h1>', webpage,`
			`'title', default=None)) or str_or_none(`
			`self._html_search_regex(r'<title>([^<]+?)</title>', webpage,`
			`'title', default=None)) or str_or_none(`
			`self._html_search_meta('twitter:title', webpage, 'title',`
			`default=False))`
			`description = strip_or_none(`
			`get_element_by_id('des_text', webpage).strip())`

			`uploader = str_or_none(`
			`self._html_search_regex(`
			`r'<div[^>]+class="wt_person"[^>]>(?:[^<]+)<a href="\/user\/[^>]?>([^<]*?)<',`
			`webpage,`
			`'uploader', default=None)) or str_or_none(`
			`self._html_search_regex(`
			`r'<img src="[^>]+?class=["\']avt2\s*["\'][^>]+?alt=["\']([^"\']+?)["\']',`
			`webpage, 'uploader', default=None))`

			`url = self._html_search_regex(`
			`r'videoInfo[\s]=[\s]{[^}]src:[\s](?:"\|\')([^"]*?)(?:"\|\')',`
			`webpage, 'url', default=None)`
- Implemented basic extraction for vidlii but requires improvement 2017-11-09 09:38:47 +01:00
			`# get additional properties`
			`uploader_url = "https://www.vidlii.com/user/%s" % uploader`
- Improved extraction for Vidlii (added fallbacks, simplified RegExes) - Added first test 2017-11-15 22:55:56 +01:00
			`upload_date = str_or_none(`
			`self._html_search_meta('datePublished', webpage, 'upload_date',`
			`default=False).replace("-",`
			`"")) or str_or_none(`
			`self._html_search_regex(r'<date>(.+?)</date>', webpage,`
			`'upload_date', default="").replace("-",`
			`""))`
- Implemented basic extraction for vidlii but requires improvement 2017-11-09 09:38:47 +01:00			`categories = self._html_search_regex(`
- Improved extraction for Vidlii (added fallbacks, simplified RegExes) - Added first test 2017-11-15 22:55:56 +01:00			`r'<div>Category:\s<\/div>[\s\r]<div>[\s\r]<a href="\/videos\?c=[^>]>([^<]*?)<\/a>',`
			`webpage,`
			`'categories', default=None)`
			`tags = re.findall(r'<a href="/results\?q=[^>]>[\s]([^<]*)</a>',`
			`webpage) or None`
			`duration = int_or_none(`
			`self._html_search_meta('video:duration', webpage, 'duration',`
			`default=False)) or int_or_none(`
			`self._html_search_regex(`
			`r'videoInfo[^=]=[^{]{[^}]dur:([^,}]?),', webpage,`
			`'duration', default=None))`
			`view_count_fallback = re.findall(r'<strong>([^<]*?)</strong>',`
			`get_element_by_class("w_views",`
			`webpage))`
			`view_count_fallback = view_count_fallback[`
			`0] if view_count_fallback else None`
			`view_count = int_or_none(self._html_search_regex(`
			`r'Views:[^<]<strong>([^<]?)<\/strong>', webpage,`
			`'view_count', default=None)) or int_or_none(`
			`view_count_fallback)`
- Implemented basic extraction for vidlii but requires improvement 2017-11-09 09:38:47 +01:00
- Improved extraction for Vidlii (added fallbacks, simplified RegExes) - Added first test 2017-11-15 22:55:56 +01:00			`comment_count = int_or_none(self._html_search_regex(`
			`r'Comments:[^<]<strong>([^<]?)<\/strong>', webpage,`
			`'comment_count', default=None)) or int_or_none(`
			`self._html_search_regex(`
			`r'<span[^>]+id="cmt_num"[^>]*>([^<]+?)<\/span>', webpage,`
			`'comment_count', default=None))`
			`average_rating = float_or_none(`
			`self._html_search_regex(`
			`r'{[\s\r]\$\("#rateYo"\).rateYo\({[^}]rating:\s([^,]?),[^}.]*}',`
			`webpage, 'average_rating', default=None))`
			`thumbnail_link = self._html_search_regex(`
			`r'videoInfo[\s]=[\s]{[^}]img:[\s](?:"\|\')([^"]*?)(?:"\|\')',`
			`webpage, 'thumbnail', default=None)`
			`thumbnail = 'https://www.vidlii.com%s' % thumbnail_link`
			`video_type = self._og_search_property('type', webpage, 'type')`
- Added template for extractor vidlii.py, last commit only added vidlii extractor into extractors.py 2017-11-08 14:09:57 +01:00
			`return {`
			`'id': video_id,`
			`'title': title,`
- Implemented basic extraction for vidlii but requires improvement 2017-11-09 09:38:47 +01:00			`'description': description,`
			`'uploader': uploader,`
			`'url': url,`
			`'uploader_url': uploader_url,`
- Improved extraction for Vidlii (added fallbacks, simplified RegExes) - Added first test 2017-11-15 22:55:56 +01:00			`'upload_date': upload_date,`
- Implemented basic extraction for vidlii but requires improvement 2017-11-09 09:38:47 +01:00			`'categories': categories,`
			`'tags': tags,`
			`'duration': duration,`
			`'view_count': view_count,`
			`'comment_count': comment_count,`
			`'average_rating': average_rating,`
			`'thumbnail': thumbnail,`
- Improved extraction for Vidlii (added fallbacks, simplified RegExes) - Added first test 2017-11-15 22:55:56 +01:00			`'type': video_type`
- Implemented basic extraction for vidlii but requires improvement 2017-11-09 09:38:47 +01:00			`}`