l1ving_youtube-dl/youtube_dl/extractor/vidlii.py

# coding: utf-8
from __future__ import unicode_literals

import re

from .common import InfoExtractor
from ..utils import (
    int_or_none,
    get_element_by_id, str_or_none, get_element_by_class, strip_or_none,
    float_or_none)


class VidliiIE(InfoExtractor):
    _VALID_URL = r'(?:https*?:\/\/)*(?:www\.)*vidlii.com\/watch\?v=(?P<id>[^?\s]{11})'
    _TESTS = [{
        'url': 'https://www.vidlii.com/watch?v=tJluaH4BJ3v',
        'md5': '9bf7d1e005dfa909b6efb0a1ff5175e2',
        'info_dict': {
            'id': 'tJluaH4BJ3v',
            'title': 'Vidlii is against me',
            'description': 'I have HAD it. Vidlii does not like me. I have tried to uplaod videos and submit them to the contest and no ne of my videos show up so maybe it is broken for everyone else but this one was trying to submit it because I wanted to submit to the contest :) Tanks I hope the website is fixed PS: Jan you are cool please add my video',
            'thumbnail': 'https://www.vidlii.com/usfi/thmp/tJluaH4BJ3v.jpg',
            'uploader': 'APPle5auc31995',
            'url': 'https://cdn.vidlii.com/videos/tJluaH4BJ3v.mp4',
            'uploader_url': 'https://www.vidlii.com/user/APPle5auc31995',
            'upload_date': '20171107',
            'categories': 'News & Politics',
            'tags': ['Vidlii', 'Jan', 'Videogames'],
            'duration': 212,
            'view_count': int,
            'comment_count': int,
            'average_rating': float,
            'type': 'video',
            'ext': 'mp4'
        }
    }, {
        'url': 'https://www.vidlii.com/watch?v=vBo2IcrwOkO',
        'md5': 'b42640a596b4dc986702567d49268963',
        'info_dict': {
            'id': 'vBo2IcrwOkO',
            'title': '(OLD VIDEO) i like youtube!!',
            'description': 'Original upload date:<br />\nMarch 10th 2011<br />\nCredit goes to people who own content in the video',
            'thumbnail': 'https://www.vidlii.com/usfi/thmp/vBo2IcrwOkO.jpg',
            'uploader': 'MyEditedVideoSpartan',
            'url': 'https://cdn.vidlii.com/videos/vBo2IcrwOkO.mp4',
            'uploader_url': 'https://www.vidlii.com/user/MyEditedVideoSpartan',
            'upload_date': '20171011',
            'categories': 'Film & Animation',
            'tags': None,
            'duration': 34,
            'view_count': int,
            'comment_count': int,
            'average_rating': float,
            'type': 'video',
            'ext': 'mp4'
        }
    }]

    def _real_extract(self, url):
        video_id = self._match_id(url)
        webpage = self._download_webpage(url, video_id)

        title_1 = str_or_none(
            self._html_search_regex(r'<h1>(.+?)</h1>', webpage,
                                    'title', default=None))
        title_2 = str_or_none(
            self._html_search_regex(r'<title>([^<]+?)</title>', webpage,
                                    'title', default=None)).replace(
            " - VidLii", "")
        title_3 = str_or_none(
            self._html_search_meta('twitter:title', webpage, 'title',
                                   default=False)).replace(" - VidLii", "")
        # assert title_1 == title_2 == title_3, "TITLE fallback is not working"
        title = title_1 or title_2 or title_3

        description = strip_or_none(
            get_element_by_id('des_text', webpage).strip())

        uploader_1 = str_or_none(
            self._html_search_regex(
                r'<div[^>]+class="wt_person"[^>]*>(?:[^<]+)<a href="\/user\/[^>]*?>([^<]*?)<',
                webpage,
                'uploader', default=None))
        uploader_2 = str_or_none(
            self._html_search_regex(
                r'<img src="[^>]+?class=["\']avt2\s*["\'][^>]+?alt=["\']([^"\']+?)["\']',
                webpage, 'uploader', default=None))
        # assert uploader_1 == uploader_2, "UPLOADER fallback is not working"
        uploader = uploader_1 or uploader_2

        url = self._html_search_regex(
            r'videoInfo[\s]*=[\s]*{[^}]*src:[\s]*(?:"|\')([^"]*?)(?:"|\')',
            webpage, 'url', default=None)

        # get additional properties
        uploader_url = "https://www.vidlii.com/user/%s" % uploader

        # returns date as YYYYMMDD
        upload_date = str_or_none(
            self._html_search_meta('datePublished', webpage, 'upload_date',
                                   default=False).replace("-",
                                                          ""))

        categories = self._html_search_regex(
            r'<div>Category:\s*<\/div>[\s\r]*<div>[\s\r]*<a href="\/videos\?c=[^>]*>([^<]*?)<\/a>',
            webpage,
            'categories', default=None)
        tags = re.findall(r'<a href="/results\?q=[^>]*>[\s]*([^<]*)</a>',
                          webpage) or None
        duration_1 = int_or_none(
            self._html_search_meta('video:duration', webpage, 'duration',
                                   default=False))
        duration_2 = int_or_none(
            self._html_search_regex(
                r'videoInfo[^=]*=[^{]*{[^}]*dur:([^,}]*?),', webpage,
                'duration', default=None))
        # assert duration_1 == duration_2, "DURATION fallback is not working"
        duration = duration_1 or duration_2

        view_count_1 = int_or_none(self._html_search_regex(
            r'Views:[^<]*<strong>([^<]*?)<\/strong>', webpage,
            'view_count', default=None))
        view_count_2 = re.findall(r'<strong>([^<]*?)</strong>',
                                  get_element_by_class("w_views",
                                                       webpage))
        view_count_2 = int_or_none(view_count_2[
                                       0]) if view_count_2 else None
        # assert view_count_1 == view_count_2, "VIEW COUNT fallback is not working"
        view_count = view_count_1 or view_count_2

        comment_count_1 = int_or_none(self._html_search_regex(
            r'Comments:[^<]*<strong>([^<]*?)<\/strong>', webpage,
            'comment_count', default=None))
        comment_count_2 = int_or_none(
            self._html_search_regex(
                r'<span[^>]+id="cmt_num"[^>]*>([^<]+?)<\/span>', webpage,
                'comment_count', default=None))
        # assert comment_count_1 == comment_count_2, "COMMENT COUNT fallback is not working"
        comment_count = comment_count_1 or comment_count_2

        average_rating = float_or_none(
            self._html_search_regex(
                r'{[\s\r]*\$\("#rateYo"\).rateYo\({[^}]*rating:\s*([^,]*?),[^}.]*}',
                webpage, 'average_rating', default=None))
        thumbnail_link = self._html_search_regex(
            r'videoInfo[\s]*=[\s]*{[^}]*img:[\s]*(?:"|\')([^"]*?)(?:"|\')',
            webpage, 'thumbnail', default=None)
        thumbnail = 'https://www.vidlii.com%s' % thumbnail_link
        video_type = self._og_search_property('type', webpage, 'type')

        return {
            'id': video_id,
            'title': title,
            'description': description,
            'uploader': uploader,
            'url': url,
            'uploader_url': uploader_url,
            'upload_date': upload_date,
            'categories': categories,
            'tags': tags,
            'duration': duration,
            'view_count': view_count,
            'comment_count': comment_count,
            'average_rating': average_rating,
            'thumbnail': thumbnail,
            'type': video_type
        }
- Added template for extractor vidlii.py, last commit only added vidlii extractor into extractors.py 2017-11-08 14:09:57 +01:00			`# coding: utf-8`
			`from __future__ import unicode_literals`

- Implemented basic extraction for vidlii but requires improvement 2017-11-09 09:38:47 +01:00			`import re`

- Improved extraction for Vidlii (added fallbacks, simplified RegExes) - Added first test 2017-11-15 22:55:56 +01:00			`from .common import InfoExtractor`
- Implemented basic extraction for vidlii but requires improvement 2017-11-09 09:38:47 +01:00			`from ..utils import (`
			`int_or_none,`
- Improved extraction for Vidlii (added fallbacks, simplified RegExes) - Added first test 2017-11-15 22:55:56 +01:00			`get_element_by_id, str_or_none, get_element_by_class, strip_or_none,`
			`float_or_none)`
- Added template for extractor vidlii.py, last commit only added vidlii extractor into extractors.py 2017-11-08 14:09:57 +01:00

			`class VidliiIE(InfoExtractor):`
- Implemented basic extraction for vidlii but requires improvement 2017-11-09 09:38:47 +01:00			`_VALID_URL = r'(?:https?:\/\/)(?:www\.)*vidlii.com\/watch\?v=(?P<id>[^?\s]{11})'`
Test 2 und 3 hinzugefuegt 2017-11-17 10:00:17 +01:00			`_TESTS = [{`
- Improved extraction for Vidlii (added fallbacks, simplified RegExes) - Added first test 2017-11-15 22:55:56 +01:00			`'url': 'https://www.vidlii.com/watch?v=tJluaH4BJ3v',`
			`'md5': '9bf7d1e005dfa909b6efb0a1ff5175e2',`
- Added template for extractor vidlii.py, last commit only added vidlii extractor into extractors.py 2017-11-08 14:09:57 +01:00			`'info_dict': {`
- Improved extraction for Vidlii (added fallbacks, simplified RegExes) - Added first test 2017-11-15 22:55:56 +01:00			`'id': 'tJluaH4BJ3v',`
			`'title': 'Vidlii is against me',`
			`'description': 'I have HAD it. Vidlii does not like me. I have tried to uplaod videos and submit them to the contest and no ne of my videos show up so maybe it is broken for everyone else but this one was trying to submit it because I wanted to submit to the contest :) Tanks I hope the website is fixed PS: Jan you are cool please add my video',`
			`'thumbnail': 'https://www.vidlii.com/usfi/thmp/tJluaH4BJ3v.jpg',`
			`'uploader': 'APPle5auc31995',`
			`'url': 'https://cdn.vidlii.com/videos/tJluaH4BJ3v.mp4',`
			`'uploader_url': 'https://www.vidlii.com/user/APPle5auc31995',`
			`'upload_date': '20171107',`
			`'categories': 'News & Politics',`
			`'tags': ['Vidlii', 'Jan', 'Videogames'],`
			`'duration': 212,`
- Added tests for extractor vidlii - Tested fallbacks by use of assertions 2017-11-17 11:07:24 +01:00			`'view_count': int,`
			`'comment_count': int,`
			`'average_rating': float,`
- Improved extraction for Vidlii (added fallbacks, simplified RegExes) - Added first test 2017-11-15 22:55:56 +01:00			`'type': 'video',`
			`'ext': 'mp4'`
- Added template for extractor vidlii.py, last commit only added vidlii extractor into extractors.py 2017-11-08 14:09:57 +01:00			`}`
- Added tests for extractor vidlii - Tested fallbacks by use of assertions 2017-11-17 11:07:24 +01:00			`}, {`
Test 2 und 3 hinzugefuegt 2017-11-17 10:00:17 +01:00			`'url': 'https://www.vidlii.com/watch?v=vBo2IcrwOkO',`
			`'md5': 'b42640a596b4dc986702567d49268963',`
			`'info_dict': {`
			`'id': 'vBo2IcrwOkO',`
			`'title': '(OLD VIDEO) i like youtube!!',`
- Added tests for extractor vidlii - Tested fallbacks by use of assertions 2017-11-17 11:07:24 +01:00			`'description': 'Original upload date:<br />\nMarch 10th 2011<br />\nCredit goes to people who own content in the video',`
Test 2 und 3 hinzugefuegt 2017-11-17 10:00:17 +01:00			`'thumbnail': 'https://www.vidlii.com/usfi/thmp/vBo2IcrwOkO.jpg',`
- Added tests for extractor vidlii - Tested fallbacks by use of assertions 2017-11-17 11:07:24 +01:00			`'uploader': 'MyEditedVideoSpartan',`
			`'url': 'https://cdn.vidlii.com/videos/vBo2IcrwOkO.mp4',`
			`'uploader_url': 'https://www.vidlii.com/user/MyEditedVideoSpartan',`
Test 2 und 3 hinzugefuegt 2017-11-17 10:00:17 +01:00			`'upload_date': '20171011',`
- Added tests for extractor vidlii - Tested fallbacks by use of assertions 2017-11-17 11:07:24 +01:00			`'categories': 'Film & Animation',`
			`'tags': None,`
			`'duration': 34,`
			`'view_count': int,`
			`'comment_count': int,`
			`'average_rating': float,`
			`'type': 'video',`
			`'ext': 'mp4'`
Test 2 und 3 hinzugefuegt 2017-11-17 10:00:17 +01:00			`}`
			`}]`
- Added template for extractor vidlii.py, last commit only added vidlii extractor into extractors.py 2017-11-08 14:09:57 +01:00
			`def _real_extract(self, url):`
			`video_id = self._match_id(url)`
			`webpage = self._download_webpage(url, video_id)`
- Improved extraction for Vidlii (added fallbacks, simplified RegExes) - Added first test 2017-11-15 22:55:56 +01:00
- Added tests for extractor vidlii - Tested fallbacks by use of assertions 2017-11-17 11:07:24 +01:00			`title_1 = str_or_none(`
- Improved extraction for Vidlii (added fallbacks, simplified RegExes) - Added first test 2017-11-15 22:55:56 +01:00			`self._html_search_regex(r'<h1>(.+?)</h1>', webpage,`
- Added tests for extractor vidlii - Tested fallbacks by use of assertions 2017-11-17 11:07:24 +01:00			`'title', default=None))`
			`title_2 = str_or_none(`
- Improved extraction for Vidlii (added fallbacks, simplified RegExes) - Added first test 2017-11-15 22:55:56 +01:00			`self._html_search_regex(r'<title>([^<]+?)</title>', webpage,`
- Added tests for extractor vidlii - Tested fallbacks by use of assertions 2017-11-17 11:07:24 +01:00			`'title', default=None)).replace(`
			`" - VidLii", "")`
			`title_3 = str_or_none(`
- Improved extraction for Vidlii (added fallbacks, simplified RegExes) - Added first test 2017-11-15 22:55:56 +01:00			`self._html_search_meta('twitter:title', webpage, 'title',`
- Added tests for extractor vidlii - Tested fallbacks by use of assertions 2017-11-17 11:07:24 +01:00			`default=False)).replace(" - VidLii", "")`
			`# assert title_1 == title_2 == title_3, "TITLE fallback is not working"`
			`title = title_1 or title_2 or title_3`

- Improved extraction for Vidlii (added fallbacks, simplified RegExes) - Added first test 2017-11-15 22:55:56 +01:00			`description = strip_or_none(`
			`get_element_by_id('des_text', webpage).strip())`

- Added tests for extractor vidlii - Tested fallbacks by use of assertions 2017-11-17 11:07:24 +01:00			`uploader_1 = str_or_none(`
- Improved extraction for Vidlii (added fallbacks, simplified RegExes) - Added first test 2017-11-15 22:55:56 +01:00			`self._html_search_regex(`
			`r'<div[^>]+class="wt_person"[^>]>(?:[^<]+)<a href="\/user\/[^>]?>([^<]*?)<',`
			`webpage,`
- Added tests for extractor vidlii - Tested fallbacks by use of assertions 2017-11-17 11:07:24 +01:00			`'uploader', default=None))`
			`uploader_2 = str_or_none(`
- Improved extraction for Vidlii (added fallbacks, simplified RegExes) - Added first test 2017-11-15 22:55:56 +01:00			`self._html_search_regex(`
			`r'<img src="[^>]+?class=["\']avt2\s*["\'][^>]+?alt=["\']([^"\']+?)["\']',`
			`webpage, 'uploader', default=None))`
- Added tests for extractor vidlii - Tested fallbacks by use of assertions 2017-11-17 11:07:24 +01:00			`# assert uploader_1 == uploader_2, "UPLOADER fallback is not working"`
			`uploader = uploader_1 or uploader_2`
- Improved extraction for Vidlii (added fallbacks, simplified RegExes) - Added first test 2017-11-15 22:55:56 +01:00
			`url = self._html_search_regex(`
			`r'videoInfo[\s]=[\s]{[^}]src:[\s](?:"\|\')([^"]*?)(?:"\|\')',`
			`webpage, 'url', default=None)`
- Implemented basic extraction for vidlii but requires improvement 2017-11-09 09:38:47 +01:00
			`# get additional properties`
			`uploader_url = "https://www.vidlii.com/user/%s" % uploader`
- Improved extraction for Vidlii (added fallbacks, simplified RegExes) - Added first test 2017-11-15 22:55:56 +01:00
- Added tests for extractor vidlii - Tested fallbacks by use of assertions 2017-11-17 11:07:24 +01:00			`# returns date as YYYYMMDD`
- Improved extraction for Vidlii (added fallbacks, simplified RegExes) - Added first test 2017-11-15 22:55:56 +01:00			`upload_date = str_or_none(`
			`self._html_search_meta('datePublished', webpage, 'upload_date',`
			`default=False).replace("-",`
- Added tests for extractor vidlii - Tested fallbacks by use of assertions 2017-11-17 11:07:24 +01:00			`""))`

- Implemented basic extraction for vidlii but requires improvement 2017-11-09 09:38:47 +01:00			`categories = self._html_search_regex(`
- Improved extraction for Vidlii (added fallbacks, simplified RegExes) - Added first test 2017-11-15 22:55:56 +01:00			`r'<div>Category:\s<\/div>[\s\r]<div>[\s\r]<a href="\/videos\?c=[^>]>([^<]*?)<\/a>',`
			`webpage,`
			`'categories', default=None)`
			`tags = re.findall(r'<a href="/results\?q=[^>]>[\s]([^<]*)</a>',`
			`webpage) or None`
- Added tests for extractor vidlii - Tested fallbacks by use of assertions 2017-11-17 11:07:24 +01:00			`duration_1 = int_or_none(`
- Improved extraction for Vidlii (added fallbacks, simplified RegExes) - Added first test 2017-11-15 22:55:56 +01:00			`self._html_search_meta('video:duration', webpage, 'duration',`
- Added tests for extractor vidlii - Tested fallbacks by use of assertions 2017-11-17 11:07:24 +01:00			`default=False))`
			`duration_2 = int_or_none(`
- Improved extraction for Vidlii (added fallbacks, simplified RegExes) - Added first test 2017-11-15 22:55:56 +01:00			`self._html_search_regex(`
			`r'videoInfo[^=]=[^{]{[^}]dur:([^,}]?),', webpage,`
			`'duration', default=None))`
- Added tests for extractor vidlii - Tested fallbacks by use of assertions 2017-11-17 11:07:24 +01:00			`# assert duration_1 == duration_2, "DURATION fallback is not working"`
			`duration = duration_1 or duration_2`

			`view_count_1 = int_or_none(self._html_search_regex(`
- Improved extraction for Vidlii (added fallbacks, simplified RegExes) - Added first test 2017-11-15 22:55:56 +01:00			`r'Views:[^<]<strong>([^<]?)<\/strong>', webpage,`
- Added tests for extractor vidlii - Tested fallbacks by use of assertions 2017-11-17 11:07:24 +01:00			`'view_count', default=None))`
			`view_count_2 = re.findall(r'<strong>([^<]*?)</strong>',`
			`get_element_by_class("w_views",`
			`webpage))`
			`view_count_2 = int_or_none(view_count_2[`
			`0]) if view_count_2 else None`
			`# assert view_count_1 == view_count_2, "VIEW COUNT fallback is not working"`
			`view_count = view_count_1 or view_count_2`
- Implemented basic extraction for vidlii but requires improvement 2017-11-09 09:38:47 +01:00
- Added tests for extractor vidlii - Tested fallbacks by use of assertions 2017-11-17 11:07:24 +01:00			`comment_count_1 = int_or_none(self._html_search_regex(`
- Improved extraction for Vidlii (added fallbacks, simplified RegExes) - Added first test 2017-11-15 22:55:56 +01:00			`r'Comments:[^<]<strong>([^<]?)<\/strong>', webpage,`
- Added tests for extractor vidlii - Tested fallbacks by use of assertions 2017-11-17 11:07:24 +01:00			`'comment_count', default=None))`
			`comment_count_2 = int_or_none(`
- Improved extraction for Vidlii (added fallbacks, simplified RegExes) - Added first test 2017-11-15 22:55:56 +01:00			`self._html_search_regex(`
			`r'<span[^>]+id="cmt_num"[^>]*>([^<]+?)<\/span>', webpage,`
			`'comment_count', default=None))`
- Added tests for extractor vidlii - Tested fallbacks by use of assertions 2017-11-17 11:07:24 +01:00			`# assert comment_count_1 == comment_count_2, "COMMENT COUNT fallback is not working"`
			`comment_count = comment_count_1 or comment_count_2`

- Improved extraction for Vidlii (added fallbacks, simplified RegExes) - Added first test 2017-11-15 22:55:56 +01:00			`average_rating = float_or_none(`
			`self._html_search_regex(`
			`r'{[\s\r]\$\("#rateYo"\).rateYo\({[^}]rating:\s([^,]?),[^}.]*}',`
			`webpage, 'average_rating', default=None))`
			`thumbnail_link = self._html_search_regex(`
			`r'videoInfo[\s]=[\s]{[^}]img:[\s](?:"\|\')([^"]*?)(?:"\|\')',`
			`webpage, 'thumbnail', default=None)`
			`thumbnail = 'https://www.vidlii.com%s' % thumbnail_link`
			`video_type = self._og_search_property('type', webpage, 'type')`
- Added template for extractor vidlii.py, last commit only added vidlii extractor into extractors.py 2017-11-08 14:09:57 +01:00
			`return {`
			`'id': video_id,`
			`'title': title,`
- Implemented basic extraction for vidlii but requires improvement 2017-11-09 09:38:47 +01:00			`'description': description,`
			`'uploader': uploader,`
			`'url': url,`
			`'uploader_url': uploader_url,`
- Improved extraction for Vidlii (added fallbacks, simplified RegExes) - Added first test 2017-11-15 22:55:56 +01:00			`'upload_date': upload_date,`
- Implemented basic extraction for vidlii but requires improvement 2017-11-09 09:38:47 +01:00			`'categories': categories,`
			`'tags': tags,`
			`'duration': duration,`
			`'view_count': view_count,`
			`'comment_count': comment_count,`
			`'average_rating': average_rating,`
			`'thumbnail': thumbnail,`
- Improved extraction for Vidlii (added fallbacks, simplified RegExes) - Added first test 2017-11-15 22:55:56 +01:00			`'type': video_type`
- Implemented basic extraction for vidlii but requires improvement 2017-11-09 09:38:47 +01:00			`}`