l1ving_youtube-dl/youtube_dl/extractor/vidlii.py

# coding: utf-8
from __future__ import unicode_literals

import re

from .common import InfoExtractor
from ..utils import (
    int_or_none,
    get_element_by_id, get_element_by_class, strip_or_none,
    float_or_none, urljoin, js_to_json, unified_strdate)


class VidliiIE(InfoExtractor):
    _VALID_URL = r'(?:https*?:\/\/)*(?:www\.)*vidlii.com\/watch\?v=(?P<id>[^?\s]{11})'
    _TESTS = [{
        'url': 'https://www.vidlii.com/watch?v=tJluaH4BJ3v',
        'md5': '9bf7d1e005dfa909b6efb0a1ff5175e2',
        'info_dict': {
            'id': 'tJluaH4BJ3v',
            'title': 'Vidlii is against me',
            'description': 'I have HAD it. Vidlii does not like me. I have tried to uplaod videos and submit them to the '
                           'contest and no ne of my videos show up so maybe it is broken for everyone else but this one was '
                           'trying to submit it because I wanted to submit to the contest :) Tanks I hope the website is '
                           'fixed PS: Jan you are cool please add my video',
            'thumbnail': 'https://www.vidlii.com/usfi/thmp/tJluaH4BJ3v.jpg',
            'uploader': 'APPle5auc31995',
            'url': 'https://cdn.vidlii.com/videos/tJluaH4BJ3v.mp4',
            'uploader_url': 'https://www.vidlii.com/user/APPle5auc31995',
            'upload_date': '20171107',
            'categories': 'News & Politics',
            'tags': ['Vidlii', 'Jan', 'Videogames'],
            'duration': 212,
            'view_count': int,
            'comment_count': int,
            'average_rating': float,
            'type': 'video',
            'ext': 'mp4'
        }
    }, {
        'url': 'https://www.vidlii.com/watch?v=vBo2IcrwOkO',
        'md5': 'b42640a596b4dc986702567d49268963',
        'info_dict': {
            'id': 'vBo2IcrwOkO',
            'title': '(OLD VIDEO) i like youtube!!',
            'description': 'Original upload date:<br />\nMarch 10th 2011<br />\nCredit goes to people who own content in the video',
            'thumbnail': 'https://www.vidlii.com/usfi/thmp/vBo2IcrwOkO.jpg',
            'uploader': 'MyEditedVideoSpartan',
            'url': 'https://cdn.vidlii.com/videos/vBo2IcrwOkO.mp4',
            'uploader_url': 'https://www.vidlii.com/user/MyEditedVideoSpartan',
            'upload_date': '20171011',
            'categories': 'Film & Animation',
            'tags': None,
            'duration': 34,
            'view_count': int,
            'comment_count': int,
            'average_rating': float,
            'type': 'video',
            'ext': 'mp4'
        }
    }]

    def _real_extract(self, url):
        video_id = self._match_id(url)
        webpage = self._download_webpage(url, video_id)

        # extract videoInfo variable for further use
        videoInfo_dict = self._parse_json(js_to_json(self._html_search_regex(r'var videoInfo\s*=\s*({[^}]*})', webpage,
                                                                             'videoInfo', fatal=False)), video_id)

        # extract basic properties of video
        title = (self._html_search_regex(r'<title>([^<]+?)</title>', webpage, 'title', default='') or
                 self._html_search_meta('twitter:title', webpage, 'title', default='')).replace(" - VidLii",
                                                                                                "") or self._html_search_regex(
            r'<h1>(.+?)</h1>', webpage, 'title', default=None)

        description = strip_or_none(get_element_by_id('des_text', webpage))

        uploader = self._html_search_regex(
            r'<div[^>]+class="wt_person"[^>]*>(?:[^<]+)<a href="/user/[^>]*?>([^<]*?)<|<img src="[^>]+?class=('
            r'?:"avt2\s*"|\'avt2\s*\')[^>]+?alt=(?:"([^"]+?)"|\'([^\']+?)\')>', webpage, 'uploader', default=None,
            fatal=False)

        video_url = videoInfo_dict.get("src")

        # get additional properties
        uploader_url = urljoin('https://www.vidlii.com/user/', uploader)

        # returns date as YYYYMMDD
        upload_date = unified_strdate(
            self._html_search_meta('datePublished', webpage, 'upload_date', default=None,
                                   fatal=False) or self._html_search_regex(r'<date>(['r'^<]*?)</date>', webpage,
                                                                           'upload_date',
                                                                           default=None, fatal=False))

        categories = self._html_search_regex(
            r'<div>Category:\s*<\/div>[\s\r]*<div>[\s\r]*<a href="\/videos\?c=[^>]*>([^<]*?)<\/a>', webpage, 'categories',
            default=None, fatal=False)
        tags = re.findall(r'<a href="/results\?q=[^>]*>\s*([^<]*)</a>', webpage) or None
        duration = int_or_none(
            self._html_search_meta('video:duration', webpage, 'duration', default=False, fatal=False) or
            videoInfo_dict.get("dur"))

        view_count_fb = re.findall(r'<strong>([^<]*?)</strong>', get_element_by_class("w_views", webpage))
        view_count_fb = view_count_fb[0] if view_count_fb else None
        view_count = int_or_none(self._html_search_regex(r'Views:[^<]*<strong>([^<]*?)<\/strong>', webpage, 'view_count',
                                                         default=None, fatal=False)) or int_or_none(view_count_fb)

        comment_count = int_or_none(
            self._html_search_regex(r'Comments:[^<]*<strong>([^<]*?)<\/strong>|<span[^>]+id="cmt_num"[^>]*>(['
                                    r'^<]+?)<\/span>', webpage, 'comment_count',
                                    default=None, fatal=False))

        average_rating = float_or_none(
            self._html_search_regex(r'{[\s\r]*\$\("#rateYo"\).rateYo\({[^}]*rating:\s*([^,]*?),[^}.]*}',
                                    webpage, 'average_rating', default=None, fatal=False))
        thumbnail_link = videoInfo_dict.get("img")
        thumbnail = urljoin('https://www.vidlii.com/', thumbnail_link)
        video_type = self._og_search_property('type', webpage, 'type')

        return {
            'id': video_id,
            'title': title,
            'description': description,
            'uploader': uploader,
            'url': video_url,
            'uploader_url': uploader_url,
            'upload_date': upload_date,
            'categories': categories,
            'tags': tags,
            'duration': duration,
            'view_count': view_count,
            'comment_count': comment_count,
            'average_rating': average_rating,
            'thumbnail': thumbnail,
            'type': video_type
        }
- Added template for extractor vidlii.py, last commit only added vidlii extractor into extractors.py 2017-11-08 14:09:57 +01:00			`# coding: utf-8`
			`from __future__ import unicode_literals`

- Implemented basic extraction for vidlii but requires improvement 2017-11-09 09:38:47 +01:00			`import re`

- Improved extraction for Vidlii (added fallbacks, simplified RegExes) - Added first test 2017-11-15 22:55:56 +01:00			`from .common import InfoExtractor`
- Implemented basic extraction for vidlii but requires improvement 2017-11-09 09:38:47 +01:00			`from ..utils import (`
			`int_or_none,`
[vidlii] Fixed suggestions for new extractor (see pull request for issue #14779) 2017-11-22 13:44:08 +01:00			`get_element_by_id, get_element_by_class, strip_or_none,`
[vidlii] Fixed suggestions for new extractor (see pull request for issue #14779) - improved regular expressions - implemented use of helper functions js_to_json, unified_strdate 2017-11-22 20:35:02 +01:00			`float_or_none, urljoin, js_to_json, unified_strdate)`
- Added template for extractor vidlii.py, last commit only added vidlii extractor into extractors.py 2017-11-08 14:09:57 +01:00

			`class VidliiIE(InfoExtractor):`
- Implemented basic extraction for vidlii but requires improvement 2017-11-09 09:38:47 +01:00			`_VALID_URL = r'(?:https?:\/\/)(?:www\.)*vidlii.com\/watch\?v=(?P<id>[^?\s]{11})'`
Test 2 und 3 hinzugefuegt 2017-11-17 10:00:17 +01:00			`_TESTS = [{`
- Improved extraction for Vidlii (added fallbacks, simplified RegExes) - Added first test 2017-11-15 22:55:56 +01:00			`'url': 'https://www.vidlii.com/watch?v=tJluaH4BJ3v',`
			`'md5': '9bf7d1e005dfa909b6efb0a1ff5175e2',`
- Added template for extractor vidlii.py, last commit only added vidlii extractor into extractors.py 2017-11-08 14:09:57 +01:00			`'info_dict': {`
- Improved extraction for Vidlii (added fallbacks, simplified RegExes) - Added first test 2017-11-15 22:55:56 +01:00			`'id': 'tJluaH4BJ3v',`
			`'title': 'Vidlii is against me',`
[vidlii] Fixed suggestions for new extractor (see pull request for issue #14779) 2017-11-22 13:44:08 +01:00			`'description': 'I have HAD it. Vidlii does not like me. I have tried to uplaod videos and submit them to the '`
			`'contest and no ne of my videos show up so maybe it is broken for everyone else but this one was '`
			`'trying to submit it because I wanted to submit to the contest :) Tanks I hope the website is '`
			`'fixed PS: Jan you are cool please add my video',`
- Improved extraction for Vidlii (added fallbacks, simplified RegExes) - Added first test 2017-11-15 22:55:56 +01:00			`'thumbnail': 'https://www.vidlii.com/usfi/thmp/tJluaH4BJ3v.jpg',`
			`'uploader': 'APPle5auc31995',`
			`'url': 'https://cdn.vidlii.com/videos/tJluaH4BJ3v.mp4',`
			`'uploader_url': 'https://www.vidlii.com/user/APPle5auc31995',`
			`'upload_date': '20171107',`
			`'categories': 'News & Politics',`
			`'tags': ['Vidlii', 'Jan', 'Videogames'],`
			`'duration': 212,`
- Added tests for extractor vidlii - Tested fallbacks by use of assertions 2017-11-17 11:07:24 +01:00			`'view_count': int,`
			`'comment_count': int,`
			`'average_rating': float,`
- Improved extraction for Vidlii (added fallbacks, simplified RegExes) - Added first test 2017-11-15 22:55:56 +01:00			`'type': 'video',`
			`'ext': 'mp4'`
- Added template for extractor vidlii.py, last commit only added vidlii extractor into extractors.py 2017-11-08 14:09:57 +01:00			`}`
- Added tests for extractor vidlii - Tested fallbacks by use of assertions 2017-11-17 11:07:24 +01:00			`}, {`
Test 2 und 3 hinzugefuegt 2017-11-17 10:00:17 +01:00			`'url': 'https://www.vidlii.com/watch?v=vBo2IcrwOkO',`
			`'md5': 'b42640a596b4dc986702567d49268963',`
			`'info_dict': {`
			`'id': 'vBo2IcrwOkO',`
			`'title': '(OLD VIDEO) i like youtube!!',`
- Added tests for extractor vidlii - Tested fallbacks by use of assertions 2017-11-17 11:07:24 +01:00			`'description': 'Original upload date:<br />\nMarch 10th 2011<br />\nCredit goes to people who own content in the video',`
Test 2 und 3 hinzugefuegt 2017-11-17 10:00:17 +01:00			`'thumbnail': 'https://www.vidlii.com/usfi/thmp/vBo2IcrwOkO.jpg',`
- Added tests for extractor vidlii - Tested fallbacks by use of assertions 2017-11-17 11:07:24 +01:00			`'uploader': 'MyEditedVideoSpartan',`
			`'url': 'https://cdn.vidlii.com/videos/vBo2IcrwOkO.mp4',`
			`'uploader_url': 'https://www.vidlii.com/user/MyEditedVideoSpartan',`
Test 2 und 3 hinzugefuegt 2017-11-17 10:00:17 +01:00			`'upload_date': '20171011',`
- Added tests for extractor vidlii - Tested fallbacks by use of assertions 2017-11-17 11:07:24 +01:00			`'categories': 'Film & Animation',`
			`'tags': None,`
			`'duration': 34,`
			`'view_count': int,`
			`'comment_count': int,`
			`'average_rating': float,`
			`'type': 'video',`
			`'ext': 'mp4'`
Test 2 und 3 hinzugefuegt 2017-11-17 10:00:17 +01:00			`}`
			`}]`
- Added template for extractor vidlii.py, last commit only added vidlii extractor into extractors.py 2017-11-08 14:09:57 +01:00
			`def _real_extract(self, url):`
			`video_id = self._match_id(url)`
			`webpage = self._download_webpage(url, video_id)`
- Improved extraction for Vidlii (added fallbacks, simplified RegExes) - Added first test 2017-11-15 22:55:56 +01:00
[vidlii] Fixed suggestions for new extractor (see pull request for issue #14779) - improved regular expressions - implemented use of helper functions js_to_json, unified_strdate 2017-11-22 20:35:02 +01:00			`# extract videoInfo variable for further use`
			`videoInfo_dict = self._parse_json(js_to_json(self._html_search_regex(r'var videoInfo\s=\s({[^}]*})', webpage,`
			`'videoInfo', fatal=False)), video_id)`

[vidlii] Add new extractor, see #14472 2017-11-17 11:10:58 +01:00			`# extract basic properties of video`
[vidlii] Fixed suggestions for new extractor (see pull request for issue #14779) - improved regular expressions - implemented use of helper functions js_to_json, unified_strdate 2017-11-22 20:35:02 +01:00			`title = (self._html_search_regex(r'<title>([^<]+?)</title>', webpage, 'title', default='') or`
			`self._html_search_meta('twitter:title', webpage, 'title', default='')).replace(" - VidLii",`
			`"") or self._html_search_regex(`
			`r'<h1>(.+?)</h1>', webpage, 'title', default=None)`
[vidlii] Fixed suggestions for new extractor (see pull request for issue #14779) 2017-11-22 13:44:08 +01:00
			`description = strip_or_none(get_element_by_id('des_text', webpage))`

[vidlii] Fixed suggestions for new extractor (see pull request for issue #14779) - improved regular expressions - implemented use of helper functions js_to_json, unified_strdate 2017-11-22 20:35:02 +01:00			`uploader = self._html_search_regex(`
			`r'<div[^>]+class="wt_person"[^>]>(?:[^<]+)<a href="/user/[^>]?>([^<]*?)<\|<img src="[^>]+?class=('`
			`r'?:"avt2\s"\|\'avt2\s\')[^>]+?alt=(?:"([^"]+?)"\|\'([^\']+?)\')>', webpage, 'uploader', default=None,`
			`fatal=False)`
[vidlii] Fixed suggestions for new extractor (see pull request for issue #14779) 2017-11-22 13:44:08 +01:00
[vidlii] Fixed suggestions for new extractor (see pull request for issue #14779) - improved regular expressions - implemented use of helper functions js_to_json, unified_strdate 2017-11-22 20:35:02 +01:00			`video_url = videoInfo_dict.get("src")`
- Implemented basic extraction for vidlii but requires improvement 2017-11-09 09:38:47 +01:00
			`# get additional properties`
[vidlii] Fixed suggestions for new extractor (see pull request for issue #14779) - improved regular expressions - implemented use of helper functions js_to_json, unified_strdate 2017-11-22 20:35:02 +01:00			`uploader_url = urljoin('https://www.vidlii.com/user/', uploader)`
- Improved extraction for Vidlii (added fallbacks, simplified RegExes) - Added first test 2017-11-15 22:55:56 +01:00
- Added tests for extractor vidlii - Tested fallbacks by use of assertions 2017-11-17 11:07:24 +01:00			`# returns date as YYYYMMDD`
[vidlii] Fixed suggestions for new extractor (see pull request for issue #14779) - improved regular expressions - implemented use of helper functions js_to_json, unified_strdate 2017-11-22 20:35:02 +01:00			`upload_date = unified_strdate(`
			`self._html_search_meta('datePublished', webpage, 'upload_date', default=None,`
			`fatal=False) or self._html_search_regex(r'<date>(['r'^<]*?)</date>', webpage,`
			`'upload_date',`
			`default=None, fatal=False))`
- Added tests for extractor vidlii - Tested fallbacks by use of assertions 2017-11-17 11:07:24 +01:00
- Implemented basic extraction for vidlii but requires improvement 2017-11-09 09:38:47 +01:00			`categories = self._html_search_regex(`
[vidlii] Fixed suggestions for new extractor (see pull request for issue #14779) 2017-11-22 13:44:08 +01:00			`r'<div>Category:\s<\/div>[\s\r]<div>[\s\r]<a href="\/videos\?c=[^>]>([^<]*?)<\/a>', webpage, 'categories',`
[vidlii] Fixed suggestions for new extractor (see pull request for issue #14779) - improved regular expressions - implemented use of helper functions js_to_json, unified_strdate 2017-11-22 20:35:02 +01:00			`default=None, fatal=False)`
			`tags = re.findall(r'<a href="/results\?q=[^>]>\s([^<]*)</a>', webpage) or None`
			`duration = int_or_none(`
			`self._html_search_meta('video:duration', webpage, 'duration', default=False, fatal=False) or`
			`videoInfo_dict.get("dur"))`
[vidlii] Fixed suggestions for new extractor (see pull request for issue #14779) 2017-11-22 13:44:08 +01:00
			`view_count_fb = re.findall(r'<strong>([^<]*?)</strong>', get_element_by_class("w_views", webpage))`
[vidlii] Add new extractor. See issue #14472 2017-11-17 11:25:31 +01:00			`view_count_fb = view_count_fb[0] if view_count_fb else None`
[vidlii] Fixed suggestions for new extractor (see pull request for issue #14779) 2017-11-22 13:44:08 +01:00			`view_count = int_or_none(self._html_search_regex(r'Views:[^<]<strong>([^<]?)<\/strong>', webpage, 'view_count',`
[vidlii] Fixed suggestions for new extractor (see pull request for issue #14779) - improved regular expressions - implemented use of helper functions js_to_json, unified_strdate 2017-11-22 20:35:02 +01:00			`default=None, fatal=False)) or int_or_none(view_count_fb)`
- Implemented basic extraction for vidlii but requires improvement 2017-11-09 09:38:47 +01:00
[vidlii] Fixed suggestions for new extractor (see pull request for issue #14779) 2017-11-22 13:44:08 +01:00			`comment_count = int_or_none(`
[vidlii] Fixed suggestions for new extractor (see pull request for issue #14779) - improved regular expressions - implemented use of helper functions js_to_json, unified_strdate 2017-11-22 20:35:02 +01:00			`self._html_search_regex(r'Comments:[^<]<strong>([^<]?)<\/strong>\|<span[^>]+id="cmt_num"[^>]*>(['`
			`r'^<]+?)<\/span>', webpage, 'comment_count',`
			`default=None, fatal=False))`
- Added tests for extractor vidlii - Tested fallbacks by use of assertions 2017-11-17 11:07:24 +01:00
- Improved extraction for Vidlii (added fallbacks, simplified RegExes) - Added first test 2017-11-15 22:55:56 +01:00			`average_rating = float_or_none(`
[vidlii] Fixed suggestions for new extractor (see pull request for issue #14779) 2017-11-22 13:44:08 +01:00			`self._html_search_regex(r'{[\s\r]\$\("#rateYo"\).rateYo\({[^}]rating:\s([^,]?),[^}.]*}',`
[vidlii] Fixed suggestions for new extractor (see pull request for issue #14779) - improved regular expressions - implemented use of helper functions js_to_json, unified_strdate 2017-11-22 20:35:02 +01:00			`webpage, 'average_rating', default=None, fatal=False))`
			`thumbnail_link = videoInfo_dict.get("img")`
			`thumbnail = urljoin('https://www.vidlii.com/', thumbnail_link)`
- Improved extraction for Vidlii (added fallbacks, simplified RegExes) - Added first test 2017-11-15 22:55:56 +01:00			`video_type = self._og_search_property('type', webpage, 'type')`
- Added template for extractor vidlii.py, last commit only added vidlii extractor into extractors.py 2017-11-08 14:09:57 +01:00
			`return {`
			`'id': video_id,`
			`'title': title,`
- Implemented basic extraction for vidlii but requires improvement 2017-11-09 09:38:47 +01:00			`'description': description,`
			`'uploader': uploader,`
[vidlii] Fixed suggestions for new extractor (see pull request for issue #14779) - improved regular expressions - implemented use of helper functions js_to_json, unified_strdate 2017-11-22 20:35:02 +01:00			`'url': video_url,`
- Implemented basic extraction for vidlii but requires improvement 2017-11-09 09:38:47 +01:00			`'uploader_url': uploader_url,`
- Improved extraction for Vidlii (added fallbacks, simplified RegExes) - Added first test 2017-11-15 22:55:56 +01:00			`'upload_date': upload_date,`
- Implemented basic extraction for vidlii but requires improvement 2017-11-09 09:38:47 +01:00			`'categories': categories,`
			`'tags': tags,`
			`'duration': duration,`
			`'view_count': view_count,`
			`'comment_count': comment_count,`
			`'average_rating': average_rating,`
			`'thumbnail': thumbnail,`
- Improved extraction for Vidlii (added fallbacks, simplified RegExes) - Added first test 2017-11-15 22:55:56 +01:00			`'type': video_type`
- Implemented basic extraction for vidlii but requires improvement 2017-11-09 09:38:47 +01:00			`}`