l1ving_youtube-dl/youtube_dl/extractor/tudou.py

# coding: utf-8

from __future__ import unicode_literals

from .common import InfoExtractor
from ..compat import compat_str
from ..utils import (
    int_or_none,
    float_or_none,
    unescapeHTML,
)


class TudouIE(InfoExtractor):
    IE_NAME = 'tudou'
    _VALID_URL = r'https?://(?:www\.)?tudou\.com/(?:(?:programs|wlplay)/view|(?:listplay|albumplay)/[\w-]{11})/(?P<id>[\w-]{11})'
    _TESTS = [{
        'url': 'http://www.tudou.com/listplay/zzdE77v6Mmo/2xN2duXMxmw.html',
        'md5': '140a49ed444bd22f93330985d8475fcb',
        'info_dict': {
            'id': '159448201',
            'ext': 'f4v',
            'title': '卡马乔国足开大脚长传冲吊集锦',
            'thumbnail': 're:^https?://.*\.jpg$',
            'timestamp': 1372113489000,
            'description': '卡马乔卡家军，开大脚先进战术不完全集锦！',
            'duration': 289.04,
            'view_count': int,
            'filesize': int,
        }
    }, {
        'url': 'http://www.tudou.com/programs/view/ajX3gyhL0pc/',
        'info_dict': {
            'id': '117049447',
            'ext': 'f4v',
            'title': 'La Sylphide-Bolshoi-Ekaterina Krysanova & Vyacheslav Lopatin 2012',
            'thumbnail': 're:^https?://.*\.jpg$',
            'timestamp': 1349207518000,
            'description': 'md5:294612423894260f2dcd5c6c04fe248b',
            'duration': 5478.33,
            'view_count': int,
            'filesize': int,
        }
    }]

    _PLAYER_URL = 'http://js.tudouui.com/bin/lingtong/PortalPlayer_177.swf'

    def _url_for_id(self, video_id, quality=None):
        info_url = 'http://v2.tudou.com/f?id=' + compat_str(video_id)
        if quality:
            info_url += '&hd' + quality
        xml_data = self._download_xml(info_url, video_id, "Opening the info XML page")
        final_url = xml_data.text
        return final_url

    def _real_extract(self, url):
        video_id = self._match_id(url)
        item_data = self._download_json(
            'http://www.tudou.com/tvp/getItemInfo.action?ic=%s' % video_id, video_id)

        youku_vcode = item_data.get('vcode')
        if youku_vcode:
            return self.url_result('youku:' + youku_vcode, ie='Youku')

        segments = self._parse_json(item_data['itemSegs'], video_id)
        # It looks like the keys are the arguments that have to be passed as
        # the hd field in the request url, we filter non-number qualities (see issue #3643).
        qualites = sorted(filter(lambda k: k.isdigit(), segments.keys()),
                          key=lambda k: int(k))
        formats = []
        for quality in qualites:
            parts = []
            for part in segments[quality]:
                final_url = self._url_for_id(part['k'], quality)
                ext = (final_url.split('?')[0]).split('.')[-1]
                part_info = {
                    'url': final_url,
                    'ext': ext,
                    'duration': float_or_none(part.get('seconds'), 1000),
                    'filesize': int_or_none(part.get('size')),
                    'http_headers': {
                        'Referer': self._PLAYER_URL,
                    },
                }
                parts.append(part_info)
            formats.append({
                'formats_id': compat_str(quality),
                'parts': parts,
            })
        self._sort_formats(formats)

        return {
            'id': video_id,
            'title': unescapeHTML(item_data['kw']),
            'thumbnail': item_data.get('pic'),
            'description': item_data.get('desc'),
            'view_count': int_or_none(item_data.get('playTimes')),
            'timestamp': int_or_none(item_data.get('pt')),
            'formats': formats,
        }


class TudouPlaylistIE(InfoExtractor):
    IE_NAME = 'tudou:playlist'
    _VALID_URL = r'https?://(?:www\.)?tudou\.com/listplay/(?P<id>[\w-]{11})\.html'
    _TESTS = [{
        'url': 'http://www.tudou.com/listplay/zzdE77v6Mmo.html',
        'info_dict': {
            'id': 'zzdE77v6Mmo',
        },
        'playlist_mincount': 209,
    }]

    def _real_extract(self, url):
        playlist_id = self._match_id(url)
        playlist_data = self._download_json(
            'http://www.tudou.com/tvp/plist.action?lcode=%s' % playlist_id, playlist_id)
        entries = [self.url_result(
            'http://www.tudou.com/programs/view/%s' % item['icode'],
            'Tudou', item['icode'],
            item['kw']) for item in playlist_data['items']]
        return self.playlist_result(entries, playlist_id)


class TudouAlbumIE(InfoExtractor):
    IE_NAME = 'tudou:album'
    _VALID_URL = r'https?://(?:www\.)?tudou\.com/album(?:cover|play)/(?P<id>[\w-]{11})'
    _TESTS = [{
        'url': 'http://www.tudou.com/albumplay/v5qckFJvNJg.html',
        'info_dict': {
            'id': 'v5qckFJvNJg',
        },
        'playlist_mincount': 45,
    }]

    def _real_extract(self, url):
        album_id = self._match_id(url)
        album_data = self._download_json(
            'http://www.tudou.com/tvp/alist.action?acode=%s' % album_id, album_id)
        entries = [self.url_result(
            'http://www.tudou.com/programs/view/%s' % item['icode'],
            'Tudou', item['icode'],
            item['kw']) for item in album_data['items']]
        return self.playlist_result(entries, album_id)
TudouIE: extract all the segments of the video and download the best quality (closes #975) Also simplify a bit the extraction of the id from the url and write directly the title for the test video 2013-07-02 12:38:24 +02:00			`# coding: utf-8`

[tudou] Modernize 2014-09-01 00:16:26 +02:00			`from __future__ import unicode_literals`

Added an IE for todou 2013-06-25 22:48:08 +05:00			`from .common import InfoExtractor`
[tudou] Use single quotes and compat_str 2015-09-13 02:57:14 +08:00			`from ..compat import compat_str`
[tudou] Add support for Albums and Playlists and extract more metadata 2016-01-13 13:29:00 +01:00			`from ..utils import (`
			`int_or_none,`
			`float_or_none,`
			`unescapeHTML,`
			`)`
Added an IE for todou 2013-06-25 22:48:08 +05:00

			`class TudouIE(InfoExtractor):`
[tudou] Add support for Albums and Playlists and extract more metadata 2016-01-13 13:29:00 +01:00			`IE_NAME = 'tudou'`
			`_VALID_URL = r'https?://(?:www\.)?tudou\.com/(?:(?:programs\|wlplay)/view\|(?:listplay\|albumplay)/[\w-]{11})/(?P<id>[\w-]{11})'`
[tudou] Add support for youku links (Closes #1571) 2013-10-15 01:20:04 +02:00			`_TESTS = [{`
[tudou] Modernize 2014-09-01 00:16:26 +02:00			`'url': 'http://www.tudou.com/listplay/zzdE77v6Mmo/2xN2duXMxmw.html',`
			`'md5': '140a49ed444bd22f93330985d8475fcb',`
			`'info_dict': {`
			`'id': '159448201',`
			`'ext': 'f4v',`
			`'title': '卡马乔国足开大脚长传冲吊集锦',`
			`'thumbnail': 're:^https?://.*\.jpg$',`
[tudou] Add support for Albums and Playlists and extract more metadata 2016-01-13 13:29:00 +01:00			`'timestamp': 1372113489000,`
			`'description': '卡马乔卡家军，开大脚先进战术不完全集锦！',`
			`'duration': 289.04,`
			`'view_count': int,`
			`'filesize': int,`
Move tests to the IE definitions 2013-06-27 20:46:46 +02:00			`}`
[tudou] Add test case for #3643 2014-09-01 00:20:12 +02:00			`}, {`
			`'url': 'http://www.tudou.com/programs/view/ajX3gyhL0pc/',`
			`'info_dict': {`
			`'id': '117049447',`
			`'ext': 'f4v',`
			`'title': 'La Sylphide-Bolshoi-Ekaterina Krysanova & Vyacheslav Lopatin 2012',`
			`'thumbnail': 're:^https?://.*\.jpg$',`
[tudou] Add support for Albums and Playlists and extract more metadata 2016-01-13 13:29:00 +01:00			`'timestamp': 1349207518000,`
			`'description': 'md5:294612423894260f2dcd5c6c04fe248b',`
			`'duration': 5478.33,`
			`'view_count': int,`
			`'filesize': int,`
[tudou] Add test case for #3643 2014-09-01 00:20:12 +02:00			`}`
[tudou] Add support for youku links (Closes #1571) 2013-10-15 01:20:04 +02:00			`}]`
Added an IE for todou 2013-06-25 22:48:08 +05:00
[tudou] Fix extracion 2015-08-05 18:22:25 +08:00			`_PLAYER_URL = 'http://js.tudouui.com/bin/lingtong/PortalPlayer_177.swf'`
[tudou] Extract player URL from the webpage 2015-08-05 23:07:52 +06:00
[tudou] Avoid shadowing builtin names 2015-09-12 22:52:51 +08:00			`def _url_for_id(self, video_id, quality=None):`
[tudou] Use single quotes and compat_str 2015-09-13 02:57:14 +08:00			`info_url = 'http://v2.tudou.com/f?id=' + compat_str(video_id)`
TudouIE: extract all the segments of the video and download the best quality (closes #975) Also simplify a bit the extraction of the id from the url and write directly the title for the test video 2013-07-02 12:38:24 +02:00			`if quality:`
			`info_url += '&hd' + quality`
[tudou] Use _download_xml 2015-09-13 02:36:51 +08:00			`xml_data = self._download_xml(info_url, video_id, "Opening the info XML page")`
			`final_url = xml_data.text`
TudouIE: extract all the segments of the video and download the best quality (closes #975) Also simplify a bit the extraction of the id from the url and write directly the title for the test video 2013-07-02 12:38:24 +02:00			`return final_url`

Added an IE for todou 2013-06-25 22:48:08 +05:00			`def _real_extract(self, url):`
[tudou] Fix extraction 2015-01-08 17:50:46 +01:00			`video_id = self._match_id(url)`
[tudou] Add support for Albums and Playlists and extract more metadata 2016-01-13 13:29:00 +01:00			`item_data = self._download_json(`
			`'http://www.tudou.com/tvp/getItemInfo.action?ic=%s' % video_id, video_id)`
[tudou] Add support for youku links (Closes #1571) 2013-10-15 01:20:04 +02:00
[tudou] Add support for Albums and Playlists and extract more metadata 2016-01-13 13:29:00 +01:00			`youku_vcode = item_data.get('vcode')`
[tudou] Modernize 2015-09-12 22:51:49 +08:00			`if youku_vcode:`
			`return self.url_result('youku:' + youku_vcode, ie='Youku')`
[tudou] Add support for youku links (Closes #1571) 2013-10-15 01:20:04 +02:00
[tudou] Add support for Albums and Playlists and extract more metadata 2016-01-13 13:29:00 +01:00			`segments = self._parse_json(item_data['itemSegs'], video_id)`
TudouIE: extract all the segments of the video and download the best quality (closes #975) Also simplify a bit the extraction of the id from the url and write directly the title for the test video 2013-07-02 12:38:24 +02:00			`# It looks like the keys are the arguments that have to be passed as`
[tudou] fix multipart formats extraction 2016-01-15 01:26:20 +01:00			`# the hd field in the request url, we filter non-number qualities (see issue #3643).`
			`qualites = sorted(filter(lambda k: k.isdigit(), segments.keys()),`
			`key=lambda k: int(k))`
			`formats = []`
			`for quality in qualites:`
			`parts = []`
			`for part in segments[quality]:`
			`final_url = self._url_for_id(part['k'], quality)`
			`ext = (final_url.split('?')[0]).split('.')[-1]`
			`part_info = {`
			`'url': final_url,`
			`'ext': ext,`
			`'duration': float_or_none(part.get('seconds'), 1000),`
			`'filesize': int_or_none(part.get('size')),`
			`'http_headers': {`
			`'Referer': self._PLAYER_URL,`
			`},`
			`}`
			`parts.append(part_info)`
			`formats.append({`
			`'formats_id': compat_str(quality),`
			`'parts': parts,`
			`})`
			`self._sort_formats(formats)`
TudouIE: extract all the segments of the video and download the best quality (closes #975) Also simplify a bit the extraction of the id from the url and write directly the title for the test video 2013-07-02 12:38:24 +02:00
[tudou] Fix extraction 2015-01-08 17:50:46 +01:00			`return {`
			`'id': video_id,`
[tudou] fix multipart formats extraction 2016-01-15 01:26:20 +01:00			`'title': unescapeHTML(item_data['kw']),`
			`'thumbnail': item_data.get('pic'),`
			`'description': item_data.get('desc'),`
			`'view_count': int_or_none(item_data.get('playTimes')),`
			`'timestamp': int_or_none(item_data.get('pt')),`
			`'formats': formats,`
[tudou] Fix extraction 2015-01-08 17:50:46 +01:00			`}`
[tudou] Add support for Albums and Playlists and extract more metadata 2016-01-13 13:29:00 +01:00

			`class TudouPlaylistIE(InfoExtractor):`
			`IE_NAME = 'tudou:playlist'`
			`_VALID_URL = r'https?://(?:www\.)?tudou\.com/listplay/(?P<id>[\w-]{11})\.html'`
			`_TESTS = [{`
			`'url': 'http://www.tudou.com/listplay/zzdE77v6Mmo.html',`
			`'info_dict': {`
			`'id': 'zzdE77v6Mmo',`
			`},`
			`'playlist_mincount': 209,`
			`}]`

			`def _real_extract(self, url):`
			`playlist_id = self._match_id(url)`
			`playlist_data = self._download_json(`
			`'http://www.tudou.com/tvp/plist.action?lcode=%s' % playlist_id, playlist_id)`
			`entries = [self.url_result(`
			`'http://www.tudou.com/programs/view/%s' % item['icode'],`
			`'Tudou', item['icode'],`
			`item['kw']) for item in playlist_data['items']]`
			`return self.playlist_result(entries, playlist_id)`


			`class TudouAlbumIE(InfoExtractor):`
			`IE_NAME = 'tudou:album'`
			`_VALID_URL = r'https?://(?:www\.)?tudou\.com/album(?:cover\|play)/(?P<id>[\w-]{11})'`
			`_TESTS = [{`
			`'url': 'http://www.tudou.com/albumplay/v5qckFJvNJg.html',`
			`'info_dict': {`
			`'id': 'v5qckFJvNJg',`
			`},`
			`'playlist_mincount': 45,`
			`}]`

			`def _real_extract(self, url):`
			`album_id = self._match_id(url)`
			`album_data = self._download_json(`
			`'http://www.tudou.com/tvp/alist.action?acode=%s' % album_id, album_id)`
			`entries = [self.url_result(`
			`'http://www.tudou.com/programs/view/%s' % item['icode'],`
			`'Tudou', item['icode'],`
			`item['kw']) for item in album_data['items']]`
			`return self.playlist_result(entries, album_id)`