l1ving_youtube-dl/youtube_dl/extractor/lrt.py

# coding: utf-8
from __future__ import unicode_literals

from .common import InfoExtractor
from ..utils import (
    int_or_none,
    parse_duration,
    remove_end,
)


class LRTIE(InfoExtractor):
    IE_NAME = 'lrt.lt'
    _VALID_URL = r'https?://(?:www\.)?lrt\.lt/mediateka/irasas/(?P<id>[0-9]+)'
    _TESTS = [{
        'url': 'http://www.lrt.lt/mediateka/irasas/54391/',
        'info_dict': {
            'id': '54391',
            'ext': 'mp4',
            'title': 'Septynios Kauno dienos',
            'description': 'md5:24d84534c7dc76581e59f5689462411a',
            'duration': 1783,
            'view_count': int,
            'like_count': int,
            },
        'params': {
            'skip_download': True,  # m3u8 download
            },
        },
        {
            'url': 'http://www.lrt.lt/mediateka/irasas/1013074524',
            'info_dict': {
                'id': '1013074524',
                'ext': 'mp3',
                'title': 'Kita tema 2016-09-05 15:05',
                'duration': 3008,
                'description': 'md5:1b295a8fc7219ed0d543fc228c931fb5',
                },
            }]

    def _real_extract(self, url):
        video_id = self._match_id(url)
        webpage = self._download_webpage(url, video_id)

        title = remove_end(self._og_search_title(webpage), ' - LRT')
        thumbnail = self._og_search_thumbnail(webpage)
        description = self._og_search_description(webpage)
        duration = parse_duration(self._search_regex(
            r'var\s+record_len\s*=\s*(["\'])(?P<duration>[0-9]+:[0-9]+:[0-9]+)\1',
            webpage, 'duration', default=None, group='duration'))

        view_count = int_or_none(self._html_search_regex(
            r'<div[^>]+class=(["\']).*?record-desc-seen.*?\1[^>]*>(?P<count>.+?)</div>',
            webpage, 'view count', fatal=False, group='count'))
        like_count = int_or_none(self._search_regex(
            r'<span[^>]+id=(["\'])flikesCount.*?\1>(?P<count>\d+)<',
            webpage, 'like count', fatal=False, group='count'))


        m3u8_url = self._search_regex(
            r'\s+[^//]file\s*:\s*(["\'])(?P<url>.+?)\1\s*\+\s*location\.hash\.substring\(1\)',
            webpage, 'm3u8 url', group='url', default=None)
        if m3u8_url:
            formats = self._extract_m3u8_formats(m3u8_url, video_id, 'mp4')
            self._sort_formats(formats)

            return {
                    'id': video_id,
                    'title': title,
                    'formats': formats,
                    'thumbnail': thumbnail,
                    'description': description,
                    'duration': duration,
                    'view_count': view_count,
                    'like_count': like_count
            }
        else:
            mp3_url = self._search_regex(
                    r'\s+[^//]file\s*:\s*(["\'])(?P<url>.+?)\1',
                    webpage, 'mp3 url', group='url')
            return {
                    'id': video_id,
                    'ext': 'mp3',
                    'url': mp3_url,
                    'title': title,
                    'duration': duration,
                    'description': description,
                    'view_count': view_count,
                    'like_count': like_count
            }
[lrt] Add new extractor 2014-09-30 02:26:16 +03:00			`# coding: utf-8`
			`from __future__ import unicode_literals`

			`from .common import InfoExtractor`
			`from ..utils import (`
[lrt] Extract counters 2015-12-27 12:26:48 +06:00			`int_or_none,`
[lrt] Add new extractor 2014-09-30 02:26:16 +03:00			`parse_duration,`
			`remove_end,`
			`)`


			`class LRTIE(InfoExtractor):`
			`IE_NAME = 'lrt.lt'`
			`_VALID_URL = r'https?://(?:www\.)?lrt\.lt/mediateka/irasas/(?P<id>[0-9]+)'`
[lrt] Make it behave better with audio-only links Some lrt.lt links contain only audio (mp3) files so make the extractor better at behaving in these situations. In such cases only grab the link to the MP3 file. 2016-09-05 20:45:35 +03:00			`_TESTS = [{`
[lrt] Add new extractor 2014-09-30 02:26:16 +03:00			`'url': 'http://www.lrt.lt/mediateka/irasas/54391/',`
			`'info_dict': {`
			`'id': '54391',`
			`'ext': 'mp4',`
			`'title': 'Septynios Kauno dienos',`
[lrt] Updated test 2014-10-25 13:24:46 +03:00			`'description': 'md5:24d84534c7dc76581e59f5689462411a',`
[lrt] Add new extractor 2014-09-30 02:26:16 +03:00			`'duration': 1783,`
[lrt] Extract counters 2015-12-27 12:26:48 +06:00			`'view_count': int,`
			`'like_count': int,`
[lrt] Make it behave better with audio-only links Some lrt.lt links contain only audio (mp3) files so make the extractor better at behaving in these situations. In such cases only grab the link to the MP3 file. 2016-09-05 20:45:35 +03:00			`},`
[lrt] Add new extractor 2014-09-30 02:26:16 +03:00			`'params': {`
[lrt] fix the rest of extractor Closes #7690. 2015-11-29 02:58:52 +02:00			`'skip_download': True, # m3u8 download`
[lrt] Make it behave better with audio-only links Some lrt.lt links contain only audio (mp3) files so make the extractor better at behaving in these situations. In such cases only grab the link to the MP3 file. 2016-09-05 20:45:35 +03:00			`},`
[lrt] Add new extractor 2014-09-30 02:26:16 +03:00			`},`
[lrt] Make it behave better with audio-only links Some lrt.lt links contain only audio (mp3) files so make the extractor better at behaving in these situations. In such cases only grab the link to the MP3 file. 2016-09-05 20:45:35 +03:00			`{`
			`'url': 'http://www.lrt.lt/mediateka/irasas/1013074524',`
			`'info_dict': {`
			`'id': '1013074524',`
			`'ext': 'mp3',`
			`'title': 'Kita tema 2016-09-05 15:05',`
			`'duration': 3008,`
			`'description': 'md5:1b295a8fc7219ed0d543fc228c931fb5',`
			`},`
			`}]`
[lrt] Add new extractor 2014-09-30 02:26:16 +03:00
			`def _real_extract(self, url):`
[lrt] Modernize 2014-10-27 02:27:49 +01:00			`video_id = self._match_id(url)`
[lrt] Add new extractor 2014-09-30 02:26:16 +03:00			`webpage = self._download_webpage(url, video_id)`

			`title = remove_end(self._og_search_title(webpage), ' - LRT')`
			`thumbnail = self._og_search_thumbnail(webpage)`
			`description = self._og_search_description(webpage)`
			`duration = parse_duration(self._search_regex(`
[lrt] Improve 2015-12-27 12:16:55 +06:00			`r'var\s+record_len\s=\s(["\'])(?P<duration>[0-9]+:[0-9]+:[0-9]+)\1',`
			`webpage, 'duration', default=None, group='duration'))`
[lrt] Add new extractor 2014-09-30 02:26:16 +03:00
[lrt] Extract counters 2015-12-27 12:26:48 +06:00			`view_count = int_or_none(self._html_search_regex(`
			`r'<div[^>]+class=(["\']).?record-desc-seen.?\1[^>]*>(?P<count>.+?)</div>',`
			`webpage, 'view count', fatal=False, group='count'))`
			`like_count = int_or_none(self._search_regex(`
			`r'<span[^>]+id=(["\'])flikesCount.*?\1>(?P<count>\d+)<',`
			`webpage, 'like count', fatal=False, group='count'))`

[lrt] Make it behave better with audio-only links Some lrt.lt links contain only audio (mp3) files so make the extractor better at behaving in these situations. In such cases only grab the link to the MP3 file. 2016-09-05 20:45:35 +03:00
			`m3u8_url = self._search_regex(`
			`r'\s+[^//]file\s:\s(["\'])(?P<url>.+?)\1\s\+\slocation\.hash\.substring\(1\)',`
			`webpage, 'm3u8 url', group='url', default=None)`
			`if m3u8_url:`
			`formats = self._extract_m3u8_formats(m3u8_url, video_id, 'mp4')`
			`self._sort_formats(formats)`

			`return {`
			`'id': video_id,`
			`'title': title,`
			`'formats': formats,`
			`'thumbnail': thumbnail,`
			`'description': description,`
			`'duration': duration,`
			`'view_count': view_count,`
			`'like_count': like_count`
			`}`
			`else:`
			`mp3_url = self._search_regex(`
			`r'\s+[^//]file\s:\s(["\'])(?P<url>.+?)\1',`
			`webpage, 'mp3 url', group='url')`
			`return {`
			`'id': video_id,`
			`'ext': 'mp3',`
			`'url': mp3_url,`
			`'title': title,`
			`'duration': duration,`
			`'description': description,`
			`'view_count': view_count,`
			`'like_count': like_count`
			`}`