Merge remote-tracking branch 'upstream/master'

2016-05-09 07:41:00 +03:00 · 2016-05-09 07:41:00 +03:00 · 0a8215677a
commit 0a8215677a
parent a0abd85b85 eb785b856f
10 changed files with 459 additions and 25 deletions
--- a/2
+++ b/2
@ -169,3 +169,5 @@ Viťas Strádal
 Kagami Hiiragi
 Philip Huppert
 blahgeek
 Kevin Deldycke
 inondle
--- a/youtube_dl/YoutubeDL.py
+++ b/youtube_dl/YoutubeDL.py
@ -2025,6 +2025,7 @@ class YoutubeDL(object):
        if opts_cookiefile is None:
            self.cookiejar = compat_cookiejar.CookieJar()
        else:
            opts_cookiefile = compat_expanduser(opts_cookiefile)
            self.cookiejar = compat_cookiejar.MozillaCookieJar(
                opts_cookiefile)
            if os.access(opts_cookiefile, os.R_OK):
--- a/youtube_dl/init.py
+++ b/youtube_dl/init.py
@ -86,7 +86,9 @@ def _real_main(argv=None):
            if opts.batchfile == '-':
                batchfd = sys.stdin
            else:
-                batchfd = io.open(opts.batchfile, 'r', encoding='utf-8', errors='ignore')
+                batchfd = io.open(
                    compat_expanduser(opts.batchfile),
                    'r', encoding='utf-8', errors='ignore')
            batch_urls = read_batch_urls(batchfd)
            if opts.verbose:
                write_string('[debug] Batch file urls: ' + repr(batch_urls) + '\n')
@ -404,7 +406,7 @@ def _real_main(argv=None):
        try:
            if opts.load_info_filename is not None:
-                retcode = ydl.download_with_info_file(opts.load_info_filename)
+                retcode = ydl.download_with_info_file(compat_expanduser(opts.load_info_filename))
            else:
                retcode = ydl.download(all_urls)
        except MaxDownloadsReached:
--- a/youtube_dl/downloader/external.py
+++ b/youtube_dl/downloader/external.py
@ -224,7 +224,7 @@ class FFmpegFD(ExternalFD):
                args += ['-rtmp_live', 'live']
        args += ['-i', url, '-c', 'copy']
-        if protocol == 'm3u8':
+        if protocol in ('m3u8', 'm3u8_native'):
            if self.params.get('hls_use_mpegts', False) or tmpfilename == '-':
                args += ['-f', 'mpegts']
            else:
--- a/youtube_dl/downloader/hls.py
+++ b/youtube_dl/downloader/hls.py
@ -4,6 +4,7 @@ import os.path
 import re
 from .fragment import FragmentFD
 from .external import FFmpegFD
 from ..compat import compat_urlparse
 from ..utils import (
@ -17,12 +18,34 @@ class HlsFD(FragmentFD):
    FD_NAME = 'hlsnative'
    @staticmethod
    def can_download(manifest):
        UNSUPPORTED_FEATURES = (
            r'#EXT-X-KEY:METHOD=(?!NONE)',  # encrypted streams [1]
            r'#EXT-X-BYTERANGE',  # playlists composed of byte ranges of media files [2]
            r'#EXT-X-MEDIA-SEQUENCE:(?!0$)',  # live streams [3]
            # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.2.4
            # 2. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.2.2
            # 3. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.2
        )
        return all(not re.search(feature, manifest) for feature in UNSUPPORTED_FEATURES)
    def real_download(self, filename, info_dict):
        man_url = info_dict['url']
        self.to_screen('[%s] Downloading m3u8 manifest' % self.FD_NAME)
        manifest = self.ydl.urlopen(man_url).read()
        s = manifest.decode('utf-8', 'ignore')
        if not self.can_download(s):
            self.report_warning(
                'hlsnative has detected features it does not support, '
                'extraction will be delegated to ffmpeg')
            fd = FFmpegFD(self.ydl, self.params)
            for ph in self._progress_hooks:
                fd.add_progress_hook(ph)
            return fd.real_download(filename, info_dict)
        fragment_urls = []
        for line in s.splitlines():
            line = line.strip()
--- a/youtube_dl/extractor/arte.py
+++ b/youtube_dl/extractor/arte.py
@ -161,24 +161,53 @@ class ArteTVPlus7IE(InfoExtractor):
            'es': 'E[ESP]',
        }
        langcode = LANGS.get(lang, lang)
        formats = []
        for format_id, format_dict in player_info['VSR'].items():
            f = dict(format_dict)
            versionCode = f.get('versionCode')
-            langcode = LANGS.get(lang, lang)
+            l = re.escape(langcode)
-            lang_rexs = [r'VO?%s-' % re.escape(langcode), r'VO?.-ST%s$' % re.escape(langcode)]
+
-            lang_pref = None
+            # Language preference from most to least priority
-            if versionCode:
+            # Reference: section 5.6.3 of
-                matched_lang_rexs = [r for r in lang_rexs if re.match(r, versionCode)]
+            # http://www.arte.tv/sites/en/corporate/files/complete-technical-guidelines-arte-geie-v1-05.pdf
-                lang_pref = -10 if not matched_lang_rexs else 10 * len(matched_lang_rexs)
+            PREFERENCES = (
-            source_pref = 0
+                # original version in requested language, without subtitles
-            if versionCode is not None:
+                r'VO{0}$'.format(l),
-                # The original version with subtitles has lower relevance
+                # original version in requested language, with partial subtitles in requested language
-                if re.match(r'VO-ST(F|A|E)', versionCode):
+                r'VO{0}-ST{0}$'.format(l),
-                    source_pref -= 10
+                # original version in requested language, with subtitles for the deaf and hard-of-hearing in requested language
-                # The version with sourds/mal subtitles has also lower relevance
+                r'VO{0}-STM{0}$'.format(l),
-                elif re.match(r'VO?(F|A|E)-STM\1', versionCode):
+                # non-original (dubbed) version in requested language, without subtitles
-                    source_pref -= 9
+                r'V{0}$'.format(l),
                # non-original (dubbed) version in requested language, with subtitles partial subtitles in requested language
                r'V{0}-ST{0}$'.format(l),
                # non-original (dubbed) version in requested language, with subtitles for the deaf and hard-of-hearing in requested language
                r'V{0}-STM{0}$'.format(l),
                # original version in requested language, with partial subtitles in different language
                r'VO{0}-ST(?!{0}).+?$'.format(l),
                # original version in requested language, with subtitles for the deaf and hard-of-hearing in different language
                r'VO{0}-STM(?!{0}).+?$'.format(l),
                # original version in different language, with partial subtitles in requested language
                r'VO(?:(?!{0}).+?)?-ST{0}$'.format(l),
                # original version in different language, with subtitles for the deaf and hard-of-hearing in requested language
                r'VO(?:(?!{0}).+?)?-STM{0}$'.format(l),
                # original version in different language, without subtitles
                r'VO(?:(?!{0}))?$'.format(l),
                # original version in different language, with partial subtitles in different language
                r'VO(?:(?!{0}).+?)?-ST(?!{0}).+?$'.format(l),
                # original version in different language, with subtitles for the deaf and hard-of-hearing in different language
                r'VO(?:(?!{0}).+?)?-STM(?!{0}).+?$'.format(l),
            )
            for pref, p in enumerate(PREFERENCES):
                if re.match(p, versionCode):
                    lang_pref = len(PREFERENCES) - pref
                    break
            else:
                lang_pref = -1
            format = {
                'format_id': format_id,
                'preference': -10 if f.get('videoFormat') == 'M3U8' else None,
@ -188,7 +217,6 @@ class ArteTVPlus7IE(InfoExtractor):
                'height': int_or_none(f.get('height')),
                'tbr': int_or_none(f.get('bitrate')),
                'quality': qfunc(f.get('quality')),
                'source_preference': source_pref,
            }
            if f.get('mediaType') == 'rtmp':
--- a/youtube_dl/extractor/extractors.py
+++ b/youtube_dl/extractor/extractors.py
@ -384,6 +384,7 @@ from .limelight import (
    LimelightChannelIE,
    LimelightChannelListIE,
 )
 from .litv import LiTVIE
 from .liveleak import LiveLeakIE
 from .livestream import (
    LivestreamIE,
@ -408,6 +409,10 @@ from .metacafe import MetacafeIE
 from .metacritic import MetacriticIE
 from .mgoon import MgoonIE
 from .mgtv import MGTVIE
 from .microsoftvirtualacademy import (
    MicrosoftVirtualAcademyIE,
    MicrosoftVirtualAcademyCourseIE,
 )
 from .minhateca import MinhatecaIE
 from .ministrygrid import MinistryGridIE
 from .minoto import MinotoIE
--- a/youtube_dl/extractor/litv.py
+++ b/youtube_dl/extractor/litv.py
@ -0,0 +1,137 @@
 # coding: utf-8
 from __future__ import unicode_literals
 import json
 import re
 from .common import InfoExtractor
 from ..utils import (
    ExtractorError,
    int_or_none,
    smuggle_url,
    unsmuggle_url,
 )
 class LiTVIE(InfoExtractor):
    _VALID_URL = r'https?://www\.litv\.tv/vod/[^/]+/content\.do\?.*?\bid=(?P<id>[^&]+)'
    _URL_TEMPLATE = 'https://www.litv.tv/vod/%s/content.do?id=%s'
    _TESTS = [{
        'url': 'https://www.litv.tv/vod/drama/content.do?brc_id=root&id=VOD00041610&isUHEnabled=true&autoPlay=1',
        'info_dict': {
            'id': 'VOD00041606',
            'title': '花千骨',
        },
        'playlist_count': 50,
    }, {
        'url': 'https://www.litv.tv/vod/drama/content.do?brc_id=root&id=VOD00041610&isUHEnabled=true&autoPlay=1',
        'info_dict': {
            'id': 'VOD00041610',
            'ext': 'mp4',
            'title': '花千骨第1集',
            'thumbnail': 're:https?://.*\.jpg$',
            'description': 'md5:c7017aa144c87467c4fb2909c4b05d6f',
            'episode_number': 1,
        },
        'params': {
            'noplaylist': True,
            'skip_download': True,  # m3u8 download
        },
        'skip': 'Georestricted to Taiwan',
    }]
    def _extract_playlist(self, season_list, video_id, vod_data, view_data, prompt=True):
        episode_title = view_data['title']
        content_id = season_list['contentId']
        if prompt:
            self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (content_id, video_id))
        all_episodes = [
            self.url_result(smuggle_url(
                self._URL_TEMPLATE % (view_data['contentType'], episode['contentId']),
                {'force_noplaylist': True}))  # To prevent infinite recursion
            for episode in season_list['episode']]
        return self.playlist_result(all_episodes, content_id, episode_title)
    def _real_extract(self, url):
        url, data = unsmuggle_url(url, {})
        video_id = self._match_id(url)
        noplaylist = self._downloader.params.get('noplaylist')
        noplaylist_prompt = True
        if 'force_noplaylist' in data:
            noplaylist = data['force_noplaylist']
            noplaylist_prompt = False
        webpage = self._download_webpage(url, video_id)
        view_data = dict(map(lambda t: (t[0], t[2]), re.findall(
            r'viewData\.([a-zA-Z]+)\s*=\s*(["\'])([^"\']+)\2',
            webpage)))
        vod_data = self._parse_json(self._search_regex(
            'var\s+vod\s*=\s*([^;]+)', webpage, 'VOD data', default='{}'),
            video_id)
        season_list = list(vod_data.get('seasonList', {}).values())
        if season_list:
            if not noplaylist:
                return self._extract_playlist(
                    season_list[0], video_id, vod_data, view_data,
                    prompt=noplaylist_prompt)
            if noplaylist_prompt:
                self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
        # In browsers `getMainUrl` request is always issued. Usually this
        # endpoint gives the same result as the data embedded in the webpage.
        # If georestricted, there are no embedded data, so an extra request is
        # necessary to get the error code
        video_data = self._parse_json(self._search_regex(
            r'uiHlsUrl\s*=\s*testBackendData\(([^;]+)\);',
            webpage, 'video data', default='{}'), video_id)
        if not video_data:
            payload = {
                'assetId': view_data['assetId'],
                'watchDevices': vod_data['watchDevices'],
                'contentType': view_data['contentType'],
            }
            video_data = self._download_json(
                'https://www.litv.tv/vod/getMainUrl', video_id,
                data=json.dumps(payload).encode('utf-8'),
                headers={'Content-Type': 'application/json'})
        if not video_data.get('fullpath'):
            error_msg = video_data.get('errorMessage')
            if error_msg == 'vod.error.outsideregionerror':
                self.raise_geo_restricted('This video is available in Taiwan only')
            if error_msg:
                raise ExtractorError('%s said: %s' % (self.IE_NAME, error_msg), expected=True)
            raise ExtractorError('Unexpected result from %s' % self.IE_NAME)
        formats = self._extract_m3u8_formats(
            video_data['fullpath'], video_id, ext='mp4', m3u8_id='hls')
        for a_format in formats:
            # LiTV HLS segments doesn't like compressions
            a_format.setdefault('http_headers', {})['Youtubedl-no-compression'] = True
        title = view_data['title'] + view_data.get('secondaryMark', '')
        description = view_data.get('description')
        thumbnail = view_data.get('imageFile')
        categories = [item['name'] for item in vod_data.get('category', [])]
        episode = int_or_none(view_data.get('episode'))
        return {
            'id': video_id,
            'formats': formats,
            'title': title,
            'description': description,
            'thumbnail': thumbnail,
            'categories': categories,
            'episode_number': episode,
        }
--- a/youtube_dl/extractor/microsoftvirtualacademy.py
+++ b/youtube_dl/extractor/microsoftvirtualacademy.py
@ -0,0 +1,192 @@
 from __future__ import unicode_literals
 import re
 from .common import InfoExtractor
 from ..compat import (
    compat_xpath,
 )
 from ..utils import (
    int_or_none,
    parse_duration,
    smuggle_url,
    unsmuggle_url,
    xpath_text,
 )
 class MicrosoftVirtualAcademyBaseIE(InfoExtractor):
    def _extract_base_url(self, course_id, display_id):
        return self._download_json(
            'https://api-mlxprod.microsoft.com/services/products/anonymous/%s' % course_id,
            display_id, 'Downloading course base URL')
    def _extract_chapter_and_title(self, title):
        if not title:
            return None, None
        m = re.search(r'(?P<chapter>\d+)\s*\|\s*(?P<title>.+)', title)
        return (int(m.group('chapter')), m.group('title')) if m else (None, title)
 class MicrosoftVirtualAcademyIE(MicrosoftVirtualAcademyBaseIE):
    IE_NAME = 'mva'
    IE_DESC = 'Microsoft Virtual Academy videos'
    _VALID_URL = r'(?:%s:|https?://(?:mva\.microsoft|(?:www\.)?microsoftvirtualacademy)\.com/[^/]+/training-courses/[^/?#&]+-)(?P<course_id>\d+)(?::|\?l=)(?P<id>[\da-zA-Z]+_\d+)' % IE_NAME
    _TESTS = [{
        'url': 'https://mva.microsoft.com/en-US/training-courses/microsoft-azure-fundamentals-virtual-machines-11788?l=gfVXISmEB_6804984382',
        'md5': '7826c44fc31678b12ad8db11f6b5abb9',
        'info_dict': {
            'id': 'gfVXISmEB_6804984382',
            'ext': 'mp4',
            'title': 'Course Introduction',
            'formats': 'mincount:3',
            'subtitles': {
                'en': [{
                    'ext': 'ttml',
                }],
            },
        }
    }, {
        'url': 'mva:11788:gfVXISmEB_6804984382',
        'only_matching': True,
    }]
    def _real_extract(self, url):
        url, smuggled_data = unsmuggle_url(url, {})
        mobj = re.match(self._VALID_URL, url)
        course_id = mobj.group('course_id')
        video_id = mobj.group('id')
        base_url = smuggled_data.get('base_url') or self._extract_base_url(course_id, video_id)
        settings = self._download_xml(
            '%s/content/content_%s/videosettings.xml?v=1' % (base_url, video_id),
            video_id, 'Downloading video settings XML')
        _, title = self._extract_chapter_and_title(xpath_text(
            settings, './/Title', 'title', fatal=True))
        formats = []
        for sources in settings.findall(compat_xpath('.//MediaSources')):
            if sources.get('videoType') == 'smoothstreaming':
                continue
            for source in sources.findall(compat_xpath('./MediaSource')):
                video_url = source.text
                if not video_url or not video_url.startswith('http'):
                    continue
                video_mode = source.get('videoMode')
                height = int_or_none(self._search_regex(
                    r'^(\d+)[pP]$', video_mode or '', 'height', default=None))
                codec = source.get('codec')
                acodec, vcodec = [None] * 2
                if codec:
                    codecs = codec.split(',')
                    if len(codecs) == 2:
                        acodec, vcodec = codecs
                    elif len(codecs) == 1:
                        vcodec = codecs[0]
                formats.append({
                    'url': video_url,
                    'format_id': video_mode,
                    'height': height,
                    'acodec': acodec,
                    'vcodec': vcodec,
                })
        self._sort_formats(formats)
        subtitles = {}
        for source in settings.findall(compat_xpath('.//MarkerResourceSource')):
            subtitle_url = source.text
            if not subtitle_url:
                continue
            subtitles.setdefault('en', []).append({
                'url': '%s/%s' % (base_url, subtitle_url),
                'ext': source.get('type'),
            })
        return {
            'id': video_id,
            'title': title,
            'subtitles': subtitles,
            'formats': formats
        }
 class MicrosoftVirtualAcademyCourseIE(MicrosoftVirtualAcademyBaseIE):
    IE_NAME = 'mva:course'
    IE_DESC = 'Microsoft Virtual Academy courses'
    _VALID_URL = r'(?:%s:|https?://(?:mva\.microsoft|(?:www\.)?microsoftvirtualacademy)\.com/[^/]+/training-courses/(?P<display_id>[^/?#&]+)-)(?P<id>\d+)' % IE_NAME
    _TESTS = [{
        'url': 'https://mva.microsoft.com/en-US/training-courses/microsoft-azure-fundamentals-virtual-machines-11788',
        'info_dict': {
            'id': '11788',
            'title': 'Microsoft Azure Fundamentals: Virtual Machines',
        },
        'playlist_count': 36,
    }, {
        # with emphasized chapters
        'url': 'https://mva.microsoft.com/en-US/training-courses/developing-windows-10-games-with-construct-2-16335',
        'info_dict': {
            'id': '16335',
            'title': 'Developing Windows 10 Games with Construct 2',
        },
        'playlist_count': 10,
    }, {
        'url': 'https://www.microsoftvirtualacademy.com/en-US/training-courses/microsoft-azure-fundamentals-virtual-machines-11788',
        'only_matching': True,
    }, {
        'url': 'mva:course:11788',
        'only_matching': True,
    }]
    @classmethod
    def suitable(cls, url):
        return False if MicrosoftVirtualAcademyIE.suitable(url) else super(
            MicrosoftVirtualAcademyCourseIE, cls).suitable(url)
    def _real_extract(self, url):
        mobj = re.match(self._VALID_URL, url)
        course_id = mobj.group('id')
        display_id = mobj.group('display_id')
        base_url = self._extract_base_url(course_id, display_id)
        manifest = self._download_json(
            '%s/imsmanifestlite.json' % base_url,
            display_id, 'Downloading course manifest JSON')['manifest']
        organization = manifest['organizations']['organization'][0]
        entries = []
        for chapter in organization['item']:
            chapter_number, chapter_title = self._extract_chapter_and_title(chapter.get('title'))
            chapter_id = chapter.get('@identifier')
            for item in chapter.get('item', []):
                item_id = item.get('@identifier')
                if not item_id:
                    continue
                metadata = item.get('resource', {}).get('metadata') or {}
                if metadata.get('learningresourcetype') != 'Video':
                    continue
                _, title = self._extract_chapter_and_title(item.get('title'))
                duration = parse_duration(metadata.get('duration'))
                description = metadata.get('description')
                entries.append({
                    '_type': 'url_transparent',
                    'url': smuggle_url(
                        'mva:%s:%s' % (course_id, item_id), {'base_url': base_url}),
                    'title': title,
                    'description': description,
                    'duration': duration,
                    'chapter': chapter_title,
                    'chapter_number': chapter_number,
                    'chapter_id': chapter_id,
                })
        title = organization.get('title') or manifest.get('metadata', {}).get('title')
        return self.playlist_result(entries, course_id, title)
--- a/youtube_dl/extractor/telegraaf.py
+++ b/youtube_dl/extractor/telegraaf.py
@ -2,14 +2,16 @@
 from __future__ import unicode_literals
 from .common import InfoExtractor
-from ..utils import remove_end
+from ..utils import (
    determine_ext,
    remove_end,
 )
 class TelegraafIE(InfoExtractor):
    _VALID_URL = r'https?://(?:www\.)?telegraaf\.nl/tv/(?:[^/]+/)+(?P<id>\d+)/[^/]+\.html'
    _TEST = {
        'url': 'http://www.telegraaf.nl/tv/nieuws/binnenland/24353229/__Tikibad_ontruimd_wegens_brand__.html',
        'md5': '83245a9779bcc4a24454bfd53c65b6dc',
        'info_dict': {
            'id': '24353229',
            'ext': 'mp4',
@ -18,18 +20,60 @@ class TelegraafIE(InfoExtractor):
            'thumbnail': 're:^https?://.*\.jpg$',
            'duration': 33,
        },
        'params': {
            # m3u8 download
            'skip_download': True,
        },
    }
    def _real_extract(self, url):
-        playlist_id = self._match_id(url)
+        video_id = self._match_id(url)
-        webpage = self._download_webpage(url, playlist_id)
+        webpage = self._download_webpage(url, video_id)
        player_url = self._html_search_regex(
            r'<iframe[^>]+src="([^"]+")', webpage, 'player URL')
        player_page = self._download_webpage(
            player_url, video_id, note='Download player webpage')
        playlist_url = self._search_regex(
-            r"iframe\.loadPlayer\('([^']+)'", webpage, 'player')
+            r'playlist\s*:\s*"([^"]+)"', player_page, 'playlist URL')
        playlist_data = self._download_json(playlist_url, video_id)
        item = playlist_data['items'][0]
        formats = []
        locations = item['locations']
        for location in locations.get('adaptive', []):
            manifest_url = location['src']
            ext = determine_ext(manifest_url)
            if ext == 'm3u8':
                formats.extend(self._extract_m3u8_formats(
                    manifest_url, video_id, ext='mp4', m3u8_id='hls'))
            elif ext == 'mpd':
                # TODO: Current DASH formats are broken - $Time$ pattern in
                # <SegmentTemplate> not implemented yet
                continue
            else:
                self.report_warning('Unknown adaptive format %s' % ext)
        for location in locations.get('progressive', []):
            formats.append({
                'url': location['sources'][0]['src'],
                'width': location.get('width'),
                'height': location.get('height'),
                'format_id': 'http-%s' % location['label'],
            })
        self._sort_formats(formats)
        entries = self._extract_xspf_playlist(playlist_url, playlist_id)
        title = remove_end(self._og_search_title(webpage), ' - VIDEO')
        description = self._og_search_description(webpage)
        duration = item.get('duration')
        thumbnail = item.get('poster')
-        return self.playlist_result(entries, playlist_id, title, description)
+        return {
            'id': video_id,
            'title': title,
            'description': description,
            'formats': formats,
            'duration': duration,
            'thumbnail': thumbnail,
        }