Merge remote-tracking branch 'upstream/master'

This commit is contained in:
Uri London 2016-05-09 07:41:00 +03:00
commit 0a8215677a
10 changed files with 459 additions and 25 deletions

View File

@ -169,3 +169,5 @@ Viťas Strádal
Kagami Hiiragi Kagami Hiiragi
Philip Huppert Philip Huppert
blahgeek blahgeek
Kevin Deldycke
inondle

View File

@ -2025,6 +2025,7 @@ class YoutubeDL(object):
if opts_cookiefile is None: if opts_cookiefile is None:
self.cookiejar = compat_cookiejar.CookieJar() self.cookiejar = compat_cookiejar.CookieJar()
else: else:
opts_cookiefile = compat_expanduser(opts_cookiefile)
self.cookiejar = compat_cookiejar.MozillaCookieJar( self.cookiejar = compat_cookiejar.MozillaCookieJar(
opts_cookiefile) opts_cookiefile)
if os.access(opts_cookiefile, os.R_OK): if os.access(opts_cookiefile, os.R_OK):

View File

@ -86,7 +86,9 @@ def _real_main(argv=None):
if opts.batchfile == '-': if opts.batchfile == '-':
batchfd = sys.stdin batchfd = sys.stdin
else: else:
batchfd = io.open(opts.batchfile, 'r', encoding='utf-8', errors='ignore') batchfd = io.open(
compat_expanduser(opts.batchfile),
'r', encoding='utf-8', errors='ignore')
batch_urls = read_batch_urls(batchfd) batch_urls = read_batch_urls(batchfd)
if opts.verbose: if opts.verbose:
write_string('[debug] Batch file urls: ' + repr(batch_urls) + '\n') write_string('[debug] Batch file urls: ' + repr(batch_urls) + '\n')
@ -404,7 +406,7 @@ def _real_main(argv=None):
try: try:
if opts.load_info_filename is not None: if opts.load_info_filename is not None:
retcode = ydl.download_with_info_file(opts.load_info_filename) retcode = ydl.download_with_info_file(compat_expanduser(opts.load_info_filename))
else: else:
retcode = ydl.download(all_urls) retcode = ydl.download(all_urls)
except MaxDownloadsReached: except MaxDownloadsReached:

View File

@ -224,7 +224,7 @@ class FFmpegFD(ExternalFD):
args += ['-rtmp_live', 'live'] args += ['-rtmp_live', 'live']
args += ['-i', url, '-c', 'copy'] args += ['-i', url, '-c', 'copy']
if protocol == 'm3u8': if protocol in ('m3u8', 'm3u8_native'):
if self.params.get('hls_use_mpegts', False) or tmpfilename == '-': if self.params.get('hls_use_mpegts', False) or tmpfilename == '-':
args += ['-f', 'mpegts'] args += ['-f', 'mpegts']
else: else:

View File

@ -4,6 +4,7 @@ import os.path
import re import re
from .fragment import FragmentFD from .fragment import FragmentFD
from .external import FFmpegFD
from ..compat import compat_urlparse from ..compat import compat_urlparse
from ..utils import ( from ..utils import (
@ -17,12 +18,34 @@ class HlsFD(FragmentFD):
FD_NAME = 'hlsnative' FD_NAME = 'hlsnative'
@staticmethod
def can_download(manifest):
UNSUPPORTED_FEATURES = (
r'#EXT-X-KEY:METHOD=(?!NONE)', # encrypted streams [1]
r'#EXT-X-BYTERANGE', # playlists composed of byte ranges of media files [2]
r'#EXT-X-MEDIA-SEQUENCE:(?!0$)', # live streams [3]
# 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.2.4
# 2. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.2.2
# 3. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.2
)
return all(not re.search(feature, manifest) for feature in UNSUPPORTED_FEATURES)
def real_download(self, filename, info_dict): def real_download(self, filename, info_dict):
man_url = info_dict['url'] man_url = info_dict['url']
self.to_screen('[%s] Downloading m3u8 manifest' % self.FD_NAME) self.to_screen('[%s] Downloading m3u8 manifest' % self.FD_NAME)
manifest = self.ydl.urlopen(man_url).read() manifest = self.ydl.urlopen(man_url).read()
s = manifest.decode('utf-8', 'ignore') s = manifest.decode('utf-8', 'ignore')
if not self.can_download(s):
self.report_warning(
'hlsnative has detected features it does not support, '
'extraction will be delegated to ffmpeg')
fd = FFmpegFD(self.ydl, self.params)
for ph in self._progress_hooks:
fd.add_progress_hook(ph)
return fd.real_download(filename, info_dict)
fragment_urls = [] fragment_urls = []
for line in s.splitlines(): for line in s.splitlines():
line = line.strip() line = line.strip()

View File

@ -161,24 +161,53 @@ class ArteTVPlus7IE(InfoExtractor):
'es': 'E[ESP]', 'es': 'E[ESP]',
} }
langcode = LANGS.get(lang, lang)
formats = [] formats = []
for format_id, format_dict in player_info['VSR'].items(): for format_id, format_dict in player_info['VSR'].items():
f = dict(format_dict) f = dict(format_dict)
versionCode = f.get('versionCode') versionCode = f.get('versionCode')
langcode = LANGS.get(lang, lang) l = re.escape(langcode)
lang_rexs = [r'VO?%s-' % re.escape(langcode), r'VO?.-ST%s$' % re.escape(langcode)]
lang_pref = None # Language preference from most to least priority
if versionCode: # Reference: section 5.6.3 of
matched_lang_rexs = [r for r in lang_rexs if re.match(r, versionCode)] # http://www.arte.tv/sites/en/corporate/files/complete-technical-guidelines-arte-geie-v1-05.pdf
lang_pref = -10 if not matched_lang_rexs else 10 * len(matched_lang_rexs) PREFERENCES = (
source_pref = 0 # original version in requested language, without subtitles
if versionCode is not None: r'VO{0}$'.format(l),
# The original version with subtitles has lower relevance # original version in requested language, with partial subtitles in requested language
if re.match(r'VO-ST(F|A|E)', versionCode): r'VO{0}-ST{0}$'.format(l),
source_pref -= 10 # original version in requested language, with subtitles for the deaf and hard-of-hearing in requested language
# The version with sourds/mal subtitles has also lower relevance r'VO{0}-STM{0}$'.format(l),
elif re.match(r'VO?(F|A|E)-STM\1', versionCode): # non-original (dubbed) version in requested language, without subtitles
source_pref -= 9 r'V{0}$'.format(l),
# non-original (dubbed) version in requested language, with subtitles partial subtitles in requested language
r'V{0}-ST{0}$'.format(l),
# non-original (dubbed) version in requested language, with subtitles for the deaf and hard-of-hearing in requested language
r'V{0}-STM{0}$'.format(l),
# original version in requested language, with partial subtitles in different language
r'VO{0}-ST(?!{0}).+?$'.format(l),
# original version in requested language, with subtitles for the deaf and hard-of-hearing in different language
r'VO{0}-STM(?!{0}).+?$'.format(l),
# original version in different language, with partial subtitles in requested language
r'VO(?:(?!{0}).+?)?-ST{0}$'.format(l),
# original version in different language, with subtitles for the deaf and hard-of-hearing in requested language
r'VO(?:(?!{0}).+?)?-STM{0}$'.format(l),
# original version in different language, without subtitles
r'VO(?:(?!{0}))?$'.format(l),
# original version in different language, with partial subtitles in different language
r'VO(?:(?!{0}).+?)?-ST(?!{0}).+?$'.format(l),
# original version in different language, with subtitles for the deaf and hard-of-hearing in different language
r'VO(?:(?!{0}).+?)?-STM(?!{0}).+?$'.format(l),
)
for pref, p in enumerate(PREFERENCES):
if re.match(p, versionCode):
lang_pref = len(PREFERENCES) - pref
break
else:
lang_pref = -1
format = { format = {
'format_id': format_id, 'format_id': format_id,
'preference': -10 if f.get('videoFormat') == 'M3U8' else None, 'preference': -10 if f.get('videoFormat') == 'M3U8' else None,
@ -188,7 +217,6 @@ class ArteTVPlus7IE(InfoExtractor):
'height': int_or_none(f.get('height')), 'height': int_or_none(f.get('height')),
'tbr': int_or_none(f.get('bitrate')), 'tbr': int_or_none(f.get('bitrate')),
'quality': qfunc(f.get('quality')), 'quality': qfunc(f.get('quality')),
'source_preference': source_pref,
} }
if f.get('mediaType') == 'rtmp': if f.get('mediaType') == 'rtmp':

View File

@ -384,6 +384,7 @@ from .limelight import (
LimelightChannelIE, LimelightChannelIE,
LimelightChannelListIE, LimelightChannelListIE,
) )
from .litv import LiTVIE
from .liveleak import LiveLeakIE from .liveleak import LiveLeakIE
from .livestream import ( from .livestream import (
LivestreamIE, LivestreamIE,
@ -408,6 +409,10 @@ from .metacafe import MetacafeIE
from .metacritic import MetacriticIE from .metacritic import MetacriticIE
from .mgoon import MgoonIE from .mgoon import MgoonIE
from .mgtv import MGTVIE from .mgtv import MGTVIE
from .microsoftvirtualacademy import (
MicrosoftVirtualAcademyIE,
MicrosoftVirtualAcademyCourseIE,
)
from .minhateca import MinhatecaIE from .minhateca import MinhatecaIE
from .ministrygrid import MinistryGridIE from .ministrygrid import MinistryGridIE
from .minoto import MinotoIE from .minoto import MinotoIE

View File

@ -0,0 +1,137 @@
# coding: utf-8
from __future__ import unicode_literals
import json
import re
from .common import InfoExtractor
from ..utils import (
ExtractorError,
int_or_none,
smuggle_url,
unsmuggle_url,
)
class LiTVIE(InfoExtractor):
_VALID_URL = r'https?://www\.litv\.tv/vod/[^/]+/content\.do\?.*?\bid=(?P<id>[^&]+)'
_URL_TEMPLATE = 'https://www.litv.tv/vod/%s/content.do?id=%s'
_TESTS = [{
'url': 'https://www.litv.tv/vod/drama/content.do?brc_id=root&id=VOD00041610&isUHEnabled=true&autoPlay=1',
'info_dict': {
'id': 'VOD00041606',
'title': '花千骨',
},
'playlist_count': 50,
}, {
'url': 'https://www.litv.tv/vod/drama/content.do?brc_id=root&id=VOD00041610&isUHEnabled=true&autoPlay=1',
'info_dict': {
'id': 'VOD00041610',
'ext': 'mp4',
'title': '花千骨第1集',
'thumbnail': 're:https?://.*\.jpg$',
'description': 'md5:c7017aa144c87467c4fb2909c4b05d6f',
'episode_number': 1,
},
'params': {
'noplaylist': True,
'skip_download': True, # m3u8 download
},
'skip': 'Georestricted to Taiwan',
}]
def _extract_playlist(self, season_list, video_id, vod_data, view_data, prompt=True):
episode_title = view_data['title']
content_id = season_list['contentId']
if prompt:
self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (content_id, video_id))
all_episodes = [
self.url_result(smuggle_url(
self._URL_TEMPLATE % (view_data['contentType'], episode['contentId']),
{'force_noplaylist': True})) # To prevent infinite recursion
for episode in season_list['episode']]
return self.playlist_result(all_episodes, content_id, episode_title)
def _real_extract(self, url):
url, data = unsmuggle_url(url, {})
video_id = self._match_id(url)
noplaylist = self._downloader.params.get('noplaylist')
noplaylist_prompt = True
if 'force_noplaylist' in data:
noplaylist = data['force_noplaylist']
noplaylist_prompt = False
webpage = self._download_webpage(url, video_id)
view_data = dict(map(lambda t: (t[0], t[2]), re.findall(
r'viewData\.([a-zA-Z]+)\s*=\s*(["\'])([^"\']+)\2',
webpage)))
vod_data = self._parse_json(self._search_regex(
'var\s+vod\s*=\s*([^;]+)', webpage, 'VOD data', default='{}'),
video_id)
season_list = list(vod_data.get('seasonList', {}).values())
if season_list:
if not noplaylist:
return self._extract_playlist(
season_list[0], video_id, vod_data, view_data,
prompt=noplaylist_prompt)
if noplaylist_prompt:
self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
# In browsers `getMainUrl` request is always issued. Usually this
# endpoint gives the same result as the data embedded in the webpage.
# If georestricted, there are no embedded data, so an extra request is
# necessary to get the error code
video_data = self._parse_json(self._search_regex(
r'uiHlsUrl\s*=\s*testBackendData\(([^;]+)\);',
webpage, 'video data', default='{}'), video_id)
if not video_data:
payload = {
'assetId': view_data['assetId'],
'watchDevices': vod_data['watchDevices'],
'contentType': view_data['contentType'],
}
video_data = self._download_json(
'https://www.litv.tv/vod/getMainUrl', video_id,
data=json.dumps(payload).encode('utf-8'),
headers={'Content-Type': 'application/json'})
if not video_data.get('fullpath'):
error_msg = video_data.get('errorMessage')
if error_msg == 'vod.error.outsideregionerror':
self.raise_geo_restricted('This video is available in Taiwan only')
if error_msg:
raise ExtractorError('%s said: %s' % (self.IE_NAME, error_msg), expected=True)
raise ExtractorError('Unexpected result from %s' % self.IE_NAME)
formats = self._extract_m3u8_formats(
video_data['fullpath'], video_id, ext='mp4', m3u8_id='hls')
for a_format in formats:
# LiTV HLS segments doesn't like compressions
a_format.setdefault('http_headers', {})['Youtubedl-no-compression'] = True
title = view_data['title'] + view_data.get('secondaryMark', '')
description = view_data.get('description')
thumbnail = view_data.get('imageFile')
categories = [item['name'] for item in vod_data.get('category', [])]
episode = int_or_none(view_data.get('episode'))
return {
'id': video_id,
'formats': formats,
'title': title,
'description': description,
'thumbnail': thumbnail,
'categories': categories,
'episode_number': episode,
}

View File

@ -0,0 +1,192 @@
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..compat import (
compat_xpath,
)
from ..utils import (
int_or_none,
parse_duration,
smuggle_url,
unsmuggle_url,
xpath_text,
)
class MicrosoftVirtualAcademyBaseIE(InfoExtractor):
def _extract_base_url(self, course_id, display_id):
return self._download_json(
'https://api-mlxprod.microsoft.com/services/products/anonymous/%s' % course_id,
display_id, 'Downloading course base URL')
def _extract_chapter_and_title(self, title):
if not title:
return None, None
m = re.search(r'(?P<chapter>\d+)\s*\|\s*(?P<title>.+)', title)
return (int(m.group('chapter')), m.group('title')) if m else (None, title)
class MicrosoftVirtualAcademyIE(MicrosoftVirtualAcademyBaseIE):
IE_NAME = 'mva'
IE_DESC = 'Microsoft Virtual Academy videos'
_VALID_URL = r'(?:%s:|https?://(?:mva\.microsoft|(?:www\.)?microsoftvirtualacademy)\.com/[^/]+/training-courses/[^/?#&]+-)(?P<course_id>\d+)(?::|\?l=)(?P<id>[\da-zA-Z]+_\d+)' % IE_NAME
_TESTS = [{
'url': 'https://mva.microsoft.com/en-US/training-courses/microsoft-azure-fundamentals-virtual-machines-11788?l=gfVXISmEB_6804984382',
'md5': '7826c44fc31678b12ad8db11f6b5abb9',
'info_dict': {
'id': 'gfVXISmEB_6804984382',
'ext': 'mp4',
'title': 'Course Introduction',
'formats': 'mincount:3',
'subtitles': {
'en': [{
'ext': 'ttml',
}],
},
}
}, {
'url': 'mva:11788:gfVXISmEB_6804984382',
'only_matching': True,
}]
def _real_extract(self, url):
url, smuggled_data = unsmuggle_url(url, {})
mobj = re.match(self._VALID_URL, url)
course_id = mobj.group('course_id')
video_id = mobj.group('id')
base_url = smuggled_data.get('base_url') or self._extract_base_url(course_id, video_id)
settings = self._download_xml(
'%s/content/content_%s/videosettings.xml?v=1' % (base_url, video_id),
video_id, 'Downloading video settings XML')
_, title = self._extract_chapter_and_title(xpath_text(
settings, './/Title', 'title', fatal=True))
formats = []
for sources in settings.findall(compat_xpath('.//MediaSources')):
if sources.get('videoType') == 'smoothstreaming':
continue
for source in sources.findall(compat_xpath('./MediaSource')):
video_url = source.text
if not video_url or not video_url.startswith('http'):
continue
video_mode = source.get('videoMode')
height = int_or_none(self._search_regex(
r'^(\d+)[pP]$', video_mode or '', 'height', default=None))
codec = source.get('codec')
acodec, vcodec = [None] * 2
if codec:
codecs = codec.split(',')
if len(codecs) == 2:
acodec, vcodec = codecs
elif len(codecs) == 1:
vcodec = codecs[0]
formats.append({
'url': video_url,
'format_id': video_mode,
'height': height,
'acodec': acodec,
'vcodec': vcodec,
})
self._sort_formats(formats)
subtitles = {}
for source in settings.findall(compat_xpath('.//MarkerResourceSource')):
subtitle_url = source.text
if not subtitle_url:
continue
subtitles.setdefault('en', []).append({
'url': '%s/%s' % (base_url, subtitle_url),
'ext': source.get('type'),
})
return {
'id': video_id,
'title': title,
'subtitles': subtitles,
'formats': formats
}
class MicrosoftVirtualAcademyCourseIE(MicrosoftVirtualAcademyBaseIE):
IE_NAME = 'mva:course'
IE_DESC = 'Microsoft Virtual Academy courses'
_VALID_URL = r'(?:%s:|https?://(?:mva\.microsoft|(?:www\.)?microsoftvirtualacademy)\.com/[^/]+/training-courses/(?P<display_id>[^/?#&]+)-)(?P<id>\d+)' % IE_NAME
_TESTS = [{
'url': 'https://mva.microsoft.com/en-US/training-courses/microsoft-azure-fundamentals-virtual-machines-11788',
'info_dict': {
'id': '11788',
'title': 'Microsoft Azure Fundamentals: Virtual Machines',
},
'playlist_count': 36,
}, {
# with emphasized chapters
'url': 'https://mva.microsoft.com/en-US/training-courses/developing-windows-10-games-with-construct-2-16335',
'info_dict': {
'id': '16335',
'title': 'Developing Windows 10 Games with Construct 2',
},
'playlist_count': 10,
}, {
'url': 'https://www.microsoftvirtualacademy.com/en-US/training-courses/microsoft-azure-fundamentals-virtual-machines-11788',
'only_matching': True,
}, {
'url': 'mva:course:11788',
'only_matching': True,
}]
@classmethod
def suitable(cls, url):
return False if MicrosoftVirtualAcademyIE.suitable(url) else super(
MicrosoftVirtualAcademyCourseIE, cls).suitable(url)
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
course_id = mobj.group('id')
display_id = mobj.group('display_id')
base_url = self._extract_base_url(course_id, display_id)
manifest = self._download_json(
'%s/imsmanifestlite.json' % base_url,
display_id, 'Downloading course manifest JSON')['manifest']
organization = manifest['organizations']['organization'][0]
entries = []
for chapter in organization['item']:
chapter_number, chapter_title = self._extract_chapter_and_title(chapter.get('title'))
chapter_id = chapter.get('@identifier')
for item in chapter.get('item', []):
item_id = item.get('@identifier')
if not item_id:
continue
metadata = item.get('resource', {}).get('metadata') or {}
if metadata.get('learningresourcetype') != 'Video':
continue
_, title = self._extract_chapter_and_title(item.get('title'))
duration = parse_duration(metadata.get('duration'))
description = metadata.get('description')
entries.append({
'_type': 'url_transparent',
'url': smuggle_url(
'mva:%s:%s' % (course_id, item_id), {'base_url': base_url}),
'title': title,
'description': description,
'duration': duration,
'chapter': chapter_title,
'chapter_number': chapter_number,
'chapter_id': chapter_id,
})
title = organization.get('title') or manifest.get('metadata', {}).get('title')
return self.playlist_result(entries, course_id, title)

View File

@ -2,14 +2,16 @@
from __future__ import unicode_literals from __future__ import unicode_literals
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import remove_end from ..utils import (
determine_ext,
remove_end,
)
class TelegraafIE(InfoExtractor): class TelegraafIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?telegraaf\.nl/tv/(?:[^/]+/)+(?P<id>\d+)/[^/]+\.html' _VALID_URL = r'https?://(?:www\.)?telegraaf\.nl/tv/(?:[^/]+/)+(?P<id>\d+)/[^/]+\.html'
_TEST = { _TEST = {
'url': 'http://www.telegraaf.nl/tv/nieuws/binnenland/24353229/__Tikibad_ontruimd_wegens_brand__.html', 'url': 'http://www.telegraaf.nl/tv/nieuws/binnenland/24353229/__Tikibad_ontruimd_wegens_brand__.html',
'md5': '83245a9779bcc4a24454bfd53c65b6dc',
'info_dict': { 'info_dict': {
'id': '24353229', 'id': '24353229',
'ext': 'mp4', 'ext': 'mp4',
@ -18,18 +20,60 @@ class TelegraafIE(InfoExtractor):
'thumbnail': 're:^https?://.*\.jpg$', 'thumbnail': 're:^https?://.*\.jpg$',
'duration': 33, 'duration': 33,
}, },
'params': {
# m3u8 download
'skip_download': True,
},
} }
def _real_extract(self, url): def _real_extract(self, url):
playlist_id = self._match_id(url) video_id = self._match_id(url)
webpage = self._download_webpage(url, playlist_id) webpage = self._download_webpage(url, video_id)
player_url = self._html_search_regex(
r'<iframe[^>]+src="([^"]+")', webpage, 'player URL')
player_page = self._download_webpage(
player_url, video_id, note='Download player webpage')
playlist_url = self._search_regex( playlist_url = self._search_regex(
r"iframe\.loadPlayer\('([^']+)'", webpage, 'player') r'playlist\s*:\s*"([^"]+)"', player_page, 'playlist URL')
playlist_data = self._download_json(playlist_url, video_id)
item = playlist_data['items'][0]
formats = []
locations = item['locations']
for location in locations.get('adaptive', []):
manifest_url = location['src']
ext = determine_ext(manifest_url)
if ext == 'm3u8':
formats.extend(self._extract_m3u8_formats(
manifest_url, video_id, ext='mp4', m3u8_id='hls'))
elif ext == 'mpd':
# TODO: Current DASH formats are broken - $Time$ pattern in
# <SegmentTemplate> not implemented yet
continue
else:
self.report_warning('Unknown adaptive format %s' % ext)
for location in locations.get('progressive', []):
formats.append({
'url': location['sources'][0]['src'],
'width': location.get('width'),
'height': location.get('height'),
'format_id': 'http-%s' % location['label'],
})
self._sort_formats(formats)
entries = self._extract_xspf_playlist(playlist_url, playlist_id)
title = remove_end(self._og_search_title(webpage), ' - VIDEO') title = remove_end(self._og_search_title(webpage), ' - VIDEO')
description = self._og_search_description(webpage) description = self._og_search_description(webpage)
duration = item.get('duration')
thumbnail = item.get('poster')
return self.playlist_result(entries, playlist_id, title, description) return {
'id': video_id,
'title': title,
'description': description,
'formats': formats,
'duration': duration,
'thumbnail': thumbnail,
}