From 0d66bd0eab436f7215f5da168b378127898ccd66 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 1 May 2016 13:56:51 +0600 Subject: [PATCH 01/11] [downloader/hls] Delegate extraction to ffmpeg when unsupported features detected --- youtube_dl/downloader/hls.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/youtube_dl/downloader/hls.py b/youtube_dl/downloader/hls.py index a01dac031..d7b34bde3 100644 --- a/youtube_dl/downloader/hls.py +++ b/youtube_dl/downloader/hls.py @@ -4,6 +4,7 @@ import os.path import re from .fragment import FragmentFD +from .external import FFmpegFD from ..compat import compat_urlparse from ..utils import ( @@ -17,12 +18,34 @@ class HlsFD(FragmentFD): FD_NAME = 'hlsnative' + @staticmethod + def can_download(manifest): + UNSUPPORTED_FEATURES = ( + r'#EXT-X-KEY:METHOD=(?!NONE)', # encrypted streams [1] + r'#EXT-X-BYTERANGE', # playlists composed of byte ranges of media files [2] + r'#EXT-X-MEDIA-SEQUENCE:(?!0$)', # live streams [3] + # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.2.4 + # 2. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.2.2 + # 3. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.2 + ) + return all(not re.search(feature, manifest) for feature in UNSUPPORTED_FEATURES) + def real_download(self, filename, info_dict): man_url = info_dict['url'] self.to_screen('[%s] Downloading m3u8 manifest' % self.FD_NAME) manifest = self.ydl.urlopen(man_url).read() s = manifest.decode('utf-8', 'ignore') + + if not self.can_download(s): + self.report_warning( + 'hlsnative has detected features it does not support, ' + 'extraction will be delegated to ffmpeg') + fd = FFmpegFD(self.ydl, self.params) + for ph in self._progress_hooks: + fd.add_progress_hook(ph) + return fd.real_download(filename, info_dict) + fragment_urls = [] for line in s.splitlines(): line = line.strip() From a0904c5d8024c12b7f95b1126a6b8152a4e1021f Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sun, 8 May 2016 00:56:31 +0800 Subject: [PATCH 02/11] [telegraaf] Fix extractor (closes #9318) --- youtube_dl/extractor/telegraaf.py | 58 +++++++++++++++++++++++++++---- 1 file changed, 51 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/telegraaf.py b/youtube_dl/extractor/telegraaf.py index 6f8333cfc..9092e9b85 100644 --- a/youtube_dl/extractor/telegraaf.py +++ b/youtube_dl/extractor/telegraaf.py @@ -2,14 +2,16 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import remove_end +from ..utils import ( + determine_ext, + remove_end, +) class TelegraafIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?telegraaf\.nl/tv/(?:[^/]+/)+(?P\d+)/[^/]+\.html' _TEST = { 'url': 'http://www.telegraaf.nl/tv/nieuws/binnenland/24353229/__Tikibad_ontruimd_wegens_brand__.html', - 'md5': '83245a9779bcc4a24454bfd53c65b6dc', 'info_dict': { 'id': '24353229', 'ext': 'mp4', @@ -18,18 +20,60 @@ class TelegraafIE(InfoExtractor): 'thumbnail': 're:^https?://.*\.jpg$', 'duration': 33, }, + 'params': { + # m3u8 download + 'skip_download': True, + }, } def _real_extract(self, url): - playlist_id = self._match_id(url) + video_id = self._match_id(url) - webpage = self._download_webpage(url, playlist_id) + webpage = self._download_webpage(url, video_id) + player_url = self._html_search_regex( + r']+src="([^"]+")', webpage, 'player URL') + player_page = self._download_webpage( + player_url, video_id, note='Download player webpage') playlist_url = self._search_regex( - r"iframe\.loadPlayer\('([^']+)'", webpage, 'player') + r'playlist\s*:\s*"([^"]+)"', player_page, 'playlist URL') + playlist_data = self._download_json(playlist_url, video_id) + + item = playlist_data['items'][0] + formats = [] + locations = item['locations'] + for location in locations.get('adaptive', []): + manifest_url = location['src'] + ext = determine_ext(manifest_url) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + manifest_url, video_id, ext='mp4', m3u8_id='hls')) + elif ext == 'mpd': + # TODO: Current DASH formats are broken - $Time$ pattern in + # not implemented yet + continue + else: + self.report_warning('Unknown adaptive format %s' % ext) + for location in locations.get('progressive', []): + formats.append({ + 'url': location['sources'][0]['src'], + 'width': location.get('width'), + 'height': location.get('height'), + 'format_id': 'http-%s' % location['label'], + }) + + self._sort_formats(formats) - entries = self._extract_xspf_playlist(playlist_url, playlist_id) title = remove_end(self._og_search_title(webpage), ' - VIDEO') description = self._og_search_description(webpage) + duration = item.get('duration') + thumbnail = item.get('poster') - return self.playlist_result(entries, playlist_id, title, description) + return { + 'id': video_id, + 'title': title, + 'description': description, + 'formats': formats, + 'duration': duration, + 'thumbnail': thumbnail, + } From e2eca6f65e9969c31b3374bd3688321f3e471cd7 Mon Sep 17 00:00:00 2001 From: Kevin Deldycke Date: Sat, 7 May 2016 20:03:25 +0200 Subject: [PATCH 03/11] Expand user's home in batch file path. --- youtube_dl/__init__.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 737f6545d..7a0466077 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -86,7 +86,9 @@ def _real_main(argv=None): if opts.batchfile == '-': batchfd = sys.stdin else: - batchfd = io.open(opts.batchfile, 'r', encoding='utf-8', errors='ignore') + batchfd = io.open( + compat_expanduser(opts.batchfile), + 'r', encoding='utf-8', errors='ignore') batch_urls = read_batch_urls(batchfd) if opts.verbose: write_string('[debug] Batch file urls: ' + repr(batch_urls) + '\n') From 00c21c225decf648199013f2fa3385a1332037bb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 8 May 2016 00:11:44 +0600 Subject: [PATCH 04/11] Credit @kdeldycke for #9430 --- AUTHORS | 1 + 1 file changed, 1 insertion(+) diff --git a/AUTHORS b/AUTHORS index 814fe9ec3..5f668338b 100644 --- a/AUTHORS +++ b/AUTHORS @@ -169,3 +169,4 @@ Viťas Strádal Kagami Hiiragi Philip Huppert blahgeek +Kevin Deldycke From 5c24873a9e6a47e58b10eb0c0825e165604796f2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 8 May 2016 02:04:34 +0600 Subject: [PATCH 05/11] Credit @inondle for #9400 --- AUTHORS | 1 + 1 file changed, 1 insertion(+) diff --git a/AUTHORS b/AUTHORS index 5f668338b..bf860b7f7 100644 --- a/AUTHORS +++ b/AUTHORS @@ -170,3 +170,4 @@ Kagami Hiiragi Philip Huppert blahgeek Kevin Deldycke +inondle From f5436c5d9e4e65790440ada40476712ff430651b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 8 May 2016 02:29:26 +0600 Subject: [PATCH 06/11] [downloader/external] Add temp fix ffmpeg m3u8 downloads (Closes #9394) --- youtube_dl/downloader/external.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/downloader/external.py b/youtube_dl/downloader/external.py index 8d642fc3e..45f49c350 100644 --- a/youtube_dl/downloader/external.py +++ b/youtube_dl/downloader/external.py @@ -224,7 +224,7 @@ class FFmpegFD(ExternalFD): args += ['-rtmp_live', 'live'] args += ['-i', url, '-c', 'copy'] - if protocol == 'm3u8': + if protocol in ('m3u8', 'm3u8_native'): if self.params.get('hls_use_mpegts', False) or tmpfilename == '-': args += ['-f', 'mpegts'] else: From 3e169233daf76cd7585ebac12504f8e624b7693b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 8 May 2016 04:36:57 +0600 Subject: [PATCH 07/11] Expanduser for more options with input files --- youtube_dl/YoutubeDL.py | 1 + youtube_dl/__init__.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 2187dcc8f..a96482e68 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -2018,6 +2018,7 @@ class YoutubeDL(object): if opts_cookiefile is None: self.cookiejar = compat_cookiejar.CookieJar() else: + opts_cookiefile = compat_expanduser(opts_cookiefile) self.cookiejar = compat_cookiejar.MozillaCookieJar( opts_cookiefile) if os.access(opts_cookiefile, os.R_OK): diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 7a0466077..cbd84c3af 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -406,7 +406,7 @@ def _real_main(argv=None): try: if opts.load_info_filename is not None: - retcode = ydl.download_with_info_file(opts.load_info_filename) + retcode = ydl.download_with_info_file(compat_expanduser(opts.load_info_filename)) else: retcode = ydl.download(all_urls) except MaxDownloadsReached: From 9c072d38c6b0361d91e92c50cd0c753dc8ce3101 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 8 May 2016 06:52:42 +0600 Subject: [PATCH 08/11] [arte] Improve language preference (Closes #9401, closes #9162) --- youtube_dl/extractor/arte.py | 58 ++++++++++++++++++++++++++---------- 1 file changed, 43 insertions(+), 15 deletions(-) diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py index 881cacfab..e37fdae13 100644 --- a/youtube_dl/extractor/arte.py +++ b/youtube_dl/extractor/arte.py @@ -161,24 +161,53 @@ class ArteTVPlus7IE(InfoExtractor): 'es': 'E[ESP]', } + langcode = LANGS.get(lang, lang) + formats = [] for format_id, format_dict in player_info['VSR'].items(): f = dict(format_dict) versionCode = f.get('versionCode') - langcode = LANGS.get(lang, lang) - lang_rexs = [r'VO?%s-' % re.escape(langcode), r'VO?.-ST%s$' % re.escape(langcode)] - lang_pref = None - if versionCode: - matched_lang_rexs = [r for r in lang_rexs if re.match(r, versionCode)] - lang_pref = -10 if not matched_lang_rexs else 10 * len(matched_lang_rexs) - source_pref = 0 - if versionCode is not None: - # The original version with subtitles has lower relevance - if re.match(r'VO-ST(F|A|E)', versionCode): - source_pref -= 10 - # The version with sourds/mal subtitles has also lower relevance - elif re.match(r'VO?(F|A|E)-STM\1', versionCode): - source_pref -= 9 + l = re.escape(langcode) + + # Language preference from most to least priority + # Reference: section 5.6.3 of + # http://www.arte.tv/sites/en/corporate/files/complete-technical-guidelines-arte-geie-v1-05.pdf + PREFERENCES = ( + # original version in requested language, without subtitles + r'VO{0}$'.format(l), + # original version in requested language, with partial subtitles in requested language + r'VO{0}-ST{0}$'.format(l), + # original version in requested language, with subtitles for the deaf and hard-of-hearing in requested language + r'VO{0}-STM{0}$'.format(l), + # non-original (dubbed) version in requested language, without subtitles + r'V{0}$'.format(l), + # non-original (dubbed) version in requested language, with subtitles partial subtitles in requested language + r'V{0}-ST{0}$'.format(l), + # non-original (dubbed) version in requested language, with subtitles for the deaf and hard-of-hearing in requested language + r'V{0}-STM{0}$'.format(l), + # original version in requested language, with partial subtitles in different language + r'VO{0}-ST(?!{0}).+?$'.format(l), + # original version in requested language, with subtitles for the deaf and hard-of-hearing in different language + r'VO{0}-STM(?!{0}).+?$'.format(l), + # original version in different language, with partial subtitles in requested language + r'VO(?:(?!{0}).+?)?-ST{0}$'.format(l), + # original version in different language, with subtitles for the deaf and hard-of-hearing in requested language + r'VO(?:(?!{0}).+?)?-STM{0}$'.format(l), + # original version in different language, without subtitles + r'VO(?:(?!{0}))?$'.format(l), + # original version in different language, with partial subtitles in different language + r'VO(?:(?!{0}).+?)?-ST(?!{0}).+?$'.format(l), + # original version in different language, with subtitles for the deaf and hard-of-hearing in different language + r'VO(?:(?!{0}).+?)?-STM(?!{0}).+?$'.format(l), + ) + + for pref, p in enumerate(PREFERENCES): + if re.match(p, versionCode): + lang_pref = len(PREFERENCES) - pref + break + else: + lang_pref = -1 + format = { 'format_id': format_id, 'preference': -10 if f.get('videoFormat') == 'M3U8' else None, @@ -188,7 +217,6 @@ class ArteTVPlus7IE(InfoExtractor): 'height': int_or_none(f.get('height')), 'tbr': int_or_none(f.get('bitrate')), 'quality': qfunc(f.get('quality')), - 'source_preference': source_pref, } if f.get('mediaType') == 'rtmp': From 3b01a9fbb63e33325fa979db8a846d3e655e79e6 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sun, 8 May 2016 14:34:38 +0800 Subject: [PATCH 09/11] [litv] Add new extractor LiTV is a streaming platform providing free and paid legal contents in Taiwan. --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/litv.py | 137 +++++++++++++++++++++++++++++ 2 files changed, 138 insertions(+) create mode 100644 youtube_dl/extractor/litv.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 14b4f245f..7bacef184 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -384,6 +384,7 @@ from .limelight import ( LimelightChannelIE, LimelightChannelListIE, ) +from .litv import LiTVIE from .liveleak import LiveLeakIE from .livestream import ( LivestreamIE, diff --git a/youtube_dl/extractor/litv.py b/youtube_dl/extractor/litv.py new file mode 100644 index 000000000..3356d015d --- /dev/null +++ b/youtube_dl/extractor/litv.py @@ -0,0 +1,137 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json +import re + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + int_or_none, + smuggle_url, + unsmuggle_url, +) + + +class LiTVIE(InfoExtractor): + _VALID_URL = r'https?://www\.litv\.tv/vod/[^/]+/content\.do\?.*?\bid=(?P[^&]+)' + + _URL_TEMPLATE = 'https://www.litv.tv/vod/%s/content.do?id=%s' + + _TESTS = [{ + 'url': 'https://www.litv.tv/vod/drama/content.do?brc_id=root&id=VOD00041610&isUHEnabled=true&autoPlay=1', + 'info_dict': { + 'id': 'VOD00041606', + 'title': '花千骨', + }, + 'playlist_count': 50, + }, { + 'url': 'https://www.litv.tv/vod/drama/content.do?brc_id=root&id=VOD00041610&isUHEnabled=true&autoPlay=1', + 'info_dict': { + 'id': 'VOD00041610', + 'ext': 'mp4', + 'title': '花千骨第1集', + 'thumbnail': 're:https?://.*\.jpg$', + 'description': 'md5:c7017aa144c87467c4fb2909c4b05d6f', + 'episode_number': 1, + }, + 'params': { + 'noplaylist': True, + 'skip_download': True, # m3u8 download + }, + 'skip': 'Georestricted to Taiwan', + }] + + def _extract_playlist(self, season_list, video_id, vod_data, view_data, prompt=True): + episode_title = view_data['title'] + content_id = season_list['contentId'] + + if prompt: + self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (content_id, video_id)) + + all_episodes = [ + self.url_result(smuggle_url( + self._URL_TEMPLATE % (view_data['contentType'], episode['contentId']), + {'force_noplaylist': True})) # To prevent infinite recursion + for episode in season_list['episode']] + + return self.playlist_result(all_episodes, content_id, episode_title) + + def _real_extract(self, url): + url, data = unsmuggle_url(url, {}) + + video_id = self._match_id(url) + + noplaylist = self._downloader.params.get('noplaylist') + noplaylist_prompt = True + if 'force_noplaylist' in data: + noplaylist = data['force_noplaylist'] + noplaylist_prompt = False + + webpage = self._download_webpage(url, video_id) + + view_data = dict(map(lambda t: (t[0], t[2]), re.findall( + r'viewData\.([a-zA-Z]+)\s*=\s*(["\'])([^"\']+)\2', + webpage))) + + vod_data = self._parse_json(self._search_regex( + 'var\s+vod\s*=\s*([^;]+)', webpage, 'VOD data', default='{}'), + video_id) + + season_list = list(vod_data.get('seasonList', {}).values()) + if season_list: + if not noplaylist: + return self._extract_playlist( + season_list[0], video_id, vod_data, view_data, + prompt=noplaylist_prompt) + + if noplaylist_prompt: + self.to_screen('Downloading just video %s because of --no-playlist' % video_id) + + # In browsers `getMainUrl` request is always issued. Usually this + # endpoint gives the same result as the data embedded in the webpage. + # If georestricted, there are no embedded data, so an extra request is + # necessary to get the error code + video_data = self._parse_json(self._search_regex( + r'uiHlsUrl\s*=\s*testBackendData\(([^;]+)\);', + webpage, 'video data', default='{}'), video_id) + if not video_data: + payload = { + 'assetId': view_data['assetId'], + 'watchDevices': vod_data['watchDevices'], + 'contentType': view_data['contentType'], + } + video_data = self._download_json( + 'https://www.litv.tv/vod/getMainUrl', video_id, + data=json.dumps(payload).encode('utf-8'), + headers={'Content-Type': 'application/json'}) + + if not video_data.get('fullpath'): + error_msg = video_data.get('errorMessage') + if error_msg == 'vod.error.outsideregionerror': + self.raise_geo_restricted('This video is available in Taiwan only') + if error_msg: + raise ExtractorError('%s said: %s' % (self.IE_NAME, error_msg), expected=True) + raise ExtractorError('Unexpected result from %s' % self.IE_NAME) + + formats = self._extract_m3u8_formats( + video_data['fullpath'], video_id, ext='mp4', m3u8_id='hls') + for a_format in formats: + # LiTV HLS segments doesn't like compressions + a_format.setdefault('http_headers', {})['Youtubedl-no-compression'] = True + + title = view_data['title'] + view_data.get('secondaryMark', '') + description = view_data.get('description') + thumbnail = view_data.get('imageFile') + categories = [item['name'] for item in vod_data.get('category', [])] + episode = int_or_none(view_data.get('episode')) + + return { + 'id': video_id, + 'formats': formats, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'categories': categories, + 'episode_number': episode, + } From f23a92a0cecac0d4db60e086e429793556347271 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 8 May 2016 20:02:54 +0600 Subject: [PATCH 10/11] [mva] Add extractor (Closes #6667) --- youtube_dl/extractor/extractors.py | 4 + .../extractor/microsoftvirtualacademy.py | 192 ++++++++++++++++++ 2 files changed, 196 insertions(+) create mode 100644 youtube_dl/extractor/microsoftvirtualacademy.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 7bacef184..a0bb3d4c2 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -409,6 +409,10 @@ from .metacafe import MetacafeIE from .metacritic import MetacriticIE from .mgoon import MgoonIE from .mgtv import MGTVIE +from .microsoftvirtualacademy import ( + MicrosoftVirtualAcademyIE, + MicrosoftVirtualAcademyCourseIE, +) from .minhateca import MinhatecaIE from .ministrygrid import MinistryGridIE from .minoto import MinotoIE diff --git a/youtube_dl/extractor/microsoftvirtualacademy.py b/youtube_dl/extractor/microsoftvirtualacademy.py new file mode 100644 index 000000000..b7fea47ee --- /dev/null +++ b/youtube_dl/extractor/microsoftvirtualacademy.py @@ -0,0 +1,192 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import ( + compat_xpath, +) +from ..utils import ( + int_or_none, + parse_duration, + smuggle_url, + unsmuggle_url, + xpath_text, +) + + +class MicrosoftVirtualAcademyBaseIE(InfoExtractor): + def _extract_base_url(self, course_id, display_id): + return self._download_json( + 'https://api-mlxprod.microsoft.com/services/products/anonymous/%s' % course_id, + display_id, 'Downloading course base URL') + + def _extract_chapter_and_title(self, title): + if not title: + return None, None + m = re.search(r'(?P\d+)\s*\|\s*(?P.+)', title) + return (int(m.group('chapter')), m.group('title')) if m else (None, title) + + +class MicrosoftVirtualAcademyIE(MicrosoftVirtualAcademyBaseIE): + IE_NAME = 'mva' + IE_DESC = 'Microsoft Virtual Academy videos' + _VALID_URL = r'(?:%s:|https?://(?:mva\.microsoft|microsoftvirtualacademy)\.com/[^/]+/training-courses/[^/?#&]+-)(?P<course_id>\d+)(?::|\?l=)(?P<id>[\da-zA-Z]+_\d+)' % IE_NAME + + _TESTS = [{ + 'url': 'https://mva.microsoft.com/en-US/training-courses/microsoft-azure-fundamentals-virtual-machines-11788?l=gfVXISmEB_6804984382', + 'md5': '7826c44fc31678b12ad8db11f6b5abb9', + 'info_dict': { + 'id': 'gfVXISmEB_6804984382', + 'ext': 'mp4', + 'title': 'Course Introduction', + 'formats': 'mincount:3', + 'subtitles': { + 'en': [{ + 'ext': 'ttml', + }], + }, + } + }, { + 'url': 'mva:11788:gfVXISmEB_6804984382', + 'only_matching': True, + }] + + def _real_extract(self, url): + url, smuggled_data = unsmuggle_url(url, {}) + + mobj = re.match(self._VALID_URL, url) + course_id = mobj.group('course_id') + video_id = mobj.group('id') + + base_url = smuggled_data.get('base_url') or self._extract_base_url(course_id, video_id) + + settings = self._download_xml( + '%s/content/content_%s/videosettings.xml?v=1' % (base_url, video_id), + video_id, 'Downloading video settings XML') + + _, title = self._extract_chapter_and_title(xpath_text( + settings, './/Title', 'title', fatal=True)) + + formats = [] + + for sources in settings.findall(compat_xpath('.//MediaSources')): + if sources.get('videoType') == 'smoothstreaming': + continue + for source in sources.findall(compat_xpath('./MediaSource')): + video_url = source.text + if not video_url or not video_url.startswith('http'): + continue + video_mode = source.get('videoMode') + height = int_or_none(self._search_regex( + r'^(\d+)[pP]$', video_mode or '', 'height', default=None)) + codec = source.get('codec') + acodec, vcodec = [None] * 2 + if codec: + codecs = codec.split(',') + if len(codecs) == 2: + acodec, vcodec = codecs + elif len(codecs) == 1: + vcodec = codecs[0] + formats.append({ + 'url': video_url, + 'format_id': video_mode, + 'height': height, + 'acodec': acodec, + 'vcodec': vcodec, + }) + self._sort_formats(formats) + + subtitles = {} + for source in settings.findall(compat_xpath('.//MarkerResourceSource')): + subtitle_url = source.text + if not subtitle_url: + continue + subtitles.setdefault('en', []).append({ + 'url': '%s/%s' % (base_url, subtitle_url), + 'ext': source.get('type'), + }) + + return { + 'id': video_id, + 'title': title, + 'subtitles': subtitles, + 'formats': formats + } + + +class MicrosoftVirtualAcademyCourseIE(MicrosoftVirtualAcademyBaseIE): + IE_NAME = 'mva:course' + IE_DESC = 'Microsoft Virtual Academy courses' + _VALID_URL = r'(?:%s:|https?://(?:mva\.microsoft|microsoftvirtualacademy)\.com/[^/]+/training-courses/(?P<display_id>[^/?#&]+)-)(?P<id>\d+)' % IE_NAME + + _TESTS = [{ + 'url': 'https://mva.microsoft.com/en-US/training-courses/microsoft-azure-fundamentals-virtual-machines-11788', + 'info_dict': { + 'id': '11788', + 'title': 'Microsoft Azure Fundamentals: Virtual Machines', + }, + 'playlist_count': 36, + }, { + # with emphasized chapters + 'url': 'https://mva.microsoft.com/en-US/training-courses/developing-windows-10-games-with-construct-2-16335', + 'info_dict': { + 'id': '16335', + 'title': 'Developing Windows 10 Games with Construct 2', + }, + 'playlist_count': 10, + }, { + 'url': 'https://www.microsoftvirtualacademy.com/en-US/training-courses/microsoft-azure-fundamentals-virtual-machines-11788', + 'only_matching': True, + }, { + 'url': 'mva:course:11788', + 'only_matching': True, + }] + + @classmethod + def suitable(cls, url): + return False if MicrosoftVirtualAcademyIE.suitable(url) else super( + MicrosoftVirtualAcademyCourseIE, cls).suitable(url) + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + course_id = mobj.group('id') + display_id = mobj.group('display_id') + + base_url = self._extract_base_url(course_id, display_id) + + manifest = self._download_json( + '%s/imsmanifestlite.json' % base_url, + display_id, 'Downloading course manifest JSON')['manifest'] + + organization = manifest['organizations']['organization'][0] + + entries = [] + for chapter in organization['item']: + chapter_number, chapter_title = self._extract_chapter_and_title(chapter.get('title')) + chapter_id = chapter.get('@identifier') + for item in chapter.get('item', []): + item_id = item.get('@identifier') + if not item_id: + continue + metadata = item.get('resource', {}).get('metadata') or {} + if metadata.get('learningresourcetype') != 'Video': + continue + _, title = self._extract_chapter_and_title(item.get('title')) + duration = parse_duration(metadata.get('duration')) + description = metadata.get('description') + entries.append({ + '_type': 'url_transparent', + 'url': smuggle_url( + 'mva:%s:%s' % (course_id, item_id), {'base_url': base_url}), + 'title': title, + 'description': description, + 'duration': duration, + 'chapter': chapter_title, + 'chapter_number': chapter_number, + 'chapter_id': chapter_id, + }) + + title = organization.get('title') or manifest.get('metadata', {}).get('title') + + return self.playlist_result(entries, course_id, title) From c52f4efaee2386a72c3f6b694fb4f4c3132ced55 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 8 May 2016 20:10:20 +0600 Subject: [PATCH 11/11] [mva] Improve _VALID_URLs --- youtube_dl/extractor/microsoftvirtualacademy.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/microsoftvirtualacademy.py b/youtube_dl/extractor/microsoftvirtualacademy.py index b7fea47ee..afd3e98ec 100644 --- a/youtube_dl/extractor/microsoftvirtualacademy.py +++ b/youtube_dl/extractor/microsoftvirtualacademy.py @@ -31,7 +31,7 @@ class MicrosoftVirtualAcademyBaseIE(InfoExtractor): class MicrosoftVirtualAcademyIE(MicrosoftVirtualAcademyBaseIE): IE_NAME = 'mva' IE_DESC = 'Microsoft Virtual Academy videos' - _VALID_URL = r'(?:%s:|https?://(?:mva\.microsoft|microsoftvirtualacademy)\.com/[^/]+/training-courses/[^/?#&]+-)(?P<course_id>\d+)(?::|\?l=)(?P<id>[\da-zA-Z]+_\d+)' % IE_NAME + _VALID_URL = r'(?:%s:|https?://(?:mva\.microsoft|(?:www\.)?microsoftvirtualacademy)\.com/[^/]+/training-courses/[^/?#&]+-)(?P<course_id>\d+)(?::|\?l=)(?P<id>[\da-zA-Z]+_\d+)' % IE_NAME _TESTS = [{ 'url': 'https://mva.microsoft.com/en-US/training-courses/microsoft-azure-fundamentals-virtual-machines-11788?l=gfVXISmEB_6804984382', @@ -118,7 +118,7 @@ class MicrosoftVirtualAcademyIE(MicrosoftVirtualAcademyBaseIE): class MicrosoftVirtualAcademyCourseIE(MicrosoftVirtualAcademyBaseIE): IE_NAME = 'mva:course' IE_DESC = 'Microsoft Virtual Academy courses' - _VALID_URL = r'(?:%s:|https?://(?:mva\.microsoft|microsoftvirtualacademy)\.com/[^/]+/training-courses/(?P<display_id>[^/?#&]+)-)(?P<id>\d+)' % IE_NAME + _VALID_URL = r'(?:%s:|https?://(?:mva\.microsoft|(?:www\.)?microsoftvirtualacademy)\.com/[^/]+/training-courses/(?P<display_id>[^/?#&]+)-)(?P<id>\d+)' % IE_NAME _TESTS = [{ 'url': 'https://mva.microsoft.com/en-US/training-courses/microsoft-azure-fundamentals-virtual-machines-11788',