From 78790ba83f1736c196f09b55d076e798a175573c Mon Sep 17 00:00:00 2001 From: Luca Cherubin Date: Thu, 26 Apr 2018 20:33:09 +0100 Subject: [PATCH 01/19] First commit --- youtube_dl/extractor/extractors.py | 3 + youtube_dl/extractor/frontendmaster.py | 88 ++++++++++++++++++++++++++ 2 files changed, 91 insertions(+) create mode 100644 youtube_dl/extractor/frontendmaster.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 3570fa165..d6ecb2d6f 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -384,6 +384,7 @@ from .francetv import ( from .freesound import FreesoundIE from .freespeech import FreespeechIE from .freshlive import FreshLiveIE +from .frontendmaster import FrontEndMasterIE from .funimation import FunimationIE from .funk import ( FunkMixIE, @@ -1421,3 +1422,5 @@ from .zapiks import ZapiksIE from .zaq1 import Zaq1IE from .zdf import ZDFIE, ZDFChannelIE from .zingmp3 import ZingMp3IE + + diff --git a/youtube_dl/extractor/frontendmaster.py b/youtube_dl/extractor/frontendmaster.py new file mode 100644 index 000000000..6a5235e4b --- /dev/null +++ b/youtube_dl/extractor/frontendmaster.py @@ -0,0 +1,88 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json +import re + +from youtube_dl.utils import try_get +from .common import InfoExtractor +from ..compat import ( + # compat_str, + compat_urlparse, + compat_str) + +class FrontEndMasterBaseIE(InfoExtractor): + _API_BASE = 'https://api.frontendmasters.com/v1/kabuki/courses' + + _supported_resolutions = { + 'low': 360, + 'mid': 720, + 'high': 1080 + } + + _supported_formats = ['mp4', 'webm'] + + def _match_course_id(self, url): + if '_VALID_URL_RE' not in self.__dict__: + self._VALID_URL_RE = re.compile(self._VALID_URL) + m = self._VALID_URL_RE.match(url) + assert m + return compat_str(m.group('courseid')) + + def _download_course(self, course_id, url, display_id): + response = self._download_json( + '%s/%s' % (self._API_BASE, course_id), course_id, + 'Downloading course JSON', + headers={ + 'Content-Type': 'application/json;charset=utf-8', + 'Referer': url, + }) + return response + + +class FrontEndMasterIE(FrontEndMasterBaseIE): + _VALID_URL = r'https?://(?:www\.)?frontendmasters\.com/courses/(?P[a-z\-]+)/(?P[a-z\-]+)/?' + _TEST = { + 'url': 'https://frontendmasters.com/courses/content-strategy/introduction/', + 'md5': 'TODO: md5 sum of the first 10241 bytes of the video file (use --test)', + 'info_dict': { + 'id': 'introduction', + 'courseid': 'content-strategy', + 'ext': 'webm', + 'title': 'Introduction', + 'thumbnail': r're:^https?://.*\.jpg$', + # TODO more properties, either as: + # * A value + # * MD5 checksum; start the string with md5: + # * A regular expression; start the string with re: + # * Any Python type (for example int or float) + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + course_id = self._match_course_id(url) + json_content = self._download_course(course_id=course_id, url=url, display_id=None) + webpage = self._download_webpage(url, video_id) + + # TODO more code goes here, for example ... + lesson_index = json_content['lessonSlugs'].index(video_id) + lesson_hash = json_content['lessonHashes'][lesson_index] + lesson_data = json_content['lessonData'][lesson_hash] + lesson_source_base = lesson_data['sourceBase'] + + video_url_request = "%s/source?r=360&f=mp4" + + + title = lesson_data['title'] + description = json_content['description'] + + # title = self._html_search_regex(r'
(.+?)
', webpage, 'title') + + return { + 'id': video_id, + 'title': title, + 'description': description, + + # TODO more properties (see youtube_dl/extractor/common.py) + } \ No newline at end of file From a4619f74ae193d40f35adb4188176f0668d7ba61 Mon Sep 17 00:00:00 2001 From: Luca Cherubin Date: Sat, 28 Apr 2018 19:47:50 +0100 Subject: [PATCH 02/19] First version of FrontendMasters downloader actually working --- youtube_dl/extractor/frontendmaster.py | 84 +++++++++++++++++++++----- 1 file changed, 68 insertions(+), 16 deletions(-) diff --git a/youtube_dl/extractor/frontendmaster.py b/youtube_dl/extractor/frontendmaster.py index 6a5235e4b..4fb84be98 100644 --- a/youtube_dl/extractor/frontendmaster.py +++ b/youtube_dl/extractor/frontendmaster.py @@ -1,7 +1,8 @@ # coding: utf-8 from __future__ import unicode_literals -import json +import sys + import re from youtube_dl.utils import try_get @@ -11,8 +12,15 @@ from ..compat import ( compat_urlparse, compat_str) +from ..utils import ( + ExtractorError, + urlencode_postdata +) + + class FrontEndMasterBaseIE(InfoExtractor): _API_BASE = 'https://api.frontendmasters.com/v1/kabuki/courses' + _COOKIES_BASE = 'https://api.frontendmasters.com' _supported_resolutions = { 'low': 360, @@ -41,29 +49,60 @@ class FrontEndMasterBaseIE(InfoExtractor): class FrontEndMasterIE(FrontEndMasterBaseIE): + IE_NAME = 'frontend-masters' _VALID_URL = r'https?://(?:www\.)?frontendmasters\.com/courses/(?P[a-z\-]+)/(?P[a-z\-]+)/?' + _LOGIN_URL = 'https://frontendmasters.com/login/' + _NETRC_MACHINE = 'frontend-masters' _TEST = { 'url': 'https://frontendmasters.com/courses/content-strategy/introduction/', - 'md5': 'TODO: md5 sum of the first 10241 bytes of the video file (use --test)', + # 'md5': 'TODO: md5 sum of the first 10241 bytes of the video file (use --test)', 'info_dict': { 'id': 'introduction', 'courseid': 'content-strategy', - 'ext': 'webm', - 'title': 'Introduction', - 'thumbnail': r're:^https?://.*\.jpg$', - # TODO more properties, either as: - # * A value - # * MD5 checksum; start the string with md5: - # * A regular expression; start the string with re: - # * Any Python type (for example int or float) + 'ext': 'mp4', + 'title': 'Introduction' } } + def _real_initialize(self): + self._login() + + def _login(self): + (username, password) = self._get_login_info() + if username is None: + return + + login_page = self._download_webpage( + self._LOGIN_URL, None, 'Downloading login page') + + login_form = self._hidden_inputs(login_page) + + login_form.update({ + "username": username, + "password": password + }) + + post_url = self._search_regex( + r']+action=(["\'])(?P.+?)\1', login_page, + 'post_url', default=self._LOGIN_URL, group='url') + + if not post_url.startswith('http'): + post_url = compat_urlparse.urljoin(self._LOGIN_URL, post_url) + + response = self._download_webpage( + post_url, None, 'Logging in', + data=urlencode_postdata(login_form), + headers={'Content-Type': 'application/x-www-form-urlencoded'} + ) + + logout_link = self._search_regex('(Logout .*)', response, 'logout-link') + if not logout_link: + raise ExtractorError('Unable to login', expected=True) + def _real_extract(self, url): video_id = self._match_id(url) course_id = self._match_course_id(url) json_content = self._download_course(course_id=course_id, url=url, display_id=None) - webpage = self._download_webpage(url, video_id) # TODO more code goes here, for example ... lesson_index = json_content['lessonSlugs'].index(video_id) @@ -71,18 +110,31 @@ class FrontEndMasterIE(FrontEndMasterBaseIE): lesson_data = json_content['lessonData'][lesson_hash] lesson_source_base = lesson_data['sourceBase'] - video_url_request = "%s/source?r=360&f=mp4" + cookies = self._get_cookies(self._COOKIES_BASE) + cookies_str = ";".join(["%s=%s" % (cookie.key, cookie.value) for cookie in cookies.values()]) + video_request_url = "%s/source" + video_request_params = { + 'r': 720, + 'f': 'mp4' + } + video_request_headers = { + "origin": "https://frontendmasters.com", + "referer": lesson_source_base, + "cookie": cookies_str, + 'user-agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.117 Safari/537.36" + } + video_response = self._download_json(video_request_url % lesson_source_base, video_id, query=video_request_params, headers=video_request_headers) title = lesson_data['title'] description = json_content['description'] - - # title = self._html_search_regex(r'
(.+?)
', webpage, 'title') + video_url = video_response['url'] return { 'id': video_id, 'title': title, - 'description': description, + 'courseid': course_id, + 'url': video_url # TODO more properties (see youtube_dl/extractor/common.py) - } \ No newline at end of file + } From 540516c2e683c0dca461043cf845d18224af91a8 Mon Sep 17 00:00:00 2001 From: Kerruba Date: Sun, 29 Apr 2018 17:54:49 +0100 Subject: [PATCH 03/19] Add course download and a bunch of metadata --- youtube_dl/extractor/extractors.py | 5 +- youtube_dl/extractor/frontendmaster.py | 251 +++++++++++++++++++------ 2 files changed, 196 insertions(+), 60 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index d6ecb2d6f..4e191d78d 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -384,7 +384,10 @@ from .francetv import ( from .freesound import FreesoundIE from .freespeech import FreespeechIE from .freshlive import FreshLiveIE -from .frontendmaster import FrontEndMasterIE +from .frontendmaster import ( + FrontEndMasterIE, + FrontEndMasterCourseIE +) from .funimation import FunimationIE from .funk import ( FunkMixIE, diff --git a/youtube_dl/extractor/frontendmaster.py b/youtube_dl/extractor/frontendmaster.py index 4fb84be98..30dcc75ef 100644 --- a/youtube_dl/extractor/frontendmaster.py +++ b/youtube_dl/extractor/frontendmaster.py @@ -1,6 +1,8 @@ # coding: utf-8 from __future__ import unicode_literals +import collections +import random import sys import re @@ -14,54 +16,19 @@ from ..compat import ( from ..utils import ( ExtractorError, - urlencode_postdata + urlencode_postdata, + qualities ) class FrontEndMasterBaseIE(InfoExtractor): _API_BASE = 'https://api.frontendmasters.com/v1/kabuki/courses' + _VIDEO_BASE = 'http://www.frontendmasters.com/courses' _COOKIES_BASE = 'https://api.frontendmasters.com' - - _supported_resolutions = { - 'low': 360, - 'mid': 720, - 'high': 1080 - } - - _supported_formats = ['mp4', 'webm'] - - def _match_course_id(self, url): - if '_VALID_URL_RE' not in self.__dict__: - self._VALID_URL_RE = re.compile(self._VALID_URL) - m = self._VALID_URL_RE.match(url) - assert m - return compat_str(m.group('courseid')) - - def _download_course(self, course_id, url, display_id): - response = self._download_json( - '%s/%s' % (self._API_BASE, course_id), course_id, - 'Downloading course JSON', - headers={ - 'Content-Type': 'application/json;charset=utf-8', - 'Referer': url, - }) - return response - - -class FrontEndMasterIE(FrontEndMasterBaseIE): - IE_NAME = 'frontend-masters' - _VALID_URL = r'https?://(?:www\.)?frontendmasters\.com/courses/(?P[a-z\-]+)/(?P[a-z\-]+)/?' _LOGIN_URL = 'https://frontendmasters.com/login/' - _NETRC_MACHINE = 'frontend-masters' - _TEST = { - 'url': 'https://frontendmasters.com/courses/content-strategy/introduction/', - # 'md5': 'TODO: md5 sum of the first 10241 bytes of the video file (use --test)', - 'info_dict': { - 'id': 'introduction', - 'courseid': 'content-strategy', - 'ext': 'mp4', - 'title': 'Introduction' - } + _SUPPORTED_MEAN = { + "resolution": [360, 720, 1080], + "format": ['webm', 'mp4'] } def _real_initialize(self): @@ -99,24 +66,102 @@ class FrontEndMasterIE(FrontEndMasterBaseIE): if not logout_link: raise ExtractorError('Unable to login', expected=True) + def _match_course_id(self, url): + if '_VALID_URL_RE' not in self.__dict__: + self._VALID_URL_RE = re.compile(self._VALID_URL) + m = self._VALID_URL_RE.match(url) + assert m + return compat_str(m.group('courseid')) + + def _download_course(self, course_id, url, display_id): + response = self._download_json( + '%s/%s' % (self._API_BASE, course_id), course_id, + 'Downloading course JSON', + headers={ + 'Content-Type': 'application/json;charset=utf-8', + 'Referer': url, + }) + return response + + def _pair_section_with_video_elemen_index(self, lesson_elements): + sections = {} + current_section = None + current_section_number = 0 + for elem in lesson_elements: + if isinstance(elem, unicode): + (current_section, current_section_number) = (elem.encode('utf-8'), current_section_number + 1) + else: + if current_section: + sections[elem] = (current_section, current_section_number) + + return sections + + +class FrontEndMasterIE(FrontEndMasterBaseIE): + IE_NAME = 'frontend-masters' + + _VALID_URL = r'https?://(?:www\.)?frontendmasters\.com/courses/(?P[a-z\-]+)/(?P[a-z\-]+)/?' + _NETRC_MACHINE = 'frontend-masters' + _TEST = { + 'url': 'https://frontendmasters.com/courses/content-strategy/introduction/', + # 'md5': 'TODO: md5 sum of the first 10241 bytes of the video file (use --test)', + 'info_dict': { + 'id': 'introduction', + 'title': 'Introduction', + 'display_id': 'content-strategy', + 'ext': 'mp4' + } + } + def _real_extract(self, url): video_id = self._match_id(url) course_id = self._match_course_id(url) - json_content = self._download_course(course_id=course_id, url=url, display_id=None) + course_json_content = self._download_course(course_id=course_id, url=url, display_id=course_id) + + # Course details + # course_name = course_json_content.get('title') + # course_description = course_json_content.get('description') + # course_display_id = course_json_content.get('slug') + # course_thumbnail = course_json_content.get('thumbnail') # TODO more code goes here, for example ... - lesson_index = json_content['lessonSlugs'].index(video_id) - lesson_hash = json_content['lessonHashes'][lesson_index] - lesson_data = json_content['lessonData'][lesson_hash] - lesson_source_base = lesson_data['sourceBase'] + lesson_index = course_json_content.get('lessonSlugs').index(video_id) + lesson_hash = course_json_content.get('lessonHashes')[lesson_index] + lesson_section_elements = course_json_content.get('lessonElements') + lesson_data = course_json_content.get('lessonData')[lesson_hash] + lesson_source_base = lesson_data.get('sourceBase') + course_sections_pairing = self._pair_section_with_video_elemen_index(lesson_section_elements) + + lesson_title = lesson_data.get('title') + lesson_description = lesson_data.get('description') + lesson_index = lesson_data.get('index') + lesson_slug = lesson_data.get('slug') + lesson_thumbnail_url = lesson_data.get('thumbnail') + # lesson_element_index = lesson_data.get('elementIndex') + lesson_section = course_sections_pairing.get(lesson_index)[0] + lesson_section_number = course_sections_pairing.get(lesson_index)[1] + + # Get instructors informations + # instructors = course_json_content.get('instructors') + # authors = "; ".join([author.name for author in instructors]) + + QUALITIES_PREFERENCE = ('low', 'medium', 'high') + quality_key = qualities(QUALITIES_PREFERENCE) + QUALITIES = { + 'low': {'width': 480, 'height': 360}, + 'medium': {'width': 1280, 'height': 720}, + 'high': {'width': 1920, 'height': 1080} + } + + AllowedQuality = collections.namedtuple('AllowedQuality', ['ext', 'qualities']) + ALLOWED_QUALITIES = [ + AllowedQuality('webm', ['low', 'medium', 'high']), + AllowedQuality('mp4', ['low', 'medium', 'high']) + ] cookies = self._get_cookies(self._COOKIES_BASE) cookies_str = ";".join(["%s=%s" % (cookie.key, cookie.value) for cookie in cookies.values()]) video_request_url = "%s/source" - video_request_params = { - 'r': 720, - 'f': 'mp4' - } video_request_headers = { "origin": "https://frontendmasters.com", "referer": lesson_source_base, @@ -124,17 +169,105 @@ class FrontEndMasterIE(FrontEndMasterBaseIE): 'user-agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.117 Safari/537.36" } - video_response = self._download_json(video_request_url % lesson_source_base, video_id, query=video_request_params, headers=video_request_headers) + if self._downloader.params.get('listformats', False): + allowed_qualities = ALLOWED_QUALITIES + else: + def guess_allowed_qualities(): + req_format = self._downloader.params.get('format') or 'best' + req_format_split = req_format.split('-', 1) + if len(req_format_split) > 1: + req_ext, req_quality = req_format_split + req_quality = '-'.join(req_quality.split('-')[:2]) + for allowed_quality in ALLOWED_QUALITIES: + if req_ext == allowed_quality.ext and req_quality in allowed_quality.qualities: + return (AllowedQuality(req_ext, (req_quality, )), ) + req_ext = 'webm' if self._downloader.params.get('prefer_free_formats') else 'mp4' + return (AllowedQuality(req_ext, ('high', )), ) + allowed_qualities = guess_allowed_qualities() - title = lesson_data['title'] - description = json_content['description'] - video_url = video_response['url'] + formats = [] + for ext, qualities_ in allowed_qualities: + for quality in qualities_: + f = QUALITIES[quality].copy() + video_request_params = { + 'r': f['height'], + 'f': ext + } + video_response = self._download_json(video_request_url % lesson_source_base, video_id, + query=video_request_params, headers=video_request_headers) + + # To avoid the possibility of problems with multiple sequential calls to ViewClip API and start + # to return 429 HTTP errors after some time (see the problem Pluralsight has on + # https://github.com/rg3/youtube-dl/pull/6989) and avoid also the risk of + # account ban (see https://github.com/rg3/youtube-dl/issues/6842), + # we will sleep random amount of time before each call to ViewClip. + + # self._sleep( + # random.randint(2, 5), lesson_slug, + # '%(video_id)s: Waiting for %(timeout)s seconds to avoid throttling') + # + # if not video_response: + # continue + + video_url = video_response.get('url') + clip_f = f.copy() + clip_f.update({ + 'url': video_url, + 'ext': ext, + 'format_id': '%s-%s' % (ext, quality), + 'quality': quality_key(quality), + 'height': f['height'] + }) + formats.append(clip_f) + + self._sort_formats(formats) return { 'id': video_id, - 'title': title, - 'courseid': course_id, - 'url': video_url - - # TODO more properties (see youtube_dl/extractor/common.py) + 'display_id': lesson_slug, + 'title': lesson_title, + 'description': lesson_description, + 'chapter': lesson_section, + 'chapter_number': lesson_section_number, + 'thumbnail': lesson_thumbnail_url, + 'formats': formats } + + +class FrontEndMasterCourseIE(FrontEndMasterBaseIE): + IE_NAME = 'frontend-masters:course' + IE_DESC = "frontendmasters.com online courses" + + _VALID_URL = r'https?://(?:www\.)?frontendmasters\.com/courses/(?P[a-z\-]+)/?' + _NETRC_MACHINE = 'frontend-masters' + _TEST = { + 'url': 'https://frontendmasters.com/courses/content-strategy/', + # 'md5': 'TODO: md5 sum of the first 10241 bytes of the video file (use --test)', + 'info_dict': { + 'id': 'content-strategy', + 'title': 'Content Strategy' + } + } + + def _real_extract(self, url): + course_id = self._match_id(url) + course_json_content = self._download_course(course_id=course_id, url=url, display_id=None) + + title = course_json_content.get('title') + description = course_json_content.get('description') + course_display_id = course_json_content.get('slug') + + videos_data = course_json_content.get('lessonData').values() + videos_data = sorted(videos_data, key=lambda video: video.get('index')) + + entries = [] + for video in videos_data: + video_slug = video.get('slug') + clip_url = "%s/%s/%s" % (self._VIDEO_BASE, course_display_id, video_slug) + entries.append({ + '_type': 'url_transparent', + 'url': clip_url, + 'ie_key': FrontEndMasterIE.ie_key() + }) + + return self.playlist_result(entries, course_id, title, description) From 52e17c442b23c13825183202a36043770a9866b8 Mon Sep 17 00:00:00 2001 From: Kerruba Date: Sun, 29 Apr 2018 18:40:06 +0100 Subject: [PATCH 04/19] Add tests for both the course and the single video file --- youtube_dl/extractor/frontendmaster.py | 58 +++++++++++++------------- 1 file changed, 30 insertions(+), 28 deletions(-) diff --git a/youtube_dl/extractor/frontendmaster.py b/youtube_dl/extractor/frontendmaster.py index 30dcc75ef..58f12a26d 100644 --- a/youtube_dl/extractor/frontendmaster.py +++ b/youtube_dl/extractor/frontendmaster.py @@ -2,15 +2,11 @@ from __future__ import unicode_literals import collections -import random -import sys import re -from youtube_dl.utils import try_get from .common import InfoExtractor from ..compat import ( - # compat_str, compat_urlparse, compat_str) @@ -62,7 +58,8 @@ class FrontEndMasterBaseIE(InfoExtractor): headers={'Content-Type': 'application/x-www-form-urlencoded'} ) - logout_link = self._search_regex('(Logout .*)', response, 'logout-link') + logout_link = self._search_regex('(Logout .*)', + response, 'logout-link') if not logout_link: raise ExtractorError('Unable to login', expected=True) @@ -89,7 +86,8 @@ class FrontEndMasterBaseIE(InfoExtractor): current_section_number = 0 for elem in lesson_elements: if isinstance(elem, unicode): - (current_section, current_section_number) = (elem.encode('utf-8'), current_section_number + 1) + (current_section, current_section_number) = \ + (elem.encode('utf-8'), current_section_number + 1) else: if current_section: sections[elem] = (current_section, current_section_number) @@ -104,27 +102,23 @@ class FrontEndMasterIE(FrontEndMasterBaseIE): _NETRC_MACHINE = 'frontend-masters' _TEST = { 'url': 'https://frontendmasters.com/courses/content-strategy/introduction/', - # 'md5': 'TODO: md5 sum of the first 10241 bytes of the video file (use --test)', + 'md5': '5f176d4f170778524f40a06307a929f6', 'info_dict': { 'id': 'introduction', 'title': 'Introduction', 'display_id': 'content-strategy', 'ext': 'mp4' - } + }, + 'skip': 'Requires FrontendMasters account credentials' } def _real_extract(self, url): video_id = self._match_id(url) course_id = self._match_course_id(url) - course_json_content = self._download_course(course_id=course_id, url=url, display_id=course_id) + course_json_content = self._download_course(course_id=course_id, + url=url, + display_id=course_id) - # Course details - # course_name = course_json_content.get('title') - # course_description = course_json_content.get('description') - # course_display_id = course_json_content.get('slug') - # course_thumbnail = course_json_content.get('thumbnail') - - # TODO more code goes here, for example ... lesson_index = course_json_content.get('lessonSlugs').index(video_id) lesson_hash = course_json_content.get('lessonHashes')[lesson_index] lesson_section_elements = course_json_content.get('lessonElements') @@ -137,13 +131,9 @@ class FrontEndMasterIE(FrontEndMasterBaseIE): lesson_index = lesson_data.get('index') lesson_slug = lesson_data.get('slug') lesson_thumbnail_url = lesson_data.get('thumbnail') - # lesson_element_index = lesson_data.get('elementIndex') lesson_section = course_sections_pairing.get(lesson_index)[0] lesson_section_number = course_sections_pairing.get(lesson_index)[1] - # Get instructors informations - # instructors = course_json_content.get('instructors') - # authors = "; ".join([author.name for author in instructors]) QUALITIES_PREFERENCE = ('low', 'medium', 'high') quality_key = qualities(QUALITIES_PREFERENCE) @@ -160,13 +150,16 @@ class FrontEndMasterIE(FrontEndMasterBaseIE): ] cookies = self._get_cookies(self._COOKIES_BASE) - cookies_str = ";".join(["%s=%s" % (cookie.key, cookie.value) for cookie in cookies.values()]) + cookies_str = ";".join(["%s=%s" % (cookie.key, cookie.value) + for cookie in cookies.values()]) video_request_url = "%s/source" video_request_headers = { "origin": "https://frontendmasters.com", "referer": lesson_source_base, "cookie": cookies_str, - 'user-agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.117 Safari/537.36" + 'user-agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) " + "AppleWebKit/537.36 (KHTML, like Gecko) " + "Chrome/66.0.3359.117 Safari/537.36" } if self._downloader.params.get('listformats', False): @@ -240,18 +233,27 @@ class FrontEndMasterCourseIE(FrontEndMasterBaseIE): _VALID_URL = r'https?://(?:www\.)?frontendmasters\.com/courses/(?P[a-z\-]+)/?' _NETRC_MACHINE = 'frontend-masters' - _TEST = { + _TESTS = [{ 'url': 'https://frontendmasters.com/courses/content-strategy/', - # 'md5': 'TODO: md5 sum of the first 10241 bytes of the video file (use --test)', 'info_dict': { 'id': 'content-strategy', - 'title': 'Content Strategy' - } - } + 'title': 'Content Strategy', + 'description': 'md5:7916149d4539c5d6fa86ff43a5df213b' + }, + 'playlist_count': 31, + }, { + 'url': 'https://frontendmasters.com/courses/sql-fundamentals/', + 'only_matching': True, + }, { + 'url': 'https://frontendmasters.com/courses/introduction-to-javascript-jquery/', + 'only_matching': True, + }] def _real_extract(self, url): course_id = self._match_id(url) - course_json_content = self._download_course(course_id=course_id, url=url, display_id=None) + course_json_content = self._download_course(course_id=course_id, + url=url, + display_id=None) title = course_json_content.get('title') description = course_json_content.get('description') From 78909d9c6939078fbcf5a64dd6fb6fd59554308c Mon Sep 17 00:00:00 2001 From: Kerruba Date: Sun, 29 Apr 2018 18:46:06 +0100 Subject: [PATCH 05/19] Removes unnecessary class variable --- youtube_dl/extractor/frontendmaster.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/youtube_dl/extractor/frontendmaster.py b/youtube_dl/extractor/frontendmaster.py index 58f12a26d..671c357e0 100644 --- a/youtube_dl/extractor/frontendmaster.py +++ b/youtube_dl/extractor/frontendmaster.py @@ -22,10 +22,6 @@ class FrontEndMasterBaseIE(InfoExtractor): _VIDEO_BASE = 'http://www.frontendmasters.com/courses' _COOKIES_BASE = 'https://api.frontendmasters.com' _LOGIN_URL = 'https://frontendmasters.com/login/' - _SUPPORTED_MEAN = { - "resolution": [360, 720, 1080], - "format": ['webm', 'mp4'] - } def _real_initialize(self): self._login() From f02a97581b810c54828c71315587de02355be269 Mon Sep 17 00:00:00 2001 From: Kerruba Date: Sun, 29 Apr 2018 22:30:39 +0100 Subject: [PATCH 06/19] Removes spaces from end of the file --- youtube_dl/extractor/extractors.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 4e191d78d..6ed805cd9 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1425,5 +1425,3 @@ from .zapiks import ZapiksIE from .zaq1 import Zaq1IE from .zdf import ZDFIE, ZDFChannelIE from .zingmp3 import ZingMp3IE - - From c000009f5084a2ffab5942ab536642779e80906f Mon Sep 17 00:00:00 2001 From: Kerruba Date: Sun, 29 Apr 2018 22:31:45 +0100 Subject: [PATCH 07/19] Use different method to check successful login, make the code compatible with python3 --- youtube_dl/extractor/frontendmaster.py | 42 +++++++++++++++++--------- 1 file changed, 27 insertions(+), 15 deletions(-) diff --git a/youtube_dl/extractor/frontendmaster.py b/youtube_dl/extractor/frontendmaster.py index 671c357e0..b228b3da9 100644 --- a/youtube_dl/extractor/frontendmaster.py +++ b/youtube_dl/extractor/frontendmaster.py @@ -54,10 +54,13 @@ class FrontEndMasterBaseIE(InfoExtractor): headers={'Content-Type': 'application/x-www-form-urlencoded'} ) - logout_link = self._search_regex('(Logout .*)', - response, 'logout-link') - if not logout_link: - raise ExtractorError('Unable to login', expected=True) + error = self._search_regex( + r']+class=["\']Message MessageAlert["\'][^>]*>([^<]+)', + response, 'error message', default=None) + + if error: + raise ExtractorError('Unable to login: check username and password', + expected=True) def _match_course_id(self, url): if '_VALID_URL_RE' not in self.__dict__: @@ -81,9 +84,13 @@ class FrontEndMasterBaseIE(InfoExtractor): current_section = None current_section_number = 0 for elem in lesson_elements: - if isinstance(elem, unicode): + if not isinstance(elem, int): + elem_name = elem + if not isinstance(elem_name, str): + # convert unicode to str + elem_name = elem.encode('utf-8') (current_section, current_section_number) = \ - (elem.encode('utf-8'), current_section_number + 1) + (elem_name, current_section_number + 1) else: if current_section: sections[elem] = (current_section, current_section_number) @@ -120,7 +127,8 @@ class FrontEndMasterIE(FrontEndMasterBaseIE): lesson_section_elements = course_json_content.get('lessonElements') lesson_data = course_json_content.get('lessonData')[lesson_hash] lesson_source_base = lesson_data.get('sourceBase') - course_sections_pairing = self._pair_section_with_video_elemen_index(lesson_section_elements) + course_sections_pairing = self._pair_section_with_video_elemen_index( + lesson_section_elements) lesson_title = lesson_data.get('title') lesson_description = lesson_data.get('description') @@ -130,7 +138,6 @@ class FrontEndMasterIE(FrontEndMasterBaseIE): lesson_section = course_sections_pairing.get(lesson_index)[0] lesson_section_number = course_sections_pairing.get(lesson_index)[1] - QUALITIES_PREFERENCE = ('low', 'medium', 'high') quality_key = qualities(QUALITIES_PREFERENCE) QUALITIES = { @@ -139,7 +146,8 @@ class FrontEndMasterIE(FrontEndMasterBaseIE): 'high': {'width': 1920, 'height': 1080} } - AllowedQuality = collections.namedtuple('AllowedQuality', ['ext', 'qualities']) + AllowedQuality = collections.namedtuple('AllowedQuality', + ['ext', 'qualities']) ALLOWED_QUALITIES = [ AllowedQuality('webm', ['low', 'medium', 'high']), AllowedQuality('mp4', ['low', 'medium', 'high']) @@ -169,9 +177,11 @@ class FrontEndMasterIE(FrontEndMasterBaseIE): req_quality = '-'.join(req_quality.split('-')[:2]) for allowed_quality in ALLOWED_QUALITIES: if req_ext == allowed_quality.ext and req_quality in allowed_quality.qualities: - return (AllowedQuality(req_ext, (req_quality, )), ) - req_ext = 'webm' if self._downloader.params.get('prefer_free_formats') else 'mp4' - return (AllowedQuality(req_ext, ('high', )), ) + return (AllowedQuality(req_ext, (req_quality,)),) + req_ext = 'webm' if self._downloader.params.get( + 'prefer_free_formats') else 'mp4' + return (AllowedQuality(req_ext, ('high',)),) + allowed_qualities = guess_allowed_qualities() formats = [] @@ -182,8 +192,9 @@ class FrontEndMasterIE(FrontEndMasterBaseIE): 'r': f['height'], 'f': ext } - video_response = self._download_json(video_request_url % lesson_source_base, video_id, - query=video_request_params, headers=video_request_headers) + video_response = self._download_json( + video_request_url % lesson_source_base, video_id, + query=video_request_params, headers=video_request_headers) # To avoid the possibility of problems with multiple sequential calls to ViewClip API and start # to return 429 HTTP errors after some time (see the problem Pluralsight has on @@ -261,7 +272,8 @@ class FrontEndMasterCourseIE(FrontEndMasterBaseIE): entries = [] for video in videos_data: video_slug = video.get('slug') - clip_url = "%s/%s/%s" % (self._VIDEO_BASE, course_display_id, video_slug) + clip_url = "%s/%s/%s" % ( + self._VIDEO_BASE, course_display_id, video_slug) entries.append({ '_type': 'url_transparent', 'url': clip_url, From 65cf161b5d42da3558936b147fb580257d469aaf Mon Sep 17 00:00:00 2001 From: Kerruba Date: Sun, 29 Apr 2018 22:54:41 +0100 Subject: [PATCH 08/19] Add a try catch for some optional fields --- youtube_dl/extractor/frontendmaster.py | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/frontendmaster.py b/youtube_dl/extractor/frontendmaster.py index b228b3da9..a48d5fe2d 100644 --- a/youtube_dl/extractor/frontendmaster.py +++ b/youtube_dl/extractor/frontendmaster.py @@ -122,21 +122,31 @@ class FrontEndMasterIE(FrontEndMasterBaseIE): url=url, display_id=course_id) + # Necessary to get mandatory informations like title and video_url lesson_index = course_json_content.get('lessonSlugs').index(video_id) lesson_hash = course_json_content.get('lessonHashes')[lesson_index] - lesson_section_elements = course_json_content.get('lessonElements') lesson_data = course_json_content.get('lessonData')[lesson_hash] - lesson_source_base = lesson_data.get('sourceBase') - course_sections_pairing = self._pair_section_with_video_elemen_index( - lesson_section_elements) + # This is necessary to get the link for the video + lesson_source_base = lesson_data['sourceBase'] - lesson_title = lesson_data.get('title') + lesson_title = lesson_data['title'] + + # Some optional fields lesson_description = lesson_data.get('description') lesson_index = lesson_data.get('index') lesson_slug = lesson_data.get('slug') lesson_thumbnail_url = lesson_data.get('thumbnail') - lesson_section = course_sections_pairing.get(lesson_index)[0] - lesson_section_number = course_sections_pairing.get(lesson_index)[1] + lesson_section_elements = course_json_content.get('lessonElements') + + try: + course_sections_pairing = self._pair_section_with_video_elemen_index( + lesson_section_elements) + lesson_section = course_sections_pairing.get(lesson_index)[0] + lesson_section_number = course_sections_pairing.get(lesson_index)[1] + except Exception: + lesson_section = None + lesson_section_number = None + QUALITIES_PREFERENCE = ('low', 'medium', 'high') quality_key = qualities(QUALITIES_PREFERENCE) From 20d304156520f107c7850925ac30097b7859268a Mon Sep 17 00:00:00 2001 From: Kerruba Date: Sun, 29 Apr 2018 22:56:14 +0100 Subject: [PATCH 09/19] Removes user-agent from video-request-header --- youtube_dl/extractor/frontendmaster.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/youtube_dl/extractor/frontendmaster.py b/youtube_dl/extractor/frontendmaster.py index a48d5fe2d..11590e81c 100644 --- a/youtube_dl/extractor/frontendmaster.py +++ b/youtube_dl/extractor/frontendmaster.py @@ -170,10 +170,7 @@ class FrontEndMasterIE(FrontEndMasterBaseIE): video_request_headers = { "origin": "https://frontendmasters.com", "referer": lesson_source_base, - "cookie": cookies_str, - 'user-agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) " - "AppleWebKit/537.36 (KHTML, like Gecko) " - "Chrome/66.0.3359.117 Safari/537.36" + "cookie": cookies_str } if self._downloader.params.get('listformats', False): From f51e50d54684c32ce2ad4ca809717d6e138a9a0b Mon Sep 17 00:00:00 2001 From: Kerruba Date: Sun, 29 Apr 2018 23:16:16 +0100 Subject: [PATCH 10/19] Changes the test for the free-trial courses --- youtube_dl/extractor/frontendmaster.py | 23 ++++++++--------------- 1 file changed, 8 insertions(+), 15 deletions(-) diff --git a/youtube_dl/extractor/frontendmaster.py b/youtube_dl/extractor/frontendmaster.py index 11590e81c..2ec4f6af5 100644 --- a/youtube_dl/extractor/frontendmaster.py +++ b/youtube_dl/extractor/frontendmaster.py @@ -104,12 +104,12 @@ class FrontEndMasterIE(FrontEndMasterBaseIE): _VALID_URL = r'https?://(?:www\.)?frontendmasters\.com/courses/(?P[a-z\-]+)/(?P[a-z\-]+)/?' _NETRC_MACHINE = 'frontend-masters' _TEST = { - 'url': 'https://frontendmasters.com/courses/content-strategy/introduction/', - 'md5': '5f176d4f170778524f40a06307a929f6', + 'url': 'https://frontendmasters.com/courses/javascript-basics/introduction/', + 'md5': 'a47be6ea0a384cbbb10fab10061f43d6', 'info_dict': { 'id': 'introduction', 'title': 'Introduction', - 'display_id': 'content-strategy', + 'display_id': 'javascript-basics', 'ext': 'mp4' }, 'skip': 'Requires FrontendMasters account credentials' @@ -247,21 +247,14 @@ class FrontEndMasterCourseIE(FrontEndMasterBaseIE): _VALID_URL = r'https?://(?:www\.)?frontendmasters\.com/courses/(?P[a-z\-]+)/?' _NETRC_MACHINE = 'frontend-masters' - _TESTS = [{ - 'url': 'https://frontendmasters.com/courses/content-strategy/', + _TEST = { + 'url': 'https://frontendmasters.com/courses/javascript-basics/', 'info_dict': { 'id': 'content-strategy', - 'title': 'Content Strategy', - 'description': 'md5:7916149d4539c5d6fa86ff43a5df213b' + 'title': 'Introduction to JavaScript Programming', }, - 'playlist_count': 31, - }, { - 'url': 'https://frontendmasters.com/courses/sql-fundamentals/', - 'only_matching': True, - }, { - 'url': 'https://frontendmasters.com/courses/introduction-to-javascript-jquery/', - 'only_matching': True, - }] + 'playlist_count': 19, + } def _real_extract(self, url): course_id = self._match_id(url) From 21aaaaa56049f90ef9e11a3f51ab50cc93614041 Mon Sep 17 00:00:00 2001 From: Luca Cherubin Date: Mon, 30 Apr 2018 10:34:30 +0100 Subject: [PATCH 11/19] Reviews tests --- youtube_dl/extractor/frontendmaster.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/frontendmaster.py b/youtube_dl/extractor/frontendmaster.py index 2ec4f6af5..b3bba6986 100644 --- a/youtube_dl/extractor/frontendmaster.py +++ b/youtube_dl/extractor/frontendmaster.py @@ -109,7 +109,8 @@ class FrontEndMasterIE(FrontEndMasterBaseIE): 'info_dict': { 'id': 'introduction', 'title': 'Introduction', - 'display_id': 'javascript-basics', + 'display_id': 'introduction', + 'description': 'md5:319818bc390d42f937399befeedf7947', 'ext': 'mp4' }, 'skip': 'Requires FrontendMasters account credentials' @@ -250,10 +251,12 @@ class FrontEndMasterCourseIE(FrontEndMasterBaseIE): _TEST = { 'url': 'https://frontendmasters.com/courses/javascript-basics/', 'info_dict': { - 'id': 'content-strategy', + 'id': 'javascript-basics', 'title': 'Introduction to JavaScript Programming', + 'description': 'md5:269412fbb76d86954761599ad8e4cbc9' }, 'playlist_count': 19, + 'skip': 'Requires FrontendMasters account credentials' } def _real_extract(self, url): From 5ce9a04e2b82921a67dbb558a50b1e26210158a1 Mon Sep 17 00:00:00 2001 From: Luca Cherubin Date: Mon, 30 Apr 2018 10:40:03 +0100 Subject: [PATCH 12/19] Changes test for single video --- youtube_dl/extractor/frontendmaster.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/frontendmaster.py b/youtube_dl/extractor/frontendmaster.py index b3bba6986..3db948e65 100644 --- a/youtube_dl/extractor/frontendmaster.py +++ b/youtube_dl/extractor/frontendmaster.py @@ -104,16 +104,16 @@ class FrontEndMasterIE(FrontEndMasterBaseIE): _VALID_URL = r'https?://(?:www\.)?frontendmasters\.com/courses/(?P[a-z\-]+)/(?P[a-z\-]+)/?' _NETRC_MACHINE = 'frontend-masters' _TEST = { - 'url': 'https://frontendmasters.com/courses/javascript-basics/introduction/', - 'md5': 'a47be6ea0a384cbbb10fab10061f43d6', + 'url': 'https://frontendmasters.com/courses/web-development/tools', + 'md5': '7f161159710d6b7016a4f4af6fcb05e2', 'info_dict': { - 'id': 'introduction', - 'title': 'Introduction', - 'display_id': 'introduction', - 'description': 'md5:319818bc390d42f937399befeedf7947', + 'id': 'tools', + 'title': 'Tools', + 'display_id': 'tools', + 'description': 'md5:82c1ea6472e88ed5acd1829fe992e4f7', 'ext': 'mp4' }, - 'skip': 'Requires FrontendMasters account credentials' + 'skip': 'Requires FrontendMasters account credentials', } def _real_extract(self, url): From f13d3ccde152daed66303636390ac2b7d3fe80aa Mon Sep 17 00:00:00 2001 From: Luca Cherubin Date: Mon, 30 Apr 2018 12:09:08 +0100 Subject: [PATCH 13/19] Condense the IE in one single object --- youtube_dl/extractor/extractors.py | 3 +- youtube_dl/extractor/frontendmaster.py | 52 +++++++++++++------------- 2 files changed, 28 insertions(+), 27 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 6ed805cd9..219b5e325 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -385,8 +385,7 @@ from .freesound import FreesoundIE from .freespeech import FreespeechIE from .freshlive import FreshLiveIE from .frontendmaster import ( - FrontEndMasterIE, - FrontEndMasterCourseIE + FrontEndMasterIE ) from .funimation import FunimationIE from .funk import ( diff --git a/youtube_dl/extractor/frontendmaster.py b/youtube_dl/extractor/frontendmaster.py index 3db948e65..711b63a4e 100644 --- a/youtube_dl/extractor/frontendmaster.py +++ b/youtube_dl/extractor/frontendmaster.py @@ -100,10 +100,20 @@ class FrontEndMasterBaseIE(InfoExtractor): class FrontEndMasterIE(FrontEndMasterBaseIE): IE_NAME = 'frontend-masters' + _VALID_URL = r'https?://(?:www\.)?frontendmasters\.com/courses/(?P[a-z\-]+)/?(?P[a-z\-]+)?/?' - _VALID_URL = r'https?://(?:www\.)?frontendmasters\.com/courses/(?P[a-z\-]+)/(?P[a-z\-]+)/?' _NETRC_MACHINE = 'frontend-masters' - _TEST = { + + _TESTS = [{ + 'url': 'https://frontendmasters.com/courses/javascript-basics/', + 'info_dict': { + 'id': 'javascript-basics', + 'title': 'Introduction to JavaScript Programming', + 'description': 'md5:269412fbb76d86954761599ad8e4cbc9' + }, + 'playlist_count': 19, + 'skip': 'Requires FrontendMasters account credentials' + }, { 'url': 'https://frontendmasters.com/courses/web-development/tools', 'md5': '7f161159710d6b7016a4f4af6fcb05e2', 'info_dict': { @@ -114,11 +124,9 @@ class FrontEndMasterIE(FrontEndMasterBaseIE): 'ext': 'mp4' }, 'skip': 'Requires FrontendMasters account credentials', - } + }] - def _real_extract(self, url): - video_id = self._match_id(url) - course_id = self._match_course_id(url) + def _download_single_video(self, url, course_id, video_id): course_json_content = self._download_course(course_id=course_id, url=url, display_id=course_id) @@ -241,26 +249,8 @@ class FrontEndMasterIE(FrontEndMasterBaseIE): 'formats': formats } + def _download_entire_course(self, url, course_id): -class FrontEndMasterCourseIE(FrontEndMasterBaseIE): - IE_NAME = 'frontend-masters:course' - IE_DESC = "frontendmasters.com online courses" - - _VALID_URL = r'https?://(?:www\.)?frontendmasters\.com/courses/(?P[a-z\-]+)/?' - _NETRC_MACHINE = 'frontend-masters' - _TEST = { - 'url': 'https://frontendmasters.com/courses/javascript-basics/', - 'info_dict': { - 'id': 'javascript-basics', - 'title': 'Introduction to JavaScript Programming', - 'description': 'md5:269412fbb76d86954761599ad8e4cbc9' - }, - 'playlist_count': 19, - 'skip': 'Requires FrontendMasters account credentials' - } - - def _real_extract(self, url): - course_id = self._match_id(url) course_json_content = self._download_course(course_id=course_id, url=url, display_id=None) @@ -284,3 +274,15 @@ class FrontEndMasterCourseIE(FrontEndMasterBaseIE): }) return self.playlist_result(entries, course_id, title, description) + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + course_id = mobj.group('courseid') + if video_id: + return self._download_single_video(url, course_id, video_id) + else: + return self._download_entire_course(url, course_id) + + + From f0e3ba85e197506591ddc859c260d83d3bca8abc Mon Sep 17 00:00:00 2001 From: Luca Cherubin Date: Mon, 30 Apr 2018 17:04:35 +0100 Subject: [PATCH 14/19] Split the code into two IE and remove debug code --- youtube_dl/extractor/extractors.py | 3 +- youtube_dl/extractor/frontendmaster.py | 69 +++++++++++--------------- 2 files changed, 31 insertions(+), 41 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 219b5e325..6ed805cd9 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -385,7 +385,8 @@ from .freesound import FreesoundIE from .freespeech import FreespeechIE from .freshlive import FreshLiveIE from .frontendmaster import ( - FrontEndMasterIE + FrontEndMasterIE, + FrontEndMasterCourseIE ) from .funimation import FunimationIE from .funk import ( diff --git a/youtube_dl/extractor/frontendmaster.py b/youtube_dl/extractor/frontendmaster.py index 711b63a4e..479852f69 100644 --- a/youtube_dl/extractor/frontendmaster.py +++ b/youtube_dl/extractor/frontendmaster.py @@ -100,20 +100,11 @@ class FrontEndMasterBaseIE(InfoExtractor): class FrontEndMasterIE(FrontEndMasterBaseIE): IE_NAME = 'frontend-masters' - _VALID_URL = r'https?://(?:www\.)?frontendmasters\.com/courses/(?P[a-z\-]+)/?(?P[a-z\-]+)?/?' + _VALID_URL = r'https?://(?:www\.)?frontendmasters\.com/courses/(?P[a-z\-]+)/(?P[a-z\-]+)/?' _NETRC_MACHINE = 'frontend-masters' - _TESTS = [{ - 'url': 'https://frontendmasters.com/courses/javascript-basics/', - 'info_dict': { - 'id': 'javascript-basics', - 'title': 'Introduction to JavaScript Programming', - 'description': 'md5:269412fbb76d86954761599ad8e4cbc9' - }, - 'playlist_count': 19, - 'skip': 'Requires FrontendMasters account credentials' - }, { + _TEST = { 'url': 'https://frontendmasters.com/courses/web-development/tools', 'md5': '7f161159710d6b7016a4f4af6fcb05e2', 'info_dict': { @@ -124,9 +115,13 @@ class FrontEndMasterIE(FrontEndMasterBaseIE): 'ext': 'mp4' }, 'skip': 'Requires FrontendMasters account credentials', - }] + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + course_id = mobj.group('courseid') - def _download_single_video(self, url, course_id, video_id): course_json_content = self._download_course(course_id=course_id, url=url, display_id=course_id) @@ -212,19 +207,6 @@ class FrontEndMasterIE(FrontEndMasterBaseIE): video_request_url % lesson_source_base, video_id, query=video_request_params, headers=video_request_headers) - # To avoid the possibility of problems with multiple sequential calls to ViewClip API and start - # to return 429 HTTP errors after some time (see the problem Pluralsight has on - # https://github.com/rg3/youtube-dl/pull/6989) and avoid also the risk of - # account ban (see https://github.com/rg3/youtube-dl/issues/6842), - # we will sleep random amount of time before each call to ViewClip. - - # self._sleep( - # random.randint(2, 5), lesson_slug, - # '%(video_id)s: Waiting for %(timeout)s seconds to avoid throttling') - # - # if not video_response: - # continue - video_url = video_response.get('url') clip_f = f.copy() clip_f.update({ @@ -249,8 +231,27 @@ class FrontEndMasterIE(FrontEndMasterBaseIE): 'formats': formats } - def _download_entire_course(self, url, course_id): +class FrontEndMasterCourseIE(FrontEndMasterBaseIE): + IE_NAME = 'frontend-masters:course' + _VALID_URL = r'https?://(?:www\.)?frontendmasters\.com/courses/(?P[a-z\-]+)/?$' + + _NETRC_MACHINE = 'frontend-masters' + + _TEST = { + 'url': 'https://frontendmasters.com/courses/javascript-basics/', + 'info_dict': { + 'id': 'javascript-basics', + 'title': 'Introduction to JavaScript Programming', + 'description': 'md5:269412fbb76d86954761599ad8e4cbc9' + }, + 'playlist_count': 19, + 'skip': 'Requires FrontendMasters account credentials' + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + course_id = mobj.group('courseid') course_json_content = self._download_course(course_id=course_id, url=url, display_id=None) @@ -273,16 +274,4 @@ class FrontEndMasterIE(FrontEndMasterBaseIE): 'ie_key': FrontEndMasterIE.ie_key() }) - return self.playlist_result(entries, course_id, title, description) - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - course_id = mobj.group('courseid') - if video_id: - return self._download_single_video(url, course_id, video_id) - else: - return self._download_entire_course(url, course_id) - - - + return self.playlist_result(entries, course_id, title, description) \ No newline at end of file From 7f69c7d44b7c4ec43ecef9bc7492fa9466b9f29c Mon Sep 17 00:00:00 2001 From: Kerruba Date: Mon, 30 Apr 2018 22:05:00 +0100 Subject: [PATCH 15/19] Cleans code, add captions download --- youtube_dl/extractor/frontendmaster.py | 49 +++++++++++++++++++------- 1 file changed, 37 insertions(+), 12 deletions(-) diff --git a/youtube_dl/extractor/frontendmaster.py b/youtube_dl/extractor/frontendmaster.py index 479852f69..a3a342f9b 100644 --- a/youtube_dl/extractor/frontendmaster.py +++ b/youtube_dl/extractor/frontendmaster.py @@ -8,18 +8,19 @@ import re from .common import InfoExtractor from ..compat import ( compat_urlparse, + compat_basestring, compat_str) from ..utils import ( ExtractorError, urlencode_postdata, - qualities -) + qualities, unescapeHTML) class FrontEndMasterBaseIE(InfoExtractor): _API_BASE = 'https://api.frontendmasters.com/v1/kabuki/courses' _VIDEO_BASE = 'http://www.frontendmasters.com/courses' + _CAPTIONS_BASE = 'https://api.frontendmasters.com/v1/kabuki/transcripts' _COOKIES_BASE = 'https://api.frontendmasters.com' _LOGIN_URL = 'https://frontendmasters.com/login/' @@ -59,17 +60,15 @@ class FrontEndMasterBaseIE(InfoExtractor): response, 'error message', default=None) if error: - raise ExtractorError('Unable to login: check username and password', + raise ExtractorError('Unable to login: %s' % unescapeHTML(error), expected=True) def _match_course_id(self, url): - if '_VALID_URL_RE' not in self.__dict__: - self._VALID_URL_RE = re.compile(self._VALID_URL) - m = self._VALID_URL_RE.match(url) + m = re.match(self._VALID_URL, url) assert m return compat_str(m.group('courseid')) - def _download_course(self, course_id, url, display_id): + def _download_course(self, course_id, url): response = self._download_json( '%s/%s' % (self._API_BASE, course_id), course_id, 'Downloading course JSON', @@ -79,7 +78,8 @@ class FrontEndMasterBaseIE(InfoExtractor): }) return response - def _pair_section_with_video_elemen_index(self, lesson_elements): + @staticmethod + def _pair_section_with_video_elemen_index(lesson_elements): sections = {} current_section = None current_section_number = 0 @@ -100,7 +100,7 @@ class FrontEndMasterBaseIE(InfoExtractor): class FrontEndMasterIE(FrontEndMasterBaseIE): IE_NAME = 'frontend-masters' - _VALID_URL = r'https?://(?:www\.)?frontendmasters\.com/courses/(?P[a-z\-]+)/(?P[a-z\-]+)/?' + _VALID_URL = r'https?://(?:www\.)?frontendmasters\.com/courses/(?P[a-z\-]+)/(?P[a-z\-]+)' _NETRC_MACHINE = 'frontend-masters' @@ -117,6 +117,29 @@ class FrontEndMasterIE(FrontEndMasterBaseIE): 'skip': 'Requires FrontendMasters account credentials', } + @staticmethod + def _convert_subtitles(captions): + if captions and isinstance(captions, compat_basestring): + if captions.startswith('WEBVTT'): + # Assumes captions are in WEBVTT format + captions = captions.replace('WEBVTT', '') + captions = captions.replace('.', ',') + return captions + + def _get_subtitles(self, video_hash, video_id): + captions = self._download_webpage( + '%s/%s.vtt' % (self._CAPTIONS_BASE, video_hash), video_id, + fatal=False) + srt_captions = FrontEndMasterIE._convert_subtitles(captions) + + if srt_captions: + return { + 'en': [{ + 'ext': 'srt', + 'data': srt_captions + }] + } + def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') @@ -151,7 +174,6 @@ class FrontEndMasterIE(FrontEndMasterBaseIE): lesson_section = None lesson_section_number = None - QUALITIES_PREFERENCE = ('low', 'medium', 'high') quality_key = qualities(QUALITIES_PREFERENCE) QUALITIES = { @@ -220,6 +242,8 @@ class FrontEndMasterIE(FrontEndMasterBaseIE): self._sort_formats(formats) + subtitles = self.extract_subtitles(lesson_hash, video_id) + return { 'id': video_id, 'display_id': lesson_slug, @@ -228,7 +252,8 @@ class FrontEndMasterIE(FrontEndMasterBaseIE): 'chapter': lesson_section, 'chapter_number': lesson_section_number, 'thumbnail': lesson_thumbnail_url, - 'formats': formats + 'formats': formats, + 'subtitles': subtitles } @@ -274,4 +299,4 @@ class FrontEndMasterCourseIE(FrontEndMasterBaseIE): 'ie_key': FrontEndMasterIE.ie_key() }) - return self.playlist_result(entries, course_id, title, description) \ No newline at end of file + return self.playlist_result(entries, course_id, title, description) From 52978603e2856997c3de365f9928912d67d41cda Mon Sep 17 00:00:00 2001 From: Kerruba Date: Mon, 30 Apr 2018 22:06:17 +0100 Subject: [PATCH 16/19] Removes doublequotes from login details --- youtube_dl/extractor/frontendmaster.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/frontendmaster.py b/youtube_dl/extractor/frontendmaster.py index a3a342f9b..c03485ec1 100644 --- a/youtube_dl/extractor/frontendmaster.py +++ b/youtube_dl/extractor/frontendmaster.py @@ -38,8 +38,8 @@ class FrontEndMasterBaseIE(InfoExtractor): login_form = self._hidden_inputs(login_page) login_form.update({ - "username": username, - "password": password + 'username': username, + 'password': password }) post_url = self._search_regex( From 8d8f1882440cd4310499a73f3869cd375096405d Mon Sep 17 00:00:00 2001 From: Kerruba Date: Thu, 3 May 2018 23:38:36 +0100 Subject: [PATCH 17/19] Update the _download_course function everywhere --- youtube_dl/extractor/frontendmaster.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/frontendmaster.py b/youtube_dl/extractor/frontendmaster.py index c03485ec1..cfd069c12 100644 --- a/youtube_dl/extractor/frontendmaster.py +++ b/youtube_dl/extractor/frontendmaster.py @@ -146,8 +146,7 @@ class FrontEndMasterIE(FrontEndMasterBaseIE): course_id = mobj.group('courseid') course_json_content = self._download_course(course_id=course_id, - url=url, - display_id=course_id) + url=url) # Necessary to get mandatory informations like title and video_url lesson_index = course_json_content.get('lessonSlugs').index(video_id) @@ -278,8 +277,7 @@ class FrontEndMasterCourseIE(FrontEndMasterBaseIE): mobj = re.match(self._VALID_URL, url) course_id = mobj.group('courseid') course_json_content = self._download_course(course_id=course_id, - url=url, - display_id=None) + url=url) title = course_json_content.get('title') description = course_json_content.get('description') From 3d20dd3e2e889900961cd4ceb9bfc73ea2bf98d4 Mon Sep 17 00:00:00 2001 From: Luca Cherubin Date: Tue, 8 May 2018 12:38:33 +0100 Subject: [PATCH 18/19] Update code to comply to flake8 and other changes requests --- youtube_dl/extractor/frontendmaster.py | 42 ++++++++++++++++---------- 1 file changed, 26 insertions(+), 16 deletions(-) diff --git a/youtube_dl/extractor/frontendmaster.py b/youtube_dl/extractor/frontendmaster.py index cfd069c12..25a0d9fc5 100644 --- a/youtube_dl/extractor/frontendmaster.py +++ b/youtube_dl/extractor/frontendmaster.py @@ -56,7 +56,9 @@ class FrontEndMasterBaseIE(InfoExtractor): ) error = self._search_regex( - r']+class=["\']Message MessageAlert["\'][^>]*>([^<]+)', + r']+class=["\']Message MessageAlert["\'][^>]*>' + r'([^<]+)' + r'', response, 'error message', default=None) if error: @@ -79,7 +81,7 @@ class FrontEndMasterBaseIE(InfoExtractor): return response @staticmethod - def _pair_section_with_video_elemen_index(lesson_elements): + def _pair_section_video_element(lesson_elements): sections = {} current_section = None current_section_number = 0 @@ -100,9 +102,11 @@ class FrontEndMasterBaseIE(InfoExtractor): class FrontEndMasterIE(FrontEndMasterBaseIE): IE_NAME = 'frontend-masters' - _VALID_URL = r'https?://(?:www\.)?frontendmasters\.com/courses/(?P[a-z\-]+)/(?P[a-z\-]+)' + _VALID_URL = r'https?://(?:www\.)?frontendmasters\.com/courses/' \ + r'(?P[a-z\-]+)/' \ + r'(?P[a-z\-]+)$' - _NETRC_MACHINE = 'frontend-masters' + _NETRC_MACHINE = 'frontendmasters' _TEST = { 'url': 'https://frontendmasters.com/courses/web-development/tools', @@ -165,10 +169,14 @@ class FrontEndMasterIE(FrontEndMasterBaseIE): lesson_section_elements = course_json_content.get('lessonElements') try: - course_sections_pairing = self._pair_section_with_video_elemen_index( + course_sections_pairing = self._pair_section_video_element( lesson_section_elements) - lesson_section = course_sections_pairing.get(lesson_index)[0] - lesson_section_number = course_sections_pairing.get(lesson_index)[1] + + lesson_section = \ + course_sections_pairing.get(lesson_index)[0] + + lesson_section_number = \ + course_sections_pairing.get(lesson_index)[1] except Exception: lesson_section = None lesson_section_number = None @@ -189,13 +197,13 @@ class FrontEndMasterIE(FrontEndMasterBaseIE): ] cookies = self._get_cookies(self._COOKIES_BASE) - cookies_str = ";".join(["%s=%s" % (cookie.key, cookie.value) + cookies_str = ';'.join(['%s=%s' % (cookie.key, cookie.value) for cookie in cookies.values()]) - video_request_url = "%s/source" + video_request_url = '%s/source' video_request_headers = { - "origin": "https://frontendmasters.com", - "referer": lesson_source_base, - "cookie": cookies_str + 'origin': 'https://frontendmasters.com', + 'referer': lesson_source_base, + 'cookie': cookies_str } if self._downloader.params.get('listformats', False): @@ -208,7 +216,8 @@ class FrontEndMasterIE(FrontEndMasterBaseIE): req_ext, req_quality = req_format_split req_quality = '-'.join(req_quality.split('-')[:2]) for allowed_quality in ALLOWED_QUALITIES: - if req_ext == allowed_quality.ext and req_quality in allowed_quality.qualities: + if req_ext == allowed_quality.ext and \ + req_quality in allowed_quality.qualities: return (AllowedQuality(req_ext, (req_quality,)),) req_ext = 'webm' if self._downloader.params.get( 'prefer_free_formats') else 'mp4' @@ -258,9 +267,10 @@ class FrontEndMasterIE(FrontEndMasterBaseIE): class FrontEndMasterCourseIE(FrontEndMasterBaseIE): IE_NAME = 'frontend-masters:course' - _VALID_URL = r'https?://(?:www\.)?frontendmasters\.com/courses/(?P[a-z\-]+)/?$' + _VALID_URL = r'https?://(?:www\.)?frontendmasters\.com/courses/' \ + r'(?P[a-z\-]+)/?$' - _NETRC_MACHINE = 'frontend-masters' + _NETRC_MACHINE = 'frontendmasters' _TEST = { 'url': 'https://frontendmasters.com/courses/javascript-basics/', @@ -289,7 +299,7 @@ class FrontEndMasterCourseIE(FrontEndMasterBaseIE): entries = [] for video in videos_data: video_slug = video.get('slug') - clip_url = "%s/%s/%s" % ( + clip_url = '%s/%s/%s' % ( self._VIDEO_BASE, course_display_id, video_slug) entries.append({ '_type': 'url_transparent', From a648b86b053f1937838fd338a41f22a8d55ea5cc Mon Sep 17 00:00:00 2001 From: Luca Cherubin Date: Sun, 27 May 2018 16:31:31 +0200 Subject: [PATCH 19/19] Integrates required changes --- youtube_dl/extractor/frontendmaster.py | 93 ++++++++------------------ 1 file changed, 27 insertions(+), 66 deletions(-) diff --git a/youtube_dl/extractor/frontendmaster.py b/youtube_dl/extractor/frontendmaster.py index 25a0d9fc5..21e382da9 100644 --- a/youtube_dl/extractor/frontendmaster.py +++ b/youtube_dl/extractor/frontendmaster.py @@ -2,15 +2,11 @@ from __future__ import unicode_literals import collections - import re from .common import InfoExtractor from ..compat import ( - compat_urlparse, - compat_basestring, - compat_str) - + compat_urlparse) from ..utils import ( ExtractorError, urlencode_postdata, @@ -24,6 +20,20 @@ class FrontEndMasterBaseIE(InfoExtractor): _COOKIES_BASE = 'https://api.frontendmasters.com' _LOGIN_URL = 'https://frontendmasters.com/login/' + _QUALITIES_PREFERENCE = ('low', 'medium', 'high') + _QUALITIES = { + 'low': {'width': 480, 'height': 360}, + 'medium': {'width': 1280, 'height': 720}, + 'high': {'width': 1920, 'height': 1080} + } + + AllowedQuality = collections.namedtuple('AllowedQuality', + ['ext', 'qualities']) + _ALLOWED_QUALITIES = [ + AllowedQuality('webm', ['low', 'medium', 'high']), + AllowedQuality('mp4', ['low', 'medium', 'high']) + ] + def _real_initialize(self): self._login() @@ -65,11 +75,6 @@ class FrontEndMasterBaseIE(InfoExtractor): raise ExtractorError('Unable to login: %s' % unescapeHTML(error), expected=True) - def _match_course_id(self, url): - m = re.match(self._VALID_URL, url) - assert m - return compat_str(m.group('courseid')) - def _download_course(self, course_id, url): response = self._download_json( '%s/%s' % (self._API_BASE, course_id), course_id, @@ -104,7 +109,7 @@ class FrontEndMasterIE(FrontEndMasterBaseIE): IE_NAME = 'frontend-masters' _VALID_URL = r'https?://(?:www\.)?frontendmasters\.com/courses/' \ r'(?P[a-z\-]+)/' \ - r'(?P[a-z\-]+)$' + r'(?P[a-z\-]+)' _NETRC_MACHINE = 'frontendmasters' @@ -121,26 +126,15 @@ class FrontEndMasterIE(FrontEndMasterBaseIE): 'skip': 'Requires FrontendMasters account credentials', } - @staticmethod - def _convert_subtitles(captions): - if captions and isinstance(captions, compat_basestring): - if captions.startswith('WEBVTT'): - # Assumes captions are in WEBVTT format - captions = captions.replace('WEBVTT', '') - captions = captions.replace('.', ',') - return captions - def _get_subtitles(self, video_hash, video_id): captions = self._download_webpage( '%s/%s.vtt' % (self._CAPTIONS_BASE, video_hash), video_id, fatal=False) - srt_captions = FrontEndMasterIE._convert_subtitles(captions) - - if srt_captions: + if captions: return { 'en': [{ - 'ext': 'srt', - 'data': srt_captions + 'ext': 'vtt', + 'data': captions }] } @@ -181,54 +175,18 @@ class FrontEndMasterIE(FrontEndMasterBaseIE): lesson_section = None lesson_section_number = None - QUALITIES_PREFERENCE = ('low', 'medium', 'high') - quality_key = qualities(QUALITIES_PREFERENCE) - QUALITIES = { - 'low': {'width': 480, 'height': 360}, - 'medium': {'width': 1280, 'height': 720}, - 'high': {'width': 1920, 'height': 1080} - } - - AllowedQuality = collections.namedtuple('AllowedQuality', - ['ext', 'qualities']) - ALLOWED_QUALITIES = [ - AllowedQuality('webm', ['low', 'medium', 'high']), - AllowedQuality('mp4', ['low', 'medium', 'high']) - ] - - cookies = self._get_cookies(self._COOKIES_BASE) - cookies_str = ';'.join(['%s=%s' % (cookie.key, cookie.value) - for cookie in cookies.values()]) video_request_url = '%s/source' video_request_headers = { 'origin': 'https://frontendmasters.com', 'referer': lesson_source_base, - 'cookie': cookies_str } - if self._downloader.params.get('listformats', False): - allowed_qualities = ALLOWED_QUALITIES - else: - def guess_allowed_qualities(): - req_format = self._downloader.params.get('format') or 'best' - req_format_split = req_format.split('-', 1) - if len(req_format_split) > 1: - req_ext, req_quality = req_format_split - req_quality = '-'.join(req_quality.split('-')[:2]) - for allowed_quality in ALLOWED_QUALITIES: - if req_ext == allowed_quality.ext and \ - req_quality in allowed_quality.qualities: - return (AllowedQuality(req_ext, (req_quality,)),) - req_ext = 'webm' if self._downloader.params.get( - 'prefer_free_formats') else 'mp4' - return (AllowedQuality(req_ext, ('high',)),) - - allowed_qualities = guess_allowed_qualities() + quality_key = qualities(self._QUALITIES_PREFERENCE) formats = [] - for ext, qualities_ in allowed_qualities: + for ext, qualities_ in self._ALLOWED_QUALITIES: for quality in qualities_: - f = QUALITIES[quality].copy() + f = self._QUALITIES[quality].copy() video_request_params = { 'r': f['height'], 'f': ext @@ -267,8 +225,7 @@ class FrontEndMasterIE(FrontEndMasterBaseIE): class FrontEndMasterCourseIE(FrontEndMasterBaseIE): IE_NAME = 'frontend-masters:course' - _VALID_URL = r'https?://(?:www\.)?frontendmasters\.com/courses/' \ - r'(?P[a-z\-]+)/?$' + _VALID_URL = r'https?://(?:www\.)?frontendmasters\.com/courses/(?P[a-z\-]+)/?$' _NETRC_MACHINE = 'frontendmasters' @@ -283,6 +240,10 @@ class FrontEndMasterCourseIE(FrontEndMasterBaseIE): 'skip': 'Requires FrontendMasters account credentials' } + @classmethod + def suitable(cls, url): + return False if FrontEndMasterIE.suitable(url) else super(FrontEndMasterBaseIE, cls).suitable(url) + def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) course_id = mobj.group('courseid')