From 23f20674ee5daba13829bf7092f475d39ea00255 Mon Sep 17 00:00:00 2001 From: TRox1972 Date: Tue, 17 May 2016 16:21:52 +0200 Subject: [PATCH 1/3] [LibraryOfCongress] Added new extractor (Closes #3188) Added extractor of loc.gov, which closes #3188. I am not an experienced programmer, so I am sure I did a bunch of mistakes, but the extractor works (for me at least). --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/libraryofcongress.py | 64 +++++++++++++++++++++++ 2 files changed, 65 insertions(+) create mode 100644 youtube_dl/extractor/libraryofcongress.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 861701f4c..f7b75d2c4 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -376,6 +376,7 @@ from .leeco import ( LePlaylistIE, LetvCloudIE, ) +from .libraryofcongress import LibraryOfCongressIE from .libsyn import LibsynIE from .lifenews import ( LifeNewsIE, diff --git a/youtube_dl/extractor/libraryofcongress.py b/youtube_dl/extractor/libraryofcongress.py new file mode 100644 index 000000000..60e88b2d6 --- /dev/null +++ b/youtube_dl/extractor/libraryofcongress.py @@ -0,0 +1,64 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + +from ..utils import determine_ext + + +class LibraryOfCongressIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?loc\.gov/item/(?P[0-9]+)' + _TESTS = [{ + 'url': 'http://loc.gov/item/90716351/', + 'info_dict': { + 'id': '90716351', + 'ext': 'mp4', + 'title': 'Pa\'s trip to Mars /' + }, + 'params': { + # m3u8 download + 'skip_download': True, + } + }, { + 'url': 'https://www.loc.gov/item/97516576/', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + self.report_extraction(video_id) + json_id = self._search_regex('media-player-([0-9A-Z]{32})', webpage, video_id) + + data = self._parse_json(self._download_webpage('https://media.loc.gov/services/v1/media?id=%s' % json_id, video_id), video_id) + data = data['mediaObject'] + + media_url = data['derivatives'][0]['derivativeUrl'] + media_url = media_url.replace('rtmp', 'https') + + is_video = data['mediaType'].lower() == 'v' + if not determine_ext(media_url) in ('mp4', 'mp3'): + media_url += '.mp4' if is_video else '.mp3' + + if media_url.index('vod/mp4:') > -1: + media_url = media_url.replace('vod/mp4:', 'hls-vod/media/') + '.m3u8' + elif url.index('vod/mp3:') > -1: + media_url = media_url.replace('vod/mp3:', '') + + formats = [] + if determine_ext(media_url) == 'm3u8': + formats = self._extract_m3u8_formats(media_url, video_id, ext='mp4') + elif determine_ext(media_url) is 'mp3': + formats.append({ + 'url': media_url, + 'ext': 'mp3', + }) + + return { + 'id': video_id, + 'thumbnail': self._og_search_thumbnail(webpage), + 'title': self._og_search_title(webpage), + 'ext': 'mp4' if is_video else 'mp3', + 'formats': formats, + } From bd5610b6a606bbf474e29428949e0a2d8f2f3499 Mon Sep 17 00:00:00 2001 From: TRox1972 Date: Sun, 22 May 2016 14:35:54 +0200 Subject: [PATCH 2/3] [LibraryOfCongress] don't use video_id for _search_regex() --- youtube_dl/extractor/libraryofcongress.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/libraryofcongress.py b/youtube_dl/extractor/libraryofcongress.py index 60e88b2d6..149108567 100644 --- a/youtube_dl/extractor/libraryofcongress.py +++ b/youtube_dl/extractor/libraryofcongress.py @@ -29,7 +29,7 @@ class LibraryOfCongressIE(InfoExtractor): webpage = self._download_webpage(url, video_id) self.report_extraction(video_id) - json_id = self._search_regex('media-player-([0-9A-Z]{32})', webpage, video_id) + json_id = self._search_regex('media-player-([0-9A-Z]{32})', webpage, 'json id') data = self._parse_json(self._download_webpage('https://media.loc.gov/services/v1/media?id=%s' % json_id, video_id), video_id) data = data['mediaObject'] From cafa56ebc763ed8e5fb2d11e63b30723cc52955f Mon Sep 17 00:00:00 2001 From: TRox1972 Date: Mon, 30 May 2016 19:53:11 +0200 Subject: [PATCH 3/3] [LibraryOfCongress] Improvements --- youtube_dl/extractor/libraryofcongress.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/libraryofcongress.py b/youtube_dl/extractor/libraryofcongress.py index 149108567..0c34dbce3 100644 --- a/youtube_dl/extractor/libraryofcongress.py +++ b/youtube_dl/extractor/libraryofcongress.py @@ -31,7 +31,9 @@ class LibraryOfCongressIE(InfoExtractor): self.report_extraction(video_id) json_id = self._search_regex('media-player-([0-9A-Z]{32})', webpage, 'json id') - data = self._parse_json(self._download_webpage('https://media.loc.gov/services/v1/media?id=%s' % json_id, video_id), video_id) + data = self._parse_json(self._download_webpage( + 'https://media.loc.gov/services/v1/media?id=%s' % json_id, + video_id), video_id) data = data['mediaObject'] media_url = data['derivatives'][0]['derivativeUrl'] @@ -59,6 +61,5 @@ class LibraryOfCongressIE(InfoExtractor): 'id': video_id, 'thumbnail': self._og_search_thumbnail(webpage), 'title': self._og_search_title(webpage), - 'ext': 'mp4' if is_video else 'mp3', 'formats': formats, }