From 5abf513cf87049b63369e45dd59818136080a68e Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Tue, 5 Jul 2016 10:44:16 +0100 Subject: [PATCH 01/19] [stitcher] fix episode config extraction --- youtube_dl/extractor/stitcher.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/stitcher.py b/youtube_dl/extractor/stitcher.py index d5c852f52..0f8782d03 100644 --- a/youtube_dl/extractor/stitcher.py +++ b/youtube_dl/extractor/stitcher.py @@ -56,7 +56,7 @@ class StitcherIE(InfoExtractor): episode = self._parse_json( js_to_json(self._search_regex( - r'(?s)var\s+stitcher\s*=\s*({.+?});\n', webpage, 'episode config')), + r'(?s)var\s+stitcher(?:Config)?\s*=\s*({.+?});\n', webpage, 'episode config')), display_id)['config']['episode'] title = unescapeHTML(episode['title']) From 252a1f75d205b2c95c6a04c7aa64f08c756e4954 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Tue, 5 Jul 2016 11:46:25 +0100 Subject: [PATCH 02/19] [spiegel] improve info extraction --- youtube_dl/extractor/spiegel.py | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/spiegel.py b/youtube_dl/extractor/spiegel.py index 39a7aaf9d..c67d556e7 100644 --- a/youtube_dl/extractor/spiegel.py +++ b/youtube_dl/extractor/spiegel.py @@ -4,8 +4,13 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_urlparse from .spiegeltv import SpiegeltvIE +from ..compat import compat_urlparse +from ..utils import ( + extract_attributes, + unified_strdate, + get_element_by_attribute, +) class SpiegelIE(InfoExtractor): @@ -19,6 +24,7 @@ class SpiegelIE(InfoExtractor): 'title': 'Vulkanausbruch in Ecuador: Der "Feuerschlund" ist wieder aktiv', 'description': 'md5:8029d8310232196eb235d27575a8b9f4', 'duration': 49, + 'upload_date': '20130311', }, }, { 'url': 'http://www.spiegel.de/video/schach-wm-videoanalyse-des-fuenften-spiels-video-1309159.html', @@ -29,6 +35,7 @@ class SpiegelIE(InfoExtractor): 'title': 'Schach-WM in der Videoanalyse: Carlsen nutzt die Fehlgriffe des Titelverteidigers', 'description': 'md5:c2322b65e58f385a820c10fa03b2d088', 'duration': 983, + 'upload_date': '20131115', }, }, { 'url': 'http://www.spiegel.de/video/astronaut-alexander-gerst-von-der-iss-station-beantwortet-fragen-video-1519126-embed.html', @@ -38,6 +45,7 @@ class SpiegelIE(InfoExtractor): 'ext': 'mp4', 'description': 'SPIEGEL ONLINE-Nutzer durften den deutschen Astronauten Alexander Gerst über sein Leben auf der ISS-Station befragen. Hier kommen seine Antworten auf die besten sechs Fragen.', 'title': 'Fragen an Astronaut Alexander Gerst: "Bekommen Sie die Tageszeiten mit?"', + 'upload_date': '20140904', } }, { 'url': 'http://www.spiegel.de/video/astronaut-alexander-gerst-von-der-iss-station-beantwortet-fragen-video-1519126-iframe.html', @@ -52,10 +60,10 @@ class SpiegelIE(InfoExtractor): if SpiegeltvIE.suitable(handle.geturl()): return self.url_result(handle.geturl(), 'Spiegeltv') - title = re.sub(r'\s+', ' ', self._html_search_regex( - r'(?s)<(?:h1|div) class="module-title"[^>]*>(.*?)', - webpage, 'title')) - description = self._html_search_meta('description', webpage, 'description') + video_data = extract_attributes(self._search_regex(r'(]+id="spVideoElements"[^>]+>)', webpage, 'video element', default='')) + + title = video_data.get('data-video-title') or get_element_by_attribute('class', 'module-title', webpage) + description = video_data.get('data-video-teaser') or self._html_search_meta('description', webpage, 'description') base_url = self._search_regex( [r'server\s*:\s*(["\'])(?P.+?)\1', r'var\s+server\s*=\s*"(?P[^"]+)\"'], @@ -87,8 +95,9 @@ class SpiegelIE(InfoExtractor): return { 'id': video_id, 'title': title, - 'description': description, + 'description': description.strip() if description else None, 'duration': duration, + 'upload_date': unified_strdate(video_data.get('data-video-date')), 'formats': formats, } From 77082c7b9ef2ea95161e4e288c110b5f7f34fda0 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Tue, 5 Jul 2016 12:01:04 +0100 Subject: [PATCH 03/19] [slideshare] fix description extraction --- youtube_dl/extractor/slideshare.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/slideshare.py b/youtube_dl/extractor/slideshare.py index 0b717a1e4..4967c1b77 100644 --- a/youtube_dl/extractor/slideshare.py +++ b/youtube_dl/extractor/slideshare.py @@ -9,6 +9,7 @@ from ..compat import ( ) from ..utils import ( ExtractorError, + get_element_by_id, ) @@ -40,7 +41,7 @@ class SlideshareIE(InfoExtractor): bucket = info['jsplayer']['video_bucket'] ext = info['jsplayer']['video_extension'] video_url = compat_urlparse.urljoin(bucket, doc + '-SD.' + ext) - description = self._html_search_regex( + description = get_element_by_id('slideshow-description-paragraph', webpage) or self._html_search_regex( r'(?s)]+itemprop="description"[^>]*>(.+?)

', webpage, 'description', fatal=False) @@ -51,5 +52,5 @@ class SlideshareIE(InfoExtractor): 'ext': ext, 'url': video_url, 'thumbnail': info['slideshow']['pin_image_url'], - 'description': description, + 'description': description.strip() if description else None, } From 94a5cff91d81c120b8e8b395f9fbe465286940f0 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Tue, 5 Jul 2016 13:37:46 +0100 Subject: [PATCH 04/19] [sendia] fix info extraction --- youtube_dl/extractor/sandia.py | 94 +++++++++------------------------- 1 file changed, 25 insertions(+), 69 deletions(-) diff --git a/youtube_dl/extractor/sandia.py b/youtube_dl/extractor/sandia.py index 759898a49..9ab4d20a7 100644 --- a/youtube_dl/extractor/sandia.py +++ b/youtube_dl/extractor/sandia.py @@ -27,7 +27,8 @@ class SandiaIE(InfoExtractor): 'ext': 'mp4', 'title': 'Xyce Software Training - Section 1', 'description': 're:(?s)SAND Number: SAND 2013-7800.{200,}', - 'upload_date': '20120904', + 'upload_date': '20120409', + 'timestamp': 1333983600, 'duration': 7794, } } @@ -35,81 +36,36 @@ class SandiaIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - req = sanitized_Request(url) - req.add_header('Cookie', 'MediasitePlayerCaps=ClientPlugins=4') - webpage = self._download_webpage(req, video_id) + presentation_data = self._download_json( + 'http://digitalops.sandia.gov/Mediasite/PlayerService/PlayerService.svc/json/GetPlayerOptions', + video_id, data=json.dumps({ + 'getPlayerOptionsRequest': { + 'ResourceId': video_id, + 'QueryString': '', + } + }), headers={ + 'Content-Type': 'application/json; charset=utf-8', + })['d']['Presentation'] - js_path = self._search_regex( - r'