diff --git a/ChangeLog b/ChangeLog index 7d71fc5e1..a83523cb9 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,10 @@ +version + +Extractors +* [youku:show] Fix playlist extraction (#13248) ++ [dispeak] Recognize sevt subdomain (#13276) + + version 2017.07.15 Core diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index 70989e232..e0decb81c 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -41,6 +41,7 @@ def _make_result(formats, **kwargs): 'id': 'testid', 'title': 'testttitle', 'extractor': 'testex', + 'extractor_key': 'TestEx', } res.update(**kwargs) return res @@ -761,7 +762,8 @@ class TestYoutubeDL(unittest.TestCase): '_type': 'url_transparent', 'url': 'foo2:', 'ie_key': 'Foo2', - 'title': 'foo1 title' + 'title': 'foo1 title', + 'id': 'foo1_id', } class Foo2IE(InfoExtractor): @@ -787,6 +789,9 @@ class TestYoutubeDL(unittest.TestCase): downloaded = ydl.downloaded_info_dicts[0] self.assertEqual(downloaded['url'], TEST_URL) self.assertEqual(downloaded['title'], 'foo1 title') + self.assertEqual(downloaded['id'], 'testid') + self.assertEqual(downloaded['extractor'], 'testex') + self.assertEqual(downloaded['extractor_key'], 'TestEx') if __name__ == '__main__': diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 89c07be29..f94836d06 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -860,7 +860,7 @@ class YoutubeDL(object): force_properties = dict( (k, v) for k, v in ie_result.items() if v is not None) - for f in ('_type', 'url', 'ie_key'): + for f in ('_type', 'url', 'id', 'extractor', 'extractor_key', 'ie_key'): if f in force_properties: del force_properties[f] new_result = info.copy() diff --git a/youtube_dl/extractor/dispeak.py b/youtube_dl/extractor/dispeak.py index a78cb8a2a..c05f601e2 100644 --- a/youtube_dl/extractor/dispeak.py +++ b/youtube_dl/extractor/dispeak.py @@ -13,7 +13,7 @@ from ..utils import ( class DigitallySpeakingIE(InfoExtractor): - _VALID_URL = r'https?://(?:evt\.dispeak|events\.digitallyspeaking)\.com/(?:[^/]+/)+xml/(?P[^.]+)\.xml' + _VALID_URL = r'https?://(?:s?evt\.dispeak|events\.digitallyspeaking)\.com/(?:[^/]+/)+xml/(?P[^.]+)\.xml' _TESTS = [{ # From http://gdcvault.com/play/1023460/Tenacious-Design-and-The-Interface @@ -28,6 +28,10 @@ class DigitallySpeakingIE(InfoExtractor): # From http://www.gdcvault.com/play/1014631/Classic-Game-Postmortem-PAC 'url': 'http://events.digitallyspeaking.com/gdc/sf11/xml/12396_1299111843500GMPX.xml', 'only_matching': True, + }, { + # From http://www.gdcvault.com/play/1013700/Advanced-Material + 'url': 'http://sevt.dispeak.com/ubm/gdc/eur10/xml/11256_1282118587281VNIT.xml', + 'only_matching': True, }] def _parse_mp4(self, metadata): diff --git a/youtube_dl/extractor/egghead.py b/youtube_dl/extractor/egghead.py index c86f52319..e4a3046af 100644 --- a/youtube_dl/extractor/egghead.py +++ b/youtube_dl/extractor/egghead.py @@ -2,6 +2,11 @@ from __future__ import unicode_literals from .common import InfoExtractor +from ..utils import ( + int_or_none, + try_get, + unified_timestamp, +) class EggheadCourseIE(InfoExtractor): @@ -33,3 +38,47 @@ class EggheadCourseIE(InfoExtractor): return self.playlist_result( entries, playlist_id, course.get('title'), course.get('description')) + + +class EggheadLessonIE(InfoExtractor): + IE_DESC = 'egghead.io lesson' + IE_NAME = 'egghead:lesson' + _VALID_URL = r'https://egghead\.io/lessons/(?P[^/?#&]+)' + _TEST = { + 'url': 'https://egghead.io/lessons/javascript-linear-data-flow-with-container-style-types-box', + 'info_dict': { + 'id': 'fv5yotjxcg', + 'ext': 'mp4', + 'title': 'Create linear data flow with container style types (Box)', + 'description': 'md5:9aa2cdb6f9878ed4c39ec09e85a8150e', + 'thumbnail': r're:^https?:.*\.jpg$', + 'timestamp': 1481296768, + 'upload_date': '20161209', + 'duration': 304, + 'view_count': 0, + 'tags': ['javascript', 'free'], + }, + 'params': { + 'skip_download': True, + }, + } + + def _real_extract(self, url): + lesson_id = self._match_id(url) + + lesson = self._download_json( + 'https://egghead.io/api/v1/lessons/%s' % lesson_id, lesson_id) + + return { + '_type': 'url_transparent', + 'ie_key': 'Wistia', + 'url': 'wistia:%s' % lesson['wistia_id'], + 'id': lesson['wistia_id'], + 'title': lesson.get('title'), + 'description': lesson.get('summary'), + 'thumbnail': lesson.get('thumb_nail'), + 'timestamp': unified_timestamp(lesson.get('published_at')), + 'duration': int_or_none(lesson.get('duration')), + 'view_count': int_or_none(lesson.get('plays_count')), + 'tags': try_get(lesson, lambda x: x['tag_list'], list), + } diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index e8a066b83..db7616caa 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -298,7 +298,10 @@ from .dw import ( from .eagleplatform import EaglePlatformIE from .ebaumsworld import EbaumsWorldIE from .echomsk import EchoMskIE -from .egghead import EggheadCourseIE +from .egghead import ( + EggheadCourseIE, + EggheadLessonIE, +) from .ehow import EHowIE from .eighttracks import EightTracksIE from .einthusan import EinthusanIE diff --git a/youtube_dl/extractor/funnyordie.py b/youtube_dl/extractor/funnyordie.py index 49409369c..f85e7de14 100644 --- a/youtube_dl/extractor/funnyordie.py +++ b/youtube_dl/extractor/funnyordie.py @@ -1,10 +1,14 @@ from __future__ import unicode_literals -import json import re from .common import InfoExtractor -from ..utils import ExtractorError +from ..utils import ( + ExtractorError, + float_or_none, + int_or_none, + unified_timestamp, +) class FunnyOrDieIE(InfoExtractor): @@ -18,6 +22,10 @@ class FunnyOrDieIE(InfoExtractor): 'title': 'Heart-Shaped Box: Literal Video Version', 'description': 'md5:ea09a01bc9a1c46d9ab696c01747c338', 'thumbnail': r're:^http:.*\.jpg$', + 'uploader': 'DASjr', + 'timestamp': 1317904928, + 'upload_date': '20111006', + 'duration': 318.3, }, }, { 'url': 'http://www.funnyordie.com/embed/e402820827', @@ -27,6 +35,8 @@ class FunnyOrDieIE(InfoExtractor): 'title': 'Please Use This Song (Jon Lajoie)', 'description': 'Please use this to sell something. www.jonlajoie.com', 'thumbnail': r're:^http:.*\.jpg$', + 'timestamp': 1398988800, + 'upload_date': '20140502', }, 'params': { 'skip_download': True, @@ -100,15 +110,53 @@ class FunnyOrDieIE(InfoExtractor): 'url': 'http://www.funnyordie.com%s' % src, }] - post_json = self._search_regex( - r'fb_post\s*=\s*(\{.*?\});', webpage, 'post details') - post = json.loads(post_json) + timestamp = unified_timestamp(self._html_search_meta( + 'uploadDate', webpage, 'timestamp', default=None)) + + uploader = self._html_search_regex( + r']+\bclass=["\']channel-preview-name[^>]+>(.+?) embed { 'url': 'https://www.hostblogger.de/blog/archives/6181-Auto-jagt-Betonmischer.html', diff --git a/youtube_dl/extractor/spiegel.py b/youtube_dl/extractor/spiegel.py index 8598377b0..84298fee4 100644 --- a/youtube_dl/extractor/spiegel.py +++ b/youtube_dl/extractor/spiegel.py @@ -122,6 +122,26 @@ class SpiegelArticleIE(InfoExtractor): }, 'playlist_count': 6, + }, { + # Nexx iFrame embed + 'url': 'http://www.spiegel.de/sptv/spiegeltv/spiegel-tv-ueber-schnellste-katapult-achterbahn-der-welt-taron-a-1137884.html', + 'info_dict': { + 'id': '161464', + 'ext': 'mp4', + 'title': 'Nervenkitzel Achterbahn', + 'alt_title': 'Karussellbauer in Deutschland', + 'description': 'md5:ffe7b1cc59a01f585e0569949aef73cc', + 'release_year': 2005, + 'creator': 'SPIEGEL TV', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 2761, + 'timestamp': 1394021479, + 'upload_date': '20140305', + }, + 'params': { + 'format': 'bestvideo', + 'skip_download': True, + }, }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/tbs.py b/youtube_dl/extractor/tbs.py index bf93eb868..e9474533f 100644 --- a/youtube_dl/extractor/tbs.py +++ b/youtube_dl/extractor/tbs.py @@ -8,6 +8,9 @@ from ..utils import extract_attributes class TBSIE(TurnerBaseIE): + # https://github.com/rg3/youtube-dl/issues/13658 + _WORKING = False + _VALID_URL = r'https?://(?:www\.)?(?Ptbs|tntdrama)\.com/videos/(?:[^/]+/)+(?P[^/?#]+)\.html' _TESTS = [{ 'url': 'http://www.tbs.com/videos/people-of-earth/season-1/extras/2007318/theatrical-trailer.html', @@ -17,7 +20,8 @@ class TBSIE(TurnerBaseIE): 'ext': 'mp4', 'title': 'Theatrical Trailer', 'description': 'Catch the latest comedy from TBS, People of Earth, premiering Halloween night--Monday, October 31, at 9/8c.', - } + }, + 'skip': 'TBS videos are deleted after a while', }, { 'url': 'http://www.tntdrama.com/videos/good-behavior/season-1/extras/1538823/you-better-run.html', 'md5': 'ce53c6ead5e9f3280b4ad2031a6fab56', @@ -26,7 +30,8 @@ class TBSIE(TurnerBaseIE): 'ext': 'mp4', 'title': 'You Better Run', 'description': 'Letty Raines must figure out what she\'s running toward while running away from her past. Good Behavior premieres November 15 at 9/8c.', - } + }, + 'skip': 'TBS videos are deleted after a while', }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/youku.py b/youtube_dl/extractor/youku.py index dcce15d77..0c4bc2eda 100644 --- a/youtube_dl/extractor/youku.py +++ b/youtube_dl/extractor/youku.py @@ -1,7 +1,6 @@ # coding: utf-8 from __future__ import unicode_literals -import itertools import random import re import string @@ -14,7 +13,6 @@ from ..utils import ( js_to_json, str_or_none, strip_jsonp, - urljoin, ) @@ -222,17 +220,42 @@ class YoukuShowIE(InfoExtractor): _VALID_URL = r'https?://list\.youku\.com/show/id_(?P[0-9a-z]+)\.html' IE_NAME = 'youku:show' - _TEST = { + _TESTS = [{ 'url': 'http://list.youku.com/show/id_zc7c670be07ff11e48b3f.html', 'info_dict': { 'id': 'zc7c670be07ff11e48b3f', - 'title': '花千骨 未删减版', + 'title': '花千骨 DVD版', 'description': 'md5:a1ae6f5618571bbeb5c9821f9c81b558', }, 'playlist_count': 50, - } + }, { + # Episode number not starting from 1 + 'url': 'http://list.youku.com/show/id_zefbfbd70efbfbd780bef.html', + 'info_dict': { + 'id': 'zefbfbd70efbfbd780bef', + 'title': '超级飞侠3', + 'description': 'md5:275715156abebe5ccc2a1992e9d56b98', + }, + 'playlist_count': 24, + }, { + # Ongoing playlist. The initial page is the last one + 'url': 'http://list.youku.com/show/id_za7c275ecd7b411e1a19e.html', + 'only_matchine': True, + }] - _PAGE_SIZE = 40 + def _extract_entries(self, playlist_data_url, show_id, note, query): + query['callback'] = 'cb' + playlist_data = self._download_json( + playlist_data_url, show_id, query=query, note=note, + transform_source=lambda s: js_to_json(strip_jsonp(s)))['html'] + drama_list = (get_element_by_class('p-drama-grid', playlist_data) or + get_element_by_class('p-drama-half-row', playlist_data)) + if drama_list is None: + raise ExtractorError('No episodes found') + video_urls = re.findall(r']+href="([^"]+)"', drama_list) + return playlist_data, [ + self.url_result(self._proto_relative_url(video_url, 'http:'), YoukuIE.ie_key()) + for video_url in video_urls] def _real_extract(self, url): show_id = self._match_id(url) @@ -242,30 +265,29 @@ class YoukuShowIE(InfoExtractor): page_config = self._parse_json(self._search_regex( r'var\s+PageConfig\s*=\s*({.+});', webpage, 'page config'), show_id, transform_source=js_to_json) - for idx in itertools.count(0): - if idx == 0: - playlist_data_url = 'http://list.youku.com/show/module' - query = {'id': page_config['showid'], 'tab': 'point'} - else: - playlist_data_url = 'http://list.youku.com/show/point' - query = { - 'id': page_config['showid'], - 'stage': 'reload_%d' % (self._PAGE_SIZE * idx + 1), - } - query['callback'] = 'cb' - playlist_data = self._download_json( - playlist_data_url, show_id, query=query, + first_page, initial_entries = self._extract_entries( + 'http://list.youku.com/show/module', show_id, + note='Downloading initial playlist data page', + query={ + 'id': page_config['showid'], + 'tab': 'showInfo', + }) + first_page_reload_id = self._html_search_regex( + r']+id="(reload_\d+)', first_page, 'first page reload id') + # The first reload_id has the same items as first_page + reload_ids = re.findall(']+data-id="([^"]+)">', first_page) + for idx, reload_id in enumerate(reload_ids): + if reload_id == first_page_reload_id: + entries.extend(initial_entries) + continue + _, new_entries = self._extract_entries( + 'http://list.youku.com/show/episode', show_id, note='Downloading playlist data page %d' % (idx + 1), - transform_source=lambda s: js_to_json(strip_jsonp(s)))['html'] - video_urls = re.findall( - r']+class="p-thumb"[^<]+]+href="([^"]+)"', - playlist_data) - new_entries = [ - self.url_result(urljoin(url, video_url), YoukuIE.ie_key()) - for video_url in video_urls] + query={ + 'id': page_config['showid'], + 'stage': reload_id, + }) entries.extend(new_entries) - if len(new_entries) < self._PAGE_SIZE: - break desc = self._html_search_meta('description', webpage, fatal=False) playlist_title = desc.split(',')[0] if desc else None