Merge branch 'master' into GoogleDrive-issue-13619

This commit is contained in:
Parmjit Virk 2017-07-20 15:55:52 -05:00
commit 90793eef1d
11 changed files with 205 additions and 63 deletions

View File

@ -1,3 +1,10 @@
version <unreleased>
Extractors
* [youku:show] Fix playlist extraction (#13248)
+ [dispeak] Recognize sevt subdomain (#13276)
version 2017.07.15 version 2017.07.15
Core Core

View File

@ -41,6 +41,7 @@ def _make_result(formats, **kwargs):
'id': 'testid', 'id': 'testid',
'title': 'testttitle', 'title': 'testttitle',
'extractor': 'testex', 'extractor': 'testex',
'extractor_key': 'TestEx',
} }
res.update(**kwargs) res.update(**kwargs)
return res return res
@ -761,7 +762,8 @@ class TestYoutubeDL(unittest.TestCase):
'_type': 'url_transparent', '_type': 'url_transparent',
'url': 'foo2:', 'url': 'foo2:',
'ie_key': 'Foo2', 'ie_key': 'Foo2',
'title': 'foo1 title' 'title': 'foo1 title',
'id': 'foo1_id',
} }
class Foo2IE(InfoExtractor): class Foo2IE(InfoExtractor):
@ -787,6 +789,9 @@ class TestYoutubeDL(unittest.TestCase):
downloaded = ydl.downloaded_info_dicts[0] downloaded = ydl.downloaded_info_dicts[0]
self.assertEqual(downloaded['url'], TEST_URL) self.assertEqual(downloaded['url'], TEST_URL)
self.assertEqual(downloaded['title'], 'foo1 title') self.assertEqual(downloaded['title'], 'foo1 title')
self.assertEqual(downloaded['id'], 'testid')
self.assertEqual(downloaded['extractor'], 'testex')
self.assertEqual(downloaded['extractor_key'], 'TestEx')
if __name__ == '__main__': if __name__ == '__main__':

View File

@ -860,7 +860,7 @@ class YoutubeDL(object):
force_properties = dict( force_properties = dict(
(k, v) for k, v in ie_result.items() if v is not None) (k, v) for k, v in ie_result.items() if v is not None)
for f in ('_type', 'url', 'ie_key'): for f in ('_type', 'url', 'id', 'extractor', 'extractor_key', 'ie_key'):
if f in force_properties: if f in force_properties:
del force_properties[f] del force_properties[f]
new_result = info.copy() new_result = info.copy()

View File

@ -13,7 +13,7 @@ from ..utils import (
class DigitallySpeakingIE(InfoExtractor): class DigitallySpeakingIE(InfoExtractor):
_VALID_URL = r'https?://(?:evt\.dispeak|events\.digitallyspeaking)\.com/(?:[^/]+/)+xml/(?P<id>[^.]+)\.xml' _VALID_URL = r'https?://(?:s?evt\.dispeak|events\.digitallyspeaking)\.com/(?:[^/]+/)+xml/(?P<id>[^.]+)\.xml'
_TESTS = [{ _TESTS = [{
# From http://gdcvault.com/play/1023460/Tenacious-Design-and-The-Interface # From http://gdcvault.com/play/1023460/Tenacious-Design-and-The-Interface
@ -28,6 +28,10 @@ class DigitallySpeakingIE(InfoExtractor):
# From http://www.gdcvault.com/play/1014631/Classic-Game-Postmortem-PAC # From http://www.gdcvault.com/play/1014631/Classic-Game-Postmortem-PAC
'url': 'http://events.digitallyspeaking.com/gdc/sf11/xml/12396_1299111843500GMPX.xml', 'url': 'http://events.digitallyspeaking.com/gdc/sf11/xml/12396_1299111843500GMPX.xml',
'only_matching': True, 'only_matching': True,
}, {
# From http://www.gdcvault.com/play/1013700/Advanced-Material
'url': 'http://sevt.dispeak.com/ubm/gdc/eur10/xml/11256_1282118587281VNIT.xml',
'only_matching': True,
}] }]
def _parse_mp4(self, metadata): def _parse_mp4(self, metadata):

View File

@ -2,6 +2,11 @@
from __future__ import unicode_literals from __future__ import unicode_literals
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import (
int_or_none,
try_get,
unified_timestamp,
)
class EggheadCourseIE(InfoExtractor): class EggheadCourseIE(InfoExtractor):
@ -33,3 +38,47 @@ class EggheadCourseIE(InfoExtractor):
return self.playlist_result( return self.playlist_result(
entries, playlist_id, course.get('title'), entries, playlist_id, course.get('title'),
course.get('description')) course.get('description'))
class EggheadLessonIE(InfoExtractor):
IE_DESC = 'egghead.io lesson'
IE_NAME = 'egghead:lesson'
_VALID_URL = r'https://egghead\.io/lessons/(?P<id>[^/?#&]+)'
_TEST = {
'url': 'https://egghead.io/lessons/javascript-linear-data-flow-with-container-style-types-box',
'info_dict': {
'id': 'fv5yotjxcg',
'ext': 'mp4',
'title': 'Create linear data flow with container style types (Box)',
'description': 'md5:9aa2cdb6f9878ed4c39ec09e85a8150e',
'thumbnail': r're:^https?:.*\.jpg$',
'timestamp': 1481296768,
'upload_date': '20161209',
'duration': 304,
'view_count': 0,
'tags': ['javascript', 'free'],
},
'params': {
'skip_download': True,
},
}
def _real_extract(self, url):
lesson_id = self._match_id(url)
lesson = self._download_json(
'https://egghead.io/api/v1/lessons/%s' % lesson_id, lesson_id)
return {
'_type': 'url_transparent',
'ie_key': 'Wistia',
'url': 'wistia:%s' % lesson['wistia_id'],
'id': lesson['wistia_id'],
'title': lesson.get('title'),
'description': lesson.get('summary'),
'thumbnail': lesson.get('thumb_nail'),
'timestamp': unified_timestamp(lesson.get('published_at')),
'duration': int_or_none(lesson.get('duration')),
'view_count': int_or_none(lesson.get('plays_count')),
'tags': try_get(lesson, lambda x: x['tag_list'], list),
}

View File

@ -298,7 +298,10 @@ from .dw import (
from .eagleplatform import EaglePlatformIE from .eagleplatform import EaglePlatformIE
from .ebaumsworld import EbaumsWorldIE from .ebaumsworld import EbaumsWorldIE
from .echomsk import EchoMskIE from .echomsk import EchoMskIE
from .egghead import EggheadCourseIE from .egghead import (
EggheadCourseIE,
EggheadLessonIE,
)
from .ehow import EHowIE from .ehow import EHowIE
from .eighttracks import EightTracksIE from .eighttracks import EightTracksIE
from .einthusan import EinthusanIE from .einthusan import EinthusanIE

View File

@ -1,10 +1,14 @@
from __future__ import unicode_literals from __future__ import unicode_literals
import json
import re import re
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import ExtractorError from ..utils import (
ExtractorError,
float_or_none,
int_or_none,
unified_timestamp,
)
class FunnyOrDieIE(InfoExtractor): class FunnyOrDieIE(InfoExtractor):
@ -18,6 +22,10 @@ class FunnyOrDieIE(InfoExtractor):
'title': 'Heart-Shaped Box: Literal Video Version', 'title': 'Heart-Shaped Box: Literal Video Version',
'description': 'md5:ea09a01bc9a1c46d9ab696c01747c338', 'description': 'md5:ea09a01bc9a1c46d9ab696c01747c338',
'thumbnail': r're:^http:.*\.jpg$', 'thumbnail': r're:^http:.*\.jpg$',
'uploader': 'DASjr',
'timestamp': 1317904928,
'upload_date': '20111006',
'duration': 318.3,
}, },
}, { }, {
'url': 'http://www.funnyordie.com/embed/e402820827', 'url': 'http://www.funnyordie.com/embed/e402820827',
@ -27,6 +35,8 @@ class FunnyOrDieIE(InfoExtractor):
'title': 'Please Use This Song (Jon Lajoie)', 'title': 'Please Use This Song (Jon Lajoie)',
'description': 'Please use this to sell something. www.jonlajoie.com', 'description': 'Please use this to sell something. www.jonlajoie.com',
'thumbnail': r're:^http:.*\.jpg$', 'thumbnail': r're:^http:.*\.jpg$',
'timestamp': 1398988800,
'upload_date': '20140502',
}, },
'params': { 'params': {
'skip_download': True, 'skip_download': True,
@ -100,15 +110,53 @@ class FunnyOrDieIE(InfoExtractor):
'url': 'http://www.funnyordie.com%s' % src, 'url': 'http://www.funnyordie.com%s' % src,
}] }]
post_json = self._search_regex( timestamp = unified_timestamp(self._html_search_meta(
r'fb_post\s*=\s*(\{.*?\});', webpage, 'post details') 'uploadDate', webpage, 'timestamp', default=None))
post = json.loads(post_json)
uploader = self._html_search_regex(
r'<h\d[^>]+\bclass=["\']channel-preview-name[^>]+>(.+?)</h',
webpage, 'uploader', default=None)
title, description, thumbnail, duration = [None] * 4
medium = self._parse_json(
self._search_regex(
r'jsonMedium\s*=\s*({.+?});', webpage, 'JSON medium',
default='{}'),
video_id, fatal=False)
if medium:
title = medium.get('title')
duration = float_or_none(medium.get('duration'))
if not timestamp:
timestamp = unified_timestamp(medium.get('publishDate'))
post = self._parse_json(
self._search_regex(
r'fb_post\s*=\s*(\{.*?\});', webpage, 'post details',
default='{}'),
video_id, fatal=False)
if post:
if not title:
title = post.get('name')
description = post.get('description')
thumbnail = post.get('picture')
if not title:
title = self._og_search_title(webpage)
if not description:
description = self._og_search_description(webpage)
if not duration:
duration = int_or_none(self._html_search_meta(
('video:duration', 'duration'), webpage, 'duration', default=False))
return { return {
'id': video_id, 'id': video_id,
'title': post['name'], 'title': title,
'description': post.get('description'), 'description': description,
'thumbnail': post.get('picture'), 'thumbnail': thumbnail,
'uploader': uploader,
'timestamp': timestamp,
'duration': duration,
'formats': formats, 'formats': formats,
'subtitles': subtitles, 'subtitles': subtitles,
} }

View File

@ -1569,27 +1569,6 @@ class GenericIE(InfoExtractor):
'skip_download': True, 'skip_download': True,
}, },
}, },
# Nexx iFrame embed
{
'url': 'http://www.spiegel.de/sptv/spiegeltv/spiegel-tv-ueber-schnellste-katapult-achterbahn-der-welt-taron-a-1137884.html',
'info_dict': {
'id': '161464',
'ext': 'mp4',
'title': 'Nervenkitzel Achterbahn',
'alt_title': 'Karussellbauer in Deutschland',
'description': 'md5:ffe7b1cc59a01f585e0569949aef73cc',
'release_year': 2005,
'creator': 'SPIEGEL TV',
'thumbnail': r're:^https?://.*\.jpg$',
'duration': 2761,
'timestamp': 1394021479,
'upload_date': '20140305',
},
'params': {
'format': 'bestvideo',
'skip_download': True,
},
},
# Facebook <iframe> embed # Facebook <iframe> embed
{ {
'url': 'https://www.hostblogger.de/blog/archives/6181-Auto-jagt-Betonmischer.html', 'url': 'https://www.hostblogger.de/blog/archives/6181-Auto-jagt-Betonmischer.html',

View File

@ -122,6 +122,26 @@ class SpiegelArticleIE(InfoExtractor):
}, },
'playlist_count': 6, 'playlist_count': 6,
}, {
# Nexx iFrame embed
'url': 'http://www.spiegel.de/sptv/spiegeltv/spiegel-tv-ueber-schnellste-katapult-achterbahn-der-welt-taron-a-1137884.html',
'info_dict': {
'id': '161464',
'ext': 'mp4',
'title': 'Nervenkitzel Achterbahn',
'alt_title': 'Karussellbauer in Deutschland',
'description': 'md5:ffe7b1cc59a01f585e0569949aef73cc',
'release_year': 2005,
'creator': 'SPIEGEL TV',
'thumbnail': r're:^https?://.*\.jpg$',
'duration': 2761,
'timestamp': 1394021479,
'upload_date': '20140305',
},
'params': {
'format': 'bestvideo',
'skip_download': True,
},
}] }]
def _real_extract(self, url): def _real_extract(self, url):

View File

@ -8,6 +8,9 @@ from ..utils import extract_attributes
class TBSIE(TurnerBaseIE): class TBSIE(TurnerBaseIE):
# https://github.com/rg3/youtube-dl/issues/13658
_WORKING = False
_VALID_URL = r'https?://(?:www\.)?(?P<site>tbs|tntdrama)\.com/videos/(?:[^/]+/)+(?P<id>[^/?#]+)\.html' _VALID_URL = r'https?://(?:www\.)?(?P<site>tbs|tntdrama)\.com/videos/(?:[^/]+/)+(?P<id>[^/?#]+)\.html'
_TESTS = [{ _TESTS = [{
'url': 'http://www.tbs.com/videos/people-of-earth/season-1/extras/2007318/theatrical-trailer.html', 'url': 'http://www.tbs.com/videos/people-of-earth/season-1/extras/2007318/theatrical-trailer.html',
@ -17,7 +20,8 @@ class TBSIE(TurnerBaseIE):
'ext': 'mp4', 'ext': 'mp4',
'title': 'Theatrical Trailer', 'title': 'Theatrical Trailer',
'description': 'Catch the latest comedy from TBS, People of Earth, premiering Halloween night--Monday, October 31, at 9/8c.', 'description': 'Catch the latest comedy from TBS, People of Earth, premiering Halloween night--Monday, October 31, at 9/8c.',
} },
'skip': 'TBS videos are deleted after a while',
}, { }, {
'url': 'http://www.tntdrama.com/videos/good-behavior/season-1/extras/1538823/you-better-run.html', 'url': 'http://www.tntdrama.com/videos/good-behavior/season-1/extras/1538823/you-better-run.html',
'md5': 'ce53c6ead5e9f3280b4ad2031a6fab56', 'md5': 'ce53c6ead5e9f3280b4ad2031a6fab56',
@ -26,7 +30,8 @@ class TBSIE(TurnerBaseIE):
'ext': 'mp4', 'ext': 'mp4',
'title': 'You Better Run', 'title': 'You Better Run',
'description': 'Letty Raines must figure out what she\'s running toward while running away from her past. Good Behavior premieres November 15 at 9/8c.', 'description': 'Letty Raines must figure out what she\'s running toward while running away from her past. Good Behavior premieres November 15 at 9/8c.',
} },
'skip': 'TBS videos are deleted after a while',
}] }]
def _real_extract(self, url): def _real_extract(self, url):

View File

@ -1,7 +1,6 @@
# coding: utf-8 # coding: utf-8
from __future__ import unicode_literals from __future__ import unicode_literals
import itertools
import random import random
import re import re
import string import string
@ -14,7 +13,6 @@ from ..utils import (
js_to_json, js_to_json,
str_or_none, str_or_none,
strip_jsonp, strip_jsonp,
urljoin,
) )
@ -222,17 +220,42 @@ class YoukuShowIE(InfoExtractor):
_VALID_URL = r'https?://list\.youku\.com/show/id_(?P<id>[0-9a-z]+)\.html' _VALID_URL = r'https?://list\.youku\.com/show/id_(?P<id>[0-9a-z]+)\.html'
IE_NAME = 'youku:show' IE_NAME = 'youku:show'
_TEST = { _TESTS = [{
'url': 'http://list.youku.com/show/id_zc7c670be07ff11e48b3f.html', 'url': 'http://list.youku.com/show/id_zc7c670be07ff11e48b3f.html',
'info_dict': { 'info_dict': {
'id': 'zc7c670be07ff11e48b3f', 'id': 'zc7c670be07ff11e48b3f',
'title': '花千骨 未删减', 'title': '花千骨 DVD',
'description': 'md5:a1ae6f5618571bbeb5c9821f9c81b558', 'description': 'md5:a1ae6f5618571bbeb5c9821f9c81b558',
}, },
'playlist_count': 50, 'playlist_count': 50,
} }, {
# Episode number not starting from 1
'url': 'http://list.youku.com/show/id_zefbfbd70efbfbd780bef.html',
'info_dict': {
'id': 'zefbfbd70efbfbd780bef',
'title': '超级飞侠3',
'description': 'md5:275715156abebe5ccc2a1992e9d56b98',
},
'playlist_count': 24,
}, {
# Ongoing playlist. The initial page is the last one
'url': 'http://list.youku.com/show/id_za7c275ecd7b411e1a19e.html',
'only_matchine': True,
}]
_PAGE_SIZE = 40 def _extract_entries(self, playlist_data_url, show_id, note, query):
query['callback'] = 'cb'
playlist_data = self._download_json(
playlist_data_url, show_id, query=query, note=note,
transform_source=lambda s: js_to_json(strip_jsonp(s)))['html']
drama_list = (get_element_by_class('p-drama-grid', playlist_data) or
get_element_by_class('p-drama-half-row', playlist_data))
if drama_list is None:
raise ExtractorError('No episodes found')
video_urls = re.findall(r'<a[^>]+href="([^"]+)"', drama_list)
return playlist_data, [
self.url_result(self._proto_relative_url(video_url, 'http:'), YoukuIE.ie_key())
for video_url in video_urls]
def _real_extract(self, url): def _real_extract(self, url):
show_id = self._match_id(url) show_id = self._match_id(url)
@ -242,30 +265,29 @@ class YoukuShowIE(InfoExtractor):
page_config = self._parse_json(self._search_regex( page_config = self._parse_json(self._search_regex(
r'var\s+PageConfig\s*=\s*({.+});', webpage, 'page config'), r'var\s+PageConfig\s*=\s*({.+});', webpage, 'page config'),
show_id, transform_source=js_to_json) show_id, transform_source=js_to_json)
for idx in itertools.count(0): first_page, initial_entries = self._extract_entries(
if idx == 0: 'http://list.youku.com/show/module', show_id,
playlist_data_url = 'http://list.youku.com/show/module' note='Downloading initial playlist data page',
query = {'id': page_config['showid'], 'tab': 'point'}
else:
playlist_data_url = 'http://list.youku.com/show/point'
query={ query={
'id': page_config['showid'], 'id': page_config['showid'],
'stage': 'reload_%d' % (self._PAGE_SIZE * idx + 1), 'tab': 'showInfo',
} })
query['callback'] = 'cb' first_page_reload_id = self._html_search_regex(
playlist_data = self._download_json( r'<div[^>]+id="(reload_\d+)', first_page, 'first page reload id')
playlist_data_url, show_id, query=query, # The first reload_id has the same items as first_page
reload_ids = re.findall('<li[^>]+data-id="([^"]+)">', first_page)
for idx, reload_id in enumerate(reload_ids):
if reload_id == first_page_reload_id:
entries.extend(initial_entries)
continue
_, new_entries = self._extract_entries(
'http://list.youku.com/show/episode', show_id,
note='Downloading playlist data page %d' % (idx + 1), note='Downloading playlist data page %d' % (idx + 1),
transform_source=lambda s: js_to_json(strip_jsonp(s)))['html'] query={
video_urls = re.findall( 'id': page_config['showid'],
r'<div[^>]+class="p-thumb"[^<]+<a[^>]+href="([^"]+)"', 'stage': reload_id,
playlist_data) })
new_entries = [
self.url_result(urljoin(url, video_url), YoukuIE.ie_key())
for video_url in video_urls]
entries.extend(new_entries) entries.extend(new_entries)
if len(new_entries) < self._PAGE_SIZE:
break
desc = self._html_search_meta('description', webpage, fatal=False) desc = self._html_search_meta('description', webpage, fatal=False)
playlist_title = desc.split(',')[0] if desc else None playlist_title = desc.split(',')[0] if desc else None