Merge remote-tracking branch 'upstream/master'

# Conflicts:
#	youtube_dl/extractor/facebook.py
This commit is contained in:
Avi Peretz 2019-07-13 22:55:40 +03:00
commit d08fed0d41
14 changed files with 288 additions and 160 deletions

View File

@ -6,7 +6,6 @@ from ..utils import (
ExtractorError, ExtractorError,
remove_end, remove_end,
) )
from .rudo import RudoIE
class BioBioChileTVIE(InfoExtractor): class BioBioChileTVIE(InfoExtractor):
@ -41,11 +40,15 @@ class BioBioChileTVIE(InfoExtractor):
}, { }, {
'url': 'http://www.biobiochile.cl/noticias/bbtv/comentarios-bio-bio/2016/07/08/edecanes-del-congreso-figuras-decorativas-que-le-cuestan-muy-caro-a-los-chilenos.shtml', 'url': 'http://www.biobiochile.cl/noticias/bbtv/comentarios-bio-bio/2016/07/08/edecanes-del-congreso-figuras-decorativas-que-le-cuestan-muy-caro-a-los-chilenos.shtml',
'info_dict': { 'info_dict': {
'id': 'edecanes-del-congreso-figuras-decorativas-que-le-cuestan-muy-caro-a-los-chilenos', 'id': 'b4xd0LK3SK',
'ext': 'mp4', 'ext': 'mp4',
'uploader': '(none)', # TODO: fix url_transparent information overriding
'upload_date': '20160708', # 'uploader': 'Juan Pablo Echenique',
'title': 'Edecanes del Congreso: Figuras decorativas que le cuestan muy caro a los chilenos', 'title': 'Comentario Oscar Cáceres',
},
'params': {
# empty m3u8 manifest
'skip_download': True,
}, },
}, { }, {
'url': 'http://tv.biobiochile.cl/notas/2015/10/22/ninos-transexuales-de-quien-es-la-decision.shtml', 'url': 'http://tv.biobiochile.cl/notas/2015/10/22/ninos-transexuales-de-quien-es-la-decision.shtml',
@ -60,7 +63,9 @@ class BioBioChileTVIE(InfoExtractor):
webpage = self._download_webpage(url, video_id) webpage = self._download_webpage(url, video_id)
rudo_url = RudoIE._extract_url(webpage) rudo_url = self._search_regex(
r'<iframe[^>]+src=(?P<q1>[\'"])(?P<url>(?:https?:)?//rudo\.video/vod/[0-9a-zA-Z]+)(?P=q1)',
webpage, 'embed URL', None, group='url')
if not rudo_url: if not rudo_url:
raise ExtractorError('No videos found') raise ExtractorError('No videos found')
@ -68,7 +73,7 @@ class BioBioChileTVIE(InfoExtractor):
thumbnail = self._og_search_thumbnail(webpage) thumbnail = self._og_search_thumbnail(webpage)
uploader = self._html_search_regex( uploader = self._html_search_regex(
r'<a[^>]+href=["\']https?://(?:busca|www)\.biobiochile\.cl/(?:lista/)?(?:author|autor)[^>]+>(.+?)</a>', r'<a[^>]+href=["\'](?:https?://(?:busca|www)\.biobiochile\.cl)?/(?:lista/)?(?:author|autor)[^>]+>(.+?)</a>',
webpage, 'uploader', fatal=False) webpage, 'uploader', fatal=False)
return { return {

View File

@ -71,7 +71,7 @@ class BleacherReportIE(InfoExtractor):
video = article_data.get('video') video = article_data.get('video')
if video: if video:
video_type = video['type'] video_type = video['type']
if video_type == 'cms.bleacherreport.com': if video_type in ('cms.bleacherreport.com', 'vid.bleacherreport.com'):
info['url'] = 'http://bleacherreport.com/video_embed?id=%s' % video['id'] info['url'] = 'http://bleacherreport.com/video_embed?id=%s' % video['id']
elif video_type == 'ooyala.com': elif video_type == 'ooyala.com':
info['url'] = 'ooyala:%s' % video['id'] info['url'] = 'ooyala:%s' % video['id']
@ -87,9 +87,9 @@ class BleacherReportIE(InfoExtractor):
class BleacherReportCMSIE(AMPIE): class BleacherReportCMSIE(AMPIE):
_VALID_URL = r'https?://(?:www\.)?bleacherreport\.com/video_embed\?id=(?P<id>[0-9a-f-]{36})' _VALID_URL = r'https?://(?:www\.)?bleacherreport\.com/video_embed\?id=(?P<id>[0-9a-f-]{36}|\d{5})'
_TESTS = [{ _TESTS = [{
'url': 'http://bleacherreport.com/video_embed?id=8fd44c2f-3dc5-4821-9118-2c825a98c0e1', 'url': 'http://bleacherreport.com/video_embed?id=8fd44c2f-3dc5-4821-9118-2c825a98c0e1&library=video-cms',
'md5': '2e4b0a997f9228ffa31fada5c53d1ed1', 'md5': '2e4b0a997f9228ffa31fada5c53d1ed1',
'info_dict': { 'info_dict': {
'id': '8fd44c2f-3dc5-4821-9118-2c825a98c0e1', 'id': '8fd44c2f-3dc5-4821-9118-2c825a98c0e1',
@ -101,6 +101,6 @@ class BleacherReportCMSIE(AMPIE):
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) video_id = self._match_id(url)
info = self._extract_feed_info('http://cms.bleacherreport.com/media/items/%s/akamai.json' % video_id) info = self._extract_feed_info('http://vid.bleacherreport.com/videos/%s.akamai' % video_id)
info['id'] = video_id info['id'] = video_id
return info return info

View File

@ -7,50 +7,51 @@ from .common import InfoExtractor
class DBTVIE(InfoExtractor): class DBTVIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?dbtv\.no/(?:[^/]+/)?(?P<id>[0-9]+)(?:#(?P<display_id>.+))?' _VALID_URL = r'https?://(?:www\.)?dagbladet\.no/video/(?:(?:embed|(?P<display_id>[^/]+))/)?(?P<id>[0-9A-Za-z_-]{11}|[a-zA-Z0-9]{8})'
_TESTS = [{ _TESTS = [{
'url': 'http://dbtv.no/3649835190001#Skulle_teste_ut_fornøyelsespark,_men_kollegaen_var_bare_opptatt_av_bikinikroppen', 'url': 'https://www.dagbladet.no/video/PynxJnNWChE/',
'md5': '2e24f67936517b143a234b4cadf792ec', 'md5': 'b8f850ba1860adbda668d367f9b77699',
'info_dict': { 'info_dict': {
'id': '3649835190001', 'id': 'PynxJnNWChE',
'display_id': 'Skulle_teste_ut_fornøyelsespark,_men_kollegaen_var_bare_opptatt_av_bikinikroppen',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Skulle teste ut fornøyelsespark, men kollegaen var bare opptatt av bikinikroppen', 'title': 'Skulle teste ut fornøyelsespark, men kollegaen var bare opptatt av bikinikroppen',
'description': 'md5:1504a54606c4dde3e4e61fc97aa857e0', 'description': 'md5:49cc8370e7d66e8a2ef15c3b4631fd3f',
'thumbnail': r're:https?://.*\.jpg', 'thumbnail': r're:https?://.*\.jpg',
'timestamp': 1404039863, 'upload_date': '20160916',
'upload_date': '20140629', 'duration': 69,
'duration': 69.544, 'uploader_id': 'UCk5pvsyZJoYJBd7_oFPTlRQ',
'uploader_id': '1027729757001', 'uploader': 'Dagbladet',
}, },
'add_ie': ['BrightcoveNew'] 'add_ie': ['Youtube']
}, { }, {
'url': 'http://dbtv.no/3649835190001', 'url': 'https://www.dagbladet.no/video/embed/xlGmyIeN9Jo/?autoplay=false',
'only_matching': True, 'only_matching': True,
}, { }, {
'url': 'http://www.dbtv.no/lazyplayer/4631135248001', 'url': 'https://www.dagbladet.no/video/truer-iran-bor-passe-dere/PalfB2Cw',
'only_matching': True,
}, {
'url': 'http://dbtv.no/vice/5000634109001',
'only_matching': True,
}, {
'url': 'http://dbtv.no/filmtrailer/3359293614001',
'only_matching': True, 'only_matching': True,
}] }]
@staticmethod @staticmethod
def _extract_urls(webpage): def _extract_urls(webpage):
return [url for _, url in re.findall( return [url for _, url in re.findall(
r'<iframe[^>]+src=(["\'])((?:https?:)?//(?:www\.)?dbtv\.no/(?:lazy)?player/\d+.*?)\1', r'<iframe[^>]+src=(["\'])((?:https?:)?//(?:www\.)?dagbladet\.no/video/embed/(?:[0-9A-Za-z_-]{11}|[a-zA-Z0-9]{8}).*?)\1',
webpage)] webpage)]
def _real_extract(self, url): def _real_extract(self, url):
video_id, display_id = re.match(self._VALID_URL, url).groups() display_id, video_id = re.match(self._VALID_URL, url).groups()
info = {
return {
'_type': 'url_transparent', '_type': 'url_transparent',
'url': 'http://players.brightcove.net/1027729757001/default_default/index.html?videoId=%s' % video_id,
'id': video_id, 'id': video_id,
'display_id': display_id, 'display_id': display_id,
'ie_key': 'BrightcoveNew',
} }
if len(video_id) == 11:
info.update({
'url': video_id,
'ie_key': 'Youtube',
})
else:
info.update({
'url': 'jwplatform:' + video_id,
'ie_key': 'JWPlatform',
})
return info

View File

@ -0,0 +1,94 @@
from __future__ import unicode_literals
import json
import re
from .common import InfoExtractor
from ..utils import int_or_none
class DLiveVODIE(InfoExtractor):
IE_NAME = 'dlive:vod'
_VALID_URL = r'https?://(?:www\.)?dlive\.tv/p/(?P<uploader_id>.+?)\+(?P<id>[a-zA-Z0-9]+)'
_TEST = {
'url': 'https://dlive.tv/p/pdp+3mTzOl4WR',
'info_dict': {
'id': '3mTzOl4WR',
'ext': 'mp4',
'title': 'Minecraft with james charles epic',
'upload_date': '20190701',
'timestamp': 1562011015,
'uploader_id': 'pdp',
}
}
def _real_extract(self, url):
uploader_id, vod_id = re.match(self._VALID_URL, url).groups()
broadcast = self._download_json(
'https://graphigo.prd.dlive.tv/', vod_id,
data=json.dumps({'query': '''query {
pastBroadcast(permlink:"%s+%s") {
content
createdAt
length
playbackUrl
title
thumbnailUrl
viewCount
}
}''' % (uploader_id, vod_id)}).encode())['data']['pastBroadcast']
title = broadcast['title']
formats = self._extract_m3u8_formats(
broadcast['playbackUrl'], vod_id, 'mp4', 'm3u8_native')
self._sort_formats(formats)
return {
'id': vod_id,
'title': title,
'uploader_id': uploader_id,
'formats': formats,
'description': broadcast.get('content'),
'thumbnail': broadcast.get('thumbnailUrl'),
'timestamp': int_or_none(broadcast.get('createdAt'), 1000),
'view_count': int_or_none(broadcast.get('viewCount')),
}
class DLiveStreamIE(InfoExtractor):
IE_NAME = 'dlive:stream'
_VALID_URL = r'https?://(?:www\.)?dlive\.tv/(?!p/)(?P<id>[\w.-]+)'
def _real_extract(self, url):
display_name = self._match_id(url)
user = self._download_json(
'https://graphigo.prd.dlive.tv/', display_name,
data=json.dumps({'query': '''query {
userByDisplayName(displayname:"%s") {
livestream {
content
createdAt
title
thumbnailUrl
watchingCount
}
username
}
}''' % display_name}).encode())['data']['userByDisplayName']
livestream = user['livestream']
title = livestream['title']
username = user['username']
formats = self._extract_m3u8_formats(
'https://live.prd.dlive.tv/hls/live/%s.m3u8' % username,
display_name, 'mp4')
self._sort_formats(formats)
return {
'id': display_name,
'title': self._live_title(title),
'uploader': display_name,
'uploader_id': username,
'formats': formats,
'description': livestream.get('content'),
'thumbnail': livestream.get('thumbnailUrl'),
'is_live': True,
'timestamp': int_or_none(livestream.get('createdAt'), 1000),
'view_count': int_or_none(livestream.get('watchingCount')),
}

View File

@ -579,6 +579,7 @@ from .linkedin import (
) )
from .linuxacademy import LinuxAcademyIE from .linuxacademy import LinuxAcademyIE
from .litv import LiTVIE from .litv import LiTVIE
from .livejournal import LiveJournalIE
from .liveleak import ( from .liveleak import (
LiveLeakIE, LiveLeakIE,
LiveLeakEmbedIE, LiveLeakEmbedIE,
@ -967,7 +968,6 @@ from .rts import RTSIE
from .rtve import RTVEALaCartaIE, RTVELiveIE, RTVEInfantilIE, RTVELiveIE, RTVETelevisionIE from .rtve import RTVEALaCartaIE, RTVELiveIE, RTVEInfantilIE, RTVELiveIE, RTVETelevisionIE
from .rtvnh import RTVNHIE from .rtvnh import RTVNHIE
from .rtvs import RTVSIE from .rtvs import RTVSIE
from .rudo import RudoIE
from .ruhd import RUHDIE from .ruhd import RUHDIE
from .rutube import ( from .rutube import (
RutubeIE, RutubeIE,
@ -1255,6 +1255,10 @@ from .udn import UDNEmbedIE
from .ufctv import UFCTVIE from .ufctv import UFCTVIE
from .uktvplay import UKTVPlayIE from .uktvplay import UKTVPlayIE
from .digiteka import DigitekaIE from .digiteka import DigitekaIE
from .dlive import (
DLiveVODIE,
DLiveStreamIE,
)
from .umg import UMGDeIE from .umg import UMGDeIE
from .unistra import UnistraIE from .unistra import UnistraIE
from .unity import UnityIE from .unity import UnityIE

View File

@ -462,8 +462,8 @@ class FacebookIE(InfoExtractor):
r'[\'\"]ownerid[\'\"]\s*:\s*[\'\"](\d+)[\'\"]', tahoe_data.secondary, r'[\'\"]ownerid[\'\"]\s*:\s*[\'\"](\d+)[\'\"]', tahoe_data.secondary,
'uploader_id', fatal=False) 'uploader_id', fatal=False)
thumbnail = self._og_search_thumbnail(webpage)
thumbnail = self._html_search_meta(['og:image', 'twitter:image'], webpage)
if is_live: if is_live:
view_count = parse_count( view_count = parse_count(
self._search_regex(r'viewerCount:([\d]+)', webpage, 'views', fatal=False) or \ self._search_regex(r'viewerCount:([\d]+)', webpage, 'views', fatal=False) or \

View File

@ -0,0 +1,42 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from ..compat import compat_str
from ..utils import int_or_none
class LiveJournalIE(InfoExtractor):
_VALID_URL = r'https?://(?:[^.]+\.)?livejournal\.com/video/album/\d+.+?\bid=(?P<id>\d+)'
_TEST = {
'url': 'https://andrei-bt.livejournal.com/video/album/407/?mode=view&id=51272',
'md5': 'adaf018388572ced8a6f301ace49d4b2',
'info_dict': {
'id': '1263729',
'ext': 'mp4',
'title': 'Истребители против БПЛА',
'upload_date': '20190624',
'timestamp': 1561406715,
}
}
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
record = self._parse_json(self._search_regex(
r'Site\.page\s*=\s*({.+?});', webpage,
'page data'), video_id)['video']['record']
storage_id = compat_str(record['storageid'])
title = record.get('name')
if title:
# remove filename extension(.mp4, .mov, etc...)
title = title.rsplit('.', 1)[0]
return {
'_type': 'url_transparent',
'id': video_id,
'title': title,
'thumbnail': record.get('thumbnail'),
'timestamp': int_or_none(record.get('timecreate')),
'url': 'eagleplatform:vc.videos.livejournal.com:' + storage_id,
'ie_key': 'EaglePlatform',
}

View File

@ -117,6 +117,10 @@ class LyndaIE(LyndaBaseIE):
}, { }, {
'url': 'https://www.lynda.com/de/Graphic-Design-tutorials/Willkommen-Grundlagen-guten-Gestaltung/393570/393572-4.html', 'url': 'https://www.lynda.com/de/Graphic-Design-tutorials/Willkommen-Grundlagen-guten-Gestaltung/393570/393572-4.html',
'only_matching': True, 'only_matching': True,
}, {
# Status="NotFound", Message="Transcript not found"
'url': 'https://www.lynda.com/ASP-NET-tutorials/What-you-should-know/5034180/2811512-4.html',
'only_matching': True,
}] }]
def _raise_unavailable(self, video_id): def _raise_unavailable(self, video_id):
@ -247,11 +251,16 @@ class LyndaIE(LyndaBaseIE):
def _get_subtitles(self, video_id): def _get_subtitles(self, video_id):
url = 'https://www.lynda.com/ajax/player?videoId=%s&type=transcript' % video_id url = 'https://www.lynda.com/ajax/player?videoId=%s&type=transcript' % video_id
subs = self._download_json(url, None, False) subs = self._download_webpage(
url, video_id, 'Downloading subtitles JSON', fatal=False)
if not subs or 'Status="NotFound"' in subs:
return {}
subs = self._parse_json(subs, video_id, fatal=False)
if not subs:
return {}
fixed_subs = self._fix_subtitles(subs) fixed_subs = self._fix_subtitles(subs)
if fixed_subs: if fixed_subs:
return {'en': [{'ext': 'srt', 'data': fixed_subs}]} return {'en': [{'ext': 'srt', 'data': fixed_subs}]}
else:
return {} return {}

View File

@ -4,11 +4,14 @@ from __future__ import unicode_literals
import re import re
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import (
compat_HTTPError,
compat_str,
)
from ..utils import ( from ..utils import (
ExtractorError, ExtractorError,
int_or_none, int_or_none,
strip_or_none, str_or_none,
unescapeHTML,
urlencode_postdata, urlencode_postdata,
) )
@ -21,15 +24,14 @@ class RoosterTeethIE(InfoExtractor):
'url': 'http://roosterteeth.com/episode/million-dollars-but-season-2-million-dollars-but-the-game-announcement', 'url': 'http://roosterteeth.com/episode/million-dollars-but-season-2-million-dollars-but-the-game-announcement',
'md5': 'e2bd7764732d785ef797700a2489f212', 'md5': 'e2bd7764732d785ef797700a2489f212',
'info_dict': { 'info_dict': {
'id': '26576', 'id': '9156',
'display_id': 'million-dollars-but-season-2-million-dollars-but-the-game-announcement', 'display_id': 'million-dollars-but-season-2-million-dollars-but-the-game-announcement',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Million Dollars, But...: Million Dollars, But... The Game Announcement', 'title': 'Million Dollars, But... The Game Announcement',
'description': 'md5:0cc3b21986d54ed815f5faeccd9a9ca5', 'description': 'md5:168a54b40e228e79f4ddb141e89fe4f5',
'thumbnail': r're:^https?://.*\.png$', 'thumbnail': r're:^https?://.*\.png$',
'series': 'Million Dollars, But...', 'series': 'Million Dollars, But...',
'episode': 'Million Dollars, But... The Game Announcement', 'episode': 'Million Dollars, But... The Game Announcement',
'comment_count': int,
}, },
}, { }, {
'url': 'http://achievementhunter.roosterteeth.com/episode/off-topic-the-achievement-hunter-podcast-2016-i-didn-t-think-it-would-pass-31', 'url': 'http://achievementhunter.roosterteeth.com/episode/off-topic-the-achievement-hunter-podcast-2016-i-didn-t-think-it-would-pass-31',
@ -89,60 +91,55 @@ class RoosterTeethIE(InfoExtractor):
def _real_extract(self, url): def _real_extract(self, url):
display_id = self._match_id(url) display_id = self._match_id(url)
api_episode_url = 'https://svod-be.roosterteeth.com/api/v1/episodes/%s' % display_id
webpage = self._download_webpage(url, display_id) try:
m3u8_url = self._download_json(
episode = strip_or_none(unescapeHTML(self._search_regex( api_episode_url + '/videos', display_id,
(r'videoTitle\s*=\s*(["\'])(?P<title>(?:(?!\1).)+)\1', 'Downloading video JSON metadata')['data'][0]['attributes']['url']
r'<title>(?P<title>[^<]+)</title>'), webpage, 'title', except ExtractorError as e:
default=None, group='title'))) if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403:
if self._parse_json(e.cause.read().decode(), display_id).get('access') is False:
title = strip_or_none(self._og_search_title(
webpage, default=None)) or episode
m3u8_url = self._search_regex(
r'file\s*:\s*(["\'])(?P<url>http.+?\.m3u8.*?)\1',
webpage, 'm3u8 url', default=None, group='url')
if not m3u8_url:
if re.search(r'<div[^>]+class=["\']non-sponsor', webpage):
self.raise_login_required( self.raise_login_required(
'%s is only available for FIRST members' % display_id) '%s is only available for FIRST members' % display_id)
raise
if re.search(r'<div[^>]+class=["\']golive-gate', webpage):
self.raise_login_required('%s is not available yet' % display_id)
raise ExtractorError('Unable to extract m3u8 URL')
formats = self._extract_m3u8_formats( formats = self._extract_m3u8_formats(
m3u8_url, display_id, ext='mp4', m3u8_url, display_id, 'mp4', 'm3u8_native', m3u8_id='hls')
entry_protocol='m3u8_native', m3u8_id='hls')
self._sort_formats(formats) self._sort_formats(formats)
description = strip_or_none(self._og_search_description(webpage)) episode = self._download_json(
thumbnail = self._proto_relative_url(self._og_search_thumbnail(webpage)) api_episode_url, display_id,
'Downloading episode JSON metadata')['data'][0]
attributes = episode['attributes']
title = attributes.get('title') or attributes['display_title']
video_id = compat_str(episode['id'])
series = self._search_regex( thumbnails = []
(r'<h2>More ([^<]+)</h2>', r'<a[^>]+>See All ([^<]+) Videos<'), for image in episode.get('included', {}).get('images', []):
webpage, 'series', fatal=False) if image.get('type') == 'episode_image':
img_attributes = image.get('attributes') or {}
comment_count = int_or_none(self._search_regex( for k in ('thumb', 'small', 'medium', 'large'):
r'>Comments \((\d+)\)<', webpage, img_url = img_attributes.get(k)
'comment count', fatal=False)) if img_url:
thumbnails.append({
video_id = self._search_regex( 'id': k,
(r'containerId\s*=\s*["\']episode-(\d+)\1', 'url': img_url,
r'<div[^<]+id=["\']episode-(\d+)'), webpage, })
'video id', default=display_id)
return { return {
'id': video_id, 'id': video_id,
'display_id': display_id, 'display_id': display_id,
'title': title, 'title': title,
'description': description, 'description': attributes.get('description') or attributes.get('caption'),
'thumbnail': thumbnail, 'thumbnails': thumbnails,
'series': series, 'series': attributes.get('show_title'),
'episode': episode, 'season_number': int_or_none(attributes.get('season_number')),
'comment_count': comment_count, 'season_id': attributes.get('season_id'),
'episode': title,
'episode_number': int_or_none(attributes.get('number')),
'episode_id': str_or_none(episode.get('uuid')),
'formats': formats, 'formats': formats,
'channel_id': attributes.get('channel_id'),
'duration': int_or_none(attributes.get('length')),
} }

View File

@ -1,53 +0,0 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import (
js_to_json,
get_element_by_class,
unified_strdate,
)
class RudoIE(InfoExtractor):
_VALID_URL = r'https?://rudo\.video/vod/(?P<id>[0-9a-zA-Z]+)'
_TEST = {
'url': 'http://rudo.video/vod/oTzw0MGnyG',
'md5': '2a03a5b32dd90a04c83b6d391cf7b415',
'info_dict': {
'id': 'oTzw0MGnyG',
'ext': 'mp4',
'title': 'Comentario Tomás Mosciatti',
'upload_date': '20160617',
},
}
@classmethod
def _extract_url(cls, webpage):
mobj = re.search(
r'<iframe[^>]+src=(?P<q1>[\'"])(?P<url>(?:https?:)?//rudo\.video/vod/[0-9a-zA-Z]+)(?P=q1)',
webpage)
if mobj:
return mobj.group('url')
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id, encoding='iso-8859-1')
jwplayer_data = self._parse_json(self._search_regex(
r'(?s)playerInstance\.setup\(({.+?})\)', webpage, 'jwplayer data'), video_id,
transform_source=lambda s: js_to_json(re.sub(r'encodeURI\([^)]+\)', '""', s)))
info_dict = self._parse_jwplayer_data(
jwplayer_data, video_id, require_title=False, m3u8_id='hls', mpd_id='dash')
info_dict.update({
'title': self._og_search_title(webpage),
'upload_date': unified_strdate(get_element_by_class('date', webpage)),
})
return info_dict

View File

@ -5,6 +5,7 @@ import re
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import ( from ..utils import (
ExtractorError, ExtractorError,
merge_dicts,
orderedSet, orderedSet,
parse_duration, parse_duration,
parse_resolution, parse_resolution,
@ -26,6 +27,8 @@ class SpankBangIE(InfoExtractor):
'description': 'dillion harper masturbates on a bed', 'description': 'dillion harper masturbates on a bed',
'thumbnail': r're:^https?://.*\.jpg$', 'thumbnail': r're:^https?://.*\.jpg$',
'uploader': 'silly2587', 'uploader': 'silly2587',
'timestamp': 1422571989,
'upload_date': '20150129',
'age_limit': 18, 'age_limit': 18,
} }
}, { }, {
@ -106,31 +109,36 @@ class SpankBangIE(InfoExtractor):
for format_id, format_url in stream.items(): for format_id, format_url in stream.items():
if format_id.startswith(STREAM_URL_PREFIX): if format_id.startswith(STREAM_URL_PREFIX):
if format_url and isinstance(format_url, list):
format_url = format_url[0]
extract_format( extract_format(
format_id[len(STREAM_URL_PREFIX):], format_url) format_id[len(STREAM_URL_PREFIX):], format_url)
self._sort_formats(formats) self._sort_formats(formats)
info = self._search_json_ld(webpage, video_id, default={})
title = self._html_search_regex( title = self._html_search_regex(
r'(?s)<h1[^>]*>(.+?)</h1>', webpage, 'title') r'(?s)<h1[^>]*>(.+?)</h1>', webpage, 'title', default=None)
description = self._search_regex( description = self._search_regex(
r'<div[^>]+\bclass=["\']bottom[^>]+>\s*<p>[^<]*</p>\s*<p>([^<]+)', r'<div[^>]+\bclass=["\']bottom[^>]+>\s*<p>[^<]*</p>\s*<p>([^<]+)',
webpage, 'description', fatal=False) webpage, 'description', default=None)
thumbnail = self._og_search_thumbnail(webpage) thumbnail = self._og_search_thumbnail(webpage, default=None)
uploader = self._search_regex( uploader = self._html_search_regex(
r'class="user"[^>]*><img[^>]+>([^<]+)', (r'(?s)<li[^>]+class=["\']profile[^>]+>(.+?)</a>',
r'class="user"[^>]*><img[^>]+>([^<]+)'),
webpage, 'uploader', default=None) webpage, 'uploader', default=None)
duration = parse_duration(self._search_regex( duration = parse_duration(self._search_regex(
r'<div[^>]+\bclass=["\']right_side[^>]+>\s*<span>([^<]+)', r'<div[^>]+\bclass=["\']right_side[^>]+>\s*<span>([^<]+)',
webpage, 'duration', fatal=False)) webpage, 'duration', default=None))
view_count = str_to_int(self._search_regex( view_count = str_to_int(self._search_regex(
r'([\d,.]+)\s+plays', webpage, 'view count', fatal=False)) r'([\d,.]+)\s+plays', webpage, 'view count', default=None))
age_limit = self._rta_search(webpage) age_limit = self._rta_search(webpage)
return { return merge_dicts({
'id': video_id, 'id': video_id,
'title': title, 'title': title or video_id,
'description': description, 'description': description,
'thumbnail': thumbnail, 'thumbnail': thumbnail,
'uploader': uploader, 'uploader': uploader,
@ -138,7 +146,8 @@ class SpankBangIE(InfoExtractor):
'view_count': view_count, 'view_count': view_count,
'formats': formats, 'formats': formats,
'age_limit': age_limit, 'age_limit': age_limit,
} }, info
)
class SpankBangPlaylistIE(InfoExtractor): class SpankBangPlaylistIE(InfoExtractor):

View File

@ -22,7 +22,7 @@ class BellatorIE(MTVServicesInfoExtractor):
'only_matching': True, 'only_matching': True,
}] }]
_FEED_URL = 'http://www.spike.com/feeds/mrss/' _FEED_URL = 'http://www.bellator.com/feeds/mrss/'
_GEO_COUNTRIES = ['US'] _GEO_COUNTRIES = ['US']

View File

@ -438,11 +438,22 @@ class TwitterIE(InfoExtractor):
'params': { 'params': {
'skip_download': True, # requires ffmpeg 'skip_download': True, # requires ffmpeg
}, },
}, {
'url': 'https://twitter.com/foobar/status/1087791357756956680',
'info_dict': {
'id': '1087791357756956680',
'ext': 'mp4',
'title': 'Twitter - A new is coming. Some of you got an opt-in to try it now. Check out the emoji button, quick keyboard shortcuts, upgraded trends, advanced search, and more. Let us know your thoughts!',
'thumbnail': r're:^https?://.*\.jpg',
'description': 'md5:66d493500c013e3e2d434195746a7f78',
'uploader': 'Twitter',
'uploader_id': 'Twitter',
'duration': 61.567,
},
}] }]
def _real_extract(self, url): def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url) mobj = re.match(self._VALID_URL, url)
user_id = mobj.group('user_id')
twid = mobj.group('id') twid = mobj.group('id')
webpage, urlh = self._download_webpage_handle( webpage, urlh = self._download_webpage_handle(
@ -451,8 +462,13 @@ class TwitterIE(InfoExtractor):
if 'twitter.com/account/suspended' in urlh.geturl(): if 'twitter.com/account/suspended' in urlh.geturl():
raise ExtractorError('Account suspended by Twitter.', expected=True) raise ExtractorError('Account suspended by Twitter.', expected=True)
if user_id is None: user_id = None
mobj = re.match(self._VALID_URL, urlh.geturl())
redirect_mobj = re.match(self._VALID_URL, urlh.geturl())
if redirect_mobj:
user_id = redirect_mobj.group('user_id')
if not user_id:
user_id = mobj.group('user_id') user_id = mobj.group('user_id')
username = remove_end(self._og_search_title(webpage), ' on Twitter') username = remove_end(self._og_search_title(webpage), ' on Twitter')

View File

@ -371,10 +371,14 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
(?:www\.)?hooktube\.com/| (?:www\.)?hooktube\.com/|
(?:www\.)?yourepeat\.com/| (?:www\.)?yourepeat\.com/|
tube\.majestyc\.net/| tube\.majestyc\.net/|
# Invidious instances taken from https://github.com/omarroth/invidious/wiki/Invidious-Instances
(?:(?:www|dev)\.)?invidio\.us/| (?:(?:www|dev)\.)?invidio\.us/|
(?:www\.)?invidiou\.sh/| (?:(?:www|no)\.)?invidiou\.sh/|
(?:www\.)?invidious\.snopyta\.org/| (?:(?:www|fi|de)\.)?invidious\.snopyta\.org/|
(?:www\.)?invidious\.kabi\.tk/| (?:www\.)?invidious\.kabi\.tk/|
(?:www\.)?invidious\.enkirton\.net/|
(?:www\.)?invidious\.13ad\.de/|
(?:www\.)?tube\.poal\.co/|
(?:www\.)?vid\.wxzm\.sx/| (?:www\.)?vid\.wxzm\.sx/|
youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
(?:.*?\#/)? # handle anchor (#/) redirect urls (?:.*?\#/)? # handle anchor (#/) redirect urls