Merge branch 'master' of github.com:rg3/youtube-dl

* 'master' of github.com:rg3/youtube-dl:
  [cloudflarestream] Add support for videodelivery.net (#21049)
  [byutv] Improve extraction and update DVR test (closes #20676)
  [byutv] Add support for DVR videos (closes #20574)
  [gfycat] Add support for URLs with tags (closes #20696) (#20731)
  [utils] Transliterate "þ" as "th" (#20897)
  [openload] Add support for verystream.com (closes #20701) (#20967)
  [youtube] Use sp field value for signature field name (closes #18841, closes #18927, closes #21028)
  [yahoo:gyao] extend _VALID_URL(closes #21008)
  [youtube] Fix channel id extraction (closes #20982) (#21003)
This commit is contained in:
Paolo de Dios 2019-05-10 13:21:21 -07:00
commit 90125fc6e6
10 changed files with 146 additions and 43 deletions

View File

@ -183,7 +183,7 @@ class TestUtil(unittest.TestCase):
self.assertEqual(sanitize_filename( self.assertEqual(sanitize_filename(
'ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ', restricted=True), 'ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ', restricted=True),
'AAAAAAAECEEEEIIIIDNOOOOOOOOEUUUUUYPssaaaaaaaeceeeeiiiionooooooooeuuuuuypy') 'AAAAAAAECEEEEIIIIDNOOOOOOOOEUUUUUYTHssaaaaaaaeceeeeiiiionooooooooeuuuuuythy')
def test_sanitize_ids(self): def test_sanitize_ids(self):
self.assertEqual(sanitize_filename('_n_cd26wFpw', is_id=True), '_n_cd26wFpw') self.assertEqual(sanitize_filename('_n_cd26wFpw', is_id=True), '_n_cd26wFpw')

View File

@ -3,11 +3,13 @@ from __future__ import unicode_literals
import re import re
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import parse_duration
class BYUtvIE(InfoExtractor): class BYUtvIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?byutv\.org/(?:watch|player)/(?!event/)(?P<id>[0-9a-f-]+)(?:/(?P<display_id>[^/?#&]+))?' _VALID_URL = r'https?://(?:www\.)?byutv\.org/(?:watch|player)/(?!event/)(?P<id>[0-9a-f-]+)(?:/(?P<display_id>[^/?#&]+))?'
_TESTS = [{ _TESTS = [{
# ooyalaVOD
'url': 'http://www.byutv.org/watch/6587b9a3-89d2-42a6-a7f7-fd2f81840a7d/studio-c-season-5-episode-5', 'url': 'http://www.byutv.org/watch/6587b9a3-89d2-42a6-a7f7-fd2f81840a7d/studio-c-season-5-episode-5',
'info_dict': { 'info_dict': {
'id': 'ZvanRocTpW-G5_yZFeltTAMv6jxOU9KH', 'id': 'ZvanRocTpW-G5_yZFeltTAMv6jxOU9KH',
@ -22,6 +24,20 @@ class BYUtvIE(InfoExtractor):
'skip_download': True, 'skip_download': True,
}, },
'add_ie': ['Ooyala'], 'add_ie': ['Ooyala'],
}, {
# dvr
'url': 'https://www.byutv.org/player/8f1dab9b-b243-47c8-b525-3e2d021a3451/byu-softball-pacific-vs-byu-41219---game-2',
'info_dict': {
'id': '8f1dab9b-b243-47c8-b525-3e2d021a3451',
'display_id': 'byu-softball-pacific-vs-byu-41219---game-2',
'ext': 'mp4',
'title': 'Pacific vs. BYU (4/12/19)',
'description': 'md5:1ac7b57cb9a78015910a4834790ce1f3',
'duration': 11645,
},
'params': {
'skip_download': True
},
}, { }, {
'url': 'http://www.byutv.org/watch/6587b9a3-89d2-42a6-a7f7-fd2f81840a7d', 'url': 'http://www.byutv.org/watch/6587b9a3-89d2-42a6-a7f7-fd2f81840a7d',
'only_matching': True, 'only_matching': True,
@ -35,24 +51,42 @@ class BYUtvIE(InfoExtractor):
video_id = mobj.group('id') video_id = mobj.group('id')
display_id = mobj.group('display_id') or video_id display_id = mobj.group('display_id') or video_id
ep = self._download_json( info = self._download_json(
'https://api.byutv.org/api3/catalog/getvideosforcontent', video_id, 'https://api.byutv.org/api3/catalog/getvideosforcontent',
query={ display_id, query={
'contentid': video_id, 'contentid': video_id,
'channel': 'byutv', 'channel': 'byutv',
'x-byutv-context': 'web$US', 'x-byutv-context': 'web$US',
}, headers={ }, headers={
'x-byutv-context': 'web$US', 'x-byutv-context': 'web$US',
'x-byutv-platformkey': 'xsaaw9c7y5', 'x-byutv-platformkey': 'xsaaw9c7y5',
})['ooyalaVOD'] })
ep = info.get('ooyalaVOD')
if ep:
return {
'_type': 'url_transparent',
'ie_key': 'Ooyala',
'url': 'ooyala:%s' % ep['providerId'],
'id': video_id,
'display_id': display_id,
'title': ep.get('title'),
'description': ep.get('description'),
'thumbnail': ep.get('imageThumbnail'),
}
ep = info['dvr']
title = ep['title']
formats = self._extract_m3u8_formats(
ep['videoUrl'], video_id, 'mp4', entry_protocol='m3u8_native',
m3u8_id='hls')
self._sort_formats(formats)
return { return {
'_type': 'url_transparent',
'ie_key': 'Ooyala',
'url': 'ooyala:%s' % ep['providerId'],
'id': video_id, 'id': video_id,
'display_id': display_id, 'display_id': display_id,
'title': ep.get('title'), 'title': title,
'description': ep.get('description'), 'description': ep.get('description'),
'thumbnail': ep.get('imageThumbnail'), 'thumbnail': ep.get('imageThumbnail'),
'duration': parse_duration(ep.get('length')),
'formats': formats,
} }

View File

@ -10,8 +10,8 @@ class CloudflareStreamIE(InfoExtractor):
_VALID_URL = r'''(?x) _VALID_URL = r'''(?x)
https?:// https?://
(?: (?:
(?:watch\.)?cloudflarestream\.com/| (?:watch\.)?(?:cloudflarestream\.com|videodelivery\.net)/|
embed\.cloudflarestream\.com/embed/[^/]+\.js\?.*?\bvideo= embed\.(?:cloudflarestream\.com|videodelivery\.net)/embed/[^/]+\.js\?.*?\bvideo=
) )
(?P<id>[\da-f]+) (?P<id>[\da-f]+)
''' '''
@ -31,6 +31,9 @@ class CloudflareStreamIE(InfoExtractor):
}, { }, {
'url': 'https://cloudflarestream.com/31c9291ab41fac05471db4e73aa11717/manifest/video.mpd', 'url': 'https://cloudflarestream.com/31c9291ab41fac05471db4e73aa11717/manifest/video.mpd',
'only_matching': True, 'only_matching': True,
}, {
'url': 'https://embed.videodelivery.net/embed/r4xu.fla9.latest.js?video=81d80727f3022488598f68d323c1ad5e',
'only_matching': True,
}] }]
@staticmethod @staticmethod
@ -38,7 +41,7 @@ class CloudflareStreamIE(InfoExtractor):
return [ return [
mobj.group('url') mobj.group('url')
for mobj in re.finditer( for mobj in re.finditer(
r'<script[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//embed\.cloudflarestream\.com/embed/[^/]+\.js\?.*?\bvideo=[\da-f]+?.*?)\1', r'<script[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//embed\.(?:cloudflarestream\.com|videodelivery\.net)/embed/[^/]+\.js\?.*?\bvideo=[\da-f]+?.*?)\1',
webpage)] webpage)]
def _real_extract(self, url): def _real_extract(self, url):

View File

@ -833,7 +833,10 @@ from .ooyala import (
OoyalaIE, OoyalaIE,
OoyalaExternalIE, OoyalaExternalIE,
) )
from .openload import OpenloadIE from .openload import (
OpenloadIE,
VerystreamIE,
)
from .ora import OraTVIE from .ora import OraTVIE
from .orf import ( from .orf import (
ORFTVthekIE, ORFTVthekIE,

View File

@ -89,7 +89,10 @@ from .piksel import PikselIE
from .videa import VideaIE from .videa import VideaIE
from .twentymin import TwentyMinutenIE from .twentymin import TwentyMinutenIE
from .ustream import UstreamIE from .ustream import UstreamIE
from .openload import OpenloadIE from .openload import (
OpenloadIE,
VerystreamIE,
)
from .videopress import VideoPressIE from .videopress import VideoPressIE
from .rutube import RutubeIE from .rutube import RutubeIE
from .limelight import LimelightBaseIE from .limelight import LimelightBaseIE
@ -3017,6 +3020,12 @@ class GenericIE(InfoExtractor):
return self.playlist_from_matches( return self.playlist_from_matches(
openload_urls, video_id, video_title, ie=OpenloadIE.ie_key()) openload_urls, video_id, video_title, ie=OpenloadIE.ie_key())
# Look for Verystream embeds
verystream_urls = VerystreamIE._extract_urls(webpage)
if verystream_urls:
return self.playlist_from_matches(
verystream_urls, video_id, video_title, ie=VerystreamIE.ie_key())
# Look for VideoPress embeds # Look for VideoPress embeds
videopress_urls = VideoPressIE._extract_urls(webpage) videopress_urls = VideoPressIE._extract_urls(webpage)
if videopress_urls: if videopress_urls:

View File

@ -11,7 +11,7 @@ from ..utils import (
class GfycatIE(InfoExtractor): class GfycatIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?gfycat\.com/(?:ifr/|gifs/detail/)?(?P<id>[^/?#]+)' _VALID_URL = r'https?://(?:www\.)?gfycat\.com/(?:ifr/|gifs/detail/)?(?P<id>[^-/?#]+)'
_TESTS = [{ _TESTS = [{
'url': 'http://gfycat.com/DeadlyDecisiveGermanpinscher', 'url': 'http://gfycat.com/DeadlyDecisiveGermanpinscher',
'info_dict': { 'info_dict': {
@ -47,6 +47,9 @@ class GfycatIE(InfoExtractor):
}, { }, {
'url': 'https://gfycat.com/gifs/detail/UnconsciousLankyIvorygull', 'url': 'https://gfycat.com/gifs/detail/UnconsciousLankyIvorygull',
'only_matching': True 'only_matching': True
}, {
'url': 'https://gfycat.com/acceptablehappygoluckyharborporpoise-baseball',
'only_matching': True
}] }]
def _real_extract(self, url): def _real_extract(self, url):

View File

@ -254,7 +254,10 @@ class OpenloadIE(InfoExtractor):
(?:f|embed)/ (?:f|embed)/
(?P<id>[a-zA-Z0-9-_]+) (?P<id>[a-zA-Z0-9-_]+)
''' % _DOMAINS ''' % _DOMAINS
_EMBED_WORD = 'embed'
_STREAM_WORD = 'f'
_REDIR_WORD = 'stream'
_URL_IDS = ('streamurl', 'streamuri', 'streamurj')
_TESTS = [{ _TESTS = [{
'url': 'https://openload.co/f/kUEfGclsU9o', 'url': 'https://openload.co/f/kUEfGclsU9o',
'md5': 'bf1c059b004ebc7a256f89408e65c36e', 'md5': 'bf1c059b004ebc7a256f89408e65c36e',
@ -1948,11 +1951,16 @@ class OpenloadIE(InfoExtractor):
'69.0.3497.28', '69.0.3497.28',
) )
@staticmethod @classmethod
def _extract_urls(webpage): def _extract_urls(cls, webpage):
return re.findall( return re.findall(
r'<iframe[^>]+src=["\']((?:https?://)?%s/embed/[a-zA-Z0-9-_]+)' r'<iframe[^>]+src=["\']((?:https?://)?%s/%s/[a-zA-Z0-9-_]+)'
% OpenloadIE._DOMAINS, webpage) % (cls._DOMAINS, cls._EMBED_WORD), webpage)
def _extract_decrypted_page(self, page_url, webpage, video_id, headers):
phantom = PhantomJSwrapper(self, required_version='2.0')
webpage, _ = phantom.get(page_url, html=webpage, video_id=video_id, headers=headers)
return webpage
def _real_extract(self, url): def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url) mobj = re.match(self._VALID_URL, url)
@ -1964,9 +1972,9 @@ class OpenloadIE(InfoExtractor):
'User-Agent': self._USER_AGENT_TPL % random.choice(self._CHROME_VERSIONS), 'User-Agent': self._USER_AGENT_TPL % random.choice(self._CHROME_VERSIONS),
} }
for path in ('embed', 'f'): for path in (self._EMBED_WORD, self._STREAM_WORD):
page_url = url_pattern % path page_url = url_pattern % path
last = path == 'f' last = path == self._STREAM_WORD
webpage = self._download_webpage( webpage = self._download_webpage(
page_url, video_id, 'Downloading %s webpage' % path, page_url, video_id, 'Downloading %s webpage' % path,
headers=headers, fatal=last) headers=headers, fatal=last)
@ -1978,21 +1986,20 @@ class OpenloadIE(InfoExtractor):
raise ExtractorError('File not found', expected=True, video_id=video_id) raise ExtractorError('File not found', expected=True, video_id=video_id)
break break
phantom = PhantomJSwrapper(self, required_version='2.0') webpage = self._extract_decrypted_page(page_url, webpage, video_id, headers)
webpage, _ = phantom.get(page_url, html=webpage, video_id=video_id, headers=headers) for element_id in self._URL_IDS:
decoded_id = get_element_by_id(element_id, webpage)
decoded_id = (get_element_by_id('streamurl', webpage) or if decoded_id:
get_element_by_id('streamuri', webpage) or break
get_element_by_id('streamurj', webpage) or if not decoded_id:
self._search_regex( decoded_id = self._search_regex(
(r'>\s*([\w-]+~\d{10,}~\d+\.\d+\.0\.0~[\w-]+)\s*<', (r'>\s*([\w-]+~\d{10,}~\d+\.\d+\.0\.0~[\w-]+)\s*<',
r'>\s*([\w~-]+~\d+\.\d+\.\d+\.\d+~[\w~-]+)', r'>\s*([\w~-]+~\d+\.\d+\.\d+\.\d+~[\w~-]+)',
r'>\s*([\w-]+~\d{10,}~(?:[a-f\d]+:){2}:~[\w-]+)\s*<', r'>\s*([\w-]+~\d{10,}~(?:[a-f\d]+:){2}:~[\w-]+)\s*<',
r'>\s*([\w~-]+~[a-f0-9:]+~[\w~-]+)\s*<', r'>\s*([\w~-]+~[a-f0-9:]+~[\w~-]+)\s*<',
r'>\s*([\w~-]+~[a-f0-9:]+~[\w~-]+)'), webpage, r'>\s*([\w~-]+~[a-f0-9:]+~[\w~-]+)'), webpage,
'stream URL')) 'stream URL')
video_url = 'https://%s/%s/%s?mime=true' % (host, self._REDIR_WORD, decoded_id)
video_url = 'https://%s/stream/%s?mime=true' % (host, decoded_id)
title = self._og_search_title(webpage, default=None) or self._search_regex( title = self._og_search_title(webpage, default=None) or self._search_regex(
r'<span[^>]+class=["\']title["\'][^>]*>([^<]+)', webpage, r'<span[^>]+class=["\']title["\'][^>]*>([^<]+)', webpage,
@ -2012,3 +2019,38 @@ class OpenloadIE(InfoExtractor):
'subtitles': subtitles, 'subtitles': subtitles,
'http_headers': headers, 'http_headers': headers,
} }
class VerystreamIE(OpenloadIE):
IE_NAME = 'verystream'
_DOMAINS = r'(?:verystream\.com)'
_VALID_URL = r'''(?x)
https?://
(?P<host>
(?:www\.)?
%s
)/
(?:stream|e)/
(?P<id>[a-zA-Z0-9-_]+)
''' % _DOMAINS
_EMBED_WORD = 'e'
_STREAM_WORD = 'stream'
_REDIR_WORD = 'gettoken'
_URL_IDS = ('videolink', )
_TESTS = [{
'url': 'https://verystream.com/stream/c1GWQ9ngBBx/',
'md5': 'd3e8c5628ccb9970b65fd65269886795',
'info_dict': {
'id': 'c1GWQ9ngBBx',
'ext': 'mp4',
'title': 'Big Buck Bunny.mp4',
'thumbnail': r're:^https?://.*\.jpg$',
},
}, {
'url': 'https://verystream.com/e/c1GWQ9ngBBx/',
'only_matching': True,
}]
def _extract_decrypted_page(self, page_url, webpage, video_id, headers):
return webpage # for Verystream, the webpage is already decrypted

View File

@ -526,7 +526,7 @@ class YahooGyaOPlayerIE(InfoExtractor):
class YahooGyaOIE(InfoExtractor): class YahooGyaOIE(InfoExtractor):
IE_NAME = 'yahoo:gyao' IE_NAME = 'yahoo:gyao'
_VALID_URL = r'https?://(?:gyao\.yahoo\.co\.jp/p|streaming\.yahoo\.co\.jp/p/y)/(?P<id>\d+/v\d+)' _VALID_URL = r'https?://(?:gyao\.yahoo\.co\.jp/(?:p|title/[^/]+)|streaming\.yahoo\.co\.jp/p/y)/(?P<id>\d+/v\d+|[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})'
_TESTS = [{ _TESTS = [{
'url': 'https://gyao.yahoo.co.jp/p/00449/v03102/', 'url': 'https://gyao.yahoo.co.jp/p/00449/v03102/',
'info_dict': { 'info_dict': {
@ -536,6 +536,9 @@ class YahooGyaOIE(InfoExtractor):
}, { }, {
'url': 'https://streaming.yahoo.co.jp/p/y/01034/v00133/', 'url': 'https://streaming.yahoo.co.jp/p/y/01034/v00133/',
'only_matching': True, 'only_matching': True,
}, {
'url': 'https://gyao.yahoo.co.jp/title/%E3%81%97%E3%82%83%E3%81%B9%E3%81%8F%E3%82%8A007/5b025a49-b2e5-4dc7-945c-09c6634afacf',
'only_matching': True,
}] }]
def _real_extract(self, url): def _real_extract(self, url):

View File

@ -1987,7 +1987,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
signature = self._decrypt_signature( signature = self._decrypt_signature(
encrypted_sig, video_id, player_url, age_gate) encrypted_sig, video_id, player_url, age_gate)
url += '&signature=' + signature sp = try_get(url_data, lambda x: x['sp'][0], compat_str) or 'signature'
url += '&%s=%s' % (sp, signature)
if 'ratebypass' not in url: if 'ratebypass' not in url:
url += '&ratebypass=yes' url += '&ratebypass=yes'
@ -2100,8 +2101,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
else: else:
self._downloader.report_warning('unable to extract uploader nickname') self._downloader.report_warning('unable to extract uploader nickname')
channel_id = self._html_search_meta( channel_id = (
'channelId', video_webpage, 'channel id') str_or_none(video_details.get('channelId')) or
self._html_search_meta(
'channelId', video_webpage, 'channel id', default=None) or
self._search_regex(
r'data-channel-external-id=(["\'])(?P<id>(?:(?!\1).)+)\1',
video_webpage, 'channel id', default=None, group='id'))
channel_url = 'http://www.youtube.com/channel/%s' % channel_id if channel_id else None channel_url = 'http://www.youtube.com/channel/%s' % channel_id if channel_id else None
# thumbnail image # thumbnail image

View File

@ -125,8 +125,8 @@ KNOWN_EXTENSIONS = (
# needed for sanitizing filenames in restricted mode # needed for sanitizing filenames in restricted mode
ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ', ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUYP', ['ss'], itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuypy'))) 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
DATE_FORMATS = ( DATE_FORMATS = (
'%d %B %Y', '%d %B %Y',