This commit is contained in:
Rooty 2018-06-14 16:31:36 +01:00
commit 7bd772a69f
43 changed files with 1370 additions and 890 deletions

View File

@ -6,8 +6,8 @@
--- ---
### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.05.26*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. ### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.06.14*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected.
- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.05.26** - [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.06.14**
### Before submitting an *issue* make sure you have: ### Before submitting an *issue* make sure you have:
- [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections - [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections
@ -36,7 +36,7 @@ Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl
[debug] User config: [] [debug] User config: []
[debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj']
[debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251
[debug] youtube-dl version 2018.05.26 [debug] youtube-dl version 2018.06.14
[debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2
[debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4
[debug] Proxy map: {} [debug] Proxy map: {}

1
.gitignore vendored
View File

@ -47,3 +47,4 @@ youtube-dl.zsh
*.iml *.iml
tmp/ tmp/
venv/

View File

@ -1,3 +1,85 @@
version 2018.06.14
Core
* [downloader/http] Fix retry on error when streaming to stdout (#16699)
Extractors
+ [discoverynetworks] Add support for disco-api videos (#16724)
+ [dailymotion] Add support for password protected videos (#9789)
+ [abc:iview] Add support for livestreams (#12354)
* [abc:iview] Fix extraction (#16704)
+ [crackle] Add support for sonycrackle.com (#16698)
+ [tvnet] Add support for tvnet.gov.vn (#15462)
* [nrk] Update API hosts and try all previously known ones (#16690)
* [wimp] Fix Youtube embeds extraction
version 2018.06.11
Extractors
* [npo] Extend URL regular expression and add support for npostart.nl (#16682)
+ [inc] Add support for another embed schema (#16666)
* [tv4] Fix format extraction (#16650)
+ [nexx] Add support for free cdn (#16538)
+ [pbs] Add another cove id pattern (#15373)
+ [rbmaradio] Add support for 192k format (#16631)
version 2018.06.04
Extractors
+ [camtube] Add support for camtube.co
+ [twitter:card] Extract guest token (#16609)
+ [chaturbate] Use geo verification headers
+ [bbc] Add support for bbcthree (#16612)
* [youtube] Move metadata extraction after video availability check
+ [youtube] Extract track and artist
+ [safari] Add support for new URL schema (#16614)
* [adn] Fix extraction
version 2018.06.02
Core
* [utils] Improve determine_ext
Extractors
+ [facebook] Add support for tahoe player videos (#15441, #16554)
* [cbc] Improve extraction (#16583, #16593)
* [openload] Improve ext extraction (#16595)
+ [twitter:card] Add support for another endpoint (#16586)
+ [openload] Add support for oload.win and oload.download (#16592)
* [audimedia] Fix extraction (#15309)
+ [francetv] Add support for sport.francetvinfo.fr (#15645)
* [mlb] Improve extraction (#16587)
- [nhl] Remove old extractors
* [rbmaradio] Check formats availability (#16585)
version 2018.05.30
Core
* [downloader/rtmp] Generalize download messages and report time elapsed
on finish
* [downloader/rtmp] Gracefully handle live streams interrupted by user
Extractors
* [teamcoco] Fix extraction for full episodes (#16573)
* [spiegel] Fix info extraction (#16538)
+ [apa] Add support for apa.at (#15041, #15672)
+ [bellmedia] Add support for bnnbloomberg.ca (#16560)
+ [9c9media] Extract MPD formats and subtitles
* [cammodels] Use geo verification headers
+ [ufctv] Add support for authentication (#16542)
+ [cammodels] Add support for cammodels.com (#14499)
* [utils] Fix style id extraction for namespaced id attribute in dfxp2srt
(#16551)
* [soundcloud] Detect format extension (#16549)
* [cbc] Fix playlist title extraction (#16502)
+ [tumblr] Detect and report sensitive media (#13829)
+ [tumblr] Add support for authentication (#15133)
version 2018.05.26 version 2018.05.26
Core Core

View File

@ -13,7 +13,7 @@ year = str(datetime.datetime.now().year)
for fn in glob.glob('*.html*'): for fn in glob.glob('*.html*'):
with io.open(fn, encoding='utf-8') as f: with io.open(fn, encoding='utf-8') as f:
content = f.read() content = f.read()
newc = re.sub(r'(?P<copyright>Copyright © 2006-)(?P<year>[0-9]{4})', 'Copyright © 2006-' + year, content) newc = re.sub(r'(?P<copyright>Copyright © 2011-)(?P<year>[0-9]{4})', 'Copyright © 2011-' + year, content)
if content != newc: if content != newc:
tmpFn = fn + '.part' tmpFn = fn + '.part'
with io.open(tmpFn, 'wt', encoding='utf-8') as outf: with io.open(tmpFn, 'wt', encoding='utf-8') as outf:

View File

@ -15,7 +15,6 @@
- **8tracks** - **8tracks**
- **91porn** - **91porn**
- **9c9media** - **9c9media**
- **9c9media:stack**
- **9gag** - **9gag**
- **9now.com.au** - **9now.com.au**
- **abc.net.au** - **abc.net.au**
@ -48,6 +47,7 @@
- **anitube.se** - **anitube.se**
- **Anvato** - **Anvato**
- **AnySex** - **AnySex**
- **APA**
- **Aparat** - **Aparat**
- **AppleConnect** - **AppleConnect**
- **AppleDaily**: 臺灣蘋果日報 - **AppleDaily**: 臺灣蘋果日報
@ -128,6 +128,8 @@
- **BYUtv** - **BYUtv**
- **Camdemy** - **Camdemy**
- **CamdemyFolder** - **CamdemyFolder**
- **CamModels**
- **CamTube**
- **CamWithHer** - **CamWithHer**
- **canalc2.tv** - **canalc2.tv**
- **Canalplus**: mycanal.fr and piwiplus.fr - **Canalplus**: mycanal.fr and piwiplus.fr
@ -552,9 +554,6 @@
- **nfl.com** - **nfl.com**
- **NhkVod** - **NhkVod**
- **nhl.com** - **nhl.com**
- **nhl.com:news**: NHL news
- **nhl.com:videocenter**
- **nhl.com:videocenter:category**: NHL videocenter category
- **nick.com** - **nick.com**
- **nick.de** - **nick.de**
- **nickelodeon:br** - **nickelodeon:br**
@ -792,6 +791,7 @@
- **Spiegel** - **Spiegel**
- **Spiegel:Article**: Articles on spiegel.de - **Spiegel:Article**: Articles on spiegel.de
- **Spiegeltv** - **Spiegeltv**
- **sport.francetvinfo.fr**
- **Sport5** - **Sport5**
- **SportBoxEmbed** - **SportBoxEmbed**
- **SportDeutschland** - **SportDeutschland**
@ -893,6 +893,7 @@
- **tvigle**: Интернет-телевидение Tvigle.ru - **tvigle**: Интернет-телевидение Tvigle.ru
- **tvland.com** - **tvland.com**
- **TVN24** - **TVN24**
- **TVNet**
- **TVNoe** - **TVNoe**
- **TVNow** - **TVNow**
- **TVNowList** - **TVNowList**

View File

@ -2,5 +2,5 @@
universal = True universal = True
[flake8] [flake8]
exclude = youtube_dl/extractor/__init__.py,devscripts/buildserver.py,devscripts/lazy_load_template.py,devscripts/make_issue_template.py,setup.py,build,.git exclude = youtube_dl/extractor/__init__.py,devscripts/buildserver.py,devscripts/lazy_load_template.py,devscripts/make_issue_template.py,setup.py,build,.git,venv
ignore = E402,E501,E731,E741 ignore = E402,E501,E731,E741

View File

@ -361,6 +361,7 @@ class TestUtil(unittest.TestCase):
self.assertEqual(determine_ext('http://example.com/foo/bar.nonext/?download', None), None) self.assertEqual(determine_ext('http://example.com/foo/bar.nonext/?download', None), None)
self.assertEqual(determine_ext('http://example.com/foo/bar/mp4?download', None), None) self.assertEqual(determine_ext('http://example.com/foo/bar/mp4?download', None), None)
self.assertEqual(determine_ext('http://example.com/foo/bar.m3u8//?download'), 'm3u8') self.assertEqual(determine_ext('http://example.com/foo/bar.m3u8//?download'), 'm3u8')
self.assertEqual(determine_ext('foobar', None), None)
def test_find_xpath_attr(self): def test_find_xpath_attr(self):
testxml = '''<root> testxml = '''<root>

View File

@ -217,10 +217,11 @@ class HttpFD(FileDownloader):
before = start # start measuring before = start # start measuring
def retry(e): def retry(e):
if ctx.tmpfilename != '-': to_stdout = ctx.tmpfilename == '-'
if not to_stdout:
ctx.stream.close() ctx.stream.close()
ctx.stream = None ctx.stream = None
ctx.resume_len = os.path.getsize(encodeFilename(ctx.tmpfilename)) ctx.resume_len = byte_counter if to_stdout else os.path.getsize(encodeFilename(ctx.tmpfilename))
raise RetryDownload(e) raise RetryDownload(e)
while True: while True:

View File

@ -105,22 +105,22 @@ class ABCIE(InfoExtractor):
class ABCIViewIE(InfoExtractor): class ABCIViewIE(InfoExtractor):
IE_NAME = 'abc.net.au:iview' IE_NAME = 'abc.net.au:iview'
_VALID_URL = r'https?://iview\.abc\.net\.au/programs/[^/]+/(?P<id>[^/?#]+)' _VALID_URL = r'https?://iview\.abc\.net\.au/(?:[^/]+/)*video/(?P<id>[^/?#]+)'
_GEO_COUNTRIES = ['AU'] _GEO_COUNTRIES = ['AU']
# ABC iview programs are normally available for 14 days only. # ABC iview programs are normally available for 14 days only.
_TESTS = [{ _TESTS = [{
'url': 'https://iview.abc.net.au/programs/ben-and-hollys-little-kingdom/ZY9247A021S00', 'url': 'https://iview.abc.net.au/show/ben-and-hollys-little-kingdom/series/0/video/ZX9371A050S00',
'md5': 'cde42d728b3b7c2b32b1b94b4a548afc', 'md5': 'cde42d728b3b7c2b32b1b94b4a548afc',
'info_dict': { 'info_dict': {
'id': 'ZY9247A021S00', 'id': 'ZX9371A050S00',
'ext': 'mp4', 'ext': 'mp4',
'title': "Gaston's Visit", 'title': "Gaston's Birthday",
'series': "Ben And Holly's Little Kingdom", 'series': "Ben And Holly's Little Kingdom",
'description': 'md5:18db170ad71cf161e006a4c688e33155', 'description': 'md5:f9de914d02f226968f598ac76f105bcf',
'upload_date': '20180318', 'upload_date': '20180604',
'uploader_id': 'abc4kids', 'uploader_id': 'abc4kids',
'timestamp': 1521400959, 'timestamp': 1528140219,
}, },
'params': { 'params': {
'skip_download': True, 'skip_download': True,
@ -129,17 +129,16 @@ class ABCIViewIE(InfoExtractor):
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id) video_params = self._download_json(
video_params = self._parse_json(self._search_regex( 'https://iview.abc.net.au/api/programs/' + video_id, video_id)
r'videoParams\s*=\s*({.+?});', webpage, 'video params'), video_id) title = unescapeHTML(video_params.get('title') or video_params['seriesTitle'])
title = video_params.get('title') or video_params['seriesTitle'] stream = next(s for s in video_params['playlist'] if s.get('type') in ('program', 'livestream'))
stream = next(s for s in video_params['playlist'] if s.get('type') == 'program')
house_number = video_params.get('episodeHouseNumber') house_number = video_params.get('episodeHouseNumber') or video_id
path = '/auth/hls/sign?ts={0}&hn={1}&d=android-mobile'.format( path = '/auth/hls/sign?ts={0}&hn={1}&d=android-tablet'.format(
int(time.time()), house_number) int(time.time()), house_number)
sig = hmac.new( sig = hmac.new(
'android.content.res.Resources'.encode('utf-8'), b'android.content.res.Resources',
path.encode('utf-8'), hashlib.sha256).hexdigest() path.encode('utf-8'), hashlib.sha256).hexdigest()
token = self._download_webpage( token = self._download_webpage(
'http://iview.abc.net.au{0}&sig={1}'.format(path, sig), video_id) 'http://iview.abc.net.au{0}&sig={1}'.format(path, sig), video_id)
@ -169,18 +168,26 @@ class ABCIViewIE(InfoExtractor):
'ext': 'vtt', 'ext': 'vtt',
}] }]
is_live = video_params.get('livestream') == '1'
if is_live:
title = self._live_title(title)
return { return {
'id': video_id, 'id': video_id,
'title': unescapeHTML(title), 'title': title,
'description': self._html_search_meta(['og:description', 'twitter:description'], webpage), 'description': video_params.get('description'),
'thumbnail': self._html_search_meta(['og:image', 'twitter:image:src'], webpage), 'thumbnail': video_params.get('thumbnail'),
'duration': int_or_none(video_params.get('eventDuration')), 'duration': int_or_none(video_params.get('eventDuration')),
'timestamp': parse_iso8601(video_params.get('pubDate'), ' '), 'timestamp': parse_iso8601(video_params.get('pubDate'), ' '),
'series': unescapeHTML(video_params.get('seriesTitle')), 'series': unescapeHTML(video_params.get('seriesTitle')),
'series_id': video_params.get('seriesHouseNumber') or video_id[:7], 'series_id': video_params.get('seriesHouseNumber') or video_id[:7],
'episode_number': int_or_none(self._html_search_meta('episodeNumber', webpage, default=None)), 'season_number': int_or_none(self._search_regex(
'episode': self._html_search_meta('episode_title', webpage, default=None), r'\bSeries\s+(\d+)\b', title, 'season number', default=None)),
'episode_number': int_or_none(self._search_regex(
r'\bEp\s+(\d+)\b', title, 'episode number', default=None)),
'episode_id': house_number,
'uploader_id': video_params.get('channel'), 'uploader_id': video_params.get('channel'),
'formats': formats, 'formats': formats,
'subtitles': subtitles, 'subtitles': subtitles,
'is_live': is_live,
} }

View File

@ -1,8 +1,11 @@
# coding: utf-8 # coding: utf-8
from __future__ import unicode_literals from __future__ import unicode_literals
import base64
import binascii
import json import json
import os import os
import random
from .common import InfoExtractor from .common import InfoExtractor
from ..aes import aes_cbc_decrypt from ..aes import aes_cbc_decrypt
@ -12,9 +15,12 @@ from ..compat import (
) )
from ..utils import ( from ..utils import (
bytes_to_intlist, bytes_to_intlist,
bytes_to_long,
ExtractorError, ExtractorError,
float_or_none, float_or_none,
intlist_to_bytes, intlist_to_bytes,
long_to_bytes,
pkcs1pad,
srt_subtitles_timecode, srt_subtitles_timecode,
strip_or_none, strip_or_none,
urljoin, urljoin,
@ -35,6 +41,7 @@ class ADNIE(InfoExtractor):
} }
} }
_BASE_URL = 'http://animedigitalnetwork.fr' _BASE_URL = 'http://animedigitalnetwork.fr'
_RSA_KEY = (0xc35ae1e4356b65a73b551493da94b8cb443491c0aa092a357a5aee57ffc14dda85326f42d716e539a34542a0d3f363adf16c5ec222d713d5997194030ee2e4f0d1fb328c01a81cf6868c090d50de8e169c6b13d1675b9eeed1cbc51e1fffca9b38af07f37abd790924cd3bee59d0257cfda4fe5f3f0534877e21ce5821447d1b, 65537)
def _get_subtitles(self, sub_path, video_id): def _get_subtitles(self, sub_path, video_id):
if not sub_path: if not sub_path:
@ -42,16 +49,14 @@ class ADNIE(InfoExtractor):
enc_subtitles = self._download_webpage( enc_subtitles = self._download_webpage(
urljoin(self._BASE_URL, sub_path), urljoin(self._BASE_URL, sub_path),
video_id, fatal=False, headers={ video_id, fatal=False)
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:53.0) Gecko/20100101 Firefox/53.0',
})
if not enc_subtitles: if not enc_subtitles:
return None return None
# http://animedigitalnetwork.fr/components/com_vodvideo/videojs/adn-vjs.min.js # http://animedigitalnetwork.fr/components/com_vodvideo/videojs/adn-vjs.min.js
dec_subtitles = intlist_to_bytes(aes_cbc_decrypt( dec_subtitles = intlist_to_bytes(aes_cbc_decrypt(
bytes_to_intlist(compat_b64decode(enc_subtitles[24:])), bytes_to_intlist(compat_b64decode(enc_subtitles[24:])),
bytes_to_intlist(b'\xc8\x6e\x06\xbc\xbe\xc6\x49\xf5\x88\x0d\xc8\x47\xc4\x27\x0c\x60'), bytes_to_intlist(binascii.unhexlify(self._K + '9032ad7083106400')),
bytes_to_intlist(compat_b64decode(enc_subtitles[:24])) bytes_to_intlist(compat_b64decode(enc_subtitles[:24]))
)) ))
subtitles_json = self._parse_json( subtitles_json = self._parse_json(
@ -112,11 +117,24 @@ class ADNIE(InfoExtractor):
error = None error = None
if not links: if not links:
links_url = player_config.get('linksurl') or options['videoUrl'] links_url = player_config.get('linksurl') or options['videoUrl']
links_data = self._download_json(urljoin( token = options['token']
self._BASE_URL, links_url), video_id) self._K = ''.join([random.choice('0123456789abcdef') for _ in range(16)])
message = bytes_to_intlist(json.dumps({
'k': self._K,
'e': 60,
't': token,
}))
padded_message = intlist_to_bytes(pkcs1pad(message, 128))
n, e = self._RSA_KEY
encrypted_message = long_to_bytes(pow(bytes_to_long(padded_message), e, n))
authorization = base64.b64encode(encrypted_message).decode()
links_data = self._download_json(
urljoin(self._BASE_URL, links_url), video_id, headers={
'Authorization': 'Bearer ' + authorization,
})
links = links_data.get('links') or {} links = links_data.get('links') or {}
metas = metas or links_data.get('meta') or {} metas = metas or links_data.get('meta') or {}
sub_path = sub_path or links_data.get('subtitles') sub_path = (sub_path or links_data.get('subtitles')) + '&token=' + token
error = links_data.get('error') error = links_data.get('error')
title = metas.get('title') or video_info['title'] title = metas.get('title') or video_info['title']

View File

@ -5,13 +5,12 @@ from .common import InfoExtractor
from ..utils import ( from ..utils import (
int_or_none, int_or_none,
parse_iso8601, parse_iso8601,
sanitized_Request,
) )
class AudiMediaIE(InfoExtractor): class AudiMediaIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?audi-mediacenter\.com/(?:en|de)/audimediatv/(?P<id>[^/?#]+)' _VALID_URL = r'https?://(?:www\.)?audi-mediacenter\.com/(?:en|de)/audimediatv/(?:video/)?(?P<id>[^/?#]+)'
_TEST = { _TESTS = [{
'url': 'https://www.audi-mediacenter.com/en/audimediatv/60-seconds-of-audi-sport-104-2015-wec-bahrain-rookie-test-1467', 'url': 'https://www.audi-mediacenter.com/en/audimediatv/60-seconds-of-audi-sport-104-2015-wec-bahrain-rookie-test-1467',
'md5': '79a8b71c46d49042609795ab59779b66', 'md5': '79a8b71c46d49042609795ab59779b66',
'info_dict': { 'info_dict': {
@ -24,41 +23,46 @@ class AudiMediaIE(InfoExtractor):
'duration': 74022, 'duration': 74022,
'view_count': int, 'view_count': int,
} }
} }, {
# extracted from https://audimedia.tv/assets/embed/embedded-player.js (dataSourceAuthToken) 'url': 'https://www.audi-mediacenter.com/en/audimediatv/video/60-seconds-of-audi-sport-104-2015-wec-bahrain-rookie-test-2991',
_AUTH_TOKEN = 'e25b42847dba18c6c8816d5d8ce94c326e06823ebf0859ed164b3ba169be97f2' 'only_matching': True,
}]
def _real_extract(self, url): def _real_extract(self, url):
display_id = self._match_id(url) display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id) webpage = self._download_webpage(url, display_id)
raw_payload = self._search_regex([ raw_payload = self._search_regex([
r'class="amtv-embed"[^>]+id="([^"]+)"', r'class="amtv-embed"[^>]+id="([0-9a-z-]+)"',
r'class=\\"amtv-embed\\"[^>]+id=\\"([^"]+)\\"', r'id="([0-9a-z-]+)"[^>]+class="amtv-embed"',
r'class=\\"amtv-embed\\"[^>]+id=\\"([0-9a-z-]+)\\"',
r'id=\\"([0-9a-z-]+)\\"[^>]+class=\\"amtv-embed\\"',
r'id=(?:\\)?"(amtve-[a-z]-\d+-[a-z]{2})',
], webpage, 'raw payload') ], webpage, 'raw payload')
_, stage_mode, video_id, lang = raw_payload.split('-') _, stage_mode, video_id, _ = raw_payload.split('-')
# TODO: handle s and e stage_mode (live streams and ended live streams) # TODO: handle s and e stage_mode (live streams and ended live streams)
if stage_mode not in ('s', 'e'): if stage_mode not in ('s', 'e'):
request = sanitized_Request( video_data = self._download_json(
'https://audimedia.tv/api/video/v1/videos/%s?embed[]=video_versions&embed[]=thumbnail_image&where[content_language_iso]=%s' % (video_id, lang), 'https://www.audimedia.tv/api/video/v1/videos/' + video_id,
headers={'X-Auth-Token': self._AUTH_TOKEN}) video_id, query={
json_data = self._download_json(request, video_id)['results'] 'embed[]': ['video_versions', 'thumbnail_image'],
})['results']
formats = [] formats = []
stream_url_hls = json_data.get('stream_url_hls') stream_url_hls = video_data.get('stream_url_hls')
if stream_url_hls: if stream_url_hls:
formats.extend(self._extract_m3u8_formats( formats.extend(self._extract_m3u8_formats(
stream_url_hls, video_id, 'mp4', stream_url_hls, video_id, 'mp4',
entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)) entry_protocol='m3u8_native', m3u8_id='hls', fatal=False))
stream_url_hds = json_data.get('stream_url_hds') stream_url_hds = video_data.get('stream_url_hds')
if stream_url_hds: if stream_url_hds:
formats.extend(self._extract_f4m_formats( formats.extend(self._extract_f4m_formats(
stream_url_hds + '?hdcore=3.4.0', stream_url_hds + '?hdcore=3.4.0',
video_id, f4m_id='hds', fatal=False)) video_id, f4m_id='hds', fatal=False))
for video_version in json_data.get('video_versions'): for video_version in video_data.get('video_versions', []):
video_version_url = video_version.get('download_url') or video_version.get('stream_url') video_version_url = video_version.get('download_url') or video_version.get('stream_url')
if not video_version_url: if not video_version_url:
continue continue
@ -79,11 +83,11 @@ class AudiMediaIE(InfoExtractor):
return { return {
'id': video_id, 'id': video_id,
'title': json_data['title'], 'title': video_data['title'],
'description': json_data.get('subtitle'), 'description': video_data.get('subtitle'),
'thumbnail': json_data.get('thumbnail_image', {}).get('file'), 'thumbnail': video_data.get('thumbnail_image', {}).get('file'),
'timestamp': parse_iso8601(json_data.get('publication_date')), 'timestamp': parse_iso8601(video_data.get('publication_date')),
'duration': int_or_none(json_data.get('duration')), 'duration': int_or_none(video_data.get('duration')),
'view_count': int_or_none(json_data.get('view_count')), 'view_count': int_or_none(video_data.get('view_count')),
'formats': formats, 'formats': formats,
} }

View File

@ -12,6 +12,7 @@ from ..utils import (
float_or_none, float_or_none,
get_element_by_class, get_element_by_class,
int_or_none, int_or_none,
js_to_json,
parse_duration, parse_duration,
parse_iso8601, parse_iso8601,
try_get, try_get,
@ -772,6 +773,17 @@ class BBCIE(BBCCoUkIE):
# single video article embedded with data-media-vpid # single video article embedded with data-media-vpid
'url': 'http://www.bbc.co.uk/sport/rowing/35908187', 'url': 'http://www.bbc.co.uk/sport/rowing/35908187',
'only_matching': True, 'only_matching': True,
}, {
'url': 'https://www.bbc.co.uk/bbcthree/clip/73d0bbd0-abc3-4cea-b3c0-cdae21905eb1',
'info_dict': {
'id': 'p06556y7',
'ext': 'mp4',
'title': 'Transfers: Cristiano Ronaldo to Man Utd, Arsenal to spend?',
'description': 'md5:4b7dfd063d5a789a1512e99662be3ddd',
},
'params': {
'skip_download': True,
}
}] }]
@classmethod @classmethod
@ -994,6 +1006,36 @@ class BBCIE(BBCCoUkIE):
'subtitles': subtitles, 'subtitles': subtitles,
} }
bbc3_config = self._parse_json(
self._search_regex(
r'(?s)bbcthreeConfig\s*=\s*({.+?})\s*;\s*<', webpage,
'bbcthree config', default='{}'),
playlist_id, transform_source=js_to_json, fatal=False)
if bbc3_config:
bbc3_playlist = try_get(
bbc3_config, lambda x: x['payload']['content']['bbcMedia']['playlist'],
dict)
if bbc3_playlist:
playlist_title = bbc3_playlist.get('title') or playlist_title
thumbnail = bbc3_playlist.get('holdingImageURL')
entries = []
for bbc3_item in bbc3_playlist['items']:
programme_id = bbc3_item.get('versionID')
if not programme_id:
continue
formats, subtitles = self._download_media_selector(programme_id)
self._sort_formats(formats)
entries.append({
'id': programme_id,
'title': playlist_title,
'thumbnail': thumbnail,
'timestamp': timestamp,
'formats': formats,
'subtitles': subtitles,
})
return self.playlist_result(
entries, playlist_id, playlist_title, playlist_description)
def extract_all(pattern): def extract_all(pattern):
return list(filter(None, map( return list(filter(None, map(
lambda s: self._parse_json(s, playlist_id, fatal=False), lambda s: self._parse_json(s, playlist_id, fatal=False),

View File

@ -0,0 +1,69 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import (
int_or_none,
unified_timestamp,
)
class CamTubeIE(InfoExtractor):
_VALID_URL = r'https?://(?:(?:www|api)\.)?camtube\.co/recordings?/(?P<id>[^/?#&]+)'
_TESTS = [{
'url': 'https://camtube.co/recording/minafay-030618-1136-chaturbate-female',
'info_dict': {
'id': '42ad3956-dd5b-445a-8313-803ea6079fac',
'display_id': 'minafay-030618-1136-chaturbate-female',
'ext': 'mp4',
'title': 'minafay-030618-1136-chaturbate-female',
'duration': 1274,
'timestamp': 1528018608,
'upload_date': '20180603',
},
'params': {
'skip_download': True,
},
}]
_API_BASE = 'https://api.camtube.co'
def _real_extract(self, url):
display_id = self._match_id(url)
token = self._download_json(
'%s/rpc/session/new' % self._API_BASE, display_id,
'Downloading session token')['token']
self._set_cookie('api.camtube.co', 'session', token)
video = self._download_json(
'%s/recordings/%s' % (self._API_BASE, display_id), display_id,
headers={'Referer': url})
video_id = video['uuid']
timestamp = unified_timestamp(video.get('createdAt'))
duration = int_or_none(video.get('duration'))
view_count = int_or_none(video.get('viewCount'))
like_count = int_or_none(video.get('likeCount'))
creator = video.get('stageName')
formats = [{
'url': '%s/recordings/%s/manifest.m3u8'
% (self._API_BASE, video_id),
'format_id': 'hls',
'ext': 'mp4',
'protocol': 'm3u8_native',
}]
return {
'id': video_id,
'display_id': display_id,
'title': display_id,
'timestamp': timestamp,
'duration': duration,
'view_count': view_count,
'like_count': like_count,
'creator': creator,
'formats': formats,
}

View File

@ -17,6 +17,7 @@ from ..utils import (
xpath_element, xpath_element,
xpath_with_ns, xpath_with_ns,
find_xpath_attr, find_xpath_attr,
orderedSet,
parse_duration, parse_duration,
parse_iso8601, parse_iso8601,
parse_age_limit, parse_age_limit,
@ -136,9 +137,15 @@ class CBCIE(InfoExtractor):
entries = [ entries = [
self._extract_player_init(player_init, display_id) self._extract_player_init(player_init, display_id)
for player_init in re.findall(r'CBC\.APP\.Caffeine\.initInstance\(({.+?})\);', webpage)] for player_init in re.findall(r'CBC\.APP\.Caffeine\.initInstance\(({.+?})\);', webpage)]
media_ids = []
for media_id_re in (
r'<iframe[^>]+src="[^"]+?mediaId=(\d+)"',
r'<div[^>]+\bid=["\']player-(\d+)',
r'guid["\']\s*:\s*["\'](\d+)'):
media_ids.extend(re.findall(media_id_re, webpage))
entries.extend([ entries.extend([
self.url_result('cbcplayer:%s' % media_id, 'CBCPlayer', media_id) self.url_result('cbcplayer:%s' % media_id, 'CBCPlayer', media_id)
for media_id in re.findall(r'<iframe[^>]+src="[^"]+?mediaId=(\d+)"', webpage)]) for media_id in orderedSet(media_ids)])
return self.playlist_result( return self.playlist_result(
entries, display_id, strip_or_none(title), entries, display_id, strip_or_none(title),
self._og_search_description(webpage)) self._og_search_description(webpage))

View File

@ -31,7 +31,8 @@ class ChaturbateIE(InfoExtractor):
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id) webpage = self._download_webpage(
url, video_id, headers=self.geo_verification_headers())
m3u8_urls = [] m3u8_urls = []

View File

@ -19,8 +19,8 @@ from ..utils import (
class CrackleIE(InfoExtractor): class CrackleIE(InfoExtractor):
_VALID_URL = r'(?:crackle:|https?://(?:(?:www|m)\.)?crackle\.com/(?:playlist/\d+/|(?:[^/]+/)+))(?P<id>\d+)' _VALID_URL = r'(?:crackle:|https?://(?:(?:www|m)\.)?(?:sony)?crackle\.com/(?:playlist/\d+/|(?:[^/]+/)+))(?P<id>\d+)'
_TEST = { _TESTS = [{
# geo restricted to CA # geo restricted to CA
'url': 'https://www.crackle.com/andromeda/2502343', 'url': 'https://www.crackle.com/andromeda/2502343',
'info_dict': { 'info_dict': {
@ -45,7 +45,10 @@ class CrackleIE(InfoExtractor):
# m3u8 download # m3u8 download
'skip_download': True, 'skip_download': True,
} }
} }, {
'url': 'https://www.sonycrackle.com/andromeda/2502343',
'only_matching': True,
}]
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) video_id = self._match_id(url)

View File

@ -1,12 +1,16 @@
# coding: utf-8 # coding: utf-8
from __future__ import unicode_literals from __future__ import unicode_literals
import re import base64
import json import hashlib
import itertools import itertools
import json
import random
import re
import string
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import compat_struct_pack
from ..utils import ( from ..utils import (
determine_ext, determine_ext,
error_to_compat_str, error_to_compat_str,
@ -64,7 +68,6 @@ class DailymotionIE(DailymotionBaseInfoExtractor):
'uploader': 'Deadline', 'uploader': 'Deadline',
'uploader_id': 'x1xm8ri', 'uploader_id': 'x1xm8ri',
'age_limit': 0, 'age_limit': 0,
'view_count': int,
}, },
}, { }, {
'url': 'https://www.dailymotion.com/video/x2iuewm_steam-machine-models-pricing-listed-on-steam-store-ign-news_videogames', 'url': 'https://www.dailymotion.com/video/x2iuewm_steam-machine-models-pricing-listed-on-steam-store-ign-news_videogames',
@ -167,6 +170,17 @@ class DailymotionIE(DailymotionBaseInfoExtractor):
player = self._parse_json(player_v5, video_id) player = self._parse_json(player_v5, video_id)
metadata = player['metadata'] metadata = player['metadata']
if metadata.get('error', {}).get('type') == 'password_protected':
password = self._downloader.params.get('videopassword')
if password:
r = int(metadata['id'][1:], 36)
us64e = lambda x: base64.urlsafe_b64encode(x).decode().strip('=')
t = ''.join(random.choice(string.ascii_letters) for i in range(10))
n = us64e(compat_struct_pack('I', r))
i = us64e(hashlib.md5(('%s%d%s' % (password, r, t)).encode()).digest())
metadata = self._download_json(
'http://www.dailymotion.com/player/metadata/video/p' + i + t + n, video_id)
self._check_error(metadata) self._check_error(metadata)
formats = [] formats = []
@ -302,8 +316,8 @@ class DailymotionIE(DailymotionBaseInfoExtractor):
def _check_error(self, info): def _check_error(self, info):
error = info.get('error') error = info.get('error')
if info.get('error') is not None: if error:
title = error['title'] title = error.get('title') or error['message']
# See https://developer.dailymotion.com/api#access-error # See https://developer.dailymotion.com/api#access-error
if error.get('code') == 'DM007': if error.get('code') == 'DM007':
self.raise_geo_restricted(msg=title) self.raise_geo_restricted(msg=title)

View File

@ -3,8 +3,8 @@ from __future__ import unicode_literals
import re import re
from .common import InfoExtractor
from .brightcove import BrightcoveLegacyIE from .brightcove import BrightcoveLegacyIE
from .dplay import DPlayIE
from ..compat import ( from ..compat import (
compat_parse_qs, compat_parse_qs,
compat_urlparse, compat_urlparse,
@ -12,8 +12,13 @@ from ..compat import (
from ..utils import smuggle_url from ..utils import smuggle_url
class DiscoveryNetworksDeIE(InfoExtractor): class DiscoveryNetworksDeIE(DPlayIE):
_VALID_URL = r'https?://(?:www\.)?(?:discovery|tlc|animalplanet|dmax)\.de/(?:.*#(?P<id>\d+)|(?:[^/]+/)*videos/(?P<title>[^/?#]+))' _VALID_URL = r'''(?x)https?://(?:www\.)?(?P<site>discovery|tlc|animalplanet|dmax)\.de/
(?:
.*\#(?P<id>\d+)|
(?:[^/]+/)*videos/(?P<display_id>[^/?#]+)|
programme/(?P<programme>[^/]+)/video/(?P<alternate_id>[^/]+)
)'''
_TESTS = [{ _TESTS = [{
'url': 'http://www.tlc.de/sendungen/breaking-amish/videos/#3235167922001', 'url': 'http://www.tlc.de/sendungen/breaking-amish/videos/#3235167922001',
@ -40,6 +45,14 @@ class DiscoveryNetworksDeIE(InfoExtractor):
def _real_extract(self, url): def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url) mobj = re.match(self._VALID_URL, url)
alternate_id = mobj.group('alternate_id')
if alternate_id:
self._initialize_geo_bypass({
'countries': ['DE'],
})
return self._get_disco_api_info(
url, '%s/%s' % (mobj.group('programme'), alternate_id),
'sonic-eu1-prod.disco-api.com', mobj.group('site') + 'de')
brightcove_id = mobj.group('id') brightcove_id = mobj.group('id')
if not brightcove_id: if not brightcove_id:
title = mobj.group('title') title = mobj.group('title')

View File

@ -97,6 +97,75 @@ class DPlayIE(InfoExtractor):
'only_matching': True, 'only_matching': True,
}] }]
def _get_disco_api_info(self, url, display_id, disco_host, realm):
disco_base = 'https://' + disco_host
token = self._download_json(
'%s/token' % disco_base, display_id, 'Downloading token',
query={
'realm': realm,
})['data']['attributes']['token']
headers = {
'Referer': url,
'Authorization': 'Bearer ' + token,
}
video = self._download_json(
'%s/content/videos/%s' % (disco_base, display_id), display_id,
headers=headers, query={
'include': 'show'
})
video_id = video['data']['id']
info = video['data']['attributes']
title = info['name']
formats = []
for format_id, format_dict in self._download_json(
'%s/playback/videoPlaybackInfo/%s' % (disco_base, video_id),
display_id, headers=headers)['data']['attributes']['streaming'].items():
if not isinstance(format_dict, dict):
continue
format_url = format_dict.get('url')
if not format_url:
continue
ext = determine_ext(format_url)
if format_id == 'dash' or ext == 'mpd':
formats.extend(self._extract_mpd_formats(
format_url, display_id, mpd_id='dash', fatal=False))
elif format_id == 'hls' or ext == 'm3u8':
formats.extend(self._extract_m3u8_formats(
format_url, display_id, 'mp4',
entry_protocol='m3u8_native', m3u8_id='hls',
fatal=False))
else:
formats.append({
'url': format_url,
'format_id': format_id,
})
self._sort_formats(formats)
series = None
try:
included = video.get('included')
if isinstance(included, list):
show = next(e for e in included if e.get('type') == 'show')
series = try_get(
show, lambda x: x['attributes']['name'], compat_str)
except StopIteration:
pass
return {
'id': video_id,
'display_id': display_id,
'title': title,
'description': info.get('description'),
'duration': float_or_none(
info.get('videoDuration'), scale=1000),
'timestamp': unified_timestamp(info.get('publishStart')),
'series': series,
'season_number': int_or_none(info.get('seasonNumber')),
'episode_number': int_or_none(info.get('episodeNumber')),
'age_limit': int_or_none(info.get('minimum_age')),
'formats': formats,
}
def _real_extract(self, url): def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url) mobj = re.match(self._VALID_URL, url)
display_id = mobj.group('id') display_id = mobj.group('id')
@ -113,72 +182,8 @@ class DPlayIE(InfoExtractor):
if not video_id: if not video_id:
host = mobj.group('host') host = mobj.group('host')
disco_base = 'https://disco-api.%s' % host return self._get_disco_api_info(
self._download_json( url, display_id, 'disco-api.' + host, host.replace('.', ''))
'%s/token' % disco_base, display_id, 'Downloading token',
query={
'realm': host.replace('.', ''),
})
video = self._download_json(
'%s/content/videos/%s' % (disco_base, display_id), display_id,
headers={
'Referer': url,
'x-disco-client': 'WEB:UNKNOWN:dplay-client:0.0.1',
}, query={
'include': 'show'
})
video_id = video['data']['id']
info = video['data']['attributes']
title = info['name']
formats = []
for format_id, format_dict in self._download_json(
'%s/playback/videoPlaybackInfo/%s' % (disco_base, video_id),
display_id)['data']['attributes']['streaming'].items():
if not isinstance(format_dict, dict):
continue
format_url = format_dict.get('url')
if not format_url:
continue
ext = determine_ext(format_url)
if format_id == 'dash' or ext == 'mpd':
formats.extend(self._extract_mpd_formats(
format_url, display_id, mpd_id='dash', fatal=False))
elif format_id == 'hls' or ext == 'm3u8':
formats.extend(self._extract_m3u8_formats(
format_url, display_id, 'mp4',
entry_protocol='m3u8_native', m3u8_id='hls',
fatal=False))
else:
formats.append({
'url': format_url,
'format_id': format_id,
})
self._sort_formats(formats)
series = None
try:
included = video.get('included')
if isinstance(included, list):
show = next(e for e in included if e.get('type') == 'show')
series = try_get(
show, lambda x: x['attributes']['name'], compat_str)
except StopIteration:
pass
return {
'id': video_id,
'display_id': display_id,
'title': title,
'description': info.get('description'),
'duration': float_or_none(
info.get('videoDuration'), scale=1000),
'timestamp': unified_timestamp(info.get('publishStart')),
'series': series,
'season_number': int_or_none(info.get('seasonNumber')),
'episode_number': int_or_none(info.get('episodeNumber')),
'age_limit': int_or_none(info.get('minimum_age')),
'formats': formats,
}
info = self._download_json( info = self._download_json(
'http://%s/api/v2/ajax/videos?video_id=%s' % (domain, video_id), 'http://%s/api/v2/ajax/videos?video_id=%s' % (domain, video_id),

View File

@ -147,6 +147,7 @@ from .camdemy import (
CamdemyFolderIE CamdemyFolderIE
) )
from .cammodels import CamModelsIE from .cammodels import CamModelsIE
from .camtube import CamTubeIE
from .camwithher import CamWithHerIE from .camwithher import CamWithHerIE
from .canalplus import CanalplusIE from .canalplus import CanalplusIE
from .canalc2 import Canalc2IE from .canalc2 import Canalc2IE
@ -381,6 +382,7 @@ from .francetv import (
FranceTVSiteIE, FranceTVSiteIE,
FranceTVEmbedIE, FranceTVEmbedIE,
FranceTVInfoIE, FranceTVInfoIE,
FranceTVInfoSportIE,
FranceTVJeunesseIE, FranceTVJeunesseIE,
GenerationWhatIE, GenerationWhatIE,
CultureboxIE, CultureboxIE,
@ -705,12 +707,7 @@ from .nexx import (
from .nfb import NFBIE from .nfb import NFBIE
from .nfl import NFLIE from .nfl import NFLIE
from .nhk import NhkVodIE from .nhk import NhkVodIE
from .nhl import ( from .nhl import NHLIE
NHLVideocenterIE,
NHLNewsIE,
NHLVideocenterCategoryIE,
NHLIE,
)
from .nick import ( from .nick import (
NickIE, NickIE,
NickBrIE, NickBrIE,
@ -1142,6 +1139,7 @@ from .tvc import (
from .tvigle import TvigleIE from .tvigle import TvigleIE
from .tvland import TVLandIE from .tvland import TVLandIE
from .tvn24 import TVN24IE from .tvn24 import TVN24IE
from .tvnet import TVNetIE
from .tvnoe import TVNoeIE from .tvnoe import TVNoeIE
from .tvnow import ( from .tvnow import (
TVNowIE, TVNowIE,

View File

@ -56,6 +56,7 @@ class FacebookIE(InfoExtractor):
_CHROME_USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.97 Safari/537.36' _CHROME_USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.97 Safari/537.36'
_VIDEO_PAGE_TEMPLATE = 'https://www.facebook.com/video/video.php?v=%s' _VIDEO_PAGE_TEMPLATE = 'https://www.facebook.com/video/video.php?v=%s'
_VIDEO_PAGE_TAHOE_TEMPLATE = 'https://www.facebook.com/video/tahoe/async/%s/?chain=true&isvideo=true'
_TESTS = [{ _TESTS = [{
'url': 'https://www.facebook.com/video.php?v=637842556329505&fref=nf', 'url': 'https://www.facebook.com/video.php?v=637842556329505&fref=nf',
@ -208,6 +209,17 @@ class FacebookIE(InfoExtractor):
# no title # no title
'url': 'https://www.facebook.com/onlycleverentertainment/videos/1947995502095005/', 'url': 'https://www.facebook.com/onlycleverentertainment/videos/1947995502095005/',
'only_matching': True, 'only_matching': True,
}, {
'url': 'https://www.facebook.com/WatchESLOne/videos/359649331226507/',
'info_dict': {
'id': '359649331226507',
'ext': 'mp4',
'title': '#ESLOne VoD - Birmingham Finals Day#1 Fnatic vs. @Evil Geniuses',
'uploader': 'ESL One Dota 2',
},
'params': {
'skip_download': True,
},
}] }]
@staticmethod @staticmethod
@ -312,16 +324,18 @@ class FacebookIE(InfoExtractor):
if server_js_data: if server_js_data:
video_data = extract_video_data(server_js_data.get('instances', [])) video_data = extract_video_data(server_js_data.get('instances', []))
def extract_from_jsmods_instances(js_data):
if js_data:
return extract_video_data(try_get(
js_data, lambda x: x['jsmods']['instances'], list) or [])
if not video_data: if not video_data:
server_js_data = self._parse_json( server_js_data = self._parse_json(
self._search_regex( self._search_regex(
r'bigPipe\.onPageletArrive\(({.+?})\)\s*;\s*}\s*\)\s*,\s*["\']onPageletArrive\s+(?:stream_pagelet|pagelet_group_mall|permalink_video_pagelet)', r'bigPipe\.onPageletArrive\(({.+?})\)\s*;\s*}\s*\)\s*,\s*["\']onPageletArrive\s+(?:stream_pagelet|pagelet_group_mall|permalink_video_pagelet)',
webpage, 'js data', default='{}'), webpage, 'js data', default='{}'),
video_id, transform_source=js_to_json, fatal=False) video_id, transform_source=js_to_json, fatal=False)
if server_js_data: video_data = extract_from_jsmods_instances(server_js_data)
video_data = extract_video_data(try_get(
server_js_data, lambda x: x['jsmods']['instances'],
list) or [])
if not video_data: if not video_data:
if not fatal_if_no_video: if not fatal_if_no_video:
@ -333,8 +347,33 @@ class FacebookIE(InfoExtractor):
expected=True) expected=True)
elif '>You must log in to continue' in webpage: elif '>You must log in to continue' in webpage:
self.raise_login_required() self.raise_login_required()
else:
raise ExtractorError('Cannot parse data') # Video info not in first request, do a secondary request using
# tahoe player specific URL
tahoe_data = self._download_webpage(
self._VIDEO_PAGE_TAHOE_TEMPLATE % video_id, video_id,
data=urlencode_postdata({
'__user': 0,
'__a': 1,
'__pc': self._search_regex(
r'pkg_cohort["\']\s*:\s*["\'](.+?)["\']', webpage,
'pkg cohort', default='PHASED:DEFAULT'),
'__rev': self._search_regex(
r'client_revision["\']\s*:\s*(\d+),', webpage,
'client revision', default='3944515'),
}),
headers={
'Content-Type': 'application/x-www-form-urlencoded',
})
tahoe_js_data = self._parse_json(
self._search_regex(
r'for\s+\(\s*;\s*;\s*\)\s*;(.+)', tahoe_data,
'tahoe js data', default='{}'),
video_id, fatal=False)
video_data = extract_from_jsmods_instances(tahoe_js_data)
if not video_data:
raise ExtractorError('Cannot parse data')
formats = [] formats = []
for f in video_data: for f in video_data:
@ -380,7 +419,8 @@ class FacebookIE(InfoExtractor):
video_title = 'Facebook video #%s' % video_id video_title = 'Facebook video #%s' % video_id
uploader = clean_html(get_element_by_id( uploader = clean_html(get_element_by_id(
'fbPhotoPageAuthorName', webpage)) or self._search_regex( 'fbPhotoPageAuthorName', webpage)) or self._search_regex(
r'ownerName\s*:\s*"([^"]+)"', webpage, 'uploader', fatal=False) r'ownerName\s*:\s*"([^"]+)"', webpage, 'uploader',
fatal=False) or self._og_search_title(webpage, fatal=False)
timestamp = int_or_none(self._search_regex( timestamp = int_or_none(self._search_regex(
r'<abbr[^>]+data-utime=["\'](\d+)', webpage, r'<abbr[^>]+data-utime=["\'](\d+)', webpage,
'timestamp', default=None)) 'timestamp', default=None))

View File

@ -379,6 +379,31 @@ class FranceTVInfoIE(FranceTVBaseInfoExtractor):
return self._make_url_result(video_id, catalogue) return self._make_url_result(video_id, catalogue)
class FranceTVInfoSportIE(FranceTVBaseInfoExtractor):
IE_NAME = 'sport.francetvinfo.fr'
_VALID_URL = r'https?://sport\.francetvinfo\.fr/(?:[^/]+/)*(?P<id>[^/?#&]+)'
_TESTS = [{
'url': 'https://sport.francetvinfo.fr/les-jeux-olympiques/retour-sur-les-meilleurs-moments-de-pyeongchang-2018',
'info_dict': {
'id': '6e49080e-3f45-11e8-b459-000d3a2439ea',
'ext': 'mp4',
'title': 'Retour sur les meilleurs moments de Pyeongchang 2018',
'timestamp': 1523639962,
'upload_date': '20180413',
},
'params': {
'skip_download': True,
},
'add_ie': [FranceTVIE.ie_key()],
}]
def _real_extract(self, url):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
video_id = self._search_regex(r'data-video="([^"]+)"', webpage, 'video_id')
return self._make_url_result(video_id, 'Sport-web')
class GenerationWhatIE(InfoExtractor): class GenerationWhatIE(InfoExtractor):
IE_NAME = 'france2.fr:generation-what' IE_NAME = 'france2.fr:generation-what'
_VALID_URL = r'https?://generation-what\.francetv\.fr/[^/]+/video/(?P<id>[^/?#&]+)' _VALID_URL = r'https?://generation-what\.francetv\.fr/[^/]+/video/(?P<id>[^/?#&]+)'

View File

@ -21,6 +21,21 @@ class IncIE(InfoExtractor):
'params': { 'params': {
'skip_download': True, 'skip_download': True,
}, },
}, {
# div with id=kaltura_player_1_kqs38cgm
'url': 'https://www.inc.com/oscar-raymundo/richard-branson-young-entrepeneurs.html',
'info_dict': {
'id': '1_kqs38cgm',
'ext': 'mp4',
'title': 'Branson: "In the end, you have to say, Screw it. Just do it."',
'description': 'md5:21b832d034f9af5191ca5959da5e9cb6',
'timestamp': 1364403232,
'upload_date': '20130327',
'uploader_id': 'incdigital@inc.com',
},
'params': {
'skip_download': True,
},
}, { }, {
'url': 'http://www.inc.com/video/david-whitford/founders-forum-tripadvisor-steve-kaufer-most-enjoyable-moment-for-entrepreneur.html', 'url': 'http://www.inc.com/video/david-whitford/founders-forum-tripadvisor-steve-kaufer-most-enjoyable-moment-for-entrepreneur.html',
'only_matching': True, 'only_matching': True,
@ -31,10 +46,13 @@ class IncIE(InfoExtractor):
webpage = self._download_webpage(url, display_id) webpage = self._download_webpage(url, display_id)
partner_id = self._search_regex( partner_id = self._search_regex(
r'var\s+_?bizo_data_partner_id\s*=\s*["\'](\d+)', webpage, 'partner id') r'var\s+_?bizo_data_partner_id\s*=\s*["\'](\d+)', webpage,
'partner id', default='1034971')
kaltura_id = self._parse_json(self._search_regex( kaltura_id = self._search_regex(
r'pageInfo\.videos\s*=\s*\[(.+)\];', webpage, 'kaltura id'), r'id=(["\'])kaltura_player_(?P<id>.+?)\1', webpage, 'kaltura id',
default=None, group='id') or self._parse_json(self._search_regex(
r'pageInfo\.videos\s*=\s*\[(.+)\];', webpage, 'kaltura id'),
display_id)['vid_kaltura_id'] display_id)['vid_kaltura_id']
return self.url_result( return self.url_result(

View File

@ -1,96 +1,90 @@
from __future__ import unicode_literals from __future__ import unicode_literals
import re from .nhl import NHLBaseIE
from .common import InfoExtractor
from ..utils import (
parse_duration,
parse_iso8601,
)
class MLBIE(InfoExtractor): class MLBIE(NHLBaseIE):
_VALID_URL = r'''(?x) _VALID_URL = r'''(?x)
https?:// https?://
(?:[\da-z_-]+\.)*mlb\.com/ (?:[\da-z_-]+\.)*(?P<site>mlb)\.com/
(?: (?:
(?: (?:
(?:.*?/)?video/(?:topic/[\da-z_-]+/)?(?:v|.*?/c-)| (?:[^/]+/)*c-|
(?: (?:
shared/video/embed/(?:embed|m-internal-embed)\.html| shared/video/embed/(?:embed|m-internal-embed)\.html|
(?:[^/]+/)+(?:play|index)\.jsp| (?:[^/]+/)+(?:play|index)\.jsp|
)\?.*?\bcontent_id= )\?.*?\bcontent_id=
) )
(?P<id>n?\d+)| (?P<id>\d+)
(?:[^/]+/)*(?P<path>[^/]+)
) )
''' '''
_CONTENT_DOMAIN = 'content.mlb.com'
_TESTS = [ _TESTS = [
{ {
'url': 'http://m.mlb.com/sea/video/topic/51231442/v34698933/nymsea-ackley-robs-a-home-run-with-an-amazing-catch/?c_id=sea', 'url': 'https://www.mlb.com/mariners/video/ackleys-spectacular-catch/c-34698933',
'md5': 'ff56a598c2cf411a9a38a69709e97079', 'md5': '632358dacfceec06bad823b83d21df2d',
'info_dict': { 'info_dict': {
'id': '34698933', 'id': '34698933',
'ext': 'mp4', 'ext': 'mp4',
'title': "Ackley's spectacular catch", 'title': "Ackley's spectacular catch",
'description': 'md5:7f5a981eb4f3cbc8daf2aeffa2215bf0', 'description': 'md5:7f5a981eb4f3cbc8daf2aeffa2215bf0',
'duration': 66, 'duration': 66,
'timestamp': 1405980600, 'timestamp': 1405995000,
'upload_date': '20140721', 'upload_date': '20140722',
'thumbnail': r're:^https?://.*\.jpg$', 'thumbnail': r're:^https?://.*\.jpg$',
}, },
}, },
{ {
'url': 'http://m.mlb.com/video/topic/81536970/v34496663/mianym-stanton-practices-for-the-home-run-derby', 'url': 'https://www.mlb.com/video/stanton-prepares-for-derby/c-34496663',
'md5': 'd9c022c10d21f849f49c05ae12a8a7e9', 'md5': 'bf2619bf9cacc0a564fc35e6aeb9219f',
'info_dict': { 'info_dict': {
'id': '34496663', 'id': '34496663',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Stanton prepares for Derby', 'title': 'Stanton prepares for Derby',
'description': 'md5:d00ce1e5fd9c9069e9c13ab4faedfa57', 'description': 'md5:d00ce1e5fd9c9069e9c13ab4faedfa57',
'duration': 46, 'duration': 46,
'timestamp': 1405105800, 'timestamp': 1405120200,
'upload_date': '20140711', 'upload_date': '20140711',
'thumbnail': r're:^https?://.*\.jpg$', 'thumbnail': r're:^https?://.*\.jpg$',
}, },
}, },
{ {
'url': 'http://m.mlb.com/video/topic/vtp_hrd_sponsor/v34578115/hrd-cespedes-wins-2014-gillette-home-run-derby', 'url': 'https://www.mlb.com/video/cespedes-repeats-as-derby-champ/c-34578115',
'md5': '0e6e73d509321e142409b695eadd541f', 'md5': '99bb9176531adc600b90880fb8be9328',
'info_dict': { 'info_dict': {
'id': '34578115', 'id': '34578115',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Cespedes repeats as Derby champ', 'title': 'Cespedes repeats as Derby champ',
'description': 'md5:08df253ce265d4cf6fb09f581fafad07', 'description': 'md5:08df253ce265d4cf6fb09f581fafad07',
'duration': 488, 'duration': 488,
'timestamp': 1405399936, 'timestamp': 1405414336,
'upload_date': '20140715', 'upload_date': '20140715',
'thumbnail': r're:^https?://.*\.jpg$', 'thumbnail': r're:^https?://.*\.jpg$',
}, },
}, },
{ {
'url': 'http://m.mlb.com/video/v34577915/bautista-on-derby-captaining-duties-his-performance', 'url': 'https://www.mlb.com/video/bautista-on-home-run-derby/c-34577915',
'md5': 'b8fd237347b844365d74ea61d4245967', 'md5': 'da8b57a12b060e7663ee1eebd6f330ec',
'info_dict': { 'info_dict': {
'id': '34577915', 'id': '34577915',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Bautista on Home Run Derby', 'title': 'Bautista on Home Run Derby',
'description': 'md5:b80b34031143d0986dddc64a8839f0fb', 'description': 'md5:b80b34031143d0986dddc64a8839f0fb',
'duration': 52, 'duration': 52,
'timestamp': 1405390722, 'timestamp': 1405405122,
'upload_date': '20140715', 'upload_date': '20140715',
'thumbnail': r're:^https?://.*\.jpg$', 'thumbnail': r're:^https?://.*\.jpg$',
}, },
}, },
{ {
'url': 'http://m.mlb.com/news/article/118550098/blue-jays-kevin-pillar-goes-spidey-up-the-wall-to-rob-tim-beckham-of-a-homer', 'url': 'https://www.mlb.com/news/blue-jays-kevin-pillar-goes-spidey-up-the-wall-to-rob-tim-beckham-of-a-homer/c-118550098',
'md5': 'aafaf5b0186fee8f32f20508092f8111', 'md5': 'e09e37b552351fddbf4d9e699c924d68',
'info_dict': { 'info_dict': {
'id': '75609783', 'id': '75609783',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Must C: Pillar climbs for catch', 'title': 'Must C: Pillar climbs for catch',
'description': '4/15/15: Blue Jays outfielder Kevin Pillar continues his defensive dominance by climbing the wall in left to rob Tim Beckham of a home run', 'description': '4/15/15: Blue Jays outfielder Kevin Pillar continues his defensive dominance by climbing the wall in left to rob Tim Beckham of a home run',
'timestamp': 1429124820, 'timestamp': 1429139220,
'upload_date': '20150415', 'upload_date': '20150415',
} }
}, },
@ -111,7 +105,7 @@ class MLBIE(InfoExtractor):
'only_matching': True, 'only_matching': True,
}, },
{ {
'url': 'http://m.cardinals.mlb.com/stl/video/v51175783/atlstl-piscotty-makes-great-sliding-catch-on-line/?partnerId=as_mlb_20150321_42500876&adbid=579409712979910656&adbpl=tw&adbpr=52847728', 'url': 'https://www.mlb.com/cardinals/video/piscottys-great-sliding-catch/c-51175783',
'only_matching': True, 'only_matching': True,
}, },
{ {
@ -120,58 +114,7 @@ class MLBIE(InfoExtractor):
'only_matching': True, 'only_matching': True,
}, },
{ {
'url': 'http://washington.nationals.mlb.com/mlb/gameday/index.jsp?c_id=was&gid=2015_05_09_atlmlb_wasmlb_1&lang=en&content_id=108309983&mode=video#', 'url': 'https://www.mlb.com/cut4/carlos-gomez-borrowed-sunglasses-from-an-as-fan/c-278912842',
'only_matching': True, 'only_matching': True,
} }
] ]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
if not video_id:
video_path = mobj.group('path')
webpage = self._download_webpage(url, video_path)
video_id = self._search_regex(
[r'data-video-?id="(\d+)"', r'content_id=(\d+)'], webpage, 'video id')
detail = self._download_xml(
'http://m.mlb.com/gen/multimedia/detail/%s/%s/%s/%s.xml'
% (video_id[-3], video_id[-2], video_id[-1], video_id), video_id)
title = detail.find('./headline').text
description = detail.find('./big-blurb').text
duration = parse_duration(detail.find('./duration').text)
timestamp = parse_iso8601(detail.attrib['date'][:-5])
thumbnails = [{
'url': thumbnail.text,
} for thumbnail in detail.findall('./thumbnailScenarios/thumbnailScenario')]
formats = []
for media_url in detail.findall('./url'):
playback_scenario = media_url.attrib['playback_scenario']
fmt = {
'url': media_url.text,
'format_id': playback_scenario,
}
m = re.search(r'(?P<vbr>\d+)K_(?P<width>\d+)X(?P<height>\d+)', playback_scenario)
if m:
fmt.update({
'vbr': int(m.group('vbr')) * 1000,
'width': int(m.group('width')),
'height': int(m.group('height')),
})
formats.append(fmt)
self._sort_formats(formats)
return {
'id': video_id,
'title': title,
'description': description,
'duration': duration,
'timestamp': timestamp,
'formats': formats,
'thumbnails': thumbnails,
}

View File

@ -29,14 +29,13 @@ class NexxIE(InfoExtractor):
_TESTS = [{ _TESTS = [{
# movie # movie
'url': 'https://api.nexx.cloud/v3/748/videos/byid/128907', 'url': 'https://api.nexx.cloud/v3/748/videos/byid/128907',
'md5': '828cea195be04e66057b846288295ba1', 'md5': '31899fd683de49ad46f4ee67e53e83fe',
'info_dict': { 'info_dict': {
'id': '128907', 'id': '128907',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Stiftung Warentest', 'title': 'Stiftung Warentest',
'alt_title': 'Wie ein Test abläuft', 'alt_title': 'Wie ein Test abläuft',
'description': 'md5:d1ddb1ef63de721132abd38639cc2fd2', 'description': 'md5:d1ddb1ef63de721132abd38639cc2fd2',
'release_year': 2013,
'creator': 'SPIEGEL TV', 'creator': 'SPIEGEL TV',
'thumbnail': r're:^https?://.*\.jpg$', 'thumbnail': r're:^https?://.*\.jpg$',
'duration': 2509, 'duration': 2509,
@ -62,6 +61,7 @@ class NexxIE(InfoExtractor):
'params': { 'params': {
'skip_download': True, 'skip_download': True,
}, },
'skip': 'HTTP Error 404: Not Found',
}, { }, {
# does not work via arc # does not work via arc
'url': 'nexx:741:1269984', 'url': 'nexx:741:1269984',
@ -71,12 +71,26 @@ class NexxIE(InfoExtractor):
'ext': 'mp4', 'ext': 'mp4',
'title': '1 TAG ohne KLO... wortwörtlich! 😑', 'title': '1 TAG ohne KLO... wortwörtlich! 😑',
'alt_title': '1 TAG ohne KLO... wortwörtlich! 😑', 'alt_title': '1 TAG ohne KLO... wortwörtlich! 😑',
'description': 'md5:4604539793c49eda9443ab5c5b1d612f',
'thumbnail': r're:^https?://.*\.jpg$', 'thumbnail': r're:^https?://.*\.jpg$',
'duration': 607, 'duration': 607,
'timestamp': 1518614955, 'timestamp': 1518614955,
'upload_date': '20180214', 'upload_date': '20180214',
}, },
}, {
# free cdn from http://www.spiegel.de/video/eifel-zoo-aufregung-um-ausgebrochene-raubtiere-video-99018031.html
'url': 'nexx:747:1533779',
'md5': '6bf6883912b82b7069fb86c2297e9893',
'info_dict': {
'id': '1533779',
'ext': 'mp4',
'title': 'Aufregung um ausgebrochene Raubtiere',
'alt_title': 'Eifel-Zoo',
'description': 'md5:f21375c91c74ad741dcb164c427999d2',
'thumbnail': r're:^https?://.*\.jpg$',
'duration': 111,
'timestamp': 1527874460,
'upload_date': '20180601',
},
}, { }, {
'url': 'https://api.nexxcdn.com/v3/748/videos/byid/128907', 'url': 'https://api.nexxcdn.com/v3/748/videos/byid/128907',
'only_matching': True, 'only_matching': True,
@ -141,6 +155,139 @@ class NexxIE(InfoExtractor):
self._handle_error(result) self._handle_error(result)
return result['result'] return result['result']
def _extract_free_formats(self, video, video_id):
stream_data = video['streamdata']
cdn = stream_data['cdnType']
assert cdn == 'free'
hash = video['general']['hash']
ps = compat_str(stream_data['originalDomain'])
if stream_data['applyFolderHierarchy'] == 1:
s = ('%04d' % int(video_id))[::-1]
ps += '/%s/%s' % (s[0:2], s[2:4])
ps += '/%s/%s_' % (video_id, hash)
t = 'http://%s' + ps
fd = stream_data['azureFileDistribution'].split(',')
cdn_provider = stream_data['cdnProvider']
def p0(p):
return '_%s' % p if stream_data['applyAzureStructure'] == 1 else ''
formats = []
if cdn_provider == 'ak':
t += ','
for i in fd:
p = i.split(':')
t += p[1] + p0(int(p[0])) + ','
t += '.mp4.csmil/master.%s'
elif cdn_provider == 'ce':
k = t.split('/')
h = k.pop()
http_base = t = '/'.join(k)
http_base = http_base % stream_data['cdnPathHTTP']
t += '/asset.ism/manifest.%s?dcp_ver=aos4&videostream='
for i in fd:
p = i.split(':')
tbr = int(p[0])
filename = '%s%s%s.mp4' % (h, p[1], p0(tbr))
f = {
'url': http_base + '/' + filename,
'format_id': '%s-http-%d' % (cdn, tbr),
'tbr': tbr,
}
width_height = p[1].split('x')
if len(width_height) == 2:
f.update({
'width': int_or_none(width_height[0]),
'height': int_or_none(width_height[1]),
})
formats.append(f)
a = filename + ':%s' % (tbr * 1000)
t += a + ','
t = t[:-1] + '&audiostream=' + a.split(':')[0]
else:
assert False
if cdn_provider == 'ce':
formats.extend(self._extract_mpd_formats(
t % (stream_data['cdnPathDASH'], 'mpd'), video_id,
mpd_id='%s-dash' % cdn, fatal=False))
formats.extend(self._extract_m3u8_formats(
t % (stream_data['cdnPathHLS'], 'm3u8'), video_id, 'mp4',
entry_protocol='m3u8_native', m3u8_id='%s-hls' % cdn, fatal=False))
return formats
def _extract_azure_formats(self, video, video_id):
stream_data = video['streamdata']
cdn = stream_data['cdnType']
assert cdn == 'azure'
azure_locator = stream_data['azureLocator']
def get_cdn_shield_base(shield_type='', static=False):
for secure in ('', 's'):
cdn_shield = stream_data.get('cdnShield%sHTTP%s' % (shield_type, secure.upper()))
if cdn_shield:
return 'http%s://%s' % (secure, cdn_shield)
else:
if 'fb' in stream_data['azureAccount']:
prefix = 'df' if static else 'f'
else:
prefix = 'd' if static else 'p'
account = int(stream_data['azureAccount'].replace('nexxplayplus', '').replace('nexxplayfb', ''))
return 'http://nx-%s%02d.akamaized.net/' % (prefix, account)
language = video['general'].get('language_raw') or ''
azure_stream_base = get_cdn_shield_base()
is_ml = ',' in language
azure_manifest_url = '%s%s/%s_src%s.ism/Manifest' % (
azure_stream_base, azure_locator, video_id, ('_manifest' if is_ml else '')) + '%s'
protection_token = try_get(
video, lambda x: x['protectiondata']['token'], compat_str)
if protection_token:
azure_manifest_url += '?hdnts=%s' % protection_token
formats = self._extract_m3u8_formats(
azure_manifest_url % '(format=m3u8-aapl)',
video_id, 'mp4', 'm3u8_native',
m3u8_id='%s-hls' % cdn, fatal=False)
formats.extend(self._extract_mpd_formats(
azure_manifest_url % '(format=mpd-time-csf)',
video_id, mpd_id='%s-dash' % cdn, fatal=False))
formats.extend(self._extract_ism_formats(
azure_manifest_url % '', video_id, ism_id='%s-mss' % cdn, fatal=False))
azure_progressive_base = get_cdn_shield_base('Prog', True)
azure_file_distribution = stream_data.get('azureFileDistribution')
if azure_file_distribution:
fds = azure_file_distribution.split(',')
if fds:
for fd in fds:
ss = fd.split(':')
if len(ss) == 2:
tbr = int_or_none(ss[0])
if tbr:
f = {
'url': '%s%s/%s_src_%s_%d.mp4' % (
azure_progressive_base, azure_locator, video_id, ss[1], tbr),
'format_id': '%s-http-%d' % (cdn, tbr),
'tbr': tbr,
}
width_height = ss[1].split('x')
if len(width_height) == 2:
f.update({
'width': int_or_none(width_height[0]),
'height': int_or_none(width_height[1]),
})
formats.append(f)
return formats
def _real_extract(self, url): def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url) mobj = re.match(self._VALID_URL, url)
domain_id = mobj.group('domain_id') or mobj.group('domain_id_s') domain_id = mobj.group('domain_id') or mobj.group('domain_id_s')
@ -220,72 +367,15 @@ class NexxIE(InfoExtractor):
general = video['general'] general = video['general']
title = general['title'] title = general['title']
stream_data = video['streamdata'] cdn = video['streamdata']['cdnType']
language = general.get('language_raw') or ''
# TODO: reverse more cdns if cdn == 'azure':
formats = self._extract_azure_formats(video, video_id)
cdn = stream_data['cdnType'] elif cdn == 'free':
assert cdn == 'azure' formats = self._extract_free_formats(video, video_id)
else:
azure_locator = stream_data['azureLocator'] # TODO: reverse more cdns
assert False
def get_cdn_shield_base(shield_type='', static=False):
for secure in ('', 's'):
cdn_shield = stream_data.get('cdnShield%sHTTP%s' % (shield_type, secure.upper()))
if cdn_shield:
return 'http%s://%s' % (secure, cdn_shield)
else:
if 'fb' in stream_data['azureAccount']:
prefix = 'df' if static else 'f'
else:
prefix = 'd' if static else 'p'
account = int(stream_data['azureAccount'].replace('nexxplayplus', '').replace('nexxplayfb', ''))
return 'http://nx-%s%02d.akamaized.net/' % (prefix, account)
azure_stream_base = get_cdn_shield_base()
is_ml = ',' in language
azure_manifest_url = '%s%s/%s_src%s.ism/Manifest' % (
azure_stream_base, azure_locator, video_id, ('_manifest' if is_ml else '')) + '%s'
protection_token = try_get(
video, lambda x: x['protectiondata']['token'], compat_str)
if protection_token:
azure_manifest_url += '?hdnts=%s' % protection_token
formats = self._extract_m3u8_formats(
azure_manifest_url % '(format=m3u8-aapl)',
video_id, 'mp4', 'm3u8_native',
m3u8_id='%s-hls' % cdn, fatal=False)
formats.extend(self._extract_mpd_formats(
azure_manifest_url % '(format=mpd-time-csf)',
video_id, mpd_id='%s-dash' % cdn, fatal=False))
formats.extend(self._extract_ism_formats(
azure_manifest_url % '', video_id, ism_id='%s-mss' % cdn, fatal=False))
azure_progressive_base = get_cdn_shield_base('Prog', True)
azure_file_distribution = stream_data.get('azureFileDistribution')
if azure_file_distribution:
fds = azure_file_distribution.split(',')
if fds:
for fd in fds:
ss = fd.split(':')
if len(ss) == 2:
tbr = int_or_none(ss[0])
if tbr:
f = {
'url': '%s%s/%s_src_%s_%d.mp4' % (
azure_progressive_base, azure_locator, video_id, ss[1], tbr),
'format_id': '%s-http-%d' % (cdn, tbr),
'tbr': tbr,
}
width_height = ss[1].split('x')
if len(width_height) == 2:
f.update({
'width': int_or_none(width_height[0]),
'height': int_or_none(width_height[1]),
})
formats.append(f)
self._sort_formats(formats) self._sort_formats(formats)

View File

@ -1,18 +1,10 @@
from __future__ import unicode_literals from __future__ import unicode_literals
import re import re
import json
import os
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import ( from ..compat import compat_str
compat_urlparse,
compat_urllib_parse_urlencode,
compat_urllib_parse_urlparse,
compat_str,
)
from ..utils import ( from ..utils import (
unified_strdate,
determine_ext, determine_ext,
int_or_none, int_or_none,
parse_iso8601, parse_iso8601,
@ -20,236 +12,77 @@ from ..utils import (
) )
class NHLBaseInfoExtractor(InfoExtractor): class NHLBaseIE(InfoExtractor):
@staticmethod def _real_extract(self, url):
def _fix_json(json_string): site, tmp_id = re.match(self._VALID_URL, url).groups()
return json_string.replace('\\\'', '\'') video_data = self._download_json(
'https://%s/%s/%sid/v1/%s/details/web-v1.json'
% (self._CONTENT_DOMAIN, site[:3], 'item/' if site == 'mlb' else '', tmp_id), tmp_id)
if video_data.get('type') != 'video':
video_data = video_data['media']
video = video_data.get('video')
if video:
video_data = video
else:
videos = video_data.get('videos')
if videos:
video_data = videos[0]
def _real_extract_video(self, video_id): video_id = compat_str(video_data['id'])
vid_parts = video_id.split(',') title = video_data['title']
if len(vid_parts) == 3:
video_id = '%s0%s%s-X-h' % (vid_parts[0][:4], vid_parts[1], vid_parts[2].rjust(4, '0'))
json_url = 'http://video.nhl.com/videocenter/servlets/playlist?ids=%s&format=json' % video_id
data = self._download_json(
json_url, video_id, transform_source=self._fix_json)
return self._extract_video(data[0])
def _extract_video(self, info): formats = []
video_id = info['id'] for playback in video_data.get('playbacks', []):
self.report_extraction(video_id) playback_url = playback.get('url')
if not playback_url:
continue
ext = determine_ext(playback_url)
if ext == 'm3u8':
m3u8_formats = self._extract_m3u8_formats(
playback_url, video_id, 'mp4', 'm3u8_native',
m3u8_id=playback.get('name', 'hls'), fatal=False)
self._check_formats(m3u8_formats, video_id)
formats.extend(m3u8_formats)
else:
height = int_or_none(playback.get('height'))
formats.append({
'format_id': playback.get('name', 'http' + ('-%dp' % height if height else '')),
'url': playback_url,
'width': int_or_none(playback.get('width')),
'height': height,
'tbr': int_or_none(self._search_regex(r'_(\d+)[kK]', playback_url, 'bitrate', default=None)),
})
self._sort_formats(formats)
initial_video_url = info['publishPoint'] thumbnails = []
if info['formats'] == '1': cuts = video_data.get('image', {}).get('cuts') or []
parsed_url = compat_urllib_parse_urlparse(initial_video_url) if isinstance(cuts, dict):
filename, ext = os.path.splitext(parsed_url.path) cuts = cuts.values()
path = '%s_sd%s' % (filename, ext) for thumbnail_data in cuts:
data = compat_urllib_parse_urlencode({ thumbnail_url = thumbnail_data.get('src')
'type': 'fvod', if not thumbnail_url:
'path': compat_urlparse.urlunparse(parsed_url[:2] + (path,) + parsed_url[3:]) continue
thumbnails.append({
'url': thumbnail_url,
'width': int_or_none(thumbnail_data.get('width')),
'height': int_or_none(thumbnail_data.get('height')),
}) })
path_url = 'http://video.nhl.com/videocenter/servlets/encryptvideopath?' + data
path_doc = self._download_xml(
path_url, video_id, 'Downloading final video url')
video_url = path_doc.find('path').text
else:
video_url = initial_video_url
join = compat_urlparse.urljoin
ret = {
'id': video_id,
'title': info['name'],
'url': video_url,
'description': info['description'],
'duration': int(info['duration']),
'thumbnail': join(join(video_url, '/u/'), info['bigImage']),
'upload_date': unified_strdate(info['releaseDate'].split('.')[0]),
}
if video_url.startswith('rtmp:'):
mobj = re.match(r'(?P<tc_url>rtmp://[^/]+/(?P<app>[a-z0-9/]+))/(?P<play_path>mp4:.*)', video_url)
ret.update({
'tc_url': mobj.group('tc_url'),
'play_path': mobj.group('play_path'),
'app': mobj.group('app'),
'no_resume': True,
})
return ret
class NHLVideocenterIE(NHLBaseInfoExtractor):
IE_NAME = 'nhl.com:videocenter'
_VALID_URL = r'https?://video(?P<team>\.[^.]*)?\.nhl\.com/videocenter/(?:console|embed)?(?:\?(?:.*?[?&])?)(?:id|hlg|playlist)=(?P<id>[-0-9a-zA-Z,]+)'
_TESTS = [{
'url': 'http://video.canucks.nhl.com/videocenter/console?catid=6?id=453614',
'md5': 'db704a4ea09e8d3988c85e36cc892d09',
'info_dict': {
'id': '453614',
'ext': 'mp4',
'title': 'Quick clip: Weise 4-3 goal vs Flames',
'description': 'Dale Weise scores his first of the season to put the Canucks up 4-3.',
'duration': 18,
'upload_date': '20131006',
},
}, {
'url': 'http://video.nhl.com/videocenter/console?id=2014020024-628-h',
'md5': 'd22e82bc592f52d37d24b03531ee9696',
'info_dict': {
'id': '2014020024-628-h',
'ext': 'mp4',
'title': 'Alex Galchenyuk Goal on Ray Emery (14:40/3rd)',
'description': 'Home broadcast - Montreal Canadiens at Philadelphia Flyers - October 11, 2014',
'duration': 0,
'upload_date': '20141011',
},
}, {
'url': 'http://video.mapleleafs.nhl.com/videocenter/console?id=58665&catid=802',
'md5': 'c78fc64ea01777e426cfc202b746c825',
'info_dict': {
'id': '58665',
'ext': 'flv',
'title': 'Classic Game In Six - April 22, 1979',
'description': 'It was the last playoff game for the Leafs in the decade, and the last time the Leafs and Habs played in the playoffs. Great game, not a great ending.',
'duration': 400,
'upload_date': '20100129'
},
}, {
'url': 'http://video.flames.nhl.com/videocenter/console?id=630616',
'only_matching': True,
}, {
'url': 'http://video.nhl.com/videocenter/?id=736722',
'only_matching': True,
}, {
'url': 'http://video.nhl.com/videocenter/console?hlg=20142015,2,299&lang=en',
'md5': '076fcb88c255154aacbf0a7accc3f340',
'info_dict': {
'id': '2014020299-X-h',
'ext': 'mp4',
'title': 'Penguins at Islanders / Game Highlights',
'description': 'Home broadcast - Pittsburgh Penguins at New York Islanders - November 22, 2014',
'duration': 268,
'upload_date': '20141122',
}
}, {
'url': 'http://video.oilers.nhl.com/videocenter/console?id=691469&catid=4',
'info_dict': {
'id': '691469',
'ext': 'mp4',
'title': 'RAW | Craig MacTavish Full Press Conference',
'description': 'Oilers GM Craig MacTavish addresses the media at Rexall Place on Friday.',
'upload_date': '20141205',
},
'params': {
'skip_download': True, # Requires rtmpdump
}
}, {
'url': 'http://video.nhl.com/videocenter/embed?playlist=836127',
'only_matching': True,
}]
def _real_extract(self, url):
video_id = self._match_id(url)
return self._real_extract_video(video_id)
class NHLNewsIE(NHLBaseInfoExtractor):
IE_NAME = 'nhl.com:news'
IE_DESC = 'NHL news'
_VALID_URL = r'https?://(?:.+?\.)?nhl\.com/(?:ice|club)/news\.html?(?:\?(?:.*?[?&])?)id=(?P<id>[-0-9a-zA-Z]+)'
_TESTS = [{
'url': 'http://www.nhl.com/ice/news.htm?id=750727',
'md5': '4b3d1262e177687a3009937bd9ec0be8',
'info_dict': {
'id': '736722',
'ext': 'mp4',
'title': 'Cal Clutterbuck has been fined $2,000',
'description': 'md5:45fe547d30edab88b23e0dd0ab1ed9e6',
'duration': 37,
'upload_date': '20150128',
},
}, {
# iframe embed
'url': 'http://sabres.nhl.com/club/news.htm?id=780189',
'md5': '9f663d1c006c90ac9fb82777d4294e12',
'info_dict': {
'id': '836127',
'ext': 'mp4',
'title': 'Morning Skate: OTT vs. BUF (9/23/15)',
'description': "Brian Duff chats with Tyler Ennis prior to Buffalo's first preseason home game.",
'duration': 93,
'upload_date': '20150923',
},
}]
def _real_extract(self, url):
news_id = self._match_id(url)
webpage = self._download_webpage(url, news_id)
video_id = self._search_regex(
[r'pVid(\d+)', r"nlid\s*:\s*'(\d+)'",
r'<iframe[^>]+src=["\']https?://video.*?\.nhl\.com/videocenter/embed\?.*\bplaylist=(\d+)'],
webpage, 'video id')
return self._real_extract_video(video_id)
class NHLVideocenterCategoryIE(NHLBaseInfoExtractor):
IE_NAME = 'nhl.com:videocenter:category'
IE_DESC = 'NHL videocenter category'
_VALID_URL = r'https?://video\.(?P<team>[^.]*)\.nhl\.com/videocenter/(console\?[^(id=)]*catid=(?P<catid>[0-9]+)(?![&?]id=).*?)?$'
_TEST = {
'url': 'http://video.canucks.nhl.com/videocenter/console?catid=999',
'info_dict': {
'id': '999',
'title': 'Highlights',
},
'playlist_count': 12,
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
team = mobj.group('team')
webpage = self._download_webpage(url, team)
cat_id = self._search_regex(
[r'var defaultCatId = "(.+?)";',
r'{statusIndex:0,index:0,.*?id:(.*?),'],
webpage, 'category id')
playlist_title = self._html_search_regex(
r'tab0"[^>]*?>(.*?)</td>',
webpage, 'playlist title', flags=re.DOTALL).lower().capitalize()
data = compat_urllib_parse_urlencode({
'cid': cat_id,
# This is the default value
'count': 12,
'ptrs': 3,
'format': 'json',
})
path = '/videocenter/servlets/browse?' + data
request_url = compat_urlparse.urljoin(url, path)
response = self._download_webpage(request_url, playlist_title)
response = self._fix_json(response)
if not response.strip():
self._downloader.report_warning('Got an empty response, trying '
'adding the "newvideos" parameter')
response = self._download_webpage(request_url + '&newvideos=true',
playlist_title)
response = self._fix_json(response)
videos = json.loads(response)
return { return {
'_type': 'playlist', 'id': video_id,
'title': playlist_title, 'title': title,
'id': cat_id, 'description': video_data.get('description'),
'entries': [self._extract_video(v) for v in videos], 'timestamp': parse_iso8601(video_data.get('date')),
'duration': parse_duration(video_data.get('duration')),
'thumbnails': thumbnails,
'formats': formats,
} }
class NHLIE(InfoExtractor): class NHLIE(NHLBaseIE):
IE_NAME = 'nhl.com' IE_NAME = 'nhl.com'
_VALID_URL = r'https?://(?:www\.)?(?P<site>nhl|wch2016)\.com/(?:[^/]+/)*c-(?P<id>\d+)' _VALID_URL = r'https?://(?:www\.)?(?P<site>nhl|wch2016)\.com/(?:[^/]+/)*c-(?P<id>\d+)'
_SITES_MAP = { _CONTENT_DOMAIN = 'nhl.bamcontent.com'
'nhl': 'nhl',
'wch2016': 'wch',
}
_TESTS = [{ _TESTS = [{
# type=video # type=video
'url': 'https://www.nhl.com/video/anisimov-cleans-up-mess/t-277752844/c-43663503', 'url': 'https://www.nhl.com/video/anisimov-cleans-up-mess/t-277752844/c-43663503',
@ -293,59 +126,3 @@ class NHLIE(InfoExtractor):
'url': 'https://www.wch2016.com/news/3-stars-team-europe-vs-team-canada/c-282195068', 'url': 'https://www.wch2016.com/news/3-stars-team-europe-vs-team-canada/c-282195068',
'only_matching': True, 'only_matching': True,
}] }]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
tmp_id, site = mobj.group('id'), mobj.group('site')
video_data = self._download_json(
'https://nhl.bamcontent.com/%s/id/v1/%s/details/web-v1.json'
% (self._SITES_MAP[site], tmp_id), tmp_id)
if video_data.get('type') == 'article':
video_data = video_data['media']
video_id = compat_str(video_data['id'])
title = video_data['title']
formats = []
for playback in video_data.get('playbacks', []):
playback_url = playback.get('url')
if not playback_url:
continue
ext = determine_ext(playback_url)
if ext == 'm3u8':
m3u8_formats = self._extract_m3u8_formats(
playback_url, video_id, 'mp4', 'm3u8_native',
m3u8_id=playback.get('name', 'hls'), fatal=False)
self._check_formats(m3u8_formats, video_id)
formats.extend(m3u8_formats)
else:
height = int_or_none(playback.get('height'))
formats.append({
'format_id': playback.get('name', 'http' + ('-%dp' % height if height else '')),
'url': playback_url,
'width': int_or_none(playback.get('width')),
'height': height,
})
self._sort_formats(formats, ('preference', 'width', 'height', 'tbr', 'format_id'))
thumbnails = []
for thumbnail_id, thumbnail_data in video_data.get('image', {}).get('cuts', {}).items():
thumbnail_url = thumbnail_data.get('src')
if not thumbnail_url:
continue
thumbnails.append({
'id': thumbnail_id,
'url': thumbnail_url,
'width': int_or_none(thumbnail_data.get('width')),
'height': int_or_none(thumbnail_data.get('height')),
})
return {
'id': video_id,
'title': title,
'description': video_data.get('description'),
'timestamp': parse_iso8601(video_data.get('date')),
'duration': parse_duration(video_data.get('duration')),
'thumbnails': thumbnails,
'formats': formats,
}

View File

@ -4,7 +4,6 @@ from __future__ import unicode_literals
import re import re
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import compat_str
from ..utils import ( from ..utils import (
parse_iso8601, parse_iso8601,
float_or_none, float_or_none,

View File

@ -36,8 +36,8 @@ class NPOIE(NPOBaseIE):
https?:// https?://
(?:www\.)? (?:www\.)?
(?: (?:
npo\.nl/(?!(?:live|radio)/)(?:[^/]+/){2}| npo\.nl/(?:[^/]+/)*|
ntr\.nl/(?:[^/]+/){2,}| (?:ntr|npostart)\.nl/(?:[^/]+/){2,}|
omroepwnl\.nl/video/fragment/[^/]+__| omroepwnl\.nl/video/fragment/[^/]+__|
(?:zapp|npo3)\.nl/(?:[^/]+/){2,} (?:zapp|npo3)\.nl/(?:[^/]+/){2,}
) )
@ -160,8 +160,20 @@ class NPOIE(NPOBaseIE):
}, { }, {
'url': 'https://www.zapp.nl/1803-skelterlab/instructie-video-s/740-instructievideo-s/POMS_AT_11736927', 'url': 'https://www.zapp.nl/1803-skelterlab/instructie-video-s/740-instructievideo-s/POMS_AT_11736927',
'only_matching': True, 'only_matching': True,
}, {
'url': 'https://www.npostart.nl/broodje-gezond-ei/28-05-2018/KN_1698996',
'only_matching': True,
}, {
'url': 'https://npo.nl/KN_1698996',
'only_matching': True,
}] }]
@classmethod
def suitable(cls, url):
return (False if any(ie.suitable(url)
for ie in (NPOLiveIE, NPORadioIE, NPORadioFragmentIE))
else super(NPOIE, cls).suitable(url))
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) video_id = self._match_id(url)
return self._get_info(video_id) return self._get_info(video_id)
@ -389,7 +401,7 @@ class NPOLiveIE(NPOBaseIE):
class NPORadioIE(InfoExtractor): class NPORadioIE(InfoExtractor):
IE_NAME = 'npo.nl:radio' IE_NAME = 'npo.nl:radio'
_VALID_URL = r'https?://(?:www\.)?npo\.nl/radio/(?P<id>[^/]+)/?$' _VALID_URL = r'https?://(?:www\.)?npo\.nl/radio/(?P<id>[^/]+)'
_TEST = { _TEST = {
'url': 'http://www.npo.nl/radio/radio-1', 'url': 'http://www.npo.nl/radio/radio-1',
@ -404,6 +416,10 @@ class NPORadioIE(InfoExtractor):
} }
} }
@classmethod
def suitable(cls, url):
return False if NPORadioFragmentIE.suitable(url) else super(NPORadioIE, cls).suitable(url)
@staticmethod @staticmethod
def _html_get_attribute_regex(attribute): def _html_get_attribute_regex(attribute):
return r'{0}\s*=\s*\'([^\']+)\''.format(attribute) return r'{0}\s*=\s*\'([^\']+)\''.format(attribute)

View File

@ -16,12 +16,22 @@ from ..utils import (
class NRKBaseIE(InfoExtractor): class NRKBaseIE(InfoExtractor):
_GEO_COUNTRIES = ['NO'] _GEO_COUNTRIES = ['NO']
_api_host = None
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) video_id = self._match_id(url)
data = self._download_json( api_hosts = (self._api_host, ) if self._api_host else self._API_HOSTS
'http://%s/mediaelement/%s' % (self._API_HOST, video_id),
video_id, 'Downloading mediaelement JSON') for api_host in api_hosts:
data = self._download_json(
'http://%s/mediaelement/%s' % (api_host, video_id),
video_id, 'Downloading mediaelement JSON',
fatal=api_host == api_hosts[-1])
if not data:
continue
self._api_host = api_host
break
title = data.get('fullTitle') or data.get('mainTitle') or data['title'] title = data.get('fullTitle') or data.get('mainTitle') or data['title']
video_id = data.get('id') or video_id video_id = data.get('id') or video_id
@ -191,7 +201,7 @@ class NRKIE(NRKBaseIE):
) )
(?P<id>[^?#&]+) (?P<id>[^?#&]+)
''' '''
_API_HOST = 'v8-psapi.nrk.no' _API_HOSTS = ('psapi.nrk.no', 'v8-psapi.nrk.no')
_TESTS = [{ _TESTS = [{
# video # video
'url': 'http://www.nrk.no/video/PS*150533', 'url': 'http://www.nrk.no/video/PS*150533',
@ -237,8 +247,7 @@ class NRKTVIE(NRKBaseIE):
(?:/\d{2}-\d{2}-\d{4})? (?:/\d{2}-\d{2}-\d{4})?
(?:\#del=(?P<part_id>\d+))? (?:\#del=(?P<part_id>\d+))?
''' % _EPISODE_RE ''' % _EPISODE_RE
_API_HOST = 'psapi-we.nrk.no' _API_HOSTS = ('psapi-ne.nrk.no', 'psapi-we.nrk.no')
_TESTS = [{ _TESTS = [{
'url': 'https://tv.nrk.no/serie/20-spoersmaal-tv/MUHH48000314/23-05-2014', 'url': 'https://tv.nrk.no/serie/20-spoersmaal-tv/MUHH48000314/23-05-2014',
'md5': '4e9ca6629f09e588ed240fb11619922a', 'md5': '4e9ca6629f09e588ed240fb11619922a',

View File

@ -243,7 +243,7 @@ class PhantomJSwrapper(object):
class OpenloadIE(InfoExtractor): class OpenloadIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?(?:openload\.(?:co|io|link)|oload\.(?:tv|stream|site|xyz))/(?:f|embed)/(?P<id>[a-zA-Z0-9-_]+)' _VALID_URL = r'https?://(?:www\.)?(?:openload\.(?:co|io|link)|oload\.(?:tv|stream|site|xyz|win|download))/(?:f|embed)/(?P<id>[a-zA-Z0-9-_]+)'
_TESTS = [{ _TESTS = [{
'url': 'https://openload.co/f/kUEfGclsU9o', 'url': 'https://openload.co/f/kUEfGclsU9o',
@ -301,6 +301,16 @@ class OpenloadIE(InfoExtractor):
}, { }, {
'url': 'https://oload.xyz/f/WwRBpzW8Wtk', 'url': 'https://oload.xyz/f/WwRBpzW8Wtk',
'only_matching': True, 'only_matching': True,
}, {
'url': 'https://oload.win/f/kUEfGclsU9o',
'only_matching': True,
}, {
'url': 'https://oload.download/f/kUEfGclsU9o',
'only_matching': True,
}, {
# Its title has not got its extension but url has it
'url': 'https://oload.download/f/N4Otkw39VCw/Tomb.Raider.2018.HDRip.XviD.AC3-EVO.avi.mp4',
'only_matching': True,
}] }]
_USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36' _USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'
@ -362,8 +372,7 @@ class OpenloadIE(InfoExtractor):
'title': title, 'title': title,
'thumbnail': entry.get('thumbnail') or self._og_search_thumbnail(webpage, default=None), 'thumbnail': entry.get('thumbnail') or self._og_search_thumbnail(webpage, default=None),
'url': video_url, 'url': video_url,
# Seems all videos have extensions in their titles 'ext': determine_ext(title, None) or determine_ext(url, 'mp4'),
'ext': determine_ext(title, 'mp4'),
'subtitles': subtitles, 'subtitles': subtitles,
'http_headers': headers, 'http_headers': headers,
} }

View File

@ -360,6 +360,21 @@ class PBSIE(InfoExtractor):
'skip_download': True, 'skip_download': True,
}, },
}, },
{
'url': 'http://www.pbs.org/wgbh/roadshow/watch/episode/2105-indianapolis-hour-2/',
'info_dict': {
'id': '2365936247',
'ext': 'mp4',
'title': 'Antiques Roadshow - Indianapolis, Hour 2',
'description': 'md5:524b32249db55663e7231b6b8d1671a2',
'duration': 3180,
'thumbnail': r're:^https?://.*\.jpg$',
},
'params': {
'skip_download': True,
},
'expected_warnings': ['HTTP Error 403: Forbidden'],
},
{ {
'url': 'http://player.pbs.org/widget/partnerplayer/2365297708/?start=0&end=0&chapterbar=false&endscreen=false&topbar=true', 'url': 'http://player.pbs.org/widget/partnerplayer/2365297708/?start=0&end=0&chapterbar=false&endscreen=false&topbar=true',
'only_matching': True, 'only_matching': True,
@ -422,6 +437,7 @@ class PBSIE(InfoExtractor):
r'<section[^>]+data-coveid="(\d+)"', # coveplayer from http://www.pbs.org/wgbh/frontline/film/real-csi/ r'<section[^>]+data-coveid="(\d+)"', # coveplayer from http://www.pbs.org/wgbh/frontline/film/real-csi/
r'<input type="hidden" id="pbs_video_id_[0-9]+" value="([0-9]+)"/>', # jwplayer r'<input type="hidden" id="pbs_video_id_[0-9]+" value="([0-9]+)"/>', # jwplayer
r"(?s)window\.PBS\.playerConfig\s*=\s*{.*?id\s*:\s*'([0-9]+)',", r"(?s)window\.PBS\.playerConfig\s*=\s*{.*?id\s*:\s*'([0-9]+)',",
r'<div[^>]+\bdata-cove-id=["\'](\d+)"', # http://www.pbs.org/wgbh/roadshow/watch/episode/2105-indianapolis-hour-2/
] ]
media_id = self._search_regex( media_id = self._search_regex(

View File

@ -53,7 +53,8 @@ class RBMARadioIE(InfoExtractor):
'format_id': compat_str(abr), 'format_id': compat_str(abr),
'abr': abr, 'abr': abr,
'vcodec': 'none', 'vcodec': 'none',
} for abr in (96, 128, 256)] } for abr in (96, 128, 192, 256)]
self._check_formats(formats, episode_id)
description = clean_html(episode.get('longTeaser')) description = clean_html(episode.get('longTeaser'))
thumbnail = self._proto_relative_url(episode.get('imageURL', {}).get('landscape')) thumbnail = self._proto_relative_url(episode.get('imageURL', {}).get('landscape'))

View File

@ -74,7 +74,14 @@ class SafariBaseIE(InfoExtractor):
class SafariIE(SafariBaseIE): class SafariIE(SafariBaseIE):
IE_NAME = 'safari' IE_NAME = 'safari'
IE_DESC = 'safaribooksonline.com online video' IE_DESC = 'safaribooksonline.com online video'
_VALID_URL = r'https?://(?:www\.)?safaribooksonline\.com/library/view/[^/]+/(?P<course_id>[^/]+)/(?P<part>[^/?#&]+)\.html' _VALID_URL = r'''(?x)
https?://
(?:www\.)?safaribooksonline\.com/
(?:
library/view/[^/]+/(?P<course_id>[^/]+)/(?P<part>[^/?\#&]+)\.html|
videos/[^/]+/[^/]+/(?P<reference_id>[^-]+-[^/?\#&]+)
)
'''
_TESTS = [{ _TESTS = [{
'url': 'https://www.safaribooksonline.com/library/view/hadoop-fundamentals-livelessons/9780133392838/part00.html', 'url': 'https://www.safaribooksonline.com/library/view/hadoop-fundamentals-livelessons/9780133392838/part00.html',
@ -94,22 +101,41 @@ class SafariIE(SafariBaseIE):
}, { }, {
'url': 'https://www.safaribooksonline.com/library/view/learning-path-red/9780134664057/RHCE_Introduction.html', 'url': 'https://www.safaribooksonline.com/library/view/learning-path-red/9780134664057/RHCE_Introduction.html',
'only_matching': True, 'only_matching': True,
}, {
'url': 'https://www.safaribooksonline.com/videos/python-programming-language/9780134217314/9780134217314-PYMC_13_00',
'only_matching': True,
}] }]
_PARTNER_ID = '1926081'
_UICONF_ID = '29375172'
def _real_extract(self, url): def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url) mobj = re.match(self._VALID_URL, url)
video_id = '%s/%s' % (mobj.group('course_id'), mobj.group('part'))
webpage = self._download_webpage(url, video_id) reference_id = mobj.group('reference_id')
reference_id = self._search_regex( if reference_id:
r'data-reference-id=(["\'])(?P<id>(?:(?!\1).)+)\1', video_id = reference_id
webpage, 'kaltura reference id', group='id') partner_id = self._PARTNER_ID
partner_id = self._search_regex( ui_id = self._UICONF_ID
r'data-partner-id=(["\'])(?P<id>(?:(?!\1).)+)\1', else:
webpage, 'kaltura widget id', group='id') video_id = '%s-%s' % (mobj.group('course_id'), mobj.group('part'))
ui_id = self._search_regex(
r'data-ui-id=(["\'])(?P<id>(?:(?!\1).)+)\1', webpage, urlh = self._download_webpage_handle(url, video_id)
webpage, 'kaltura uiconf id', group='id')
mobj = re.match(self._VALID_URL, urlh.geturl())
reference_id = mobj.group('reference_id')
if not reference_id:
reference_id = self._search_regex(
r'data-reference-id=(["\'])(?P<id>(?:(?!\1).)+)\1',
webpage, 'kaltura reference id', group='id')
partner_id = self._search_regex(
r'data-partner-id=(["\'])(?P<id>(?:(?!\1).)+)\1',
webpage, 'kaltura widget id', default=self._PARTNER_ID,
group='id')
ui_id = self._search_regex(
r'data-ui-id=(["\'])(?P<id>(?:(?!\1).)+)\1',
webpage, 'kaltura uiconf id', default=self._UICONF_ID,
group='id')
query = { query = {
'wid': '_%s' % partner_id, 'wid': '_%s' % partner_id,
@ -159,10 +185,15 @@ class SafariCourseIE(SafariBaseIE):
_VALID_URL = r'''(?x) _VALID_URL = r'''(?x)
https?:// https?://
(?: (?:
(?:www\.)?safaribooksonline\.com/(?:library/view/[^/]+|api/v1/book)| (?:www\.)?safaribooksonline\.com/
(?:
library/view/[^/]+|
api/v1/book|
videos/[^/]+
)|
techbus\.safaribooksonline\.com techbus\.safaribooksonline\.com
) )
/(?P<id>[^/]+)/?(?:[#?]|$) /(?P<id>[^/]+)
''' '''
_TESTS = [{ _TESTS = [{
@ -179,8 +210,16 @@ class SafariCourseIE(SafariBaseIE):
}, { }, {
'url': 'http://techbus.safaribooksonline.com/9780134426365', 'url': 'http://techbus.safaribooksonline.com/9780134426365',
'only_matching': True, 'only_matching': True,
}, {
'url': 'https://www.safaribooksonline.com/videos/python-programming-language/9780134217314',
'only_matching': True,
}] }]
@classmethod
def suitable(cls, url):
return (False if SafariIE.suitable(url) or SafariApiIE.suitable(url)
else super(SafariCourseIE, cls).suitable(url))
def _real_extract(self, url): def _real_extract(self, url):
course_id = self._match_id(url) course_id = self._match_id(url)

View File

@ -4,6 +4,10 @@ from __future__ import unicode_literals
import re import re
from .turner import TurnerBaseIE from .turner import TurnerBaseIE
from ..compat import (
compat_urllib_parse_urlparse,
compat_parse_qs,
)
from ..utils import ( from ..utils import (
float_or_none, float_or_none,
int_or_none, int_or_none,
@ -38,48 +42,22 @@ class TBSIE(TurnerBaseIE):
def _real_extract(self, url): def _real_extract(self, url):
site, display_id = re.match(self._VALID_URL, url).groups() site, display_id = re.match(self._VALID_URL, url).groups()
webpage = self._download_webpage(url, display_id) webpage = self._download_webpage(url, display_id)
video_data = self._parse_json(self._search_regex( drupal_settings = self._parse_json(self._search_regex(
r'<script[^>]+?data-drupal-selector="drupal-settings-json"[^>]*?>({.+?})</script>', r'<script[^>]+?data-drupal-selector="drupal-settings-json"[^>]*?>({.+?})</script>',
webpage, 'drupal setting'), display_id)['turner_playlist'][0] webpage, 'drupal setting'), display_id)
video_data = drupal_settings['turner_playlist'][0]
media_id = video_data['mediaID'] media_id = video_data['mediaID']
title = video_data['title'] title = video_data['title']
tokenizer_query = compat_parse_qs(compat_urllib_parse_urlparse(
drupal_settings['ngtv_token_url']).query)
streams_data = self._download_json( info = self._extract_ngtv_info(
'http://medium.ngtv.io/media/%s/tv' % media_id, media_id, tokenizer_query, {
media_id)['media']['tv'] 'url': url,
duration = None 'site_name': site[:3].upper(),
chapters = [] 'auth_required': video_data.get('authRequired') == '1',
formats = [] })
for supported_type in ('unprotected', 'bulkaes'):
stream_data = streams_data.get(supported_type, {})
m3u8_url = stream_data.get('secureUrl') or stream_data.get('url')
if not m3u8_url:
continue
if stream_data.get('playlistProtection') == 'spe':
m3u8_url = self._add_akamai_spe_token(
'http://token.vgtf.net/token/token_spe',
m3u8_url, media_id, {
'url': url,
'site_name': site[:3].upper(),
'auth_required': video_data.get('authRequired') == '1',
})
formats.extend(self._extract_m3u8_formats(
m3u8_url, media_id, 'mp4', m3u8_id='hls', fatal=False))
duration = float_or_none(stream_data.get('totalRuntime') or video_data.get('duration'))
if not chapters:
for chapter in stream_data.get('contentSegments', []):
start_time = float_or_none(chapter.get('start'))
duration = float_or_none(chapter.get('duration'))
if start_time is None or duration is None:
continue
chapters.append({
'start_time': start_time,
'end_time': start_time + duration,
})
self._sort_formats(formats)
thumbnails = [] thumbnails = []
for image_id, image in video_data.get('images', {}).items(): for image_id, image in video_data.get('images', {}).items():
@ -98,15 +76,14 @@ class TBSIE(TurnerBaseIE):
}) })
thumbnails.append(i) thumbnails.append(i)
return { info.update({
'id': media_id, 'id': media_id,
'title': title, 'title': title,
'description': strip_or_none(video_data.get('descriptionNoTags') or video_data.get('shortDescriptionNoTags')), 'description': strip_or_none(video_data.get('descriptionNoTags') or video_data.get('shortDescriptionNoTags')),
'duration': duration, 'duration': float_or_none(video_data.get('duration')) or info.get('duration'),
'timestamp': int_or_none(video_data.get('created')), 'timestamp': int_or_none(video_data.get('created')),
'season_number': int_or_none(video_data.get('season')), 'season_number': int_or_none(video_data.get('season')),
'episode_number': int_or_none(video_data.get('episode')), 'episode_number': int_or_none(video_data.get('episode')),
'cahpters': chapters,
'thumbnails': thumbnails, 'thumbnails': thumbnails,
'formats': formats, })
} return info

View File

@ -3,7 +3,7 @@ from __future__ import unicode_literals
import json import json
from .common import InfoExtractor from .turner import TurnerBaseIE
from ..utils import ( from ..utils import (
determine_ext, determine_ext,
ExtractorError, ExtractorError,
@ -15,7 +15,7 @@ from ..utils import (
) )
class TeamcocoIE(InfoExtractor): class TeamcocoIE(TurnerBaseIE):
_VALID_URL = r'https?://teamcoco\.com/(?P<id>([^/]+/)*[^/?#]+)' _VALID_URL = r'https?://teamcoco\.com/(?P<id>([^/]+/)*[^/?#]+)'
_TESTS = [ _TESTS = [
{ {
@ -110,6 +110,8 @@ class TeamcocoIE(InfoExtractor):
name name
} }
duration duration
turnerMediaId
turnerMediaAuthToken
} }
} }
... on NotFoundSlug { ... on NotFoundSlug {
@ -123,53 +125,65 @@ class TeamcocoIE(InfoExtractor):
record = response['record'] record = response['record']
video_id = record['id'] video_id = record['id']
video_sources = self._graphql_call('''{ info = {
%s(id: "%s") {
src
}
}''', 'RecordVideoSource', video_id) or {}
formats = []
get_quality = qualities(['low', 'sd', 'hd', 'uhd'])
for format_id, src in video_sources.get('src', {}).items():
if not isinstance(src, dict):
continue
src_url = src.get('src')
if not src_url:
continue
ext = determine_ext(src_url, mimetype2ext(src.get('type')))
if format_id == 'hls' or ext == 'm3u8':
# compat_urllib_parse.urljoin does not work here
if src_url.startswith('/'):
src_url = 'http://ht.cdn.turner.com/tbs/big/teamcoco' + src_url
formats.extend(self._extract_m3u8_formats(
src_url, video_id, 'mp4', m3u8_id=format_id, fatal=False))
else:
if src_url.startswith('/mp4:protected/'):
# TODO Correct extraction for these files
continue
tbr = int_or_none(self._search_regex(
r'(\d+)k\.mp4', src_url, 'tbr', default=None))
formats.append({
'url': src_url,
'ext': ext,
'tbr': tbr,
'format_id': format_id,
'quality': get_quality(format_id),
})
if not formats:
formats = self._extract_m3u8_formats(
record['file']['url'], video_id, 'mp4', fatal=False)
self._sort_formats(formats)
return {
'id': video_id, 'id': video_id,
'display_id': display_id, 'display_id': display_id,
'formats': formats,
'title': record['title'], 'title': record['title'],
'thumbnail': record.get('thumb', {}).get('preview'), 'thumbnail': record.get('thumb', {}).get('preview'),
'description': record.get('teaser'), 'description': record.get('teaser'),
'duration': parse_duration(record.get('duration')), 'duration': parse_duration(record.get('duration')),
'timestamp': parse_iso8601(record.get('publishOn')), 'timestamp': parse_iso8601(record.get('publishOn')),
} }
media_id = record.get('turnerMediaId')
if media_id:
self._initialize_geo_bypass({
'countries': ['US'],
})
info.update(self._extract_ngtv_info(media_id, {
'accessToken': record['turnerMediaAuthToken'],
'accessTokenType': 'jws',
}))
else:
video_sources = self._graphql_call('''{
%s(id: "%s") {
src
}
}''', 'RecordVideoSource', video_id) or {}
formats = []
get_quality = qualities(['low', 'sd', 'hd', 'uhd'])
for format_id, src in video_sources.get('src', {}).items():
if not isinstance(src, dict):
continue
src_url = src.get('src')
if not src_url:
continue
ext = determine_ext(src_url, mimetype2ext(src.get('type')))
if format_id == 'hls' or ext == 'm3u8':
# compat_urllib_parse.urljoin does not work here
if src_url.startswith('/'):
src_url = 'http://ht.cdn.turner.com/tbs/big/teamcoco' + src_url
formats.extend(self._extract_m3u8_formats(
src_url, video_id, 'mp4', m3u8_id=format_id, fatal=False))
else:
if src_url.startswith('/mp4:protected/'):
# TODO Correct extraction for these files
continue
tbr = int_or_none(self._search_regex(
r'(\d+)k\.mp4', src_url, 'tbr', default=None))
formats.append({
'url': src_url,
'ext': ext,
'tbr': tbr,
'format_id': format_id,
'quality': get_quality(format_id),
})
if not formats:
formats = self._extract_m3u8_formats(
record['file']['url'], video_id, 'mp4', fatal=False)
self._sort_formats(formats)
info['formats'] = formats
return info

View File

@ -9,6 +9,7 @@ from ..utils import (
xpath_text, xpath_text,
int_or_none, int_or_none,
determine_ext, determine_ext,
float_or_none,
parse_duration, parse_duration,
xpath_attr, xpath_attr,
update_url_query, update_url_query,
@ -23,14 +24,17 @@ class TurnerBaseIE(AdobePassIE):
def _extract_timestamp(self, video_data): def _extract_timestamp(self, video_data):
return int_or_none(xpath_attr(video_data, 'dateCreated', 'uts')) return int_or_none(xpath_attr(video_data, 'dateCreated', 'uts'))
def _add_akamai_spe_token(self, tokenizer_src, video_url, content_id, ap_data): def _add_akamai_spe_token(self, tokenizer_src, video_url, content_id, ap_data, custom_tokenizer_query=None):
secure_path = self._search_regex(r'https?://[^/]+(.+/)', video_url, 'secure path') + '*' secure_path = self._search_regex(r'https?://[^/]+(.+/)', video_url, 'secure path') + '*'
token = self._AKAMAI_SPE_TOKEN_CACHE.get(secure_path) token = self._AKAMAI_SPE_TOKEN_CACHE.get(secure_path)
if not token: if not token:
query = { query = {
'path': secure_path, 'path': secure_path,
'videoId': content_id,
} }
if custom_tokenizer_query:
query.update(custom_tokenizer_query)
else:
query['videoId'] = content_id
if ap_data.get('auth_required'): if ap_data.get('auth_required'):
query['accessToken'] = self._extract_mvpd_auth(ap_data['url'], content_id, ap_data['site_name'], ap_data['site_name']) query['accessToken'] = self._extract_mvpd_auth(ap_data['url'], content_id, ap_data['site_name'], ap_data['site_name'])
auth = self._download_xml( auth = self._download_xml(
@ -188,3 +192,42 @@ class TurnerBaseIE(AdobePassIE):
'episode_number': int_or_none(xpath_text(video_data, 'episodeNumber')), 'episode_number': int_or_none(xpath_text(video_data, 'episodeNumber')),
'is_live': is_live, 'is_live': is_live,
} }
def _extract_ngtv_info(self, media_id, tokenizer_query, ap_data=None):
streams_data = self._download_json(
'http://medium.ngtv.io/media/%s/tv' % media_id,
media_id)['media']['tv']
duration = None
chapters = []
formats = []
for supported_type in ('unprotected', 'bulkaes'):
stream_data = streams_data.get(supported_type, {})
m3u8_url = stream_data.get('secureUrl') or stream_data.get('url')
if not m3u8_url:
continue
if stream_data.get('playlistProtection') == 'spe':
m3u8_url = self._add_akamai_spe_token(
'http://token.ngtv.io/token/token_spe',
m3u8_url, media_id, ap_data or {}, tokenizer_query)
formats.extend(self._extract_m3u8_formats(
m3u8_url, media_id, 'mp4', m3u8_id='hls', fatal=False))
duration = float_or_none(stream_data.get('totalRuntime'))
if not chapters:
for chapter in stream_data.get('contentSegments', []):
start_time = float_or_none(chapter.get('start'))
chapter_duration = float_or_none(chapter.get('duration'))
if start_time is None or chapter_duration is None:
continue
chapters.append({
'start_time': start_time,
'end_time': start_time + chapter_duration,
})
self._sort_formats(formats)
return {
'formats': formats,
'chapters': chapters,
'duration': duration,
}

View File

@ -1,13 +1,12 @@
# coding: utf-8 # coding: utf-8
from __future__ import unicode_literals from __future__ import unicode_literals
import re
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import compat_str
from ..utils import ( from ..utils import (
int_or_none, int_or_none,
parse_iso8601, parse_iso8601,
try_get,
determine_ext,
) )
@ -78,42 +77,25 @@ class TV4IE(InfoExtractor):
title = info['title'] title = info['title']
subtitles = {} manifest_url = self._download_json(
formats = [] 'https://playback-api.b17g.net/media/' + video_id,
# http formats are linked with unresolvable host video_id, query={
for kind in ('hls3', ''): 'service': 'tv4',
data = self._download_json( 'device': 'browser',
'https://prima.tv4play.se/api/web/asset/%s/play.json' % video_id, 'protocol': 'hls',
video_id, 'Downloading sources JSON', query={ })['playbackItem']['manifestUrl']
'protocol': kind, formats = self._extract_m3u8_formats(
'videoFormat': 'MP4+WEBVTT', manifest_url, video_id, 'mp4',
}) 'm3u8_native', m3u8_id='hls', fatal=False)
items = try_get(data, lambda x: x['playback']['items']['item']) formats.extend(self._extract_mpd_formats(
if not items: manifest_url.replace('.m3u8', '.mpd'),
continue video_id, mpd_id='dash', fatal=False))
if isinstance(items, dict): formats.extend(self._extract_f4m_formats(
items = [items] manifest_url.replace('.m3u8', '.f4m'),
for item in items: video_id, f4m_id='hds', fatal=False))
manifest_url = item.get('url') formats.extend(self._extract_ism_formats(
if not isinstance(manifest_url, compat_str): re.sub(r'\.ism/.+?\.m3u8', r'.ism/Manifest', manifest_url),
continue video_id, ism_id='mss', fatal=False))
ext = determine_ext(manifest_url)
if ext == 'm3u8':
formats.extend(self._extract_m3u8_formats(
manifest_url, video_id, 'mp4', entry_protocol='m3u8_native',
m3u8_id=kind, fatal=False))
elif ext == 'f4m':
formats.extend(self._extract_akamai_formats(
manifest_url, video_id, {
'hls': 'tv4play-i.akamaihd.net',
}))
elif ext == 'webvtt':
subtitles = self._merge_subtitles(
subtitles, {
'sv': [{
'url': manifest_url,
'ext': 'vtt',
}]})
if not formats and info.get('is_geo_restricted'): if not formats and info.get('is_geo_restricted'):
self.raise_geo_restricted(countries=self._GEO_COUNTRIES) self.raise_geo_restricted(countries=self._GEO_COUNTRIES)
@ -124,7 +106,7 @@ class TV4IE(InfoExtractor):
'id': video_id, 'id': video_id,
'title': title, 'title': title,
'formats': formats, 'formats': formats,
'subtitles': subtitles, # 'subtitles': subtitles,
'description': info.get('description'), 'description': info.get('description'),
'timestamp': parse_iso8601(info.get('broadcast_date_time')), 'timestamp': parse_iso8601(info.get('broadcast_date_time')),
'duration': int_or_none(info.get('duration')), 'duration': int_or_none(info.get('duration')),

View File

@ -0,0 +1,148 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..compat import compat_str
from ..utils import (
int_or_none,
unescapeHTML,
)
class TVNetIE(InfoExtractor):
_VALID_URL = r'https?://(?:[^/]+)\.tvnet\.gov\.vn/[^/]+/(?:\d+/)?(?P<id>\d+)(?:/|$)'
_TESTS = [{
# video
'url': 'http://de.tvnet.gov.vn/video/109788/vtv1---bac-tuyet-tai-lao-cai-va-ha-giang/tin-nong-24h',
'md5': 'b4d7abe0252c9b47774760b7519c7558',
'info_dict': {
'id': '109788',
'ext': 'mp4',
'title': 'VTV1 - Bắc tuyết tại Lào Cai và Hà Giang',
'thumbnail': r're:(?i)https?://.*\.(?:jpg|png)',
'is_live': False,
'view_count': int,
},
}, {
# audio
'url': 'http://vn.tvnet.gov.vn/radio/27017/vov1---ban-tin-chieu-10062018/doi-song-va-xa-hoi',
'md5': 'b5875ce9b0a2eecde029216d0e6db2ae',
'info_dict': {
'id': '27017',
'ext': 'm4a',
'title': 'VOV1 - Bản tin chiều (10/06/2018)',
'thumbnail': r're:(?i)https?://.*\.(?:jpg|png)',
'is_live': False,
},
}, {
'url': 'http://us.tvnet.gov.vn/video/118023/129999/ngay-0705',
'info_dict': {
'id': '129999',
'ext': 'mp4',
'title': 'VTV1 - Quốc hội với cử tri (11/06/2018)',
'thumbnail': r're:(?i)https?://.*\.(?:jpg|png)',
'is_live': False,
},
'params': {
'skip_download': True,
},
}, {
# live stream
'url': 'http://us.tvnet.gov.vn/kenh-truyen-hinh/1011/vtv1',
'info_dict': {
'id': '1011',
'ext': 'mp4',
'title': r're:^VTV1 \| LiveTV [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
'thumbnail': r're:(?i)https?://.*\.(?:jpg|png)',
'is_live': True,
},
'params': {
'skip_download': True,
},
}, {
# radio live stream
'url': 'http://vn.tvnet.gov.vn/kenh-truyen-hinh/1014',
'info_dict': {
'id': '1014',
'ext': 'm4a',
'title': r're:VOV1 \| LiveTV [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
'thumbnail': r're:(?i)https?://.*\.(?:jpg|png)',
'is_live': True,
},
'params': {
'skip_download': True,
},
}, {
'url': 'http://us.tvnet.gov.vn/phim/6136/25510/vtv3---ca-mot-doi-an-oan-tap-1-50/phim-truyen-hinh',
'only_matching': True,
}]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
title = self._og_search_title(
webpage, default=None) or self._html_search_meta(
'title', webpage, default=None) or self._search_regex(
r'<title>([^<]+)<', webpage, 'title')
title = re.sub(r'\s*-\s*TV Net\s*$', '', title)
if '/video/' in url or '/radio/' in url:
is_live = False
elif '/kenh-truyen-hinh/' in url:
is_live = True
else:
is_live = None
data_file = unescapeHTML(self._search_regex(
r'data-file=(["\'])(?P<url>(?:https?:)?//.+?)\1', webpage,
'data file', group='url'))
stream_urls = set()
formats = []
for stream in self._download_json(data_file, video_id):
if not isinstance(stream, dict):
continue
stream_url = stream.get('url')
if (stream_url in stream_urls or not stream_url or
not isinstance(stream_url, compat_str)):
continue
stream_urls.add(stream_url)
formats.extend(self._extract_m3u8_formats(
stream_url, video_id, 'mp4',
entry_protocol='m3u8' if is_live else 'm3u8_native',
m3u8_id='hls', fatal=False))
self._sort_formats(formats)
# better support for radio streams
if title.startswith('VOV'):
for f in formats:
f.update({
'ext': 'm4a',
'vcodec': 'none',
})
thumbnail = self._og_search_thumbnail(
webpage, default=None) or unescapeHTML(
self._search_regex(
r'data-image=(["\'])(?P<url>(?:https?:)?//.+?)\1', webpage,
'thumbnail', default=None, group='url'))
if is_live:
title = self._live_title(title)
view_count = int_or_none(self._search_regex(
r'(?s)<div[^>]+\bclass=["\'].*?view-count[^>]+>.*?(\d+).*?</div>',
webpage, 'view count', default=None))
return {
'id': video_id,
'title': title,
'thumbnail': thumbnail,
'is_live': is_live,
'view_count': view_count,
'formats': formats,
}

View File

@ -63,7 +63,7 @@ class TwitterCardIE(TwitterBaseIE):
'id': '623160978427936768', 'id': '623160978427936768',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Twitter web player', 'title': 'Twitter web player',
'thumbnail': r're:^https?://.*(?:\bformat=|\.)jpg', 'thumbnail': r're:^https?://.*$',
}, },
}, },
{ {
@ -108,6 +108,8 @@ class TwitterCardIE(TwitterBaseIE):
}, },
] ]
_API_BASE = 'https://api.twitter.com/1.1'
def _parse_media_info(self, media_info, video_id): def _parse_media_info(self, media_info, video_id):
formats = [] formats = []
for media_variant in media_info.get('variants', []): for media_variant in media_info.get('variants', []):
@ -149,7 +151,7 @@ class TwitterCardIE(TwitterBaseIE):
main_script, 'bearer token') main_script, 'bearer token')
# https://developer.twitter.com/en/docs/tweets/post-and-engage/api-reference/get-statuses-show-id # https://developer.twitter.com/en/docs/tweets/post-and-engage/api-reference/get-statuses-show-id
api_data = self._download_json( api_data = self._download_json(
'https://api.twitter.com/1.1/statuses/show/%s.json' % video_id, '%s/statuses/show/%s.json' % (self._API_BASE, video_id),
video_id, 'Downloading API data', video_id, 'Downloading API data',
headers={ headers={
'Authorization': 'Bearer ' + bearer_token, 'Authorization': 'Bearer ' + bearer_token,
@ -223,15 +225,49 @@ class TwitterCardIE(TwitterBaseIE):
formats.extend(self._extract_mobile_formats(username, video_id)) formats.extend(self._extract_mobile_formats(username, video_id))
if formats: if formats:
title = self._search_regex(r'<title>([^<]+)</title>', webpage, 'title')
thumbnail = config.get('posterImageUrl') or config.get('image_src')
duration = float_or_none(config.get('duration'), scale=1000) or duration
break break
if not formats:
headers = {
'Authorization': 'Bearer AAAAAAAAAAAAAAAAAAAAAPYXBAAAAAAACLXUNDekMxqa8h%2F40K4moUkGsoc%3DTYfbDKbT3jJPCEVnMYqilB28NHfOPqkca3qaAxGfsyKCs0wRbw',
'Referer': url,
}
ct0 = self._get_cookies(url).get('ct0')
if ct0:
headers['csrf_token'] = ct0.value
guest_token = self._download_json(
'%s/guest/activate.json' % self._API_BASE, video_id,
'Downloading guest token', data=b'',
headers=headers)['guest_token']
headers['x-guest-token'] = guest_token
self._set_cookie('api.twitter.com', 'gt', guest_token)
config = self._download_json(
'%s/videos/tweet/config/%s.json' % (self._API_BASE, video_id),
video_id, headers=headers)
track = config['track']
vmap_url = track.get('vmapUrl')
if vmap_url:
formats = self._extract_formats_from_vmap_url(vmap_url, video_id)
else:
playback_url = track['playbackUrl']
if determine_ext(playback_url) == 'm3u8':
formats = self._extract_m3u8_formats(
playback_url, video_id, 'mp4',
entry_protocol='m3u8_native', m3u8_id='hls')
else:
formats = [{
'url': playback_url,
}]
title = 'Twitter web player'
thumbnail = config.get('posterImage')
duration = float_or_none(track.get('durationMs'), scale=1000)
self._remove_duplicate_formats(formats) self._remove_duplicate_formats(formats)
self._sort_formats(formats) self._sort_formats(formats)
title = self._search_regex(r'<title>([^<]+)</title>', webpage, 'title')
thumbnail = config.get('posterImageUrl') or config.get('image_src')
duration = float_or_none(config.get('duration'), scale=1000) or duration
return { return {
'id': video_id, 'id': video_id,
'title': title, 'title': title,
@ -375,6 +411,22 @@ class TwitterIE(InfoExtractor):
'params': { 'params': {
'skip_download': True, # requires ffmpeg 'skip_download': True, # requires ffmpeg
}, },
}, {
# card via api.twitter.com/1.1/videos/tweet/config
'url': 'https://twitter.com/LisPower1/status/1001551623938805763',
'info_dict': {
'id': '1001551623938805763',
'ext': 'mp4',
'title': 're:.*?Shep is on a roll today.*?',
'thumbnail': r're:^https?://.*\.jpg',
'description': 'md5:63b036c228772523ae1924d5f8e5ed6b',
'uploader': 'Lis Power',
'uploader_id': 'LisPower1',
'duration': 111.278,
},
'params': {
'skip_download': True, # requires ffmpeg
},
}] }]
def _real_extract(self, url): def _real_extract(self, url):

View File

@ -36,7 +36,8 @@ class WimpIE(InfoExtractor):
webpage = self._download_webpage(url, video_id) webpage = self._download_webpage(url, video_id)
youtube_id = self._search_regex( youtube_id = self._search_regex(
r"videoId\s*:\s*[\"']([0-9A-Za-z_-]{11})[\"']", (r"videoId\s*:\s*[\"']([0-9A-Za-z_-]{11})[\"']",
r'data-id=["\']([0-9A-Za-z_-]{11})'),
webpage, 'video URL', default=None) webpage, 'video URL', default=None)
if youtube_id: if youtube_id:
return { return {

View File

@ -510,6 +510,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IconaPop', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IconaPop',
'license': 'Standard YouTube License', 'license': 'Standard YouTube License',
'creator': 'Icona Pop', 'creator': 'Icona Pop',
'track': 'I Love It (feat. Charli XCX)',
'artist': 'Icona Pop',
} }
}, },
{ {
@ -528,6 +530,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/justintimberlakeVEVO', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/justintimberlakeVEVO',
'license': 'Standard YouTube License', 'license': 'Standard YouTube License',
'creator': 'Justin Timberlake', 'creator': 'Justin Timberlake',
'track': 'Tunnel Vision',
'artist': 'Justin Timberlake',
'age_limit': 18, 'age_limit': 18,
} }
}, },
@ -597,7 +601,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'id': 'IB3lcPjvWLA', 'id': 'IB3lcPjvWLA',
'ext': 'm4a', 'ext': 'm4a',
'title': 'Afrojack, Spree Wilson - The Spark ft. Spree Wilson', 'title': 'Afrojack, Spree Wilson - The Spark ft. Spree Wilson',
'description': 'md5:12e7067fa6735a77bdcbb58cb1187d2d', 'description': 'md5:1900ed86ee514927b9e00fbead6969a5',
'duration': 244, 'duration': 244,
'uploader': 'AfrojackVEVO', 'uploader': 'AfrojackVEVO',
'uploader_id': 'AfrojackVEVO', 'uploader_id': 'AfrojackVEVO',
@ -638,7 +642,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'ext': 'mp4', 'ext': 'mp4',
'duration': 219, 'duration': 219,
'upload_date': '20100909', 'upload_date': '20100909',
'uploader': 'The Amazing Atheist', 'uploader': 'TJ Kirk',
'uploader_id': 'TheAmazingAtheist', 'uploader_id': 'TheAmazingAtheist',
'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheAmazingAtheist', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheAmazingAtheist',
'license': 'Standard YouTube License', 'license': 'Standard YouTube License',
@ -668,10 +672,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'url': 'https://www.youtube.com/watch?v=6kLq3WMV1nU', 'url': 'https://www.youtube.com/watch?v=6kLq3WMV1nU',
'info_dict': { 'info_dict': {
'id': '6kLq3WMV1nU', 'id': '6kLq3WMV1nU',
'ext': 'mp4', 'ext': 'webm',
'title': 'Dedication To My Ex (Miss That) (Lyric Video)', 'title': 'Dedication To My Ex (Miss That) (Lyric Video)',
'description': 'md5:33765bb339e1b47e7e72b5490139bb41', 'description': 'md5:33765bb339e1b47e7e72b5490139bb41',
'duration': 247, 'duration': 246,
'uploader': 'LloydVEVO', 'uploader': 'LloydVEVO',
'uploader_id': 'LloydVEVO', 'uploader_id': 'LloydVEVO',
'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/LloydVEVO', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/LloydVEVO',
@ -733,7 +737,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'uploader_id': 'AllenMeow', 'uploader_id': 'AllenMeow',
'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/AllenMeow', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/AllenMeow',
'description': 'made by Wacom from Korea | 字幕&加油添醋 by TY\'s Allen | 感謝heylisa00cavey1001同學熱情提供梗及翻譯', 'description': 'made by Wacom from Korea | 字幕&加油添醋 by TY\'s Allen | 感謝heylisa00cavey1001同學熱情提供梗及翻譯',
'uploader': '艾倫', 'uploader': 'ᄋᄅ',
'license': 'Standard YouTube License', 'license': 'Standard YouTube License',
'title': '[A-made] 變態妍字幕版 太妍 我就是這樣的人', 'title': '[A-made] 變態妍字幕版 太妍 我就是這樣的人',
}, },
@ -760,7 +764,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'url': 'https://www.youtube.com/watch?v=FIl7x6_3R5Y', 'url': 'https://www.youtube.com/watch?v=FIl7x6_3R5Y',
'info_dict': { 'info_dict': {
'id': 'FIl7x6_3R5Y', 'id': 'FIl7x6_3R5Y',
'ext': 'mp4', 'ext': 'webm',
'title': 'md5:7b81415841e02ecd4313668cde88737a', 'title': 'md5:7b81415841e02ecd4313668cde88737a',
'description': 'md5:116377fd2963b81ec4ce64b542173306', 'description': 'md5:116377fd2963b81ec4ce64b542173306',
'duration': 220, 'duration': 220,
@ -769,8 +773,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/dorappi2000', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/dorappi2000',
'uploader': 'dorappi2000', 'uploader': 'dorappi2000',
'license': 'Standard YouTube License', 'license': 'Standard YouTube License',
'formats': 'mincount:32', 'formats': 'mincount:31',
}, },
'skip': 'not actual anymore',
}, },
# DASH manifest with segment_list # DASH manifest with segment_list
{ {
@ -885,7 +890,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'id': 'lsguqyKfVQg', 'id': 'lsguqyKfVQg',
'ext': 'mp4', 'ext': 'mp4',
'title': '{dark walk}; Loki/AC/Dishonored; collab w/Elflover21', 'title': '{dark walk}; Loki/AC/Dishonored; collab w/Elflover21',
'alt_title': 'Dark Walk', 'alt_title': 'Dark Walk - Position Music',
'description': 'md5:8085699c11dc3f597ce0410b0dcbb34a', 'description': 'md5:8085699c11dc3f597ce0410b0dcbb34a',
'duration': 133, 'duration': 133,
'upload_date': '20151119', 'upload_date': '20151119',
@ -893,7 +898,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IronSoulElf', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IronSoulElf',
'uploader': 'IronSoulElf', 'uploader': 'IronSoulElf',
'license': 'Standard YouTube License', 'license': 'Standard YouTube License',
'creator': 'Todd Haberman, Daniel Law Heath & Aaron Kaplan', 'creator': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan',
'track': 'Dark Walk - Position Music',
'artist': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan',
}, },
'params': { 'params': {
'skip_download': True, 'skip_download': True,
@ -950,7 +957,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'description': 'md5:dda0d780d5a6e120758d1711d062a867', 'description': 'md5:dda0d780d5a6e120758d1711d062a867',
'duration': 4060, 'duration': 4060,
'upload_date': '20151119', 'upload_date': '20151119',
'uploader': 'Bernie 2016', 'uploader': 'Bernie Sanders',
'uploader_id': 'UCH1dpzjCEiGAt8CXkryhkZg', 'uploader_id': 'UCH1dpzjCEiGAt8CXkryhkZg',
'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCH1dpzjCEiGAt8CXkryhkZg', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCH1dpzjCEiGAt8CXkryhkZg',
'license': 'Creative Commons Attribution license (reuse allowed)', 'license': 'Creative Commons Attribution license (reuse allowed)',
@ -985,6 +992,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'params': { 'params': {
'skip_download': True, 'skip_download': True,
}, },
'skip': 'This video is not available.',
}, },
{ {
# YouTube Red video with episode data # YouTube Red video with episode data
@ -993,7 +1001,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'id': 'iqKdEhx-dD4', 'id': 'iqKdEhx-dD4',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Isolation - Mind Field (Ep 1)', 'title': 'Isolation - Mind Field (Ep 1)',
'description': 'md5:8013b7ddea787342608f63a13ddc9492', 'description': 'md5:25b78d2f64ae81719f5c96319889b736',
'duration': 2085, 'duration': 2085,
'upload_date': '20170118', 'upload_date': '20170118',
'uploader': 'Vsauce', 'uploader': 'Vsauce',
@ -1026,7 +1034,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'uploader_id': 'UCEJYpZGqgUob0zVVEaLhvVg', 'uploader_id': 'UCEJYpZGqgUob0zVVEaLhvVg',
'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCEJYpZGqgUob0zVVEaLhvVg', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCEJYpZGqgUob0zVVEaLhvVg',
'license': 'Standard YouTube License', 'license': 'Standard YouTube License',
'view_count': int,
}, },
'params': { 'params': {
'skip_download': True, 'skip_download': True,
@ -1694,128 +1701,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info: if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
raise ExtractorError('"rental" videos not supported. See https://github.com/rg3/youtube-dl/issues/359 for more information.', expected=True) raise ExtractorError('"rental" videos not supported. See https://github.com/rg3/youtube-dl/issues/359 for more information.', expected=True)
# Start extracting information
self.report_information_extraction(video_id)
# uploader
video_uploader = try_get(video_info, lambda x: x['author'][0], compat_str)
if video_uploader:
video_uploader = compat_urllib_parse_unquote_plus(video_uploader)
else:
self._downloader.report_warning('unable to extract uploader name')
# uploader_id
video_uploader_id = None
video_uploader_url = None
mobj = re.search(
r'<link itemprop="url" href="(?P<uploader_url>https?://www\.youtube\.com/(?:user|channel)/(?P<uploader_id>[^"]+))">',
video_webpage)
if mobj is not None:
video_uploader_id = mobj.group('uploader_id')
video_uploader_url = mobj.group('uploader_url')
else:
self._downloader.report_warning('unable to extract uploader nickname')
# thumbnail image
# We try first to get a high quality image:
m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
video_webpage, re.DOTALL)
if m_thumb is not None:
video_thumbnail = m_thumb.group(1)
elif 'thumbnail_url' not in video_info:
self._downloader.report_warning('unable to extract video thumbnail')
video_thumbnail = None
else: # don't panic if we can't find it
video_thumbnail = compat_urllib_parse_unquote_plus(video_info['thumbnail_url'][0])
# upload date
upload_date = self._html_search_meta(
'datePublished', video_webpage, 'upload date', default=None)
if not upload_date:
upload_date = self._search_regex(
[r'(?s)id="eow-date.*?>(.*?)</span>',
r'(?:id="watch-uploader-info".*?>.*?|["\']simpleText["\']\s*:\s*["\'])(?:Published|Uploaded|Streamed live|Started) on (.+?)[<"\']'],
video_webpage, 'upload date', default=None)
upload_date = unified_strdate(upload_date)
video_license = self._html_search_regex(
r'<h4[^>]+class="title"[^>]*>\s*License\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li',
video_webpage, 'license', default=None)
m_music = re.search(
r'''(?x)
<h4[^>]+class="title"[^>]*>\s*Music\s*</h4>\s*
<ul[^>]*>\s*
<li>(?P<title>.+?)
by (?P<creator>.+?)
(?:
\(.+?\)|
<a[^>]*
(?:
\bhref=["\']/red[^>]*>| # drop possible
>\s*Listen ad-free with YouTube Red # YouTube Red ad
)
.*?
)?</li
''',
video_webpage)
if m_music:
video_alt_title = remove_quotes(unescapeHTML(m_music.group('title')))
video_creator = clean_html(m_music.group('creator'))
else:
video_alt_title = video_creator = None
m_episode = re.search(
r'<div[^>]+id="watch7-headline"[^>]*>\s*<span[^>]*>.*?>(?P<series>[^<]+)</a></b>\s*S(?P<season>\d+)\s*•\s*E(?P<episode>\d+)</span>',
video_webpage)
if m_episode:
series = m_episode.group('series')
season_number = int(m_episode.group('season'))
episode_number = int(m_episode.group('episode'))
else:
series = season_number = episode_number = None
m_cat_container = self._search_regex(
r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>',
video_webpage, 'categories', default=None)
if m_cat_container:
category = self._html_search_regex(
r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',
default=None)
video_categories = None if category is None else [category]
else:
video_categories = None
video_tags = [
unescapeHTML(m.group('content'))
for m in re.finditer(self._meta_regex('og:video:tag'), video_webpage)]
def _extract_count(count_name):
return str_to_int(self._search_regex(
r'-%s-button[^>]+><span[^>]+class="yt-uix-button-content"[^>]*>([\d,]+)</span>'
% re.escape(count_name),
video_webpage, count_name, default=None))
like_count = _extract_count('like')
dislike_count = _extract_count('dislike')
# subtitles
video_subtitles = self.extract_subtitles(video_id, video_webpage)
automatic_captions = self.extract_automatic_captions(video_id, video_webpage)
video_duration = try_get(
video_info, lambda x: int_or_none(x['length_seconds'][0]))
if not video_duration:
video_duration = parse_duration(self._html_search_meta(
'duration', video_webpage, 'video duration'))
# annotations
video_annotations = None
if self._downloader.params.get('writeannotations', False):
video_annotations = self._extract_annotations(video_id)
chapters = self._extract_chapters(description_original, video_duration)
def _extract_filesize(media_url): def _extract_filesize(media_url):
return int_or_none(self._search_regex( return int_or_none(self._search_regex(
r'\bclen[=/](\d+)', media_url, 'filesize', default=None)) r'\bclen[=/](\d+)', media_url, 'filesize', default=None))
@ -1990,6 +1875,133 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
raise ExtractorError(error_message, expected=True) raise ExtractorError(error_message, expected=True)
raise ExtractorError('no conn, hlsvp or url_encoded_fmt_stream_map information found in video info') raise ExtractorError('no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
# uploader
video_uploader = try_get(video_info, lambda x: x['author'][0], compat_str)
if video_uploader:
video_uploader = compat_urllib_parse_unquote_plus(video_uploader)
else:
self._downloader.report_warning('unable to extract uploader name')
# uploader_id
video_uploader_id = None
video_uploader_url = None
mobj = re.search(
r'<link itemprop="url" href="(?P<uploader_url>https?://www\.youtube\.com/(?:user|channel)/(?P<uploader_id>[^"]+))">',
video_webpage)
if mobj is not None:
video_uploader_id = mobj.group('uploader_id')
video_uploader_url = mobj.group('uploader_url')
else:
self._downloader.report_warning('unable to extract uploader nickname')
# thumbnail image
# We try first to get a high quality image:
m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
video_webpage, re.DOTALL)
if m_thumb is not None:
video_thumbnail = m_thumb.group(1)
elif 'thumbnail_url' not in video_info:
self._downloader.report_warning('unable to extract video thumbnail')
video_thumbnail = None
else: # don't panic if we can't find it
video_thumbnail = compat_urllib_parse_unquote_plus(video_info['thumbnail_url'][0])
# upload date
upload_date = self._html_search_meta(
'datePublished', video_webpage, 'upload date', default=None)
if not upload_date:
upload_date = self._search_regex(
[r'(?s)id="eow-date.*?>(.*?)</span>',
r'(?:id="watch-uploader-info".*?>.*?|["\']simpleText["\']\s*:\s*["\'])(?:Published|Uploaded|Streamed live|Started) on (.+?)[<"\']'],
video_webpage, 'upload date', default=None)
upload_date = unified_strdate(upload_date)
video_license = self._html_search_regex(
r'<h4[^>]+class="title"[^>]*>\s*License\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li',
video_webpage, 'license', default=None)
m_music = re.search(
r'''(?x)
<h4[^>]+class="title"[^>]*>\s*Music\s*</h4>\s*
<ul[^>]*>\s*
<li>(?P<title>.+?)
by (?P<creator>.+?)
(?:
\(.+?\)|
<a[^>]*
(?:
\bhref=["\']/red[^>]*>| # drop possible
>\s*Listen ad-free with YouTube Red # YouTube Red ad
)
.*?
)?</li
''',
video_webpage)
if m_music:
video_alt_title = remove_quotes(unescapeHTML(m_music.group('title')))
video_creator = clean_html(m_music.group('creator'))
else:
video_alt_title = video_creator = None
def extract_meta(field):
return self._html_search_regex(
r'<h4[^>]+class="title"[^>]*>\s*%s\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li>\s*' % field,
video_webpage, field, default=None)
track = extract_meta('Song')
artist = extract_meta('Artist')
m_episode = re.search(
r'<div[^>]+id="watch7-headline"[^>]*>\s*<span[^>]*>.*?>(?P<series>[^<]+)</a></b>\s*S(?P<season>\d+)\s*•\s*E(?P<episode>\d+)</span>',
video_webpage)
if m_episode:
series = m_episode.group('series')
season_number = int(m_episode.group('season'))
episode_number = int(m_episode.group('episode'))
else:
series = season_number = episode_number = None
m_cat_container = self._search_regex(
r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>',
video_webpage, 'categories', default=None)
if m_cat_container:
category = self._html_search_regex(
r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',
default=None)
video_categories = None if category is None else [category]
else:
video_categories = None
video_tags = [
unescapeHTML(m.group('content'))
for m in re.finditer(self._meta_regex('og:video:tag'), video_webpage)]
def _extract_count(count_name):
return str_to_int(self._search_regex(
r'-%s-button[^>]+><span[^>]+class="yt-uix-button-content"[^>]*>([\d,]+)</span>'
% re.escape(count_name),
video_webpage, count_name, default=None))
like_count = _extract_count('like')
dislike_count = _extract_count('dislike')
# subtitles
video_subtitles = self.extract_subtitles(video_id, video_webpage)
automatic_captions = self.extract_automatic_captions(video_id, video_webpage)
video_duration = try_get(
video_info, lambda x: int_or_none(x['length_seconds'][0]))
if not video_duration:
video_duration = parse_duration(self._html_search_meta(
'duration', video_webpage, 'video duration'))
# annotations
video_annotations = None
if self._downloader.params.get('writeannotations', False):
video_annotations = self._extract_annotations(video_id)
chapters = self._extract_chapters(description_original, video_duration)
# Look for the DASH manifest # Look for the DASH manifest
if self._downloader.params.get('youtube_include_dash_manifest', True): if self._downloader.params.get('youtube_include_dash_manifest', True):
dash_mpd_fatal = True dash_mpd_fatal = True
@ -2055,9 +2067,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'uploader_url': video_uploader_url, 'uploader_url': video_uploader_url,
'upload_date': upload_date, 'upload_date': upload_date,
'license': video_license, 'license': video_license,
'creator': video_creator, 'creator': video_creator or artist,
'title': video_title, 'title': video_title,
'alt_title': video_alt_title, 'alt_title': video_alt_title or track,
'thumbnail': video_thumbnail, 'thumbnail': video_thumbnail,
'description': video_description, 'description': video_description,
'categories': video_categories, 'categories': video_categories,
@ -2080,6 +2092,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'series': series, 'series': series,
'season_number': season_number, 'season_number': season_number,
'episode_number': episode_number, 'episode_number': episode_number,
'track': track,
'artist': artist,
} }

View File

@ -1228,7 +1228,7 @@ def unified_timestamp(date_str, day_first=True):
def determine_ext(url, default_ext='unknown_video'): def determine_ext(url, default_ext='unknown_video'):
if url is None: if url is None or '.' not in url:
return default_ext return default_ext
guess = url.partition('?')[0].rpartition('.')[2] guess = url.partition('?')[0].rpartition('.')[2]
if re.match(r'^[A-Za-z0-9]+$', guess): if re.match(r'^[A-Za-z0-9]+$', guess):

View File

@ -1,3 +1,3 @@
from __future__ import unicode_literals from __future__ import unicode_literals
__version__ = '2018.05.26' __version__ = '2018.06.14'