diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.md b/.github/ISSUE_TEMPLATE/1_broken_site.md index fcfadeb1f..80ca6d5f1 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.md +++ b/.github/ISSUE_TEMPLATE/1_broken_site.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support -- [ ] I've verified that I'm running youtube-dl version **2019.07.12** +- [ ] I've verified that I'm running youtube-dl version **2019.07.14** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar issues including closed ones @@ -41,7 +41,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2019.07.12 + [debug] youtube-dl version 2019.07.14 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.md b/.github/ISSUE_TEMPLATE/2_site_support_request.md index 7e1a3d1c0..a4f3c4dd9 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.md +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.md @@ -19,7 +19,7 @@ labels: 'site-support-request' - [ ] I'm reporting a new site support request -- [ ] I've verified that I'm running youtube-dl version **2019.07.12** +- [ ] I've verified that I'm running youtube-dl version **2019.07.14** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that none of provided URLs violate any copyrights - [ ] I've searched the bugtracker for similar site support requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.md b/.github/ISSUE_TEMPLATE/3_site_feature_request.md index 71782a104..9d82e1cd9 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.md +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.md @@ -18,13 +18,13 @@ title: '' - [ ] I'm reporting a site feature request -- [ ] I've verified that I'm running youtube-dl version **2019.07.12** +- [ ] I've verified that I'm running youtube-dl version **2019.07.14** - [ ] I've searched the bugtracker for similar site feature requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.md b/.github/ISSUE_TEMPLATE/4_bug_report.md index 6bcfde1f8..ff82a7435 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.md +++ b/.github/ISSUE_TEMPLATE/4_bug_report.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support issue -- [ ] I've verified that I'm running youtube-dl version **2019.07.12** +- [ ] I've verified that I'm running youtube-dl version **2019.07.14** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar bug reports including closed ones @@ -43,7 +43,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2019.07.12 + [debug] youtube-dl version 2019.07.14 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.md b/.github/ISSUE_TEMPLATE/5_feature_request.md index 89d9c63aa..f692c663d 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.md +++ b/.github/ISSUE_TEMPLATE/5_feature_request.md @@ -19,13 +19,13 @@ labels: 'request' - [ ] I'm reporting a feature request -- [ ] I've verified that I'm running youtube-dl version **2019.07.12** +- [ ] I've verified that I'm running youtube-dl version **2019.07.14** - [ ] I've searched the bugtracker for similar feature requests including closed ones diff --git a/ChangeLog b/ChangeLog index 45cb2746e..be1606586 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,28 @@ +version 2019.07.14 + +Extractors +* [porn91] Fix extraction (#21312) ++ [yandexmusic] Extract track number and disk number (#21421) ++ [yandexmusic] Add support for multi disk albums (#21420, #21421) +* [lynda] Handle missing subtitles (#20490, #20513) ++ [youtube] Add more invidious instances to URL regular expression (#21694) +* [twitter] Improve uploader id extraction (#21705) +* [spankbang] Fix and improve metadata extraction +* [spankbang] Fix extraction (#21763, #21764) ++ [dlive] Add support for dlive.tv (#18080) ++ [livejournal] Add support for livejournal.com (#21526) +* [roosterteeth] Fix free episode extraction (#16094) +* [dbtv] Fix extraction +* [bellator] Fix extraction +- [rudo] Remove extractor (#18430, #18474) +* [facebook] Fallback to twitter:image meta for thumbnail extraction (#21224) +* [bleacherreport] Fix Bleacher Report CMS extraction +* [espn] Fix fivethirtyeight.com extraction +* [5tv] Relax video URL regular expression and support https URLs +* [youtube] Fix is_live extraction (#21734) +* [youtube] Fix authentication (#11270) + + version 2019.07.12 Core diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 4e664336d..9ae6e5c96 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -223,6 +223,8 @@ - **DiscoveryNetworksDe** - **DiscoveryVR** - **Disney** + - **dlive:stream** + - **dlive:vod** - **Dotsub** - **DouyuShow** - **DouyuTV**: 斗鱼 @@ -448,6 +450,7 @@ - **linkedin:learning:course** - **LinuxAcademy** - **LiTV** + - **LiveJournal** - **LiveLeak** - **LiveLeakEmbed** - **livestream** @@ -754,7 +757,6 @@ - **rtve.es:television** - **RTVNH** - **RTVS** - - **Rudo** - **RUHD** - **rutube**: Rutube videos - **rutube:channel**: Rutube channels diff --git a/youtube_dl/extractor/biobiochiletv.py b/youtube_dl/extractor/biobiochiletv.py index b92031c8a..dc86c57c5 100644 --- a/youtube_dl/extractor/biobiochiletv.py +++ b/youtube_dl/extractor/biobiochiletv.py @@ -6,7 +6,6 @@ from ..utils import ( ExtractorError, remove_end, ) -from .rudo import RudoIE class BioBioChileTVIE(InfoExtractor): @@ -41,11 +40,15 @@ class BioBioChileTVIE(InfoExtractor): }, { 'url': 'http://www.biobiochile.cl/noticias/bbtv/comentarios-bio-bio/2016/07/08/edecanes-del-congreso-figuras-decorativas-que-le-cuestan-muy-caro-a-los-chilenos.shtml', 'info_dict': { - 'id': 'edecanes-del-congreso-figuras-decorativas-que-le-cuestan-muy-caro-a-los-chilenos', + 'id': 'b4xd0LK3SK', 'ext': 'mp4', - 'uploader': '(none)', - 'upload_date': '20160708', - 'title': 'Edecanes del Congreso: Figuras decorativas que le cuestan muy caro a los chilenos', + # TODO: fix url_transparent information overriding + # 'uploader': 'Juan Pablo Echenique', + 'title': 'Comentario Oscar Cáceres', + }, + 'params': { + # empty m3u8 manifest + 'skip_download': True, }, }, { 'url': 'http://tv.biobiochile.cl/notas/2015/10/22/ninos-transexuales-de-quien-es-la-decision.shtml', @@ -60,7 +63,9 @@ class BioBioChileTVIE(InfoExtractor): webpage = self._download_webpage(url, video_id) - rudo_url = RudoIE._extract_url(webpage) + rudo_url = self._search_regex( + r']+src=(?P[\'"])(?P(?:https?:)?//rudo\.video/vod/[0-9a-zA-Z]+)(?P=q1)', + webpage, 'embed URL', None, group='url') if not rudo_url: raise ExtractorError('No videos found') @@ -68,7 +73,7 @@ class BioBioChileTVIE(InfoExtractor): thumbnail = self._og_search_thumbnail(webpage) uploader = self._html_search_regex( - r']+href=["\']https?://(?:busca|www)\.biobiochile\.cl/(?:lista/)?(?:author|autor)[^>]+>(.+?)', + r']+href=["\'](?:https?://(?:busca|www)\.biobiochile\.cl)?/(?:lista/)?(?:author|autor)[^>]+>(.+?)', webpage, 'uploader', fatal=False) return { diff --git a/youtube_dl/extractor/bleacherreport.py b/youtube_dl/extractor/bleacherreport.py index e829974ff..dc60224d0 100644 --- a/youtube_dl/extractor/bleacherreport.py +++ b/youtube_dl/extractor/bleacherreport.py @@ -71,7 +71,7 @@ class BleacherReportIE(InfoExtractor): video = article_data.get('video') if video: video_type = video['type'] - if video_type == 'cms.bleacherreport.com': + if video_type in ('cms.bleacherreport.com', 'vid.bleacherreport.com'): info['url'] = 'http://bleacherreport.com/video_embed?id=%s' % video['id'] elif video_type == 'ooyala.com': info['url'] = 'ooyala:%s' % video['id'] @@ -87,9 +87,9 @@ class BleacherReportIE(InfoExtractor): class BleacherReportCMSIE(AMPIE): - _VALID_URL = r'https?://(?:www\.)?bleacherreport\.com/video_embed\?id=(?P[0-9a-f-]{36})' + _VALID_URL = r'https?://(?:www\.)?bleacherreport\.com/video_embed\?id=(?P[0-9a-f-]{36}|\d{5})' _TESTS = [{ - 'url': 'http://bleacherreport.com/video_embed?id=8fd44c2f-3dc5-4821-9118-2c825a98c0e1', + 'url': 'http://bleacherreport.com/video_embed?id=8fd44c2f-3dc5-4821-9118-2c825a98c0e1&library=video-cms', 'md5': '2e4b0a997f9228ffa31fada5c53d1ed1', 'info_dict': { 'id': '8fd44c2f-3dc5-4821-9118-2c825a98c0e1', @@ -101,6 +101,6 @@ class BleacherReportCMSIE(AMPIE): def _real_extract(self, url): video_id = self._match_id(url) - info = self._extract_feed_info('http://cms.bleacherreport.com/media/items/%s/akamai.json' % video_id) + info = self._extract_feed_info('http://vid.bleacherreport.com/videos/%s.akamai' % video_id) info['id'] = video_id return info diff --git a/youtube_dl/extractor/dbtv.py b/youtube_dl/extractor/dbtv.py index f232f0dc5..aaedf2e3d 100644 --- a/youtube_dl/extractor/dbtv.py +++ b/youtube_dl/extractor/dbtv.py @@ -7,50 +7,51 @@ from .common import InfoExtractor class DBTVIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?dbtv\.no/(?:[^/]+/)?(?P[0-9]+)(?:#(?P.+))?' + _VALID_URL = r'https?://(?:www\.)?dagbladet\.no/video/(?:(?:embed|(?P[^/]+))/)?(?P[0-9A-Za-z_-]{11}|[a-zA-Z0-9]{8})' _TESTS = [{ - 'url': 'http://dbtv.no/3649835190001#Skulle_teste_ut_fornøyelsespark,_men_kollegaen_var_bare_opptatt_av_bikinikroppen', - 'md5': '2e24f67936517b143a234b4cadf792ec', + 'url': 'https://www.dagbladet.no/video/PynxJnNWChE/', + 'md5': 'b8f850ba1860adbda668d367f9b77699', 'info_dict': { - 'id': '3649835190001', - 'display_id': 'Skulle_teste_ut_fornøyelsespark,_men_kollegaen_var_bare_opptatt_av_bikinikroppen', + 'id': 'PynxJnNWChE', 'ext': 'mp4', 'title': 'Skulle teste ut fornøyelsespark, men kollegaen var bare opptatt av bikinikroppen', - 'description': 'md5:1504a54606c4dde3e4e61fc97aa857e0', + 'description': 'md5:49cc8370e7d66e8a2ef15c3b4631fd3f', 'thumbnail': r're:https?://.*\.jpg', - 'timestamp': 1404039863, - 'upload_date': '20140629', - 'duration': 69.544, - 'uploader_id': '1027729757001', + 'upload_date': '20160916', + 'duration': 69, + 'uploader_id': 'UCk5pvsyZJoYJBd7_oFPTlRQ', + 'uploader': 'Dagbladet', }, - 'add_ie': ['BrightcoveNew'] + 'add_ie': ['Youtube'] }, { - 'url': 'http://dbtv.no/3649835190001', + 'url': 'https://www.dagbladet.no/video/embed/xlGmyIeN9Jo/?autoplay=false', 'only_matching': True, }, { - 'url': 'http://www.dbtv.no/lazyplayer/4631135248001', - 'only_matching': True, - }, { - 'url': 'http://dbtv.no/vice/5000634109001', - 'only_matching': True, - }, { - 'url': 'http://dbtv.no/filmtrailer/3359293614001', + 'url': 'https://www.dagbladet.no/video/truer-iran-bor-passe-dere/PalfB2Cw', 'only_matching': True, }] @staticmethod def _extract_urls(webpage): return [url for _, url in re.findall( - r']+src=(["\'])((?:https?:)?//(?:www\.)?dbtv\.no/(?:lazy)?player/\d+.*?)\1', + r']+src=(["\'])((?:https?:)?//(?:www\.)?dagbladet\.no/video/embed/(?:[0-9A-Za-z_-]{11}|[a-zA-Z0-9]{8}).*?)\1', webpage)] def _real_extract(self, url): - video_id, display_id = re.match(self._VALID_URL, url).groups() - - return { + display_id, video_id = re.match(self._VALID_URL, url).groups() + info = { '_type': 'url_transparent', - 'url': 'http://players.brightcove.net/1027729757001/default_default/index.html?videoId=%s' % video_id, 'id': video_id, 'display_id': display_id, - 'ie_key': 'BrightcoveNew', } + if len(video_id) == 11: + info.update({ + 'url': video_id, + 'ie_key': 'Youtube', + }) + else: + info.update({ + 'url': 'jwplatform:' + video_id, + 'ie_key': 'JWPlatform', + }) + return info diff --git a/youtube_dl/extractor/dlive.py b/youtube_dl/extractor/dlive.py new file mode 100644 index 000000000..8787f15a6 --- /dev/null +++ b/youtube_dl/extractor/dlive.py @@ -0,0 +1,94 @@ +from __future__ import unicode_literals + +import json +import re + +from .common import InfoExtractor +from ..utils import int_or_none + + +class DLiveVODIE(InfoExtractor): + IE_NAME = 'dlive:vod' + _VALID_URL = r'https?://(?:www\.)?dlive\.tv/p/(?P.+?)\+(?P[a-zA-Z0-9]+)' + _TEST = { + 'url': 'https://dlive.tv/p/pdp+3mTzOl4WR', + 'info_dict': { + 'id': '3mTzOl4WR', + 'ext': 'mp4', + 'title': 'Minecraft with james charles epic', + 'upload_date': '20190701', + 'timestamp': 1562011015, + 'uploader_id': 'pdp', + } + } + + def _real_extract(self, url): + uploader_id, vod_id = re.match(self._VALID_URL, url).groups() + broadcast = self._download_json( + 'https://graphigo.prd.dlive.tv/', vod_id, + data=json.dumps({'query': '''query { + pastBroadcast(permlink:"%s+%s") { + content + createdAt + length + playbackUrl + title + thumbnailUrl + viewCount + } +}''' % (uploader_id, vod_id)}).encode())['data']['pastBroadcast'] + title = broadcast['title'] + formats = self._extract_m3u8_formats( + broadcast['playbackUrl'], vod_id, 'mp4', 'm3u8_native') + self._sort_formats(formats) + return { + 'id': vod_id, + 'title': title, + 'uploader_id': uploader_id, + 'formats': formats, + 'description': broadcast.get('content'), + 'thumbnail': broadcast.get('thumbnailUrl'), + 'timestamp': int_or_none(broadcast.get('createdAt'), 1000), + 'view_count': int_or_none(broadcast.get('viewCount')), + } + + +class DLiveStreamIE(InfoExtractor): + IE_NAME = 'dlive:stream' + _VALID_URL = r'https?://(?:www\.)?dlive\.tv/(?!p/)(?P[\w.-]+)' + + def _real_extract(self, url): + display_name = self._match_id(url) + user = self._download_json( + 'https://graphigo.prd.dlive.tv/', display_name, + data=json.dumps({'query': '''query { + userByDisplayName(displayname:"%s") { + livestream { + content + createdAt + title + thumbnailUrl + watchingCount + } + username + } +}''' % display_name}).encode())['data']['userByDisplayName'] + livestream = user['livestream'] + title = livestream['title'] + username = user['username'] + formats = self._extract_m3u8_formats( + 'https://live.prd.dlive.tv/hls/live/%s.m3u8' % username, + display_name, 'mp4') + self._sort_formats(formats) + return { + 'id': display_name, + 'title': self._live_title(title), + 'uploader': display_name, + 'uploader_id': username, + 'formats': formats, + 'description': livestream.get('content'), + 'thumbnail': livestream.get('thumbnailUrl'), + 'is_live': True, + 'timestamp': int_or_none(livestream.get('createdAt'), 1000), + 'view_count': int_or_none(livestream.get('watchingCount')), + } diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 555fadfaf..15f54a214 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -579,6 +579,7 @@ from .linkedin import ( ) from .linuxacademy import LinuxAcademyIE from .litv import LiTVIE +from .livejournal import LiveJournalIE from .liveleak import ( LiveLeakIE, LiveLeakEmbedIE, @@ -967,7 +968,6 @@ from .rts import RTSIE from .rtve import RTVEALaCartaIE, RTVELiveIE, RTVEInfantilIE, RTVELiveIE, RTVETelevisionIE from .rtvnh import RTVNHIE from .rtvs import RTVSIE -from .rudo import RudoIE from .ruhd import RUHDIE from .rutube import ( RutubeIE, @@ -1255,6 +1255,10 @@ from .udn import UDNEmbedIE from .ufctv import UFCTVIE from .uktvplay import UKTVPlayIE from .digiteka import DigitekaIE +from .dlive import ( + DLiveVODIE, + DLiveStreamIE, +) from .umg import UMGDeIE from .unistra import UnistraIE from .unity import UnityIE diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index 33d2acc91..517eb83c4 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -462,8 +462,8 @@ class FacebookIE(InfoExtractor): r'[\'\"]ownerid[\'\"]\s*:\s*[\'\"](\d+)[\'\"]', tahoe_data.secondary, 'uploader_id', fatal=False) - thumbnail = self._og_search_thumbnail(webpage) + thumbnail = self._html_search_meta(['og:image', 'twitter:image'], webpage) if is_live: view_count = parse_count( self._search_regex(r'viewerCount:([\d]+)', webpage, 'views', fatal=False) or \ diff --git a/youtube_dl/extractor/livejournal.py b/youtube_dl/extractor/livejournal.py new file mode 100644 index 000000000..3a9f4553f --- /dev/null +++ b/youtube_dl/extractor/livejournal.py @@ -0,0 +1,42 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import int_or_none + + +class LiveJournalIE(InfoExtractor): + _VALID_URL = r'https?://(?:[^.]+\.)?livejournal\.com/video/album/\d+.+?\bid=(?P\d+)' + _TEST = { + 'url': 'https://andrei-bt.livejournal.com/video/album/407/?mode=view&id=51272', + 'md5': 'adaf018388572ced8a6f301ace49d4b2', + 'info_dict': { + 'id': '1263729', + 'ext': 'mp4', + 'title': 'Истребители против БПЛА', + 'upload_date': '20190624', + 'timestamp': 1561406715, + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + record = self._parse_json(self._search_regex( + r'Site\.page\s*=\s*({.+?});', webpage, + 'page data'), video_id)['video']['record'] + storage_id = compat_str(record['storageid']) + title = record.get('name') + if title: + # remove filename extension(.mp4, .mov, etc...) + title = title.rsplit('.', 1)[0] + return { + '_type': 'url_transparent', + 'id': video_id, + 'title': title, + 'thumbnail': record.get('thumbnail'), + 'timestamp': int_or_none(record.get('timecreate')), + 'url': 'eagleplatform:vc.videos.livejournal.com:' + storage_id, + 'ie_key': 'EaglePlatform', + } diff --git a/youtube_dl/extractor/lynda.py b/youtube_dl/extractor/lynda.py index 3084c6dff..b3d8653d0 100644 --- a/youtube_dl/extractor/lynda.py +++ b/youtube_dl/extractor/lynda.py @@ -117,6 +117,10 @@ class LyndaIE(LyndaBaseIE): }, { 'url': 'https://www.lynda.com/de/Graphic-Design-tutorials/Willkommen-Grundlagen-guten-Gestaltung/393570/393572-4.html', 'only_matching': True, + }, { + # Status="NotFound", Message="Transcript not found" + 'url': 'https://www.lynda.com/ASP-NET-tutorials/What-you-should-know/5034180/2811512-4.html', + 'only_matching': True, }] def _raise_unavailable(self, video_id): @@ -247,12 +251,17 @@ class LyndaIE(LyndaBaseIE): def _get_subtitles(self, video_id): url = 'https://www.lynda.com/ajax/player?videoId=%s&type=transcript' % video_id - subs = self._download_json(url, None, False) + subs = self._download_webpage( + url, video_id, 'Downloading subtitles JSON', fatal=False) + if not subs or 'Status="NotFound"' in subs: + return {} + subs = self._parse_json(subs, video_id, fatal=False) + if not subs: + return {} fixed_subs = self._fix_subtitles(subs) if fixed_subs: return {'en': [{'ext': 'srt', 'data': fixed_subs}]} - else: - return {} + return {} class LyndaCourseIE(LyndaBaseIE): diff --git a/youtube_dl/extractor/porn91.py b/youtube_dl/extractor/porn91.py index 24c3600fe..20eac647a 100644 --- a/youtube_dl/extractor/porn91.py +++ b/youtube_dl/extractor/porn91.py @@ -39,7 +39,12 @@ class Porn91IE(InfoExtractor): r'
([^<]+)
', webpage, 'title') title = title.replace('\n', '') - info_dict = self._parse_html5_media_entries(url, webpage, video_id)[0] + video_link_url = self._search_regex( + r']+id=["\']fm-video_link[^>]+>([^<]+)', + webpage, 'video link') + videopage = self._download_webpage(video_link_url, video_id) + + info_dict = self._parse_html5_media_entries(url, videopage, video_id)[0] duration = parse_duration(self._search_regex( r'时长:\s*\s*(\d+:\d+)', webpage, 'duration', fatal=False)) diff --git a/youtube_dl/extractor/roosterteeth.py b/youtube_dl/extractor/roosterteeth.py index 857434540..d3eeeba62 100644 --- a/youtube_dl/extractor/roosterteeth.py +++ b/youtube_dl/extractor/roosterteeth.py @@ -4,11 +4,14 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import ( + compat_HTTPError, + compat_str, +) from ..utils import ( ExtractorError, int_or_none, - strip_or_none, - unescapeHTML, + str_or_none, urlencode_postdata, ) @@ -21,15 +24,14 @@ class RoosterTeethIE(InfoExtractor): 'url': 'http://roosterteeth.com/episode/million-dollars-but-season-2-million-dollars-but-the-game-announcement', 'md5': 'e2bd7764732d785ef797700a2489f212', 'info_dict': { - 'id': '26576', + 'id': '9156', 'display_id': 'million-dollars-but-season-2-million-dollars-but-the-game-announcement', 'ext': 'mp4', - 'title': 'Million Dollars, But...: Million Dollars, But... The Game Announcement', - 'description': 'md5:0cc3b21986d54ed815f5faeccd9a9ca5', + 'title': 'Million Dollars, But... The Game Announcement', + 'description': 'md5:168a54b40e228e79f4ddb141e89fe4f5', 'thumbnail': r're:^https?://.*\.png$', 'series': 'Million Dollars, But...', 'episode': 'Million Dollars, But... The Game Announcement', - 'comment_count': int, }, }, { 'url': 'http://achievementhunter.roosterteeth.com/episode/off-topic-the-achievement-hunter-podcast-2016-i-didn-t-think-it-would-pass-31', @@ -89,60 +91,55 @@ class RoosterTeethIE(InfoExtractor): def _real_extract(self, url): display_id = self._match_id(url) + api_episode_url = 'https://svod-be.roosterteeth.com/api/v1/episodes/%s' % display_id - webpage = self._download_webpage(url, display_id) - - episode = strip_or_none(unescapeHTML(self._search_regex( - (r'videoTitle\s*=\s*(["\'])(?P(?:(?!\1).)+)\1', - r'<title>(?P<title>[^<]+)'), webpage, 'title', - default=None, group='title'))) - - title = strip_or_none(self._og_search_title( - webpage, default=None)) or episode - - m3u8_url = self._search_regex( - r'file\s*:\s*(["\'])(?Phttp.+?\.m3u8.*?)\1', - webpage, 'm3u8 url', default=None, group='url') - - if not m3u8_url: - if re.search(r']+class=["\']non-sponsor', webpage): - self.raise_login_required( - '%s is only available for FIRST members' % display_id) - - if re.search(r']+class=["\']golive-gate', webpage): - self.raise_login_required('%s is not available yet' % display_id) - - raise ExtractorError('Unable to extract m3u8 URL') + try: + m3u8_url = self._download_json( + api_episode_url + '/videos', display_id, + 'Downloading video JSON metadata')['data'][0]['attributes']['url'] + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: + if self._parse_json(e.cause.read().decode(), display_id).get('access') is False: + self.raise_login_required( + '%s is only available for FIRST members' % display_id) + raise formats = self._extract_m3u8_formats( - m3u8_url, display_id, ext='mp4', - entry_protocol='m3u8_native', m3u8_id='hls') + m3u8_url, display_id, 'mp4', 'm3u8_native', m3u8_id='hls') self._sort_formats(formats) - description = strip_or_none(self._og_search_description(webpage)) - thumbnail = self._proto_relative_url(self._og_search_thumbnail(webpage)) + episode = self._download_json( + api_episode_url, display_id, + 'Downloading episode JSON metadata')['data'][0] + attributes = episode['attributes'] + title = attributes.get('title') or attributes['display_title'] + video_id = compat_str(episode['id']) - series = self._search_regex( - (r'

More ([^<]+)

', r']+>See All ([^<]+) Videos<'), - webpage, 'series', fatal=False) - - comment_count = int_or_none(self._search_regex( - r'>Comments \((\d+)\)<', webpage, - 'comment count', fatal=False)) - - video_id = self._search_regex( - (r'containerId\s*=\s*["\']episode-(\d+)\1', - r'[0-9a-zA-Z]+)' - - _TEST = { - 'url': 'http://rudo.video/vod/oTzw0MGnyG', - 'md5': '2a03a5b32dd90a04c83b6d391cf7b415', - 'info_dict': { - 'id': 'oTzw0MGnyG', - 'ext': 'mp4', - 'title': 'Comentario Tomás Mosciatti', - 'upload_date': '20160617', - }, - } - - @classmethod - def _extract_url(cls, webpage): - mobj = re.search( - r']+src=(?P[\'"])(?P(?:https?:)?//rudo\.video/vod/[0-9a-zA-Z]+)(?P=q1)', - webpage) - if mobj: - return mobj.group('url') - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id, encoding='iso-8859-1') - - jwplayer_data = self._parse_json(self._search_regex( - r'(?s)playerInstance\.setup\(({.+?})\)', webpage, 'jwplayer data'), video_id, - transform_source=lambda s: js_to_json(re.sub(r'encodeURI\([^)]+\)', '""', s))) - - info_dict = self._parse_jwplayer_data( - jwplayer_data, video_id, require_title=False, m3u8_id='hls', mpd_id='dash') - - info_dict.update({ - 'title': self._og_search_title(webpage), - 'upload_date': unified_strdate(get_element_by_class('date', webpage)), - }) - - return info_dict diff --git a/youtube_dl/extractor/spankbang.py b/youtube_dl/extractor/spankbang.py index f11d728ca..e040ada29 100644 --- a/youtube_dl/extractor/spankbang.py +++ b/youtube_dl/extractor/spankbang.py @@ -5,6 +5,7 @@ import re from .common import InfoExtractor from ..utils import ( ExtractorError, + merge_dicts, orderedSet, parse_duration, parse_resolution, @@ -26,6 +27,8 @@ class SpankBangIE(InfoExtractor): 'description': 'dillion harper masturbates on a bed', 'thumbnail': r're:^https?://.*\.jpg$', 'uploader': 'silly2587', + 'timestamp': 1422571989, + 'upload_date': '20150129', 'age_limit': 18, } }, { @@ -106,31 +109,36 @@ class SpankBangIE(InfoExtractor): for format_id, format_url in stream.items(): if format_id.startswith(STREAM_URL_PREFIX): + if format_url and isinstance(format_url, list): + format_url = format_url[0] extract_format( format_id[len(STREAM_URL_PREFIX):], format_url) self._sort_formats(formats) + info = self._search_json_ld(webpage, video_id, default={}) + title = self._html_search_regex( - r'(?s)]*>(.+?)', webpage, 'title') + r'(?s)]*>(.+?)', webpage, 'title', default=None) description = self._search_regex( r']+\bclass=["\']bottom[^>]+>\s*

[^<]*

\s*

([^<]+)', - webpage, 'description', fatal=False) - thumbnail = self._og_search_thumbnail(webpage) - uploader = self._search_regex( - r'class="user"[^>]*>]+>([^<]+)', + webpage, 'description', default=None) + thumbnail = self._og_search_thumbnail(webpage, default=None) + uploader = self._html_search_regex( + (r'(?s)]+class=["\']profile[^>]+>(.+?)', + r'class="user"[^>]*>]+>([^<]+)'), webpage, 'uploader', default=None) duration = parse_duration(self._search_regex( r']+\bclass=["\']right_side[^>]+>\s*([^<]+)', - webpage, 'duration', fatal=False)) + webpage, 'duration', default=None)) view_count = str_to_int(self._search_regex( - r'([\d,.]+)\s+plays', webpage, 'view count', fatal=False)) + r'([\d,.]+)\s+plays', webpage, 'view count', default=None)) age_limit = self._rta_search(webpage) - return { + return merge_dicts({ 'id': video_id, - 'title': title, + 'title': title or video_id, 'description': description, 'thumbnail': thumbnail, 'uploader': uploader, @@ -138,7 +146,8 @@ class SpankBangIE(InfoExtractor): 'view_count': view_count, 'formats': formats, 'age_limit': age_limit, - } + }, info + ) class SpankBangPlaylistIE(InfoExtractor): diff --git a/youtube_dl/extractor/spike.py b/youtube_dl/extractor/spike.py index 21b93a5b3..7c11ea7aa 100644 --- a/youtube_dl/extractor/spike.py +++ b/youtube_dl/extractor/spike.py @@ -22,7 +22,7 @@ class BellatorIE(MTVServicesInfoExtractor): 'only_matching': True, }] - _FEED_URL = 'http://www.spike.com/feeds/mrss/' + _FEED_URL = 'http://www.bellator.com/feeds/mrss/' _GEO_COUNTRIES = ['US'] diff --git a/youtube_dl/extractor/twitter.py b/youtube_dl/extractor/twitter.py index 422350c3b..561484138 100644 --- a/youtube_dl/extractor/twitter.py +++ b/youtube_dl/extractor/twitter.py @@ -438,11 +438,22 @@ class TwitterIE(InfoExtractor): 'params': { 'skip_download': True, # requires ffmpeg }, + }, { + 'url': 'https://twitter.com/foobar/status/1087791357756956680', + 'info_dict': { + 'id': '1087791357756956680', + 'ext': 'mp4', + 'title': 'Twitter - A new is coming. Some of you got an opt-in to try it now. Check out the emoji button, quick keyboard shortcuts, upgraded trends, advanced search, and more. Let us know your thoughts!', + 'thumbnail': r're:^https?://.*\.jpg', + 'description': 'md5:66d493500c013e3e2d434195746a7f78', + 'uploader': 'Twitter', + 'uploader_id': 'Twitter', + 'duration': 61.567, + }, }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - user_id = mobj.group('user_id') twid = mobj.group('id') webpage, urlh = self._download_webpage_handle( @@ -451,8 +462,13 @@ class TwitterIE(InfoExtractor): if 'twitter.com/account/suspended' in urlh.geturl(): raise ExtractorError('Account suspended by Twitter.', expected=True) - if user_id is None: - mobj = re.match(self._VALID_URL, urlh.geturl()) + user_id = None + + redirect_mobj = re.match(self._VALID_URL, urlh.geturl()) + if redirect_mobj: + user_id = redirect_mobj.group('user_id') + + if not user_id: user_id = mobj.group('user_id') username = remove_end(self._og_search_title(webpage), ' on Twitter') diff --git a/youtube_dl/extractor/yandexmusic.py b/youtube_dl/extractor/yandexmusic.py index 1dfee59e9..08d35e04c 100644 --- a/youtube_dl/extractor/yandexmusic.py +++ b/youtube_dl/extractor/yandexmusic.py @@ -10,6 +10,7 @@ from ..utils import ( ExtractorError, int_or_none, float_or_none, + try_get, ) @@ -51,23 +52,43 @@ class YandexMusicTrackIE(YandexMusicBaseIE): IE_DESC = 'Яндекс.Музыка - Трек' _VALID_URL = r'https?://music\.yandex\.(?:ru|kz|ua|by)/album/(?P\d+)/track/(?P\d+)' - _TEST = { + _TESTS = [{ 'url': 'http://music.yandex.ru/album/540508/track/4878838', 'md5': 'f496818aa2f60b6c0062980d2e00dc20', 'info_dict': { 'id': '4878838', 'ext': 'mp3', - 'title': 'Carlo Ambrosio, Carlo Ambrosio & Fabio Di Bari - Gypsy Eyes 1', + 'title': 'Carlo Ambrosio & Fabio Di Bari - Gypsy Eyes 1', 'filesize': 4628061, 'duration': 193.04, 'track': 'Gypsy Eyes 1', 'album': 'Gypsy Soul', 'album_artist': 'Carlo Ambrosio', - 'artist': 'Carlo Ambrosio, Carlo Ambrosio & Fabio Di Bari', + 'artist': 'Carlo Ambrosio & Fabio Di Bari', 'release_year': 2009, }, 'skip': 'Travis CI servers blocked by YandexMusic', - } + }, { + # multiple disks + 'url': 'http://music.yandex.ru/album/3840501/track/705105', + 'md5': 'ebe7b4e2ac7ac03fe11c19727ca6153e', + 'info_dict': { + 'id': '705105', + 'ext': 'mp3', + 'title': 'Hooverphonic - Sometimes', + 'filesize': 5743386, + 'duration': 239.27, + 'track': 'Sometimes', + 'album': 'The Best of Hooverphonic', + 'album_artist': 'Hooverphonic', + 'artist': 'Hooverphonic', + 'release_year': 2016, + 'genre': 'pop', + 'disc_number': 2, + 'track_number': 9, + }, + 'skip': 'Travis CI servers blocked by YandexMusic', + }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) @@ -110,9 +131,21 @@ class YandexMusicTrackIE(YandexMusicBaseIE): 'abr': int_or_none(download_data.get('bitrate')), } + def extract_artist_name(artist): + decomposed = artist.get('decomposed') + if not isinstance(decomposed, list): + return artist['name'] + parts = [artist['name']] + for element in decomposed: + if isinstance(element, dict) and element.get('name'): + parts.append(element['name']) + elif isinstance(element, compat_str): + parts.append(element) + return ''.join(parts) + def extract_artist(artist_list): if artist_list and isinstance(artist_list, list): - artists_names = [a['name'] for a in artist_list if a.get('name')] + artists_names = [extract_artist_name(a) for a in artist_list if a.get('name')] if artists_names: return ', '.join(artists_names) @@ -121,10 +154,17 @@ class YandexMusicTrackIE(YandexMusicBaseIE): album = albums[0] if isinstance(album, dict): year = album.get('year') + disc_number = int_or_none(try_get( + album, lambda x: x['trackPosition']['volume'])) + track_number = int_or_none(try_get( + album, lambda x: x['trackPosition']['index'])) track_info.update({ 'album': album.get('title'), 'album_artist': extract_artist(album.get('artists')), 'release_year': int_or_none(year), + 'genre': album.get('genre'), + 'disc_number': disc_number, + 'track_number': track_number, }) track_artist = extract_artist(track.get('artists')) @@ -152,7 +192,7 @@ class YandexMusicAlbumIE(YandexMusicPlaylistBaseIE): IE_DESC = 'Яндекс.Музыка - Альбом' _VALID_URL = r'https?://music\.yandex\.(?:ru|kz|ua|by)/album/(?P\d+)/?(\?|$)' - _TEST = { + _TESTS = [{ 'url': 'http://music.yandex.ru/album/540508', 'info_dict': { 'id': '540508', @@ -160,7 +200,15 @@ class YandexMusicAlbumIE(YandexMusicPlaylistBaseIE): }, 'playlist_count': 50, 'skip': 'Travis CI servers blocked by YandexMusic', - } + }, { + 'url': 'https://music.yandex.ru/album/3840501', + 'info_dict': { + 'id': '3840501', + 'title': 'Hooverphonic - The Best of Hooverphonic (2016)', + }, + 'playlist_count': 33, + 'skip': 'Travis CI servers blocked by YandexMusic', + }] def _real_extract(self, url): album_id = self._match_id(url) @@ -169,7 +217,7 @@ class YandexMusicAlbumIE(YandexMusicPlaylistBaseIE): 'http://music.yandex.ru/handlers/album.jsx?album=%s' % album_id, album_id, 'Downloading album JSON') - entries = self._build_playlist(album['volumes'][0]) + entries = self._build_playlist([track for volume in album['volumes'] for track in volume]) title = '%s - %s' % (album['artists'][0]['name'], album['title']) year = album.get('year') diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 8a3c502ba..762611b89 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -371,10 +371,14 @@ class YoutubeIE(YoutubeBaseInfoExtractor): (?:www\.)?hooktube\.com/| (?:www\.)?yourepeat\.com/| tube\.majestyc\.net/| + # Invidious instances taken from https://github.com/omarroth/invidious/wiki/Invidious-Instances (?:(?:www|dev)\.)?invidio\.us/| - (?:www\.)?invidiou\.sh/| - (?:www\.)?invidious\.snopyta\.org/| + (?:(?:www|no)\.)?invidiou\.sh/| + (?:(?:www|fi|de)\.)?invidious\.snopyta\.org/| (?:www\.)?invidious\.kabi\.tk/| + (?:www\.)?invidious\.enkirton\.net/| + (?:www\.)?invidious\.13ad\.de/| + (?:www\.)?tube\.poal\.co/| (?:www\.)?vid\.wxzm\.sx/| youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains (?:.*?\#/)? # handle anchor (#/) redirect urls diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 18bcb33e2..8a7f4d733 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2019.07.12' +__version__ = '2019.07.14'