diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index 35f8e6863..c208eb689 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2016.04.19*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2016.04.19** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2016.04.24*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2016.04.24** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through [README](https://github.com/rg3/youtube-dl/blob/master/README.md) and **most notably** [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -35,7 +35,7 @@ $ youtube-dl -v [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2016.04.19 +[debug] youtube-dl version 2016.04.24 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/README.md b/README.md index 7e03e0523..c127c4aba 100644 --- a/README.md +++ b/README.md @@ -176,7 +176,9 @@ which means you can modify it, redistribute it or use it however you like. --xattr-set-filesize Set file xattribute ytdl.filesize with expected filesize (experimental) --hls-prefer-native Use the native HLS downloader instead of - ffmpeg (experimental) + ffmpeg + --hls-prefer-ffmpeg Use ffmpeg instead of the native HLS + downloader --hls-use-mpegts Use the mpegts container for HLS videos, allowing to play the video while downloading (some players may not be able @@ -516,6 +518,18 @@ Available for the video that is an episode of some series or programme: - `episode_number`: Number of the video episode within a season - `episode_id`: Id of the video episode +Available for the media that is a track or a part of a music album: + - `track`: Title of the track + - `track_number`: Number of the track within an album or a disc + - `track_id`: Id of the track + - `artist`: Artist(s) of the track + - `genre`: Genre(s) of the track + - `album`: Title of the album the track belongs to + - `album_type`: Type of the album + - `album_artist`: List of all artists appeared on the album + - `disc_number`: Number of the disc or other physical medium the track belongs to + - `release_year`: Year (YYYY) when the album was released + Each aforementioned sequence when referenced in output template will be replaced by the actual value corresponding to the sequence name. Note that some of the sequences are not guaranteed to be present since they depend on the metadata obtained by particular extractor, such sequences will be replaced with `NA`. For example for `-o %(title)s-%(id)s.%(ext)s` and mp4 video with title `youtube-dl test video` and id `BaW_jenozKcj` this will result in a `youtube-dl test video-BaW_jenozKcj.mp4` file created in the current directory. diff --git a/docs/supportedsites.md b/docs/supportedsites.md index e12a7d182..03875b8db 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -163,6 +163,7 @@ - **defense.gouv.fr** - **democracynow** - **DHM**: Filmarchiv - Deutsches Historisches Museum + - **DigitallySpeaking** - **Digiteka** - **Discovery** - **Dotsub** @@ -174,7 +175,6 @@ - **Dropbox** - **DrTuber** - **DRTV** - - **Dump** - **Dumpert** - **dvtv**: http://video.aktualne.cz/ - **dw** @@ -345,6 +345,7 @@ - **metacafe** - **Metacritic** - **Mgoon** + - **MGTV**: 芒果TV - **Minhateca** - **MinistryGrid** - **Minoto** @@ -413,7 +414,8 @@ - **nfl.com** - **nhl.com** - **nhl.com:news**: NHL news - - **nhl.com:videocenter**: NHL videocenter category + - **nhl.com:videocenter** + - **nhl.com:videocenter:category**: NHL videocenter category - **nick.com** - **niconico**: ニコニコ動画 - **NiconicoPlaylist** @@ -461,13 +463,13 @@ - **Patreon** - **pbs**: Public Broadcasting Service (PBS) and member stations: PBS: Public Broadcasting Service, APT - Alabama Public Television (WBIQ), GPB/Georgia Public Broadcasting (WGTV), Mississippi Public Broadcasting (WMPN), Nashville Public Television (WNPT), WFSU-TV (WFSU), WSRE (WSRE), WTCI (WTCI), WPBA/Channel 30 (WPBA), Alaska Public Media (KAKM), Arizona PBS (KAET), KNME-TV/Channel 5 (KNME), Vegas PBS (KLVX), AETN/ARKANSAS ETV NETWORK (KETS), KET (WKLE), WKNO/Channel 10 (WKNO), LPB/LOUISIANA PUBLIC BROADCASTING (WLPB), OETA (KETA), Ozarks Public Television (KOZK), WSIU Public Broadcasting (WSIU), KEET TV (KEET), KIXE/Channel 9 (KIXE), KPBS San Diego (KPBS), KQED (KQED), KVIE Public Television (KVIE), PBS SoCal/KOCE (KOCE), ValleyPBS (KVPT), CONNECTICUT PUBLIC TELEVISION (WEDH), KNPB Channel 5 (KNPB), SOPTV (KSYS), Rocky Mountain PBS (KRMA), KENW-TV3 (KENW), KUED Channel 7 (KUED), Wyoming PBS (KCWC), Colorado Public Television / KBDI 12 (KBDI), KBYU-TV (KBYU), Thirteen/WNET New York (WNET), WGBH/Channel 2 (WGBH), WGBY (WGBY), NJTV Public Media NJ (WNJT), WLIW21 (WLIW), mpt/Maryland Public Television (WMPB), WETA Television and Radio (WETA), WHYY (WHYY), PBS 39 (WLVT), WVPT - Your Source for PBS and More! (WVPT), Howard University Television (WHUT), WEDU PBS (WEDU), WGCU Public Media (WGCU), WPBT2 (WPBT), WUCF TV (WUCF), WUFT/Channel 5 (WUFT), WXEL/Channel 42 (WXEL), WLRN/Channel 17 (WLRN), WUSF Public Broadcasting (WUSF), ETV (WRLK), UNC-TV (WUNC), PBS Hawaii - Oceanic Cable Channel 10 (KHET), Idaho Public Television (KAID), KSPS (KSPS), OPB (KOPB), KWSU/Channel 10 & KTNW/Channel 31 (KWSU), WILL-TV (WILL), Network Knowledge - WSEC/Springfield (WSEC), WTTW11 (WTTW), Iowa Public Television/IPTV (KDIN), Nine Network (KETC), PBS39 Fort Wayne (WFWA), WFYI Indianapolis (WFYI), Milwaukee Public Television (WMVS), WNIN (WNIN), WNIT Public Television (WNIT), WPT (WPNE), WVUT/Channel 22 (WVUT), WEIU/Channel 51 (WEIU), WQPT-TV (WQPT), WYCC PBS Chicago (WYCC), WIPB-TV (WIPB), WTIU (WTIU), CET (WCET), ThinkTVNetwork (WPTD), WBGU-TV (WBGU), WGVU TV (WGVU), NET1 (KUON), Pioneer Public Television (KWCM), SDPB Television (KUSD), TPT (KTCA), KSMQ (KSMQ), KPTS/Channel 8 (KPTS), KTWU/Channel 11 (KTWU), East Tennessee PBS (WSJK), WCTE-TV (WCTE), WLJT, Channel 11 (WLJT), WOSU TV (WOSU), WOUB/WOUC (WOUB), WVPB (WVPB), WKYU-PBS (WKYU), KERA 13 (KERA), MPBN (WCBB), Mountain Lake PBS (WCFE), NHPTV (WENH), Vermont PBS (WETK), witf (WITF), WQED Multimedia (WQED), WMHT Educational Telecommunications (WMHT), Q-TV (WDCQ), WTVS Detroit Public TV (WTVS), CMU Public Television (WCMU), WKAR-TV (WKAR), WNMU-TV Public TV 13 (WNMU), WDSE - WRPT (WDSE), WGTE TV (WGTE), Lakeland Public Television (KAWE), KMOS-TV - Channels 6.1, 6.2 and 6.3 (KMOS), MontanaPBS (KUSM), KRWG/Channel 22 (KRWG), KACV (KACV), KCOS/Channel 13 (KCOS), WCNY/Channel 24 (WCNY), WNED (WNED), WPBS (WPBS), WSKG Public TV (WSKG), WXXI (WXXI), WPSU (WPSU), WVIA Public Media Studios (WVIA), WTVI (WTVI), Western Reserve PBS (WNEO), WVIZ/PBS ideastream (WVIZ), KCTS 9 (KCTS), Basin PBS (KPBT), KUHT / Channel 8 (KUHT), KLRN (KLRN), KLRU (KLRU), WTJX Channel 12 (WTJX), WCVE PBS (WCVE), KBTC Public Television (KBTC) - **pcmag** + - **People** - **Periscope**: Periscope - **PhilharmonieDeParis**: Philharmonie de Paris - **phoenix.de** - **Photobucket** - **Pinkbike** - **Pladform** - - **PlanetaPlay** - **play.fm** - **played.to** - **PlaysTV** @@ -497,7 +499,6 @@ - **qqmusic:playlist**: QQ音乐 - 歌单 - **qqmusic:singer**: QQ音乐 - 歌手 - **qqmusic:toplist**: QQ音乐 - 排行榜 - - **QuickVid** - **R7** - **radio.de** - **radiobremen** diff --git a/youtube_dl/extractor/cbc.py b/youtube_dl/extractor/cbc.py index d8aa31038..68a0633b6 100644 --- a/youtube_dl/extractor/cbc.py +++ b/youtube_dl/extractor/cbc.py @@ -33,6 +33,7 @@ class CBCIE(InfoExtractor): 'title': 'Robin Williams freestyles on 90 Minutes Live', 'description': 'Wacky American comedian Robin Williams shows off his infamous "freestyle" comedic talents while being interviewed on CBC\'s 90 Minutes Live.', 'upload_date': '19700101', + 'uploader': 'CBCC-NEW', }, 'params': { # rtmp download diff --git a/youtube_dl/extractor/douyutv.py b/youtube_dl/extractor/douyutv.py index 3915cb182..ce6962755 100644 --- a/youtube_dl/extractor/douyutv.py +++ b/youtube_dl/extractor/douyutv.py @@ -18,7 +18,7 @@ class DouyuTVIE(InfoExtractor): 'display_id': 'iseven', 'ext': 'flv', 'title': 're:^清晨醒脑!T-ara根本停不下来! [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', - 'description': 'md5:f34981259a03e980a3c6404190a3ed61', + 'description': 're:.*m7show@163\.com.*', 'thumbnail': 're:^https?://.*\.jpg$', 'uploader': '7师傅', 'uploader_id': '431925', @@ -43,7 +43,7 @@ class DouyuTVIE(InfoExtractor): 'params': { 'skip_download': True, }, - 'skip': 'Romm not found', + 'skip': 'Room not found', }, { 'url': 'http://www.douyutv.com/17732', 'info_dict': { @@ -51,7 +51,7 @@ class DouyuTVIE(InfoExtractor): 'display_id': '17732', 'ext': 'flv', 'title': 're:^清晨醒脑!T-ara根本停不下来! [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', - 'description': 'md5:f34981259a03e980a3c6404190a3ed61', + 'description': 're:.*m7show@163\.com.*', 'thumbnail': 're:^https?://.*\.jpg$', 'uploader': '7师傅', 'uploader_id': '431925', @@ -75,13 +75,28 @@ class DouyuTVIE(InfoExtractor): room_id = self._html_search_regex( r'"room_id"\s*:\s*(\d+),', page, 'room id') - prefix = 'room/%s?aid=android&client_sys=android&time=%d' % ( - room_id, int(time.time())) + config = None + # Douyu API sometimes returns error "Unable to load the requested class: eticket_redis_cache" + # Retry with different parameters - same parameters cause same errors + for i in range(5): + prefix = 'room/%s?aid=android&client_sys=android&time=%d' % ( + room_id, int(time.time())) + auth = hashlib.md5((prefix + '1231').encode('ascii')).hexdigest() - auth = hashlib.md5((prefix + '1231').encode('ascii')).hexdigest() - config = self._download_json( - 'http://www.douyutv.com/api/v1/%s&auth=%s' % (prefix, auth), - video_id) + config_page = self._download_webpage( + 'http://www.douyutv.com/api/v1/%s&auth=%s' % (prefix, auth), + video_id) + try: + config = self._parse_json(config_page, video_id, fatal=False) + except ExtractorError: + # Wait some time before retrying to get a different time() value + self._sleep(1, video_id, msg_template='%(video_id)s: Error occurs. ' + 'Waiting for %(timeout)s seconds before retrying') + continue + else: + break + if config is None: + raise ExtractorError('Unable to fetch API result') data = config['data'] diff --git a/youtube_dl/extractor/dplay.py b/youtube_dl/extractor/dplay.py index 66bbfc6ca..5790553f3 100644 --- a/youtube_dl/extractor/dplay.py +++ b/youtube_dl/extractor/dplay.py @@ -6,13 +6,18 @@ import re import time from .common import InfoExtractor -from ..utils import int_or_none +from ..compat import compat_urlparse +from ..utils import ( + int_or_none, + update_url_query, +) class DPlayIE(InfoExtractor): _VALID_URL = r'https?://(?Pit\.dplay\.com|www\.dplay\.(?:dk|se|no))/[^/]+/(?P[^/?#]+)' _TESTS = [{ + # geo restricted, via direct unsigned hls URL 'url': 'http://it.dplay.com/take-me-out/stagione-1-episodio-25/', 'info_dict': { 'id': '1255600', @@ -31,11 +36,12 @@ class DPlayIE(InfoExtractor): }, 'expected_warnings': ['Unable to download f4m manifest'], }, { + # non geo restricted, via secure api, unsigned download hls URL 'url': 'http://www.dplay.se/nugammalt-77-handelser-som-format-sverige/season-1-svensken-lar-sig-njuta-av-livet/', 'info_dict': { 'id': '3172', 'display_id': 'season-1-svensken-lar-sig-njuta-av-livet', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Svensken lär sig njuta av livet', 'description': 'md5:d3819c9bccffd0fe458ca42451dd50d8', 'duration': 2650, @@ -48,23 +54,25 @@ class DPlayIE(InfoExtractor): 'age_limit': 0, }, }, { + # geo restricted, via secure api, unsigned download hls URL 'url': 'http://www.dplay.dk/mig-og-min-mor/season-6-episode-12/', 'info_dict': { 'id': '70816', 'display_id': 'season-6-episode-12', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Episode 12', 'description': 'md5:9c86e51a93f8a4401fc9641ef9894c90', 'duration': 2563, 'timestamp': 1429696800, 'upload_date': '20150422', - 'creator': 'Kanal 4', + 'creator': 'Kanal 4 (Home)', 'series': 'Mig og min mor', 'season_number': 6, 'episode_number': 12, 'age_limit': 0, }, }, { + # geo restricted, via direct unsigned hls URL 'url': 'http://www.dplay.no/pga-tour/season-1-hoydepunkter-18-21-februar/', 'only_matching': True, }] @@ -90,17 +98,24 @@ class DPlayIE(InfoExtractor): def extract_formats(protocol, manifest_url): if protocol == 'hls': - formats.extend(self._extract_m3u8_formats( + m3u8_formats = self._extract_m3u8_formats( manifest_url, video_id, ext='mp4', - entry_protocol='m3u8_native', m3u8_id=protocol, fatal=False)) + entry_protocol='m3u8_native', m3u8_id=protocol, fatal=False) + # Sometimes final URLs inside m3u8 are unsigned, let's fix this + # ourselves + query = compat_urlparse.parse_qs(compat_urlparse.urlparse(manifest_url).query) + for m3u8_format in m3u8_formats: + m3u8_format['url'] = update_url_query(m3u8_format['url'], query) + formats.extend(m3u8_formats) elif protocol == 'hds': formats.extend(self._extract_f4m_formats( manifest_url + '&hdcore=3.8.0&plugin=flowplayer-3.8.0.0', video_id, f4m_id=protocol, fatal=False)) domain_tld = domain.split('.')[-1] - if domain_tld in ('se', 'dk'): + if domain_tld in ('se', 'dk', 'no'): for protocol in PROTOCOLS: + # Providing dsc-geo allows to bypass geo restriction in some cases self._set_cookie( 'secure.dplay.%s' % domain_tld, 'dsc-geo', json.dumps({ @@ -113,13 +128,24 @@ class DPlayIE(InfoExtractor): 'Downloading %s stream JSON' % protocol, fatal=False) if stream and stream.get(protocol): extract_formats(protocol, stream[protocol]) - else: + + # The last resort is to try direct unsigned hls/hds URLs from info dictionary. + # Sometimes this does work even when secure API with dsc-geo has failed (e.g. + # http://www.dplay.no/pga-tour/season-1-hoydepunkter-18-21-februar/). + if not formats: for protocol in PROTOCOLS: if info.get(protocol): extract_formats(protocol, info[protocol]) self._sort_formats(formats) + subtitles = {} + for lang in ('se', 'sv', 'da', 'nl', 'no'): + for format_id in ('web_vtt', 'vtt', 'srt'): + subtitle_url = info.get('subtitles_%s_%s' % (lang, format_id)) + if subtitle_url: + subtitles.setdefault(lang, []).append({'url': subtitle_url}) + return { 'id': video_id, 'display_id': display_id, @@ -133,4 +159,5 @@ class DPlayIE(InfoExtractor): 'episode_number': int_or_none(info.get('episode')), 'age_limit': int_or_none(info.get('minimum_age')), 'formats': formats, + 'subtitles': subtitles, } diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 27a7e89a4..6de3438fc 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -490,9 +490,10 @@ from .nextmovie import NextMovieIE from .nfb import NFBIE from .nfl import NFLIE from .nhl import ( - NHLIE, - NHLNewsIE, NHLVideocenterIE, + NHLNewsIE, + NHLVideocenterCategoryIE, + NHLIE, ) from .nick import NickIE from .niconico import NiconicoIE, NiconicoPlaylistIE diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 95d233259..c63bdbd08 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -237,6 +237,7 @@ class GenericIE(InfoExtractor): 'ext': 'mp4', 'title': 'car-20120827-manifest', 'formats': 'mincount:9', + 'upload_date': '20130904', }, 'params': { 'format': 'bestvideo', @@ -596,7 +597,11 @@ class GenericIE(InfoExtractor): 'id': 'k2mm4bCdJ6CQ2i7c8o2', 'ext': 'mp4', 'title': 'Le Zap de Spi0n n°216 - Zapping du Web', + 'description': 'md5:faf028e48a461b8b7fad38f1e104b119', 'uploader': 'Spi0n', + 'uploader_id': 'xgditw', + 'upload_date': '20140425', + 'timestamp': 1398441542, }, 'add_ie': ['Dailymotion'], }, @@ -729,8 +734,11 @@ class GenericIE(InfoExtractor): 'id': 'uxjb0lwrcz', 'ext': 'mp4', 'title': 'Conversation about Hexagonal Rails Part 1 - ThoughtWorks', + 'description': 'a Martin Fowler video from ThoughtWorks', 'duration': 1715.0, 'uploader': 'thoughtworks.wistia.com', + 'upload_date': '20140603', + 'timestamp': 1401832161, }, }, # Soundcloud embed @@ -981,6 +989,9 @@ class GenericIE(InfoExtractor): 'ext': 'flv', 'title': "PFT Live: New leader in the 'new-look' defense", 'description': 'md5:65a19b4bbfb3b0c0c5768bed1dfad74e', + 'uploader': 'NBCU-SPORTS', + 'upload_date': '20140107', + 'timestamp': 1389118457, }, }, # UDN embed @@ -1033,6 +1044,9 @@ class GenericIE(InfoExtractor): 'title': 'SN Presents: Russell Martin, World Citizen', 'description': 'To understand why he was the Toronto Blue Jays’ top off-season priority is to appreciate his background and upbringing in Montreal, where he first developed his baseball skills. Written and narrated by Stephen Brunt.', 'uploader': 'Rogers Sportsnet', + 'uploader_id': '1704050871', + 'upload_date': '20150525', + 'timestamp': 1432570283, }, }, # Dailymotion Cloud video @@ -1124,6 +1138,9 @@ class GenericIE(InfoExtractor): 'title': 'The Cardinal Pell Interview', 'description': 'Sky News Contributor Andrew Bolt interviews George Pell in Rome, following the Cardinal\'s evidence before the Royal Commission into Child Abuse. ', 'uploader': 'GlobeCast Australia - GlobeStream', + 'uploader_id': '2733773828001', + 'upload_date': '20160304', + 'timestamp': 1457083087, }, 'params': { # m3u8 downloads @@ -2045,6 +2062,7 @@ class GenericIE(InfoExtractor): entries = [] for video_url in found: + video_url = unescapeHTML(video_url) video_url = video_url.replace('\\/', '/') video_url = compat_urlparse.urljoin(url, video_url) video_id = compat_urllib_parse_unquote(os.path.basename(video_url)) diff --git a/youtube_dl/extractor/iqiyi.py b/youtube_dl/extractor/iqiyi.py index ea8fbb329..ffb8008ce 100644 --- a/youtube_dl/extractor/iqiyi.py +++ b/youtube_dl/extractor/iqiyi.py @@ -287,6 +287,13 @@ class IqiyiIE(InfoExtractor): ('10', 'h1'), ] + AUTH_API_ERRORS = { + # No preview available (不允许试看鉴权失败) + 'Q00505': 'This video requires a VIP account', + # End of preview time (试看结束鉴权失败) + 'Q00506': 'Needs a VIP account for full video', + } + def _real_initialize(self): self._login() @@ -372,14 +379,18 @@ class IqiyiIE(InfoExtractor): note='Downloading video authentication JSON', errnote='Unable to download video authentication JSON') - if auth_result['code'] == 'Q00505': # No preview available (不允许试看鉴权失败) - raise ExtractorError('This video requires a VIP account', expected=True) - if auth_result['code'] == 'Q00506': # End of preview time (试看结束鉴权失败) + code = auth_result.get('code') + msg = self.AUTH_API_ERRORS.get(code) or auth_result.get('msg') or code + if code == 'Q00506': if do_report_warning: - self.report_warning('Needs a VIP account for full video') + self.report_warning(msg) return False + if 'data' not in auth_result: + if msg is not None: + raise ExtractorError('%s said: %s' % (self.IE_NAME, msg), expected=True) + raise ExtractorError('Unexpected error from Iqiyi auth API') - return auth_result + return auth_result['data'] def construct_video_urls(self, data, video_id, _uuid, tvid): def do_xor(x, y): @@ -455,11 +466,11 @@ class IqiyiIE(InfoExtractor): need_vip_warning_report = False break param.update({ - 't': auth_result['data']['t'], + 't': auth_result['t'], # cid is hard-coded in com/qiyi/player/core/player/RuntimeData.as 'cid': 'afbe8fd3d73448c9', 'vid': video_id, - 'QY00001': auth_result['data']['u'], + 'QY00001': auth_result['u'], }) api_video_url += '?' if '?' not in api_video_url else '&' api_video_url += compat_urllib_parse_urlencode(param) diff --git a/youtube_dl/extractor/kuwo.py b/youtube_dl/extractor/kuwo.py index c0ece5113..3740869c7 100644 --- a/youtube_dl/extractor/kuwo.py +++ b/youtube_dl/extractor/kuwo.py @@ -81,7 +81,7 @@ class KuwoIE(KuwoBaseIE): 'id': '6446136', 'ext': 'mp3', 'title': '心', - 'description': 'md5:b2ab6295d014005bfc607525bfc1e38a', + 'description': 'md5:5d0e947b242c35dc0eb1d2fce9fbf02c', 'creator': 'IU', 'upload_date': '20150518', }, @@ -102,10 +102,10 @@ class KuwoIE(KuwoBaseIE): raise ExtractorError('this song has been offline because of copyright issues', expected=True) song_name = self._html_search_regex( - r'(?s)class="(?:[^"\s]+\s+)*title(?:\s+[^"\s]+)*".*?]+title="([^"]+)"', webpage, 'song name') - singer_name = self._html_search_regex( - r']+class="s_img">\s*]+title="([^>]+)"', - webpage, 'singer name', fatal=False) + r']+id="lrcName">([^<]+)

', webpage, 'song name') + singer_name = remove_start(self._html_search_regex( + r']+href="http://www\.kuwo\.cn/artist/content\?name=([^"]+)">', + webpage, 'singer name', fatal=False), '歌手') lrc_content = clean_html(get_element_by_id('lrcContent', webpage)) if lrc_content == '暂无': # indicates no lyrics lrc_content = None @@ -114,7 +114,7 @@ class KuwoIE(KuwoBaseIE): self._sort_formats(formats) album_id = self._html_search_regex( - r']+class="album"[^<]+]+href="http://www\.kuwo\.cn/album/(\d+)/"', + r']+href="http://www\.kuwo\.cn/album/(\d+)/"', webpage, 'album id', fatal=False) publish_time = None @@ -268,7 +268,7 @@ class KuwoCategoryIE(InfoExtractor): 'title': '八十年代精选', 'description': '这些都是属于八十年代的回忆!', }, - 'playlist_count': 24, + 'playlist_mincount': 24, } def _real_extract(self, url): diff --git a/youtube_dl/extractor/metacafe.py b/youtube_dl/extractor/metacafe.py index 61dadb7a7..b6f00cc25 100644 --- a/youtube_dl/extractor/metacafe.py +++ b/youtube_dl/extractor/metacafe.py @@ -81,6 +81,9 @@ class MetacafeIE(InfoExtractor): 'title': 'Open: This is Face the Nation, February 9', 'description': 'md5:8a9ceec26d1f7ed6eab610834cc1a476', 'duration': 96, + 'uploader': 'CBSI-NEW', + 'upload_date': '20140209', + 'timestamp': 1391959800, }, 'params': { # rtmp download diff --git a/youtube_dl/extractor/mwave.py b/youtube_dl/extractor/mwave.py index 66b523197..5c3c8d464 100644 --- a/youtube_dl/extractor/mwave.py +++ b/youtube_dl/extractor/mwave.py @@ -12,7 +12,7 @@ class MwaveIE(InfoExtractor): _VALID_URL = r'https?://mwave\.interest\.me/mnettv/videodetail\.m\?searchVideoDetailVO\.clip_id=(?P[0-9]+)' _TEST = { 'url': 'http://mwave.interest.me/mnettv/videodetail.m?searchVideoDetailVO.clip_id=168859', - 'md5': 'c930e27b7720aaa3c9d0018dfc8ff6cc', + # md5 is unstable 'info_dict': { 'id': '168859', 'ext': 'flv', diff --git a/youtube_dl/extractor/nbc.py b/youtube_dl/extractor/nbc.py index e67025ff6..46504cd5f 100644 --- a/youtube_dl/extractor/nbc.py +++ b/youtube_dl/extractor/nbc.py @@ -134,6 +134,9 @@ class NBCSportsIE(InfoExtractor): 'ext': 'flv', 'title': 'Tom Izzo, Michigan St. has \'so much respect\' for Duke', 'description': 'md5:ecb459c9d59e0766ac9c7d5d0eda8113', + 'uploader': 'NBCU-SPORTS', + 'upload_date': '20150330', + 'timestamp': 1427726529, } } @@ -172,7 +175,7 @@ class CSNNEIE(InfoExtractor): class NBCNewsIE(ThePlatformIE): - _VALID_URL = r'''(?x)https?://(?:www\.)?nbcnews\.com/ + _VALID_URL = r'''(?x)https?://(?:www\.)?(?:nbcnews|today)\.com/ (?:video/.+?/(?P\d+)| ([^/]+/)*(?P[^/?]+)) ''' @@ -230,6 +233,18 @@ class NBCNewsIE(ThePlatformIE): }, 'expected_warnings': ['http-6000 is not available'] }, + { + 'url': 'http://www.today.com/video/see-the-aurora-borealis-from-space-in-stunning-new-nasa-video-669831235788', + 'md5': '118d7ca3f0bea6534f119c68ef539f71', + 'info_dict': { + 'id': '669831235788', + 'ext': 'mp4', + 'title': 'See the aurora borealis from space in stunning new NASA video', + 'description': 'md5:74752b7358afb99939c5f8bb2d1d04b1', + 'upload_date': '20160420', + 'timestamp': 1461152093, + }, + }, { 'url': 'http://www.nbcnews.com/watch/dateline/full-episode--deadly-betrayal-386250819952', 'only_matching': True, @@ -264,7 +279,10 @@ class NBCNewsIE(ThePlatformIE): info = bootstrap['results'][0]['video'] else: player_instance_json = self._search_regex( - r'videoObj\s*:\s*({.+})', webpage, 'player instance') + r'videoObj\s*:\s*({.+})', webpage, 'player instance', default=None) + if not player_instance_json: + player_instance_json = self._html_search_regex( + r'data-video="([^"]+)"', webpage, 'video json') info = self._parse_json(player_instance_json, display_id) video_id = info['mpxId'] title = info['title'] @@ -295,7 +313,7 @@ class NBCNewsIE(ThePlatformIE): formats.extend(tp_formats) subtitles = self._merge_subtitles(subtitles, tp_subtitles) else: - tbr = int_or_none(video_asset.get('bitRate'), 1000) + tbr = int_or_none(video_asset.get('bitRate') or video_asset.get('bitrate'), 1000) format_id = 'http%s' % ('-%d' % tbr if tbr else '') video_url = update_url_query( video_url, {'format': 'redirect'}) @@ -321,10 +339,9 @@ class NBCNewsIE(ThePlatformIE): 'id': video_id, 'title': title, 'description': info.get('description'), - 'thumbnail': info.get('description'), 'thumbnail': info.get('thumbnail'), 'duration': int_or_none(info.get('duration')), - 'timestamp': parse_iso8601(info.get('pubDate')), + 'timestamp': parse_iso8601(info.get('pubDate') or info.get('pub_date')), 'formats': formats, 'subtitles': subtitles, } diff --git a/youtube_dl/extractor/nhl.py b/youtube_dl/extractor/nhl.py index c1dea8b6c..b04d21113 100644 --- a/youtube_dl/extractor/nhl.py +++ b/youtube_dl/extractor/nhl.py @@ -8,10 +8,15 @@ from .common import InfoExtractor from ..compat import ( compat_urlparse, compat_urllib_parse_urlencode, - compat_urllib_parse_urlparse + compat_urllib_parse_urlparse, + compat_str, ) from ..utils import ( unified_strdate, + determine_ext, + int_or_none, + parse_iso8601, + parse_duration, ) @@ -70,8 +75,8 @@ class NHLBaseInfoExtractor(InfoExtractor): return ret -class NHLIE(NHLBaseInfoExtractor): - IE_NAME = 'nhl.com' +class NHLVideocenterIE(NHLBaseInfoExtractor): + IE_NAME = 'nhl.com:videocenter' _VALID_URL = r'https?://video(?P\.[^.]*)?\.nhl\.com/videocenter/(?:console|embed)?(?:\?(?:.*?[?&])?)(?:id|hlg|playlist)=(?P[-0-9a-zA-Z,]+)' _TESTS = [{ @@ -186,8 +191,8 @@ class NHLNewsIE(NHLBaseInfoExtractor): return self._real_extract_video(video_id) -class NHLVideocenterIE(NHLBaseInfoExtractor): - IE_NAME = 'nhl.com:videocenter' +class NHLVideocenterCategoryIE(NHLBaseInfoExtractor): + IE_NAME = 'nhl.com:videocenter:category' IE_DESC = 'NHL videocenter category' _VALID_URL = r'https?://video\.(?P[^.]*)\.nhl\.com/videocenter/(console\?[^(id=)]*catid=(?P[0-9]+)(?![&?]id=).*?)?$' _TEST = { @@ -236,3 +241,86 @@ class NHLVideocenterIE(NHLBaseInfoExtractor): 'id': cat_id, 'entries': [self._extract_video(v) for v in videos], } + + +class NHLIE(InfoExtractor): + IE_NAME = 'nhl.com' + _VALID_URL = r'https?://(?:www\.)?nhl\.com/([^/]+/)*c-(?P\d+)' + _TESTS = [{ + # type=video + 'url': 'https://www.nhl.com/video/anisimov-cleans-up-mess/t-277752844/c-43663503', + 'md5': '0f7b9a8f986fb4b4eeeece9a56416eaf', + 'info_dict': { + 'id': '43663503', + 'ext': 'mp4', + 'title': 'Anisimov cleans up mess', + 'description': 'md5:a02354acdfe900e940ce40706939ca63', + 'timestamp': 1461288600, + 'upload_date': '20160422', + }, + }, { + # type=article + 'url': 'https://www.nhl.com/news/dennis-wideman-suspended/c-278258934', + 'md5': '1f39f4ea74c1394dea110699a25b366c', + 'info_dict': { + 'id': '40784403', + 'ext': 'mp4', + 'title': 'Wideman suspended by NHL', + 'description': 'Flames defenseman Dennis Wideman was banned 20 games for violation of Rule 40 (Physical Abuse of Officials)', + 'upload_date': '20160204', + 'timestamp': 1454544904, + }, + }] + + def _real_extract(self, url): + tmp_id = self._match_id(url) + video_data = self._download_json( + 'https://nhl.bamcontent.com/nhl/id/v1/%s/details/web-v1.json' % tmp_id, + tmp_id) + if video_data.get('type') == 'article': + video_data = video_data['media'] + + video_id = compat_str(video_data['id']) + title = video_data['title'] + + formats = [] + for playback in video_data.get('playbacks', []): + playback_url = playback.get('url') + if not playback_url: + continue + ext = determine_ext(playback_url) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + playback_url, video_id, 'mp4', 'm3u8_native', + m3u8_id=playback.get('name', 'hls'), fatal=False)) + else: + height = int_or_none(playback.get('height')) + formats.append({ + 'format_id': playback.get('name', 'http' + ('-%dp' % height if height else '')), + 'url': playback_url, + 'width': int_or_none(playback.get('width')), + 'height': height, + }) + self._sort_formats(formats, ('preference', 'width', 'height', 'tbr', 'format_id')) + + thumbnails = [] + for thumbnail_id, thumbnail_data in video_data.get('image', {}).get('cuts', {}).items(): + thumbnail_url = thumbnail_data.get('src') + if not thumbnail_url: + continue + thumbnails.append({ + 'id': thumbnail_id, + 'url': thumbnail_url, + 'width': int_or_none(thumbnail_data.get('width')), + 'height': int_or_none(thumbnail_data.get('height')), + }) + + return { + 'id': video_id, + 'title': title, + 'description': video_data.get('description'), + 'timestamp': parse_iso8601(video_data.get('date')), + 'duration': parse_duration(video_data.get('duration')), + 'thumbnails': thumbnails, + 'formats': formats, + } diff --git a/youtube_dl/extractor/onionstudios.py b/youtube_dl/extractor/onionstudios.py index 6e843c327..d7b13a0f1 100644 --- a/youtube_dl/extractor/onionstudios.py +++ b/youtube_dl/extractor/onionstudios.py @@ -65,7 +65,7 @@ class OnionStudiosIE(InfoExtractor): r'share_title\s*=\s*(["\'])(?P[^\1]+?)\1', webpage, 'title', group='title') description = self._search_regex( - r'share_description\s*=\s*(["\'])(?P<description>[^\1]+?)\1', + r'share_description\s*=\s*(["\'])(?P<description>[^\'"]+?)\1', webpage, 'description', default=None, group='description') thumbnail = self._search_regex( r'poster\s*=\s*(["\'])(?P<thumbnail>[^\1]+?)\1', diff --git a/youtube_dl/extractor/openload.py b/youtube_dl/extractor/openload.py index 4468f31fc..456561bcc 100644 --- a/youtube_dl/extractor/openload.py +++ b/youtube_dl/extractor/openload.py @@ -6,8 +6,10 @@ import re from .common import InfoExtractor from ..compat import compat_chr from ..utils import ( + determine_ext, encode_base_n, ExtractorError, + mimetype2ext, ) @@ -29,6 +31,11 @@ class OpenloadIE(InfoExtractor): }, { 'url': 'https://openload.io/f/ZAn6oz-VZGE/', 'only_matching': True, + }, { + # unavailable via https://openload.co/f/Sxz5sADo82g/, different layout + # for title and ext + 'url': 'https://openload.co/embed/Sxz5sADo82g/', + 'only_matching': True, }] @staticmethod @@ -96,12 +103,25 @@ class OpenloadIE(InfoExtractor): r'<video[^>]+>\s*<script[^>]+>([^<]+)</script>', webpage, 'JS code') + decoded = self.openload_decode(code) + video_url = self._search_regex( - r'return\s+"(https?://[^"]+)"', self.openload_decode(code), 'video URL') + r'return\s+"(https?://[^"]+)"', decoded, 'video URL') + + title = self._og_search_title(webpage, default=None) or self._search_regex( + r'<span[^>]+class=["\']title["\'][^>]*>([^<]+)', webpage, + 'title', default=None) or self._html_search_meta( + 'description', webpage, 'title', fatal=True) + + ext = mimetype2ext(self._search_regex( + r'window\.vt\s*=\s*(["\'])(?P<mimetype>.+?)\1', decoded, + 'mimetype', default=None, group='mimetype')) or determine_ext( + video_url, 'mp4') return { 'id': video_id, - 'title': self._og_search_title(webpage), - 'thumbnail': self._og_search_thumbnail(webpage), + 'title': title, + 'ext': ext, + 'thumbnail': self._og_search_thumbnail(webpage, default=None), 'url': video_url, } diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py index 7a5a533b7..8272dd969 100644 --- a/youtube_dl/extractor/theplatform.py +++ b/youtube_dl/extractor/theplatform.py @@ -269,6 +269,7 @@ class ThePlatformFeedIE(ThePlatformBaseIE): 'timestamp': 1391824260, 'duration': 467.0, 'categories': ['MSNBC/Issues/Democrats', 'MSNBC/Issues/Elections/Election 2016'], + 'uploader': 'NBCU-NEWS', }, } diff --git a/youtube_dl/extractor/viewster.py b/youtube_dl/extractor/viewster.py index fe94a4793..7839225d4 100644 --- a/youtube_dl/extractor/viewster.py +++ b/youtube_dl/extractor/viewster.py @@ -1,6 +1,8 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor from ..compat import ( compat_HTTPError, @@ -14,6 +16,7 @@ from ..utils import ( parse_iso8601, sanitized_Request, HEADRequest, + url_basename, ) @@ -114,6 +117,7 @@ class ViewsterIE(InfoExtractor): return self.playlist_result(entries, video_id, title, description) formats = [] + manifest_url = None for media_type in ('application/f4m+xml', 'application/x-mpegURL', 'video/mp4'): media = self._download_json( 'https://public-api.viewster.com/movies/%s/video?mediaType=%s' @@ -126,29 +130,42 @@ class ViewsterIE(InfoExtractor): continue ext = determine_ext(video_url) if ext == 'f4m': + manifest_url = video_url video_url += '&' if '?' in video_url else '?' video_url += 'hdcore=3.2.0&plugin=flowplayer-3.2.0.1' formats.extend(self._extract_f4m_formats( video_url, video_id, f4m_id='hds')) elif ext == 'm3u8': + manifest_url = video_url m3u8_formats = self._extract_m3u8_formats( video_url, video_id, 'mp4', m3u8_id='hls', fatal=False) # m3u8 sometimes fail if m3u8_formats: formats.extend(m3u8_formats) else: - format_id = media.get('Bitrate') - f = { - 'url': video_url, - 'format_id': 'mp4-%s' % format_id, - 'height': int_or_none(media.get('Height')), - 'width': int_or_none(media.get('Width')), - 'preference': 1, - } - if format_id and not f['height']: - f['height'] = int_or_none(self._search_regex( - r'^(\d+)[pP]$', format_id, 'height', default=None)) - formats.append(f) + qualities_basename = self._search_regex( + '/([^/]+)\.csmil/', + manifest_url, 'qualities basename', default=None) + if not qualities_basename: + continue + QUALITIES_RE = r'((,\d+k)+,?)' + qualities = self._search_regex( + QUALITIES_RE, qualities_basename, + 'qualities', default=None) + if not qualities: + continue + qualities = qualities.strip(',').split(',') + http_template = re.sub(QUALITIES_RE, r'%s', qualities_basename) + http_url_basename = url_basename(video_url) + for q in qualities: + tbr = int_or_none(self._search_regex( + r'(\d+)k', q, 'bitrate', default=None)) + formats.append({ + 'url': video_url.replace(http_url_basename, http_template % q), + 'ext': 'mp4', + 'format_id': 'http' + ('-%d' % tbr if tbr else ''), + 'tbr': tbr, + }) if not formats and not info.get('LanguageSets') and not info.get('VODSettings'): self.raise_geo_restricted() diff --git a/youtube_dl/extractor/xminus.py b/youtube_dl/extractor/xminus.py index 7c9d8af6f..36e5ead1e 100644 --- a/youtube_dl/extractor/xminus.py +++ b/youtube_dl/extractor/xminus.py @@ -2,15 +2,15 @@ from __future__ import unicode_literals import re +import time from .common import InfoExtractor from ..compat import ( - compat_chr, compat_ord, ) from ..utils import ( int_or_none, - parse_filesize, + parse_duration, ) @@ -22,7 +22,7 @@ class XMinusIE(InfoExtractor): 'info_dict': { 'id': '4542', 'ext': 'mp3', - 'title': 'Леонид Агутин-Песенка шофера', + 'title': 'Леонид Агутин-Песенка шофёра', 'duration': 156, 'tbr': 320, 'filesize_approx': 5900000, @@ -36,38 +36,41 @@ class XMinusIE(InfoExtractor): webpage = self._download_webpage(url, video_id) artist = self._html_search_regex( - r'minus_track\.artist="(.+?)"', webpage, 'artist') + r'<a[^>]+href="/artist/\d+">([^<]+)</a>', webpage, 'artist') title = artist + '-' + self._html_search_regex( - r'minus_track\.title="(.+?)"', webpage, 'title') - duration = int_or_none(self._html_search_regex( - r'minus_track\.dur_sec=\'([0-9]*?)\'', + r'<span[^>]+class="minustrack-full-title(?:\s+[^"]+)?"[^>]*>([^<]+)', webpage, 'title') + duration = parse_duration(self._html_search_regex( + r'<span[^>]+class="player-duration(?:\s+[^"]+)?"[^>]*>([^<]+)', webpage, 'duration', fatal=False)) - filesize_approx = parse_filesize(self._html_search_regex( - r'<div id="finfo"[^>]*>\s*↓\s*([0-9.]+\s*[a-zA-Z][bB])', - webpage, 'approximate filesize', fatal=False)) - tbr = int_or_none(self._html_search_regex( - r'<div class="quality[^"]*"></div>\s*([0-9]+)\s*kbps', - webpage, 'bitrate', fatal=False)) + mobj = re.search( + r'<div[^>]+class="dw-info(?:\s+[^"]+)?"[^>]*>(?P<tbr>\d+)\s*кбит/c\s+(?P<filesize>[0-9.]+)\s*мб</div>', + webpage) + tbr = filesize_approx = None + if mobj: + filesize_approx = float(mobj.group('filesize')) * 1000000 + tbr = float(mobj.group('tbr')) view_count = int_or_none(self._html_search_regex( - r'<div class="quality.*?► ([0-9]+)', + r'<span><[^>]+class="icon-chart-bar".*?>(\d+)</span>', webpage, 'view count', fatal=False)) description = self._html_search_regex( - r'(?s)<div id="song_texts">(.*?)</div><br', + r'(?s)<pre[^>]+id="lyrics-original"[^>]*>(.*?)</pre>', webpage, 'song lyrics', fatal=False) if description: description = re.sub(' *\r *', '\n', description) - enc_token = self._html_search_regex( - r'minus_track\.s?tkn="(.+?)"', webpage, 'enc_token') - token = ''.join( - c if pos == 3 else compat_chr(compat_ord(c) - 1) - for pos, c in enumerate(reversed(enc_token))) - video_url = 'http://x-minus.org/dwlf/%s/%s.mp3' % (video_id, token) + k = self._search_regex( + r'<div[^>]+id="player-bottom"[^>]+data-k="([^"]+)">', webpage, + 'encoded data') + h = time.time() / 3600 + a = sum(map(int, [compat_ord(c) for c in k])) + int(video_id) + h + video_url = 'http://x-minus.me/dl/minus?id=%s&tkn2=%df%d' % (video_id, a, h) return { 'id': video_id, 'title': title, 'url': video_url, + # The extension is unknown until actual downloading + 'ext': 'mp3', 'duration': duration, 'filesize_approx': filesize_approx, 'tbr': tbr, diff --git a/youtube_dl/extractor/yahoo.py b/youtube_dl/extractor/yahoo.py index b2d8f4b48..b376f2b93 100644 --- a/youtube_dl/extractor/yahoo.py +++ b/youtube_dl/extractor/yahoo.py @@ -24,7 +24,7 @@ from .nbc import NBCSportsVPlayerIE class YahooIE(InfoExtractor): IE_DESC = 'Yahoo screen and movies' - _VALID_URL = r'(?P<url>(?P<host>https?://(?:[a-zA-Z]{2}\.)?[\da-zA-Z_-]+\.yahoo\.com)/(?:[^/]+/)*(?P<display_id>.+)?-(?P<id>[0-9]+)(?:-[a-z]+)?\.html)' + _VALID_URL = r'(?P<url>(?P<host>https?://(?:[a-zA-Z]{2}\.)?[\da-zA-Z_-]+\.yahoo\.com)/(?:[^/]+/)*(?P<display_id>.+)?-(?P<id>[0-9]+)(?:-[a-z]+)?(?:\.html)?)' _TESTS = [ { 'url': 'http://screen.yahoo.com/julian-smith-travis-legg-watch-214727115.html', @@ -38,7 +38,7 @@ class YahooIE(InfoExtractor): }, { 'url': 'http://screen.yahoo.com/wired/codefellas-s1-ep12-cougar-lies-103000935.html', - 'md5': 'd6e6fc6e1313c608f316ddad7b82b306', + 'md5': 'c3466d2b6d5dd6b9f41ba9ed04c24b23', 'info_dict': { 'id': 'd1dedf8c-d58c-38c3-8963-e899929ae0a9', 'ext': 'mp4', @@ -49,7 +49,7 @@ class YahooIE(InfoExtractor): }, { 'url': 'https://screen.yahoo.com/community/community-sizzle-reel-203225340.html?format=embed', - 'md5': '60e8ac193d8fb71997caa8fce54c6460', + 'md5': '75ffabdb87c16d4ffe8c036dc4d1c136', 'info_dict': { 'id': '4fe78544-8d48-39d8-97cd-13f205d9fcdb', 'ext': 'mp4', @@ -59,15 +59,15 @@ class YahooIE(InfoExtractor): } }, { - 'url': 'https://tw.screen.yahoo.com/election-2014-askmayor/敢問市長-黃秀霜批賴清德-非常高傲-033009720.html', - 'md5': '3a09cf59349cfaddae1797acc3c087fc', + 'url': 'https://tw.news.yahoo.com/%E6%95%A2%E5%95%8F%E5%B8%82%E9%95%B7%20%E9%BB%83%E7%A7%80%E9%9C%9C%E6%89%B9%E8%B3%B4%E6%B8%85%E5%BE%B7%20%E9%9D%9E%E5%B8%B8%E9%AB%98%E5%82%B2-034024051.html', + 'md5': '9035d38f88b1782682a3e89f985be5bb', 'info_dict': { 'id': 'cac903b3-fcf4-3c14-b632-643ab541712f', 'ext': 'mp4', 'title': '敢問市長/黃秀霜批賴清德「非常高傲」', 'description': '直言台南沒捷運 交通居五都之末', 'duration': 396, - } + }, }, { 'url': 'https://uk.screen.yahoo.com/editor-picks/cute-raccoon-freed-drain-using-091756545.html', @@ -89,17 +89,32 @@ class YahooIE(InfoExtractor): 'title': 'Program that makes hockey more affordable not offered in Manitoba', 'description': 'md5:c54a609f4c078d92b74ffb9bf1f496f4', 'duration': 121, - } + }, + 'skip': 'Video gone', }, { 'url': 'https://ca.finance.yahoo.com/news/hackers-sony-more-trouble-well-154609075.html', - 'md5': '226a895aae7e21b0129e2a2006fe9690', 'info_dict': { - 'id': 'e624c4bc-3389-34de-9dfc-025f74943409', - 'ext': 'mp4', - 'title': '\'The Interview\' TV Spot: War', - 'description': 'The Interview', - 'duration': 30, - } + 'id': '154609075', + }, + 'playlist': [{ + 'md5': 'f8e336c6b66f503282e5f719641d6565', + 'info_dict': { + 'id': 'e624c4bc-3389-34de-9dfc-025f74943409', + 'ext': 'mp4', + 'title': '\'The Interview\' TV Spot: War', + 'description': 'The Interview', + 'duration': 30, + }, + }, { + 'md5': '958bcb90b4d6df71c56312137ee1cd5a', + 'info_dict': { + 'id': '1fc8ada0-718e-3abe-a450-bf31f246d1a9', + 'ext': 'mp4', + 'title': '\'The Interview\' TV Spot: Guys', + 'description': 'The Interview', + 'duration': 30, + }, + }], }, { 'url': 'http://news.yahoo.com/video/china-moses-crazy-blues-104538833.html', 'md5': '88e209b417f173d86186bef6e4d1f160', @@ -119,10 +134,11 @@ class YahooIE(InfoExtractor): 'title': 'Connect the Dots: Dark Side of Virgo', 'description': 'md5:1428185051cfd1949807ad4ff6d3686a', 'duration': 201, - } + }, + 'skip': 'Domain name in.lifestyle.yahoo.com gone', }, { 'url': 'https://www.yahoo.com/movies/v/true-story-trailer-173000497.html', - 'md5': '989396ae73d20c6f057746fb226aa215', + 'md5': 'b17ac378b1134fa44370fb27db09a744', 'info_dict': { 'id': '071c4013-ce30-3a93-a5b2-e0413cd4a9d1', 'ext': 'mp4', @@ -141,6 +157,9 @@ class YahooIE(InfoExtractor): 'ext': 'flv', 'description': 'md5:df390f70a9ba7c95ff1daace988f0d8d', 'title': 'Tyler Kalinoski hits buzzer-beater to lift Davidson', + 'upload_date': '20150313', + 'uploader': 'NBCU-SPORTS', + 'timestamp': 1426270238, } }, { 'url': 'https://tw.news.yahoo.com/-100120367.html', @@ -148,7 +167,7 @@ class YahooIE(InfoExtractor): }, { # Query result is embedded in webpage, but explicit request to video API fails with geo restriction 'url': 'https://screen.yahoo.com/community/communitary-community-episode-1-ladders-154501237.html', - 'md5': '4fbafb9c9b6f07aa8f870629f6671b35', + 'md5': '1ddbf7c850777548438e5c4f147c7b8c', 'info_dict': { 'id': '1f32853c-a271-3eef-8cb6-f6d6872cb504', 'ext': 'mp4', @@ -166,6 +185,17 @@ class YahooIE(InfoExtractor): 'description': 'While they play feuding fathers in \'Daddy\'s Home,\' star Will Ferrell & Mark Wahlberg share their true feelings on parenthood.', }, }, + { + # config['models']['applet_model']['data']['sapi'] has no query + 'url': 'https://www.yahoo.com/music/livenation/event/galactic-2016', + 'md5': 'dac0c72d502bc5facda80c9e6d5c98db', + 'info_dict': { + 'id': 'a6015640-e9e5-3efb-bb60-05589a183919', + 'ext': 'mp4', + 'description': 'Galactic', + 'title': 'Dolla Diva (feat. Maggie Koerner)', + }, + }, ] def _real_extract(self, url): @@ -174,19 +204,26 @@ class YahooIE(InfoExtractor): page_id = mobj.group('id') url = mobj.group('url') host = mobj.group('host') - webpage = self._download_webpage(url, display_id) + webpage, urlh = self._download_webpage_handle(url, display_id) + if 'err=404' in urlh.geturl(): + raise ExtractorError('Video gone', expected=True) # Look for iframed media first - iframe_m = re.search(r'<iframe[^>]+src="(/video/.+?-\d+\.html\?format=embed.*?)"', webpage) - if iframe_m: + entries = [] + iframe_urls = re.findall(r'<iframe[^>]+src="(/video/.+?-\d+\.html\?format=embed.*?)"', webpage) + for idx, iframe_url in enumerate(iframe_urls): iframepage = self._download_webpage( - host + iframe_m.group(1), display_id, 'Downloading iframe webpage') + host + iframe_url, display_id, + note='Downloading iframe webpage for video #%d' % idx) items_json = self._search_regex( r'mediaItems: (\[.+?\])$', iframepage, 'items', flags=re.MULTILINE, default=None) if items_json: items = json.loads(items_json) video_id = items[0]['id'] - return self._get_info(video_id, display_id, webpage) + entries.append(self._get_info(video_id, display_id, webpage)) + if entries: + return self.playlist_result(entries, page_id) + # Look for NBCSports iframes nbc_sports_url = NBCSportsVPlayerIE._extract_url(webpage) if nbc_sports_url: @@ -202,7 +239,7 @@ class YahooIE(InfoExtractor): config = self._parse_json(config_json, display_id, fatal=False) if config: sapi = config.get('models', {}).get('applet_model', {}).get('data', {}).get('sapi') - if sapi: + if sapi and 'query' in sapi: return self._extract_info(display_id, sapi, webpage) items_json = self._search_regex( diff --git a/youtube_dl/extractor/youku.py b/youtube_dl/extractor/youku.py index fd7eb5a6d..349ce0941 100644 --- a/youtube_dl/extractor/youku.py +++ b/youtube_dl/extractor/youku.py @@ -64,6 +64,14 @@ class YoukuIE(InfoExtractor): 'params': { 'videopassword': '100600', }, + }, { + # /play/get.json contains streams with "channel_type":"tail" + 'url': 'http://v.youku.com/v_show/id_XOTUxMzg4NDMy.html', + 'info_dict': { + 'id': 'XOTUxMzg4NDMy', + 'title': '我的世界☆明月庄主☆车震猎杀☆杀人艺术Minecraft', + }, + 'playlist_count': 6, }] def construct_video_urls(self, data): @@ -92,6 +100,8 @@ class YoukuIE(InfoExtractor): fileid_dict = {} for stream in data['stream']: + if stream.get('channel_type') == 'tail': + continue format = stream.get('stream_type') fileid = stream['stream_fileid'] fileid_dict[format] = fileid @@ -117,6 +127,8 @@ class YoukuIE(InfoExtractor): # generate video_urls video_urls_dict = {} for stream in data['stream']: + if stream.get('channel_type') == 'tail': + continue format = stream.get('stream_type') video_urls = [] for dt in stream['segs']: @@ -253,6 +265,8 @@ class YoukuIE(InfoExtractor): # which one has all } for i in range(max(len(v.get('segs')) for v in data['stream']))] for stream in data['stream']: + if stream.get('channel_type') == 'tail': + continue fm = stream.get('stream_type') video_urls = video_urls_dict[fm] for video_url, seg, entry in zip(video_urls, stream['segs'], entries): diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 178bfd6a6..09a27ac42 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1935,6 +1935,9 @@ def error_to_compat_str(err): def mimetype2ext(mt): + if mt is None: + return None + ext = { 'audio/mp4': 'm4a', }.get(mt) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 140a67847..8befd9607 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2016.04.19' +__version__ = '2016.04.24'