diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index d30650b7c..7a79389ce 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2016.07.05*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2016.07.05** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2016.07.06*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2016.07.06** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through [README](https://github.com/rg3/youtube-dl/blob/master/README.md) and **most notably** [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -35,7 +35,7 @@ $ youtube-dl -v [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2016.07.05 +[debug] youtube-dl version 2016.07.06 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 93237022f..3388fe221 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -857,6 +857,7 @@ - **youtube:search**: YouTube.com searches - **youtube:search:date**: YouTube.com searches, newest videos first - **youtube:search_url**: YouTube.com search URLs + - **youtube:shared** - **youtube:show**: YouTube.com (multi-season) shows - **youtube:subscriptions**: YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication) - **youtube:user**: YouTube.com user videos (URL or "ytuser" keyword) diff --git a/test/test_utils.py b/test/test_utils.py index a8abd9d63..651079b5a 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -33,6 +33,7 @@ from youtube_dl.utils import ( ExtractorError, find_xpath_attr, fix_xml_ampersands, + get_element_by_class, InAdvancePagedList, intlist_to_bytes, is_html, @@ -1027,5 +1028,13 @@ The first line self.assertEqual(urshift(3, 1), 1) self.assertEqual(urshift(-3, 1), 2147483646) + def test_get_element_by_class(self): + html = ''' + nice + ''' + + self.assertEqual(get_element_by_class('foo', html), 'nice') + self.assertEqual(get_element_by_class('no-such-class', html), None) + if __name__ == '__main__': unittest.main() diff --git a/youtube_dl/extractor/amp.py b/youtube_dl/extractor/amp.py index 8545681be..e8e40126b 100644 --- a/youtube_dl/extractor/amp.py +++ b/youtube_dl/extractor/amp.py @@ -5,6 +5,8 @@ from .common import InfoExtractor from ..utils import ( int_or_none, parse_iso8601, + mimetype2ext, + determine_ext, ) @@ -50,21 +52,25 @@ class AMPIE(InfoExtractor): if isinstance(media_content, dict): media_content = [media_content] for media_data in media_content: - media = media_data['@attributes'] - media_type = media['type'] - if media_type in ('video/f4m', 'application/f4m+xml'): + media = media_data.get('@attributes', {}) + media_url = media.get('url') + if not media_url: + continue + ext = mimetype2ext(media.get('type')) or determine_ext(media_url) + if ext == 'f4m': formats.extend(self._extract_f4m_formats( - media['url'] + '?hdcore=3.4.0&plugin=aasp-3.4.0.132.124', + media_url + '?hdcore=3.4.0&plugin=aasp-3.4.0.132.124', video_id, f4m_id='hds', fatal=False)) - elif media_type == 'application/x-mpegURL': + elif ext == 'm3u8': formats.extend(self._extract_m3u8_formats( - media['url'], video_id, 'mp4', m3u8_id='hls', fatal=False)) + media_url, video_id, 'mp4', m3u8_id='hls', fatal=False)) else: formats.append({ 'format_id': media_data.get('media-category', {}).get('@attributes', {}).get('label'), 'url': media['url'], 'tbr': int_or_none(media.get('bitrate')), 'filesize': int_or_none(media.get('fileSize')), + 'ext': ext, }) self._sort_formats(formats) diff --git a/youtube_dl/extractor/cliprs.py b/youtube_dl/extractor/cliprs.py index 4f9320ea5..d55b26d59 100644 --- a/youtube_dl/extractor/cliprs.py +++ b/youtube_dl/extractor/cliprs.py @@ -1,16 +1,10 @@ # coding: utf-8 from __future__ import unicode_literals -from .common import InfoExtractor -from ..utils import ( - ExtractorError, - float_or_none, - int_or_none, - parse_iso8601, -) +from .onet import OnetBaseIE -class ClipRsIE(InfoExtractor): +class ClipRsIE(OnetBaseIE): _VALID_URL = r'https?://(?:www\.)?clip\.rs/(?P[^/]+)/\d+' _TEST = { 'url': 'http://www.clip.rs/premijera-frajle-predstavljaju-novi-spot-za-pesmu-moli-me-moli/3732', @@ -27,64 +21,13 @@ class ClipRsIE(InfoExtractor): } def _real_extract(self, url): - video_id = self._match_id(url) + display_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + webpage = self._download_webpage(url, display_id) - video_id = self._search_regex( - r'id=(["\'])mvp:(?P.+?)\1', webpage, 'mvp id', group='id') + mvp_id = self._search_mvp_id(webpage) - response = self._download_json( - 'http://qi.ckm.onetapi.pl/', video_id, - query={ - 'body[id]': video_id, - 'body[jsonrpc]': '2.0', - 'body[method]': 'get_asset_detail', - 'body[params][ID_Publikacji]': video_id, - 'body[params][Service]': 'www.onet.pl', - 'content-type': 'application/jsonp', - 'x-onet-app': 'player.front.onetapi.pl', - }) + info_dict = self._extract_from_id(mvp_id, webpage) + info_dict['display_id'] = display_id - error = response.get('error') - if error: - raise ExtractorError( - '%s said: %s' % (self.IE_NAME, error['message']), expected=True) - - video = response['result'].get('0') - - formats = [] - for _, formats_dict in video['formats'].items(): - if not isinstance(formats_dict, dict): - continue - for format_id, format_list in formats_dict.items(): - if not isinstance(format_list, list): - continue - for f in format_list: - if not f.get('url'): - continue - formats.append({ - 'url': f['url'], - 'format_id': format_id, - 'height': int_or_none(f.get('vertical_resolution')), - 'width': int_or_none(f.get('horizontal_resolution')), - 'abr': float_or_none(f.get('audio_bitrate')), - 'vbr': float_or_none(f.get('video_bitrate')), - }) - self._sort_formats(formats) - - meta = video.get('meta', {}) - - title = self._og_search_title(webpage, default=None) or meta['title'] - description = self._og_search_description(webpage, default=None) or meta.get('description') - duration = meta.get('length') or meta.get('lenght') - timestamp = parse_iso8601(meta.get('addDate'), ' ') - - return { - 'id': video_id, - 'title': title, - 'description': description, - 'duration': duration, - 'timestamp': timestamp, - 'formats': formats, - } + return info_dict diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index 2e6226ea0..15bfc59b2 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -16,6 +16,7 @@ from ..utils import ( sanitized_Request, str_to_int, unescapeHTML, + mimetype2ext, ) @@ -153,18 +154,19 @@ class DailymotionIE(DailymotionBaseInfoExtractor): type_ = media.get('type') if type_ == 'application/vnd.lumberjack.manifest': continue - ext = determine_ext(media_url) - if type_ == 'application/x-mpegURL' or ext == 'm3u8': + ext = mimetype2ext(type_) or determine_ext(media_url) + if ext == 'm3u8': formats.extend(self._extract_m3u8_formats( media_url, video_id, 'mp4', preference=-1, m3u8_id='hls', fatal=False)) - elif type_ == 'application/f4m' or ext == 'f4m': + elif ext == 'f4m': formats.extend(self._extract_f4m_formats( media_url, video_id, preference=-1, f4m_id='hds', fatal=False)) else: f = { 'url': media_url, 'format_id': 'http-%s' % quality, + 'ext': ext, } m = re.search(r'H264-(?P\d+)x(?P\d+)', media_url) if m: diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index d21d7870a..e52faa078 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -139,9 +139,9 @@ from .chirbit import ( ChirbitProfileIE, ) from .cinchcast import CinchcastIE -from .cliprs import ClipRsIE from .clipfish import ClipfishIE from .cliphunter import CliphunterIE +from .cliprs import ClipRsIE from .clipsyndicate import ClipsyndicateIE from .closertotruth import CloserToTruthIE from .cloudy import CloudyIE @@ -581,6 +581,10 @@ from .nytimes import ( from .nuvid import NuvidIE from .odnoklassniki import OdnoklassnikiIE from .oktoberfesttv import OktoberfestTVIE +from .onet import ( + OnetIE, + OnetChannelIE, +) from .onionstudios import OnionStudiosIE from .ooyala import ( OoyalaIE, diff --git a/youtube_dl/extractor/metacafe.py b/youtube_dl/extractor/metacafe.py index b6f00cc25..e6e7659a1 100644 --- a/youtube_dl/extractor/metacafe.py +++ b/youtube_dl/extractor/metacafe.py @@ -11,13 +11,14 @@ from ..utils import ( determine_ext, ExtractorError, int_or_none, - sanitized_Request, urlencode_postdata, + get_element_by_attribute, + mimetype2ext, ) class MetacafeIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*' + _VALID_URL = r'https?://(?:www\.)?metacafe\.com/watch/(?P[^/]+)/(?P[^/?#]+)' _DISCLAIMER = 'http://www.metacafe.com/family_filter/' _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user' IE_NAME = 'metacafe' @@ -47,6 +48,7 @@ class MetacafeIE(InfoExtractor): 'uploader': 'ign', 'description': 'Sony released a massive FAQ on the PlayStation Blog detailing the PS4\'s capabilities and limitations.', }, + 'skip': 'Page is temporarily unavailable.', }, # AnyClip video { @@ -55,8 +57,8 @@ class MetacafeIE(InfoExtractor): 'id': 'an-dVVXnuY7Jh77J', 'ext': 'mp4', 'title': 'The Andromeda Strain (1971): Stop the Bomb Part 3', - 'uploader': 'anyclip', - 'description': 'md5:38c711dd98f5bb87acf973d573442e67', + 'uploader': 'AnyClip', + 'description': 'md5:cbef0460d31e3807f6feb4e7a5952e5b', }, }, # age-restricted video @@ -110,28 +112,25 @@ class MetacafeIE(InfoExtractor): def report_disclaimer(self): self.to_screen('Retrieving disclaimer') - def _real_initialize(self): + def _confirm_age(self): # Retrieve disclaimer self.report_disclaimer() self._download_webpage(self._DISCLAIMER, None, False, 'Unable to retrieve disclaimer') # Confirm age - disclaimer_form = { - 'filters': '0', - 'submit': "Continue - I'm over 18", - } - request = sanitized_Request(self._FILTER_POST, urlencode_postdata(disclaimer_form)) - request.add_header('Content-Type', 'application/x-www-form-urlencoded') self.report_age_confirmation() - self._download_webpage(request, None, False, 'Unable to confirm age') + self._download_webpage( + self._FILTER_POST, None, False, 'Unable to confirm age', + data=urlencode_postdata({ + 'filters': '0', + 'submit': "Continue - I'm over 18", + }), headers={ + 'Content-Type': 'application/x-www-form-urlencoded', + }) def _real_extract(self, url): # Extract id and simplified title from URL - mobj = re.match(self._VALID_URL, url) - if mobj is None: - raise ExtractorError('Invalid URL: %s' % url) - - video_id = mobj.group(1) + video_id, display_id = re.match(self._VALID_URL, url).groups() # the video may come from an external site m_external = re.match('^(\w{2})-(.*)$', video_id) @@ -144,15 +143,24 @@ class MetacafeIE(InfoExtractor): if prefix == 'cb': return self.url_result('theplatform:%s' % ext_id, 'ThePlatform') - # Retrieve video webpage to extract further information - req = sanitized_Request('http://www.metacafe.com/watch/%s/' % video_id) + # self._confirm_age() # AnyClip videos require the flashversion cookie so that we get the link # to the mp4 file - mobj_an = re.match(r'^an-(.*?)$', video_id) - if mobj_an: - req.headers['Cookie'] = 'flashVersion=0;' - webpage = self._download_webpage(req, video_id) + headers = {} + if video_id.startswith('an-'): + headers['Cookie'] = 'flashVersion=0;' + + # Retrieve video webpage to extract further information + webpage = self._download_webpage(url, video_id, headers=headers) + + error = get_element_by_attribute( + 'class', 'notfound-page-title', webpage) + if error: + raise ExtractorError(error, expected=True) + + video_title = self._html_search_meta( + ['og:title', 'twitter:title'], webpage, 'title', default=None) or self._search_regex(r'

(.*?)

', webpage, 'title') # Extract URL, uploader and title from webpage self.report_extraction(video_id) @@ -216,20 +224,40 @@ class MetacafeIE(InfoExtractor): 'player_url': player_url, 'ext': play_path.partition(':')[0], }) + if video_url is None: + flashvars = self._parse_json(self._search_regex( + r'flashvars\s*=\s*({.*});', webpage, 'flashvars', + default=None), video_id, fatal=False) + if flashvars: + video_url = [] + for source in flashvars.get('sources'): + source_url = source.get('src') + if not source_url: + continue + ext = mimetype2ext(source.get('type')) or determine_ext(source_url) + if ext == 'm3u8': + video_url.extend(self._extract_m3u8_formats( + source_url, video_id, 'mp4', + 'm3u8_native', m3u8_id='hls', fatal=False)) + else: + video_url.append({ + 'url': source_url, + 'ext': ext, + }) if video_url is None: raise ExtractorError('Unsupported video type') - video_title = self._html_search_regex( - r'(?im)(.*) - Video', webpage, 'title') - description = self._og_search_description(webpage) - thumbnail = self._og_search_thumbnail(webpage) + description = self._html_search_meta( + ['og:description', 'twitter:description', 'description'], + webpage, 'title', fatal=False) + thumbnail = self._html_search_meta( + ['og:image', 'twitter:image'], webpage, 'title', fatal=False) video_uploader = self._html_search_regex( r'submitter=(.*?);|googletag\.pubads\(\)\.setTargeting\("(?:channel|submiter)","([^"]+)"\);', webpage, 'uploader nickname', fatal=False) duration = int_or_none( - self._html_search_meta('video:duration', webpage)) - + self._html_search_meta('video:duration', webpage, default=None)) age_limit = ( 18 if re.search(r'(?:"contentRating":|"rating",)"restricted"', webpage) @@ -242,10 +270,11 @@ class MetacafeIE(InfoExtractor): 'url': video_url, 'ext': video_ext, }] - self._sort_formats(formats) + return { 'id': video_id, + 'display_id': display_id, 'description': description, 'uploader': video_uploader, 'title': video_title, diff --git a/youtube_dl/extractor/onet.py b/youtube_dl/extractor/onet.py new file mode 100644 index 000000000..402d3a9f7 --- /dev/null +++ b/youtube_dl/extractor/onet.py @@ -0,0 +1,172 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + ExtractorError, + float_or_none, + get_element_by_class, + int_or_none, + js_to_json, + parse_iso8601, + remove_start, + strip_or_none, + url_basename, +) + + +class OnetBaseIE(InfoExtractor): + def _search_mvp_id(self, webpage): + return self._search_regex( + r'id=(["\'])mvp:(?P.+?)\1', webpage, 'mvp id', group='id') + + def _extract_from_id(self, video_id, webpage): + response = self._download_json( + 'http://qi.ckm.onetapi.pl/', video_id, + query={ + 'body[id]': video_id, + 'body[jsonrpc]': '2.0', + 'body[method]': 'get_asset_detail', + 'body[params][ID_Publikacji]': video_id, + 'body[params][Service]': 'www.onet.pl', + 'content-type': 'application/jsonp', + 'x-onet-app': 'player.front.onetapi.pl', + }) + + error = response.get('error') + if error: + raise ExtractorError( + '%s said: %s' % (self.IE_NAME, error['message']), expected=True) + + video = response['result'].get('0') + + formats = [] + for _, formats_dict in video['formats'].items(): + if not isinstance(formats_dict, dict): + continue + for format_id, format_list in formats_dict.items(): + if not isinstance(format_list, list): + continue + for f in format_list: + video_url = f.get('url') + if not video_url: + continue + ext = determine_ext(video_url) + if format_id == 'ism': + # TODO: Support Microsoft Smooth Streaming + continue + elif ext == 'mpd': + # TODO: Current DASH formats are broken - $Time$ pattern in + # not implemented yet + # formats.extend(self._extract_mpd_formats( + # video_url, video_id, mpd_id='dash', fatal=False)) + continue + else: + formats.append({ + 'url': video_url, + 'format_id': format_id, + 'height': int_or_none(f.get('vertical_resolution')), + 'width': int_or_none(f.get('horizontal_resolution')), + 'abr': float_or_none(f.get('audio_bitrate')), + 'vbr': float_or_none(f.get('video_bitrate')), + }) + self._sort_formats(formats) + + meta = video.get('meta', {}) + + title = self._og_search_title(webpage, default=None) or meta['title'] + description = self._og_search_description(webpage, default=None) or meta.get('description') + duration = meta.get('length') or meta.get('lenght') + timestamp = parse_iso8601(meta.get('addDate'), ' ') + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'duration': duration, + 'timestamp': timestamp, + 'formats': formats, + } + + +class OnetIE(OnetBaseIE): + _VALID_URL = 'https?://(?:www\.)?onet\.tv/[a-z]/[a-z]+/(?P[0-9a-z-]+)/(?P[0-9a-z]+)' + IE_NAME = 'onet.tv' + + _TEST = { + 'url': 'http://onet.tv/k/openerfestival/open-er-festival-2016-najdziwniejsze-wymagania-gwiazd/qbpyqc', + 'md5': 'e3ffbf47590032ac3f27249204173d50', + 'info_dict': { + 'id': 'qbpyqc', + 'display_id': 'open-er-festival-2016-najdziwniejsze-wymagania-gwiazd', + 'ext': 'mp4', + 'title': 'Open\'er Festival 2016: najdziwniejsze wymagania gwiazd', + 'description': 'Trzy samochody, których nigdy nie użyto, prywatne spa, hotel dekorowany czarnym suknem czy nielegalne używki. Organizatorzy koncertów i festiwali muszą stawać przed nie lada wyzwaniem zapraszając gwia...', + 'upload_date': '20160705', + 'timestamp': 1467721580, + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + display_id, video_id = mobj.group('display_id', 'id') + + webpage = self._download_webpage(url, display_id) + + mvp_id = self._search_mvp_id(webpage) + + info_dict = self._extract_from_id(mvp_id, webpage) + info_dict.update({ + 'id': video_id, + 'display_id': display_id, + }) + + return info_dict + + +class OnetChannelIE(OnetBaseIE): + _VALID_URL = r'https?://(?:www\.)?onet\.tv/[a-z]/(?P[a-z]+)(?:[?#]|$)' + IE_NAME = 'onet.tv:channel' + + _TEST = { + 'url': 'http://onet.tv/k/openerfestival', + 'info_dict': { + 'id': 'openerfestival', + 'title': 'Open\'er Festival Live', + 'description': 'Dziękujemy, że oglądaliście transmisje. Zobaczcie nasze relacje i wywiady z artystami.', + }, + 'playlist_mincount': 46, + } + + def _real_extract(self, url): + channel_id = self._match_id(url) + + webpage = self._download_webpage(url, channel_id) + + current_clip_info = self._parse_json(self._search_regex( + r'var\s+currentClip\s*=\s*({[^}]+})', webpage, 'video info'), channel_id, + transform_source=lambda s: js_to_json(re.sub(r'\'\s*\+\s*\'', '', s))) + video_id = remove_start(current_clip_info['ckmId'], 'mvp:') + video_name = url_basename(current_clip_info['url']) + + if self._downloader.params.get('noplaylist'): + self.to_screen( + 'Downloading just video %s because of --no-playlist' % video_name) + return self._extract_from_id(video_id, webpage) + + self.to_screen( + 'Downloading channel %s - add --no-playlist to just download video %s' % ( + channel_id, video_name)) + matches = re.findall( + r']+href=[\'"](https?://(?:www\.)?onet\.tv/[a-z]/[a-z]+/[0-9a-z-]+/[0-9a-z]+)', + webpage) + entries = [ + self.url_result(video_link, OnetIE.ie_key()) + for video_link in matches] + + channel_title = strip_or_none(get_element_by_class('o_channelName', webpage)) + channel_description = strip_or_none(get_element_by_class('o_channelDesc', webpage)) + return self.playlist_result(entries, channel_id, channel_title, channel_description) diff --git a/youtube_dl/extractor/onionstudios.py b/youtube_dl/extractor/onionstudios.py index d7b13a0f1..6fb1a3fcc 100644 --- a/youtube_dl/extractor/onionstudios.py +++ b/youtube_dl/extractor/onionstudios.py @@ -7,6 +7,8 @@ from .common import InfoExtractor from ..utils import ( determine_ext, int_or_none, + float_or_none, + mimetype2ext, ) @@ -15,15 +17,14 @@ class OnionStudiosIE(InfoExtractor): _TESTS = [{ 'url': 'http://www.onionstudios.com/videos/hannibal-charges-forward-stops-for-a-cocktail-2937', - 'md5': 'd4851405d31adfadf71cd7a487b765bb', + 'md5': 'e49f947c105b8a78a675a0ee1bddedfe', 'info_dict': { 'id': '2937', 'ext': 'mp4', 'title': 'Hannibal charges forward, stops for a cocktail', - 'description': 'md5:e786add7f280b7f0fe237b64cc73df76', 'thumbnail': 're:^https?://.*\.jpg$', 'uploader': 'The A.V. Club', - 'uploader_id': 'TheAVClub', + 'uploader_id': 'the-av-club', }, }, { 'url': 'http://www.onionstudios.com/embed?id=2855&autoplay=true', @@ -40,50 +41,38 @@ class OnionStudiosIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage( - 'http://www.onionstudios.com/embed?id=%s' % video_id, video_id) + video_data = self._download_json( + 'http://www.onionstudios.com/video/%s.json' % video_id, video_id) + + title = video_data['title'] formats = [] - for src in re.findall(r']+src="([^"]+)"', webpage): - ext = determine_ext(src) + for source in video_data.get('sources', []): + source_url = source.get('url') + if not source_url: + continue + ext = mimetype2ext(source.get('content_type')) or determine_ext(source_url) if ext == 'm3u8': formats.extend(self._extract_m3u8_formats( - src, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) + source_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) else: - height = int_or_none(self._search_regex( - r'/(\d+)\.%s' % ext, src, 'height', default=None)) + tbr = int_or_none(source.get('bitrate')) formats.append({ - 'format_id': ext + ('-%sp' % height if height else ''), - 'url': src, - 'height': height, + 'format_id': ext + ('-%d' % tbr if tbr else ''), + 'url': source_url, + 'width': int_or_none(source.get('width')), + 'tbr': tbr, 'ext': ext, - 'preference': 1, }) self._sort_formats(formats) - title = self._search_regex( - r'share_title\s*=\s*(["\'])(?P[^\1]+?)\1', - webpage, 'title', group='title') - description = self._search_regex( - r'share_description\s*=\s*(["\'])(?P<description>[^\'"]+?)\1', - webpage, 'description', default=None, group='description') - thumbnail = self._search_regex( - r'poster\s*=\s*(["\'])(?P<thumbnail>[^\1]+?)\1', - webpage, 'thumbnail', default=False, group='thumbnail') - - uploader_id = self._search_regex( - r'twitter_handle\s*=\s*(["\'])(?P<uploader_id>[^\1]+?)\1', - webpage, 'uploader id', fatal=False, group='uploader_id') - uploader = self._search_regex( - r'window\.channelName\s*=\s*(["\'])Embedded:(?P<uploader>[^\1]+?)\1', - webpage, 'uploader', default=False, group='uploader') - return { 'id': video_id, 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'uploader': uploader, - 'uploader_id': uploader_id, + 'thumbnail': video_data.get('poster_url'), + 'uploader': video_data.get('channel_name'), + 'uploader_id': video_data.get('channel_slug'), + 'duration': float_or_none(video_data.get('duration', 1000)), + 'tags': video_data.get('tags'), 'formats': formats, } diff --git a/youtube_dl/extractor/prosiebensat1.py b/youtube_dl/extractor/prosiebensat1.py index 07d49d489..c6eee3b72 100644 --- a/youtube_dl/extractor/prosiebensat1.py +++ b/youtube_dl/extractor/prosiebensat1.py @@ -5,7 +5,7 @@ import re from hashlib import sha1 from .common import InfoExtractor -from ..compat import compat_urllib_parse_urlencode +from ..compat import compat_str from ..utils import ( ExtractorError, determine_ext, @@ -71,6 +71,7 @@ class ProSiebenSat1IE(InfoExtractor): # rtmp download 'skip_download': True, }, + 'skip': 'This video is unavailable', }, { 'url': 'http://www.sixx.de/stars-style/video/sexy-laufen-in-ugg-boots-clip', @@ -86,6 +87,7 @@ class ProSiebenSat1IE(InfoExtractor): # rtmp download 'skip_download': True, }, + 'skip': 'This video is unavailable', }, { 'url': 'http://www.sat1.de/film/der-ruecktritt/video/im-interview-kai-wiesinger-clip', @@ -101,6 +103,7 @@ class ProSiebenSat1IE(InfoExtractor): # rtmp download 'skip_download': True, }, + 'skip': 'This video is unavailable', }, { 'url': 'http://www.kabeleins.de/tv/rosins-restaurants/videos/jagd-auf-fertigkost-im-elsthal-teil-2-ganze-folge', @@ -116,6 +119,7 @@ class ProSiebenSat1IE(InfoExtractor): # rtmp download 'skip_download': True, }, + 'skip': 'This video is unavailable', }, { 'url': 'http://www.ran.de/fussball/bundesliga/video/schalke-toennies-moechte-raul-zurueck-ganze-folge', @@ -131,6 +135,7 @@ class ProSiebenSat1IE(InfoExtractor): # rtmp download 'skip_download': True, }, + 'skip': 'This video is unavailable', }, { 'url': 'http://www.the-voice-of-germany.de/video/31-andreas-kuemmert-rocket-man-clip', @@ -227,70 +232,42 @@ class ProSiebenSat1IE(InfoExtractor): ] def _extract_clip(self, url, webpage): - clip_id = self._html_search_regex(self._CLIPID_REGEXES, webpage, 'clip id') + clip_id = self._html_search_regex( + self._CLIPID_REGEXES, webpage, 'clip id') access_token = 'prosieben' client_name = 'kolibri-2.0.19-splec4' client_location = url - videos_api_url = 'http://vas.sim-technik.de/vas/live/v2/videos?%s' % compat_urllib_parse_urlencode({ - 'access_token': access_token, - 'client_location': client_location, - 'client_name': client_name, - 'ids': clip_id, - }) - - video = self._download_json(videos_api_url, clip_id, 'Downloading videos JSON')[0] + video = self._download_json( + 'http://vas.sim-technik.de/vas/live/v2/videos', + clip_id, 'Downloading videos JSON', query={ + 'access_token': access_token, + 'client_location': client_location, + 'client_name': client_name, + 'ids': clip_id, + })[0] if video.get('is_protected') is True: raise ExtractorError('This video is DRM protected.', expected=True) duration = float_or_none(video.get('duration')) - source_ids = [source['id'] for source in video['sources']] - source_ids_str = ','.join(map(str, source_ids)) + source_ids = [compat_str(source['id']) for source in video['sources']] g = '01!8d8F_)r9]4s[qeuXfP%' + client_id = g[:2] + sha1(''.join([clip_id, g, access_token, client_location, g, client_name]).encode('utf-8')).hexdigest() - client_id = g[:2] + sha1(''.join([clip_id, g, access_token, client_location, g, client_name]) - .encode('utf-8')).hexdigest() - - sources_api_url = 'http://vas.sim-technik.de/vas/live/v2/videos/%s/sources?%s' % (clip_id, compat_urllib_parse_urlencode({ - 'access_token': access_token, - 'client_id': client_id, - 'client_location': client_location, - 'client_name': client_name, - })) - - sources = self._download_json(sources_api_url, clip_id, 'Downloading sources JSON') + sources = self._download_json( + 'http://vas.sim-technik.de/vas/live/v2/videos/%s/sources' % clip_id, + clip_id, 'Downloading sources JSON', query={ + 'access_token': access_token, + 'client_id': client_id, + 'client_location': client_location, + 'client_name': client_name, + }) server_id = sources['server_id'] - client_id = g[:2] + sha1(''.join([g, clip_id, access_token, server_id, - client_location, source_ids_str, g, client_name]) - .encode('utf-8')).hexdigest() - - url_api_url = 'http://vas.sim-technik.de/vas/live/v2/videos/%s/sources/url?%s' % (clip_id, compat_urllib_parse_urlencode({ - 'access_token': access_token, - 'client_id': client_id, - 'client_location': client_location, - 'client_name': client_name, - 'server_id': server_id, - 'source_ids': source_ids_str, - })) - - urls = self._download_json(url_api_url, clip_id, 'Downloading urls JSON') - title = self._html_search_regex(self._TITLE_REGEXES, webpage, 'title') - description = self._html_search_regex(self._DESCRIPTION_REGEXES, webpage, 'description', fatal=False) - thumbnail = self._og_search_thumbnail(webpage) - - upload_date = unified_strdate(self._html_search_regex( - self._UPLOAD_DATE_REGEXES, webpage, 'upload date', default=None)) - - formats = [] - - urls_sources = urls['sources'] - if isinstance(urls_sources, dict): - urls_sources = urls_sources.values() def fix_bitrate(bitrate): bitrate = int_or_none(bitrate) @@ -298,37 +275,73 @@ class ProSiebenSat1IE(InfoExtractor): return None return (bitrate // 1000) if bitrate % 1000 == 0 else bitrate - for source in urls_sources: - protocol = source['protocol'] - source_url = source['url'] - if protocol == 'rtmp' or protocol == 'rtmpe': - mobj = re.search(r'^(?P<url>rtmpe?://[^/]+)/(?P<path>.+)$', source_url) - if not mobj: + formats = [] + for source_id in source_ids: + client_id = g[:2] + sha1(''.join([g, clip_id, access_token, server_id, client_location, source_id, g, client_name]).encode('utf-8')).hexdigest() + urls = self._download_json( + 'http://vas.sim-technik.de/vas/live/v2/videos/%s/sources/url' % clip_id, + clip_id, 'Downloading urls JSON', fatal=False, query={ + 'access_token': access_token, + 'client_id': client_id, + 'client_location': client_location, + 'client_name': client_name, + 'server_id': server_id, + 'source_ids': source_id, + }) + if not urls: + continue + if urls.get('status_code') != 0: + raise ExtractorError('This video is unavailable', expected=True) + urls_sources = urls['sources'] + if isinstance(urls_sources, dict): + urls_sources = urls_sources.values() + for source in urls_sources: + source_url = source.get('url') + if not source_url: continue - path = mobj.group('path') - mp4colon_index = path.rfind('mp4:') - app = path[:mp4colon_index] - play_path = path[mp4colon_index:] - formats.append({ - 'url': '%s/%s' % (mobj.group('url'), app), - 'app': app, - 'play_path': play_path, - 'player_url': 'http://livepassdl.conviva.com/hf/ver/2.79.0.17083/LivePassModuleMain.swf', - 'page_url': 'http://www.prosieben.de', - 'vbr': fix_bitrate(source['bitrate']), - 'ext': 'mp4', - 'format_id': '%s_%s' % (source['cdn'], source['bitrate']), - }) - elif 'f4mgenerator' in source_url or determine_ext(source_url) == 'f4m': - formats.extend(self._extract_f4m_formats(source_url, clip_id)) - else: - formats.append({ - 'url': source_url, - 'vbr': fix_bitrate(source['bitrate']), - }) - + protocol = source.get('protocol') + mimetype = source.get('mimetype') + if mimetype == 'application/f4m+xml' or 'f4mgenerator' in source_url or determine_ext(source_url) == 'f4m': + formats.extend(self._extract_f4m_formats( + source_url, clip_id, f4m_id='hds', fatal=False)) + elif mimetype == 'application/x-mpegURL': + formats.extend(self._extract_m3u8_formats( + source_url, clip_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) + else: + tbr = fix_bitrate(source['bitrate']) + if protocol in ('rtmp', 'rtmpe'): + mobj = re.search(r'^(?P<url>rtmpe?://[^/]+)/(?P<path>.+)$', source_url) + if not mobj: + continue + path = mobj.group('path') + mp4colon_index = path.rfind('mp4:') + app = path[:mp4colon_index] + play_path = path[mp4colon_index:] + formats.append({ + 'url': '%s/%s' % (mobj.group('url'), app), + 'app': app, + 'play_path': play_path, + 'player_url': 'http://livepassdl.conviva.com/hf/ver/2.79.0.17083/LivePassModuleMain.swf', + 'page_url': 'http://www.prosieben.de', + 'tbr': tbr, + 'ext': 'flv', + 'format_id': 'rtmp%s' % ('-%d' % tbr if tbr else ''), + }) + else: + formats.append({ + 'url': source_url, + 'tbr': tbr, + 'format_id': 'http%s' % ('-%d' % tbr if tbr else ''), + }) self._sort_formats(formats) + description = self._html_search_regex( + self._DESCRIPTION_REGEXES, webpage, 'description', fatal=False) + thumbnail = self._og_search_thumbnail(webpage) + upload_date = unified_strdate(self._html_search_regex( + self._UPLOAD_DATE_REGEXES, webpage, 'upload date', default=None)) + return { 'id': clip_id, 'title': title, diff --git a/youtube_dl/extractor/rtvnh.py b/youtube_dl/extractor/rtvnh.py index 4896d09d6..f6454c6b0 100644 --- a/youtube_dl/extractor/rtvnh.py +++ b/youtube_dl/extractor/rtvnh.py @@ -9,7 +9,7 @@ class RTVNHIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?rtvnh\.nl/video/(?P<id>[0-9]+)' _TEST = { 'url': 'http://www.rtvnh.nl/video/131946', - 'md5': '6e1d0ab079e2a00b6161442d3ceacfc1', + 'md5': 'cdbec9f44550763c8afc96050fa747dc', 'info_dict': { 'id': '131946', 'ext': 'mp4', @@ -29,15 +29,29 @@ class RTVNHIE(InfoExtractor): raise ExtractorError( '%s returned error code %d' % (self.IE_NAME, status), expected=True) - formats = self._extract_smil_formats( - 'http://www.rtvnh.nl/video/smil?m=' + video_id, video_id, fatal=False) + formats = [] + rtmp_formats = self._extract_smil_formats( + 'http://www.rtvnh.nl/video/smil?m=' + video_id, video_id) + formats.extend(rtmp_formats) - for item in meta['source']['fb']: - if item.get('type') == 'hls': - formats.extend(self._extract_m3u8_formats( - item['file'], video_id, ext='mp4', entry_protocol='m3u8_native')) - elif item.get('type') == '': - formats.append({'url': item['file']}) + for rtmp_format in rtmp_formats: + rtmp_url = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path']) + rtsp_format = rtmp_format.copy() + del rtsp_format['play_path'] + del rtsp_format['ext'] + rtsp_format.update({ + 'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'), + 'url': rtmp_url.replace('rtmp://', 'rtsp://'), + 'protocol': 'rtsp', + }) + formats.append(rtsp_format) + http_base_url = rtmp_url.replace('rtmp://', 'http://') + formats.extend(self._extract_m3u8_formats( + http_base_url + '/playlist.m3u8', video_id, 'mp4', + 'm3u8_native', m3u8_id='hls', fatal=False)) + formats.extend(self._extract_f4m_formats( + http_base_url + '/manifest.f4m', + video_id, f4m_id='hds', fatal=False)) self._sort_formats(formats) return { diff --git a/youtube_dl/extractor/sandia.py b/youtube_dl/extractor/sandia.py index 759898a49..96e43af84 100644 --- a/youtube_dl/extractor/sandia.py +++ b/youtube_dl/extractor/sandia.py @@ -1,18 +1,12 @@ # coding: utf-8 from __future__ import unicode_literals -import itertools import json -import re from .common import InfoExtractor -from ..compat import compat_urlparse from ..utils import ( int_or_none, - js_to_json, mimetype2ext, - sanitized_Request, - unified_strdate, ) @@ -27,7 +21,8 @@ class SandiaIE(InfoExtractor): 'ext': 'mp4', 'title': 'Xyce Software Training - Section 1', 'description': 're:(?s)SAND Number: SAND 2013-7800.{200,}', - 'upload_date': '20120904', + 'upload_date': '20120409', + 'timestamp': 1333983600, 'duration': 7794, } } @@ -35,81 +30,36 @@ class SandiaIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - req = sanitized_Request(url) - req.add_header('Cookie', 'MediasitePlayerCaps=ClientPlugins=4') - webpage = self._download_webpage(req, video_id) + presentation_data = self._download_json( + 'http://digitalops.sandia.gov/Mediasite/PlayerService/PlayerService.svc/json/GetPlayerOptions', + video_id, data=json.dumps({ + 'getPlayerOptionsRequest': { + 'ResourceId': video_id, + 'QueryString': '', + } + }), headers={ + 'Content-Type': 'application/json; charset=utf-8', + })['d']['Presentation'] - js_path = self._search_regex( - r'<script type="text/javascript" src="(/Mediasite/FileServer/Presentation/[^"]+)"', - webpage, 'JS code URL') - js_url = compat_urlparse.urljoin(url, js_path) - - js_code = self._download_webpage( - js_url, video_id, note='Downloading player') - - def extract_str(key, **args): - return self._search_regex( - r'Mediasite\.PlaybackManifest\.%s\s*=\s*(.+);\s*?\n' % re.escape(key), - js_code, key, **args) - - def extract_data(key, **args): - data_json = extract_str(key, **args) - if data_json is None: - return data_json - return self._parse_json( - data_json, video_id, transform_source=js_to_json) + title = presentation_data['Title'] formats = [] - for i in itertools.count(): - fd = extract_data('VideoUrls[%d]' % i, default=None) - if fd is None: - break - formats.append({ - 'format_id': '%s' % i, - 'format_note': fd['MimeType'].partition('/')[2], - 'ext': mimetype2ext(fd['MimeType']), - 'url': fd['Location'], - 'protocol': 'f4m' if fd['MimeType'] == 'video/x-mp4-fragmented' else None, - }) + for stream in presentation_data.get('Streams', []): + for fd in stream.get('VideoUrls', []): + formats.append({ + 'format_id': fd['MediaType'], + 'format_note': fd['MimeType'].partition('/')[2], + 'ext': mimetype2ext(fd['MimeType']), + 'url': fd['Location'], + 'protocol': 'f4m' if fd['MimeType'] == 'video/x-mp4-fragmented' else None, + }) self._sort_formats(formats) - slide_baseurl = compat_urlparse.urljoin( - url, extract_data('SlideBaseUrl')) - slide_template = slide_baseurl + re.sub( - r'\{0:D?([0-9+])\}', r'%0\1d', extract_data('SlideImageFileNameTemplate')) - slides = [] - last_slide_time = 0 - for i in itertools.count(1): - sd = extract_str('Slides[%d]' % i, default=None) - if sd is None: - break - timestamp = int_or_none(self._search_regex( - r'^Mediasite\.PlaybackManifest\.CreateSlide\("[^"]*"\s*,\s*([0-9]+),', - sd, 'slide %s timestamp' % i, fatal=False)) - slides.append({ - 'url': slide_template % i, - 'duration': timestamp - last_slide_time, - }) - last_slide_time = timestamp - formats.append({ - 'format_id': 'slides', - 'protocol': 'slideshow', - 'url': json.dumps(slides), - 'preference': -10000, # Downloader not yet written - }) - self._sort_formats(formats) - - title = extract_data('Title') - description = extract_data('Description', fatal=False) - duration = int_or_none(extract_data( - 'Duration', fatal=False), scale=1000) - upload_date = unified_strdate(extract_data('AirDate', fatal=False)) - return { 'id': video_id, 'title': title, - 'description': description, + 'description': presentation_data.get('Description'), 'formats': formats, - 'upload_date': upload_date, - 'duration': duration, + 'timestamp': int_or_none(presentation_data.get('UnixTime'), 1000), + 'duration': int_or_none(presentation_data.get('Duration'), 1000), } diff --git a/youtube_dl/extractor/sixplay.py b/youtube_dl/extractor/sixplay.py index 759a332d2..d3aba58a2 100644 --- a/youtube_dl/extractor/sixplay.py +++ b/youtube_dl/extractor/sixplay.py @@ -5,6 +5,8 @@ from .common import InfoExtractor from ..utils import ( qualities, int_or_none, + mimetype2ext, + determine_ext, ) @@ -34,19 +36,21 @@ class SixPlayIE(InfoExtractor): source_type, source_url = source.get('type'), source.get('src') if not source_url or source_type == 'hls/primetime': continue - if source_type == 'application/vnd.apple.mpegURL': + ext = mimetype2ext(source_type) or determine_ext(source_url) + if ext == 'm3u8': formats.extend(self._extract_m3u8_formats( source_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) formats.extend(self._extract_f4m_formats( source_url.replace('.m3u8', '.f4m'), video_id, f4m_id='hds', fatal=False)) - elif source_type == 'video/mp4': + elif ext == 'mp4': quality = source.get('quality') formats.append({ 'url': source_url, 'format_id': quality, 'quality': quality_key(quality), + 'ext': ext, }) self._sort_formats(formats) diff --git a/youtube_dl/extractor/slideshare.py b/youtube_dl/extractor/slideshare.py index 0b717a1e4..4967c1b77 100644 --- a/youtube_dl/extractor/slideshare.py +++ b/youtube_dl/extractor/slideshare.py @@ -9,6 +9,7 @@ from ..compat import ( ) from ..utils import ( ExtractorError, + get_element_by_id, ) @@ -40,7 +41,7 @@ class SlideshareIE(InfoExtractor): bucket = info['jsplayer']['video_bucket'] ext = info['jsplayer']['video_extension'] video_url = compat_urlparse.urljoin(bucket, doc + '-SD.' + ext) - description = self._html_search_regex( + description = get_element_by_id('slideshow-description-paragraph', webpage) or self._html_search_regex( r'(?s)<p[^>]+itemprop="description"[^>]*>(.+?)</p>', webpage, 'description', fatal=False) @@ -51,5 +52,5 @@ class SlideshareIE(InfoExtractor): 'ext': ext, 'url': video_url, 'thumbnail': info['slideshow']['pin_image_url'], - 'description': description, + 'description': description.strip() if description else None, } diff --git a/youtube_dl/extractor/spiegel.py b/youtube_dl/extractor/spiegel.py index 39a7aaf9d..3c552807e 100644 --- a/youtube_dl/extractor/spiegel.py +++ b/youtube_dl/extractor/spiegel.py @@ -4,8 +4,13 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_urlparse from .spiegeltv import SpiegeltvIE +from ..compat import compat_urlparse +from ..utils import ( + extract_attributes, + unified_strdate, + get_element_by_attribute, +) class SpiegelIE(InfoExtractor): @@ -19,6 +24,7 @@ class SpiegelIE(InfoExtractor): 'title': 'Vulkanausbruch in Ecuador: Der "Feuerschlund" ist wieder aktiv', 'description': 'md5:8029d8310232196eb235d27575a8b9f4', 'duration': 49, + 'upload_date': '20130311', }, }, { 'url': 'http://www.spiegel.de/video/schach-wm-videoanalyse-des-fuenften-spiels-video-1309159.html', @@ -29,6 +35,7 @@ class SpiegelIE(InfoExtractor): 'title': 'Schach-WM in der Videoanalyse: Carlsen nutzt die Fehlgriffe des Titelverteidigers', 'description': 'md5:c2322b65e58f385a820c10fa03b2d088', 'duration': 983, + 'upload_date': '20131115', }, }, { 'url': 'http://www.spiegel.de/video/astronaut-alexander-gerst-von-der-iss-station-beantwortet-fragen-video-1519126-embed.html', @@ -38,6 +45,7 @@ class SpiegelIE(InfoExtractor): 'ext': 'mp4', 'description': 'SPIEGEL ONLINE-Nutzer durften den deutschen Astronauten Alexander Gerst über sein Leben auf der ISS-Station befragen. Hier kommen seine Antworten auf die besten sechs Fragen.', 'title': 'Fragen an Astronaut Alexander Gerst: "Bekommen Sie die Tageszeiten mit?"', + 'upload_date': '20140904', } }, { 'url': 'http://www.spiegel.de/video/astronaut-alexander-gerst-von-der-iss-station-beantwortet-fragen-video-1519126-iframe.html', @@ -52,10 +60,10 @@ class SpiegelIE(InfoExtractor): if SpiegeltvIE.suitable(handle.geturl()): return self.url_result(handle.geturl(), 'Spiegeltv') - title = re.sub(r'\s+', ' ', self._html_search_regex( - r'(?s)<(?:h1|div) class="module-title"[^>]*>(.*?)</(?:h1|div)>', - webpage, 'title')) - description = self._html_search_meta('description', webpage, 'description') + video_data = extract_attributes(self._search_regex(r'(<div[^>]+id="spVideoElements"[^>]+>)', webpage, 'video element', default='')) + + title = video_data.get('data-video-title') or get_element_by_attribute('class', 'module-title', webpage) + description = video_data.get('data-video-teaser') or self._html_search_meta('description', webpage, 'description') base_url = self._search_regex( [r'server\s*:\s*(["\'])(?P<url>.+?)\1', r'var\s+server\s*=\s*"(?P<url>[^"]+)\"'], @@ -87,8 +95,9 @@ class SpiegelIE(InfoExtractor): return { 'id': video_id, 'title': title, - 'description': description, + 'description': description.strip() if description else None, 'duration': duration, + 'upload_date': unified_strdate(video_data.get('data-video-date')), 'formats': formats, } @@ -104,6 +113,7 @@ class SpiegelArticleIE(InfoExtractor): 'ext': 'mp4', 'title': 'Faszination Badminton: Nennt es bloß nicht Federball', 'description': 're:^Patrick Kämnitz gehört.{100,}', + 'upload_date': '20140825', }, }, { 'url': 'http://www.spiegel.de/wissenschaft/weltall/astronaut-alexander-gerst-antwortet-spiegel-online-lesern-a-989876.html', diff --git a/youtube_dl/extractor/stitcher.py b/youtube_dl/extractor/stitcher.py index d5c852f52..0f8782d03 100644 --- a/youtube_dl/extractor/stitcher.py +++ b/youtube_dl/extractor/stitcher.py @@ -56,7 +56,7 @@ class StitcherIE(InfoExtractor): episode = self._parse_json( js_to_json(self._search_regex( - r'(?s)var\s+stitcher\s*=\s*({.+?});\n', webpage, 'episode config')), + r'(?s)var\s+stitcher(?:Config)?\s*=\s*({.+?});\n', webpage, 'episode config')), display_id)['config']['episode'] title = unescapeHTML(episode['title']) diff --git a/youtube_dl/extractor/threeqsdn.py b/youtube_dl/extractor/threeqsdn.py index c77a07989..a0bc12c81 100644 --- a/youtube_dl/extractor/threeqsdn.py +++ b/youtube_dl/extractor/threeqsdn.py @@ -92,12 +92,11 @@ class ThreeQSDNIE(InfoExtractor): if not item_url or item_url in urls: return urls.add(item_url) - type_ = item.get('type') - ext = determine_ext(item_url, default_ext=None) - if type_ == 'application/dash+xml' or ext == 'mpd': + ext = mimetype2ext(item.get('type')) or determine_ext(item_url, default_ext=None) + if ext == 'mpd': formats.extend(self._extract_mpd_formats( item_url, video_id, mpd_id='mpd', fatal=False)) - elif type_ in ('application/vnd.apple.mpegURL', 'application/x-mpegurl') or ext == 'm3u8': + elif ext == 'm3u8': formats.extend(self._extract_m3u8_formats( item_url, video_id, 'mp4', entry_protocol='m3u8' if live else 'm3u8_native', @@ -111,7 +110,7 @@ class ThreeQSDNIE(InfoExtractor): formats.append({ 'url': item_url, 'format_id': item.get('quality'), - 'ext': 'mp4' if item_url.startswith('rtsp') else mimetype2ext(type_) or ext, + 'ext': 'mp4' if item_url.startswith('rtsp') else ext, 'vcodec': 'none' if stream_type == 'audio' else None, }) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 554debd43..69603c1f8 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1995,9 +1995,13 @@ class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor): channel_playlist_id = self._html_search_meta( 'channelId', channel_page, 'channel id', default=None) if not channel_playlist_id: - channel_playlist_id = self._search_regex( - r'data-(?:channel-external-|yt)id="([^"]+)"', - channel_page, 'channel id', default=None) + channel_url = self._html_search_meta( + ('al:ios:url', 'twitter:app:url:iphone', 'twitter:app:url:ipad'), + channel_page, 'channel url', default=None) + if channel_url: + channel_playlist_id = self._search_regex( + r'vnd\.youtube://user/([0-9A-Za-z_-]+)', + channel_url, 'channel id', default=None) if channel_playlist_id and channel_playlist_id.startswith('UC'): playlist_id = 'UU' + channel_playlist_id[2:] return self.url_result( @@ -2020,6 +2024,15 @@ class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor): for video_id, video_title in self.extract_videos_from_page(channel_page)] return self.playlist_result(entries, channel_id) + try: + next(self._entries(channel_page, channel_id)) + except StopIteration: + alert_message = self._html_search_regex( + r'(?s)<div[^>]+class=(["\']).*?\byt-alert-message\b.*?\1[^>]*>(?P<alert>[^<]+)</div>', + channel_page, 'alert', default=None, group='alert') + if alert_message: + raise ExtractorError('Youtube said: %s' % alert_message, expected=True) + return self.playlist_result(self._entries(channel_page, channel_id), channel_id) @@ -2033,7 +2046,8 @@ class YoutubeUserIE(YoutubeChannelIE): 'url': 'https://www.youtube.com/user/TheLinuxFoundation', 'playlist_mincount': 320, 'info_dict': { - 'title': 'TheLinuxFoundation', + 'id': 'UUfX55Sx5hEFjoC3cNs6mCUQ', + 'title': 'Uploads from The Linux Foundation', } }, { 'url': 'ytuser:phihag', @@ -2041,6 +2055,10 @@ class YoutubeUserIE(YoutubeChannelIE): }, { 'url': 'https://www.youtube.com/c/gametrailers', 'only_matching': True, + }, { + # This channel is not available. + 'url': 'https://www.youtube.com/user/kananishinoSMEJ/videos', + 'only_matching': True, }] @classmethod diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 73a58d420..326aa810d 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -310,9 +310,17 @@ def get_element_by_id(id, html): return get_element_by_attribute('id', id, html) -def get_element_by_attribute(attribute, value, html): +def get_element_by_class(class_name, html): + return get_element_by_attribute( + 'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name), + html, escape_value=False) + + +def get_element_by_attribute(attribute, value, html, escape_value=True): """Return the content of the tag with the specified attribute in the passed HTML document""" + value = re.escape(value) if escape_value else value + m = re.search(r'''(?xs) <([a-zA-Z0-9:._-]+) (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*? @@ -321,7 +329,7 @@ def get_element_by_attribute(attribute, value, html): \s*> (?P<content>.*?) </\1> - ''' % (re.escape(attribute), re.escape(value)), html) + ''' % (re.escape(attribute), value), html) if not m: return None @@ -2097,6 +2105,7 @@ def mimetype2ext(mt): return ext _, _, res = mt.rpartition('/') + res = res.lower() return { '3gpp': '3gp', @@ -2108,6 +2117,12 @@ def mimetype2ext(mt): 'x-flv': 'flv', 'x-mp4-fragmented': 'mp4', 'x-ms-wmv': 'wmv', + 'mpegurl': 'm3u8', + 'x-mpegurl': 'm3u8', + 'vnd.apple.mpegurl': 'm3u8', + 'dash+xml': 'mpd', + 'f4m': 'f4m', + 'f4m+xml': 'f4m', }.get(res, res) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index cd106dd7d..d1974d089 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2016.07.05' +__version__ = '2016.07.06'