diff --git a/README.md b/README.md index 95795c315..ac1c3adad 100644 --- a/README.md +++ b/README.md @@ -169,6 +169,7 @@ which means you can modify it, redistribute it or use it however you like. ## Verbosity / Simulation Options: -q, --quiet activates quiet mode + --no-warnings Ignore warnings -s, --simulate do not download the video and do not write anything to disk --skip-download do not download the video @@ -180,7 +181,9 @@ which means you can modify it, redistribute it or use it however you like. --get-duration simulate, quiet but print video length --get-filename simulate, quiet but print output filename --get-format simulate, quiet but print output format - -j, --dump-json simulate, quiet but print JSON information + -j, --dump-json simulate, quiet but print JSON information. + See --output for a description of available + keys. --newline output progress bar as new lines --no-progress do not print progress bar --console-title display progress in console titlebar diff --git a/test/test_all_urls.py b/test/test_all_urls.py index 39ac8b8a1..5b6d18a82 100644 --- a/test/test_all_urls.py +++ b/test/test_all_urls.py @@ -143,5 +143,8 @@ class TestAllURLsMatching(unittest.TestCase): self.assertMatch('http://video.pbs.org/viralplayer/2365173446/', ['PBS']) self.assertMatch('http://video.pbs.org/widget/partnerplayer/980042464/', ['PBS']) + def test_ComedyCentralShows(self): + self.assertMatch('http://thedailyshow.cc.com/extended-interviews/xm3fnq/andrew-napolitano-extended-interview', ['ComedyCentralShows']) + if __name__ == '__main__': unittest.main() diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index ebd333f79..c74f1eeeb 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -396,7 +396,7 @@ def parseOpts(overrideArguments=None): help='simulate, quiet but print output format', default=False) verbosity.add_option('-j', '--dump-json', action='store_true', dest='dumpjson', - help='simulate, quiet but print JSON information', default=False) + help='simulate, quiet but print JSON information. See --output for a description of available keys.', default=False) verbosity.add_option('--newline', action='store_true', dest='progress_with_newline', help='output progress bar as new lines', default=False) verbosity.add_option('--no-progress', diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 685fc749d..8e81fa619 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -177,6 +177,8 @@ from .normalboots import NormalbootsIE from .novamov import NovaMovIE from .nowness import NownessIE from .nowvideo import NowVideoIE +from .ntv import NTVIE +from .oe1 import OE1IE from .ooyala import OoyalaIE from .orf import ORFIE from .parliamentliveuk import ParliamentLiveUKIE @@ -256,6 +258,7 @@ from .udemy import ( UdemyCourseIE ) from .unistra import UnistraIE +from .urort import UrortIE from .ustream import UstreamIE, UstreamChannelIE from .vbox7 import Vbox7IE from .veehd import VeeHDIE diff --git a/youtube_dl/extractor/appletrailers.py b/youtube_dl/extractor/appletrailers.py index 922cede05..fc5d6825e 100644 --- a/youtube_dl/extractor/appletrailers.py +++ b/youtube_dl/extractor/appletrailers.py @@ -16,9 +16,10 @@ class AppleTrailersIE(InfoExtractor): "url": "http://trailers.apple.com/trailers/wb/manofsteel/", "playlist": [ { - "file": "manofsteel-trailer4.mov", "md5": "d97a8e575432dbcb81b7c3acb741f8a8", "info_dict": { + "id": "manofsteel-trailer4", + "ext": "mov", "duration": 111, "title": "Trailer 4", "upload_date": "20130523", @@ -26,9 +27,10 @@ class AppleTrailersIE(InfoExtractor): }, }, { - "file": "manofsteel-trailer3.mov", "md5": "b8017b7131b721fb4e8d6f49e1df908c", "info_dict": { + "id": "manofsteel-trailer3", + "ext": "mov", "duration": 182, "title": "Trailer 3", "upload_date": "20130417", @@ -36,9 +38,10 @@ class AppleTrailersIE(InfoExtractor): }, }, { - "file": "manofsteel-trailer.mov", "md5": "d0f1e1150989b9924679b441f3404d48", "info_dict": { + "id": "manofsteel-trailer", + "ext": "mov", "duration": 148, "title": "Trailer", "upload_date": "20121212", @@ -46,15 +49,16 @@ class AppleTrailersIE(InfoExtractor): }, }, { - "file": "manofsteel-teaser.mov", "md5": "5fe08795b943eb2e757fa95cb6def1cb", "info_dict": { + "id": "manofsteel-teaser", + "ext": "mov", "duration": 93, "title": "Teaser", "upload_date": "20120721", "uploader_id": "wb", }, - } + }, ] } @@ -65,16 +69,16 @@ class AppleTrailersIE(InfoExtractor): movie = mobj.group('movie') uploader_id = mobj.group('company') - playlist_url = compat_urlparse.urljoin(url, u'includes/playlists/itunes.inc') + playlist_url = compat_urlparse.urljoin(url, 'includes/playlists/itunes.inc') def fix_html(s): - s = re.sub(r'(?s).*?', u'', s) + s = re.sub(r'(?s).*?', '', s) s = re.sub(r'', r'', s) # The ' in the onClick attributes are not escaped, it couldn't be parsed # like: http://trailers.apple.com/trailers/wb/gravity/ def _clean_json(m): - return u'iTunes.playURL(%s);' % m.group(1).replace('\'', ''') + return 'iTunes.playURL(%s);' % m.group(1).replace('\'', ''') s = re.sub(self._JSON_RE, _clean_json, s) - s = u'' + s + u'' + s = '' + s + u'' return s doc = self._download_xml(playlist_url, movie, transform_source=fix_html) @@ -82,7 +86,7 @@ class AppleTrailersIE(InfoExtractor): for li in doc.findall('./div/ul/li'): on_click = li.find('.//a').attrib['onClick'] trailer_info_json = self._search_regex(self._JSON_RE, - on_click, u'trailer info') + on_click, 'trailer info') trailer_info = json.loads(trailer_info_json) title = trailer_info['title'] video_id = movie + '-' + re.sub(r'[^a-zA-Z0-9]', '', title).lower() @@ -98,8 +102,7 @@ class AppleTrailersIE(InfoExtractor): first_url = trailer_info['url'] trailer_id = first_url.split('/')[-1].rpartition('_')[0].lower() settings_json_url = compat_urlparse.urljoin(url, 'includes/settings/%s.json' % trailer_id) - settings_json = self._download_webpage(settings_json_url, trailer_id, u'Downloading settings json') - settings = json.loads(settings_json) + settings = self._download_json(settings_json_url, trailer_id, 'Downloading settings json') formats = [] for format in settings['metadata']['sizes']: @@ -107,7 +110,6 @@ class AppleTrailersIE(InfoExtractor): format_url = re.sub(r'_(\d*p.mov)', r'_h\1', format['src']) formats.append({ 'url': format_url, - 'ext': determine_ext(format_url), 'format': format['type'], 'width': format['width'], 'height': int(format['height']), diff --git a/youtube_dl/extractor/canalplus.py b/youtube_dl/extractor/canalplus.py index 7cdcd8399..49dfd881e 100644 --- a/youtube_dl/extractor/canalplus.py +++ b/youtube_dl/extractor/canalplus.py @@ -28,7 +28,7 @@ class CanalplusIE(InfoExtractor): video_id = mobj.groupdict().get('id') if video_id is None: webpage = self._download_webpage(url, mobj.group('path')) - video_id = self._search_regex(r'videoId = "(\d+)";', webpage, u'video id') + video_id = self._search_regex(r'\d+)' _TEST = { - u'url': u'http://www.clipsyndicate.com/video/play/4629301/brick_briscoe', - u'md5': u'4d7d549451bad625e0ff3d7bd56d776c', - u'info_dict': { - u'id': u'4629301', - u'ext': u'mp4', - u'title': u'Brick Briscoe', - u'duration': 612, + 'url': 'http://www.clipsyndicate.com/video/play/4629301/brick_briscoe', + 'md5': '4d7d549451bad625e0ff3d7bd56d776c', + 'info_dict': { + 'id': '4629301', + 'ext': 'mp4', + 'title': 'Brick Briscoe', + 'duration': 612, + 'thumbnail': 're:^https?://.+\.jpg', }, } @@ -26,13 +29,13 @@ class ClipsyndicateIE(InfoExtractor): video_id = mobj.group('id') js_player = self._download_webpage( 'http://eplayer.clipsyndicate.com/embed/player.js?va_id=%s' % video_id, - video_id, u'Downlaoding player') + video_id, 'Downlaoding player') # it includes a required token - flvars = self._search_regex(r'flvars: "(.*?)"', js_player, u'flvars') + flvars = self._search_regex(r'flvars: "(.*?)"', js_player, 'flvars') pdoc = self._download_xml( 'http://eplayer.clipsyndicate.com/osmf/playlist?%s' % flvars, - video_id, u'Downloading video info', + video_id, 'Downloading video info', transform_source=fix_xml_ampersands) track_doc = pdoc.find('trackList/track') diff --git a/youtube_dl/extractor/comedycentral.py b/youtube_dl/extractor/comedycentral.py index 775eb8ebf..5e96eeba2 100644 --- a/youtube_dl/extractor/comedycentral.py +++ b/youtube_dl/extractor/comedycentral.py @@ -8,7 +8,7 @@ from ..utils import ( compat_str, compat_urllib_parse, ExtractorError, - int_or_none, + float_or_none, unified_strdate, ) @@ -46,7 +46,7 @@ class ComedyCentralShowsIE(InfoExtractor): (the-colbert-report-(videos|collections)/(?P[0-9]+)/[^/]*/(?P.*?)) |(watch/(?P[^/]*)/(?P.*)))| (?P - extended-interviews/(?P[0-9]+)/playlist_tds_extended_(?P.*?)/.*?))) + extended-interviews/(?P[0-9a-z]+)/(?:playlist_tds_extended_)?(?P.*?)(/.*?)?))) $''' _TEST = { 'url': 'http://thedailyshow.cc.com/watch/thu-december-13-2012/kristen-stewart', @@ -134,7 +134,7 @@ class ComedyCentralShowsIE(InfoExtractor): # a URL prefix; so extract the alternate reference # and then add the URL prefix manually. - altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage) + altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video|playlist).*?:.*?)"', webpage) if len(altMovieParams) == 0: raise ExtractorError('unable to find Flash URL in webpage ' + url) else: @@ -159,7 +159,7 @@ class ComedyCentralShowsIE(InfoExtractor): thumbnail = itemEl.find('.//{http://search.yahoo.com/mrss/}thumbnail').attrib.get('url') content = itemEl.find('.//{http://search.yahoo.com/mrss/}content') - duration = int_or_none(content.attrib.get('duration')) + duration = float_or_none(content.attrib.get('duration')) mediagen_url = content.attrib['url'] guid = itemEl.find('.//guid').text.rpartition(':')[-1] diff --git a/youtube_dl/extractor/ehow.py b/youtube_dl/extractor/ehow.py index 2bb77aec6..f8f49a013 100644 --- a/youtube_dl/extractor/ehow.py +++ b/youtube_dl/extractor/ehow.py @@ -1,23 +1,25 @@ +from __future__ import unicode_literals + import re from ..utils import ( compat_urllib_parse, - determine_ext ) from .common import InfoExtractor class EHowIE(InfoExtractor): - IE_NAME = u'eHow' - _VALID_URL = r'(?:https?://)?(?:www\.)?ehow\.com/[^/_?]*_(?P[0-9]+)' + IE_NAME = 'eHow' + _VALID_URL = r'https?://(?:www\.)?ehow\.com/[^/_?]*_(?P[0-9]+)' _TEST = { - u'url': u'http://www.ehow.com/video_12245069_hardwood-flooring-basics.html', - u'file': u'12245069.flv', - u'md5': u'9809b4e3f115ae2088440bcb4efbf371', - u'info_dict': { - u"title": u"Hardwood Flooring Basics", - u"description": u"Hardwood flooring may be time consuming, but its ultimately a pretty straightforward concept. Learn about hardwood flooring basics with help from a hardware flooring business owner in this free video...", - u"uploader": u"Erick Nathan" + 'url': 'http://www.ehow.com/video_12245069_hardwood-flooring-basics.html', + 'md5': '9809b4e3f115ae2088440bcb4efbf371', + 'info_dict': { + 'id': '12245069', + 'ext': 'flv', + 'title': 'Hardwood Flooring Basics', + 'description': 'Hardwood flooring may be time consuming, but its ultimately a pretty straightforward concept. Learn about hardwood flooring basics with help from a hardware flooring business owner in this free video...', + 'uploader': 'Erick Nathan', } } @@ -26,21 +28,16 @@ class EHowIE(InfoExtractor): video_id = mobj.group('id') webpage = self._download_webpage(url, video_id) video_url = self._search_regex(r'(?:file|source)=(http[^\'"&]*)', - webpage, u'video URL') - final_url = compat_urllib_parse.unquote(video_url) - uploader = self._search_regex(r'', - webpage, u'uploader') + webpage, 'video URL') + final_url = compat_urllib_parse.unquote(video_url) + uploader = self._html_search_meta('uploader', webpage) title = self._og_search_title(webpage).replace(' | eHow', '') - ext = determine_ext(final_url) return { - '_type': 'video', - 'id': video_id, - 'url': final_url, - 'ext': ext, - 'title': title, - 'thumbnail': self._og_search_thumbnail(webpage), + 'id': video_id, + 'url': final_url, + 'title': title, + 'thumbnail': self._og_search_thumbnail(webpage), 'description': self._og_search_description(webpage), - 'uploader': uploader, + 'uploader': uploader, } - diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index e7ee31877..fc1bedd57 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -25,6 +25,7 @@ from ..utils import ( from .brightcove import BrightcoveIE from .ooyala import OoyalaIE from .rutv import RUTVIE +from .smotri import SmotriIE class GenericIE(InfoExtractor): @@ -212,6 +213,21 @@ class GenericIE(InfoExtractor): 'skip_download': 'Requires rtmpdump' } }, + # smotri embed + { + 'url': 'http://rbctv.rbc.ru/archive/news/562949990879132.shtml', + 'md5': 'ec40048448e9284c9a1de77bb188108b', + 'info_dict': { + 'id': 'v27008541fad', + 'ext': 'mp4', + 'title': 'Крым и Севастополь вошли в состав России', + 'description': 'md5:fae01b61f68984c7bd2fa741e11c3175', + 'duration': 900, + 'upload_date': '20140318', + 'uploader': 'rbctv_2012_4', + 'uploader_id': 'rbctv_2012_4', + }, + }, ] def report_download_webpage(self, video_id): @@ -547,6 +563,11 @@ class GenericIE(InfoExtractor): if mobj is not None: return self.url_result(mobj.group('url'), 'ArteTVEmbed') + # Look for embedded smotri.com player + smotri_url = SmotriIE._extract_url(webpage) + if smotri_url: + return self.url_result(smotri_url, 'Smotri') + # Start with something easy: JW Player in SWFObject mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage) if mobj is None: diff --git a/youtube_dl/extractor/mooshare.py b/youtube_dl/extractor/mooshare.py index f1875add5..7d21ea18f 100644 --- a/youtube_dl/extractor/mooshare.py +++ b/youtube_dl/extractor/mooshare.py @@ -14,7 +14,7 @@ from ..utils import ( class MooshareIE(InfoExtractor): IE_NAME = 'mooshare' IE_DESC = 'Mooshare.biz' - _VALID_URL = r'http://mooshare\.biz/(?P[\da-z]{12})' + _VALID_URL = r'http://(?:www\.)?mooshare\.biz/(?P[\da-z]{12})' _TESTS = [ { diff --git a/youtube_dl/extractor/niconico.py b/youtube_dl/extractor/niconico.py index 46774317c..517a72561 100644 --- a/youtube_dl/extractor/niconico.py +++ b/youtube_dl/extractor/niconico.py @@ -1,12 +1,10 @@ # encoding: utf-8 +from __future__ import unicode_literals import re -import socket from .common import InfoExtractor from ..utils import ( - compat_http_client, - compat_urllib_error, compat_urllib_parse, compat_urllib_request, compat_urlparse, @@ -18,57 +16,54 @@ from ..utils import ( class NiconicoIE(InfoExtractor): - IE_NAME = u'niconico' - IE_DESC = u'ニコニコ動画' + IE_NAME = 'niconico' + IE_DESC = 'ニコニコ動画' _TEST = { - u'url': u'http://www.nicovideo.jp/watch/sm22312215', - u'file': u'sm22312215.mp4', - u'md5': u'd1a75c0823e2f629128c43e1212760f9', - u'info_dict': { - u'title': u'Big Buck Bunny', - u'uploader': u'takuya0301', - u'uploader_id': u'2698420', - u'upload_date': u'20131123', - u'description': u'(c) copyright 2008, Blender Foundation / www.bigbuckbunny.org', + 'url': 'http://www.nicovideo.jp/watch/sm22312215', + 'md5': 'd1a75c0823e2f629128c43e1212760f9', + 'info_dict': { + 'id': 'sm22312215', + 'ext': 'mp4', + 'title': 'Big Buck Bunny', + 'uploader': 'takuya0301', + 'uploader_id': '2698420', + 'upload_date': '20131123', + 'description': '(c) copyright 2008, Blender Foundation / www.bigbuckbunny.org', }, - u'params': { - u'username': u'ydl.niconico@gmail.com', - u'password': u'youtube-dl', + 'params': { + 'username': 'ydl.niconico@gmail.com', + 'password': 'youtube-dl', }, } _VALID_URL = r'^https?://(?:www\.|secure\.)?nicovideo\.jp/watch/([a-z][a-z][0-9]+)(?:.*)$' _NETRC_MACHINE = 'niconico' - # If True it will raise an error if no login info is provided - _LOGIN_REQUIRED = True def _real_initialize(self): self._login() def _login(self): (username, password) = self._get_login_info() - # No authentication to be performed if username is None: - if self._LOGIN_REQUIRED: - raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True) - return False + # Login is required + raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True) # Log in login_form_strs = { - u'mail': username, - u'password': password, + 'mail': username, + 'password': password, } # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode # chokes on unicode - login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items()) + login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k, v in login_form_strs.items()) login_data = compat_urllib_parse.urlencode(login_form).encode('utf-8') request = compat_urllib_request.Request( - u'https://secure.nicovideo.jp/secure/login', login_data) + 'https://secure.nicovideo.jp/secure/login', login_data) login_results = self._download_webpage( - request, u'', note=u'Logging in', errnote=u'Unable to log in') + request, None, note='Logging in', errnote='Unable to log in') if re.search(r'(?i)

Log in error

', login_results) is not None: - self._downloader.report_warning(u'unable to log in: bad username or password') + self._downloader.report_warning('unable to log in: bad username or password') return False return True @@ -82,12 +77,12 @@ class NiconicoIE(InfoExtractor): video_info = self._download_xml( 'http://ext.nicovideo.jp/api/getthumbinfo/' + video_id, video_id, - note=u'Downloading video info page') + note='Downloading video info page') # Get flv info flv_info_webpage = self._download_webpage( - u'http://flapi.nicovideo.jp/api/getflv?v=' + video_id, - video_id, u'Downloading flv info') + 'http://flapi.nicovideo.jp/api/getflv?v=' + video_id, + video_id, 'Downloading flv info') video_real_url = compat_urlparse.parse_qs(flv_info_webpage)['url'][0] # Start extracting information @@ -106,22 +101,22 @@ class NiconicoIE(InfoExtractor): url = 'http://seiga.nicovideo.jp/api/user/info?id=' + video_uploader_id try: user_info = self._download_xml( - url, video_id, note=u'Downloading user information') + url, video_id, note='Downloading user information') video_uploader = user_info.find('.//nickname').text - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.report_warning(u'Unable to download user info webpage: %s' % compat_str(err)) + except ExtractorError as err: + self._downloader.report_warning('Unable to download user info webpage: %s' % compat_str(err)) return { - 'id': video_id, - 'url': video_real_url, - 'title': video_title, - 'ext': video_extension, - 'format': video_format, - 'thumbnail': video_thumbnail, + 'id': video_id, + 'url': video_real_url, + 'title': video_title, + 'ext': video_extension, + 'format': video_format, + 'thumbnail': video_thumbnail, 'description': video_description, - 'uploader': video_uploader, + 'uploader': video_uploader, 'upload_date': video_upload_date, 'uploader_id': video_uploader_id, - 'view_count': video_view_count, + 'view_count': video_view_count, 'webpage_url': video_webpage_url, } diff --git a/youtube_dl/extractor/ntv.py b/youtube_dl/extractor/ntv.py new file mode 100644 index 000000000..e998d156e --- /dev/null +++ b/youtube_dl/extractor/ntv.py @@ -0,0 +1,158 @@ +# encoding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + RegexNotFoundError, + unescapeHTML +) + + +class NTVIE(InfoExtractor): + _VALID_URL = r'http://(?:www\.)?ntv\.ru/(?P.+)' + + _TESTS = [ + { + 'url': 'http://www.ntv.ru/novosti/863142/', + 'info_dict': { + 'id': '746000', + 'ext': 'flv', + 'title': 'Командующий Черноморским флотом провел переговоры в штабе ВМС Украины', + 'description': 'Командующий Черноморским флотом провел переговоры в штабе ВМС Украины', + 'duration': 136, + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + }, + { + 'url': 'http://www.ntv.ru/video/novosti/750370/', + 'info_dict': { + 'id': '750370', + 'ext': 'flv', + 'title': 'Родные пассажиров пропавшего Boeing не верят в трагический исход', + 'description': 'Родные пассажиров пропавшего Boeing не верят в трагический исход', + 'duration': 172, + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + }, + { + 'url': 'http://www.ntv.ru/peredacha/segodnya/m23700/o232416', + 'info_dict': { + 'id': '747480', + 'ext': 'flv', + 'title': '«Сегодня». 21 марта 2014 года. 16:00 ', + 'description': '«Сегодня». 21 марта 2014 года. 16:00 ', + 'duration': 1496, + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + }, + { + 'url': 'http://www.ntv.ru/kino/Koma_film', + 'info_dict': { + 'id': '750783', + 'ext': 'flv', + 'title': 'Остросюжетный фильм «Кома» — 4 апреля вечером на НТВ', + 'description': 'Остросюжетный фильм «Кома» — 4 апреля вечером на НТВ', + 'duration': 28, + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + }, + { + 'url': 'http://www.ntv.ru/serial/Delo_vrachey/m31760/o233916/', + 'info_dict': { + 'id': '751482', + 'ext': 'flv', + 'title': '«Дело врачей»: «Деревце жизни»', + 'description': '«Дело врачей»: «Деревце жизни»', + 'duration': 2590, + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + }, + ] + + _VIDEO_ID_REGEXES = [ + r'http://pics\.smotri\.com/(?:player|scrubber_custom8)\.swf\?file=v.+?\1)', + webpage) + if mobj is not None: + return mobj.group('url') + + mobj = re.search( + r'''(?x)http://smotri\.com/video/download/file/[^<]+\s* + [^<]+\s* + (?P[^<]+)''', webpage) + if mobj is not None: + return 'http://smotri.com/video/view/?id=%s' % mobj.group('id') + def _search_meta(self, name, html, display_name=None): if display_name is None: display_name = name @@ -134,7 +169,7 @@ class SmotriIE(InfoExtractor): # Video JSON does not provide enough meta data # We will extract some from the video web page instead - video_page_url = 'http://' + mobj.group('url') + video_page_url = 'http://smotri.com/video/view/?id=%s' % video_id video_page = self._download_webpage(video_page_url, video_id, 'Downloading video page') # Warning if video is unavailable @@ -222,7 +257,7 @@ class SmotriIE(InfoExtractor): 'upload_date': video_upload_date, 'uploader_id': video_uploader_id, 'duration': video_duration, - 'view_count': video_view_count, + 'view_count': int_or_none(video_view_count), 'age_limit': 18 if adult_content else 0, 'video_page_url': video_page_url } diff --git a/youtube_dl/extractor/ted.py b/youtube_dl/extractor/ted.py index ad1a46c33..a8d8e8b29 100644 --- a/youtube_dl/extractor/ted.py +++ b/youtube_dl/extractor/ted.py @@ -18,12 +18,14 @@ class TEDIE(SubtitlesInfoExtractor): (?Pplaylists(?:/\d+)?) # We have a playlist | ((?Ptalks)) # We have a simple talk + | + (?Pwatch)/[^/]+/[^/]+ ) (/lang/(.*?))? # The url may contain the language - /(?P\w+) # Here goes the name and then ".html" + /(?P[\w-]+) # Here goes the name and then ".html" .*)$ ''' - _TEST = { + _TESTS = [{ 'url': 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html', 'md5': '4ea1dada91e4174b53dac2bb8ace429d', 'info_dict': { @@ -36,7 +38,17 @@ class TEDIE(SubtitlesInfoExtractor): 'actively fooling us.'), 'uploader': 'Dan Dennett', } - } + }, { + 'url': 'http://www.ted.com/watch/ted-institute/ted-bcg/vishal-sikka-the-beauty-and-power-of-algorithms', + 'md5': '226f4fb9c62380d11b7995efa4c87994', + 'info_dict': { + 'id': 'vishal-sikka-the-beauty-and-power-of-algorithms', + 'ext': 'mp4', + 'title': 'Vishal Sikka: The beauty and power of algorithms', + 'thumbnail': 're:^https?://.+\.jpg', + 'description': 'Adaptive, intelligent, and consistent, algorithms are emerging as the ultimate app for everything from matching consumers to products to assessing medical diagnoses. Vishal Sikka shares his appreciation for the algorithm, charting both its inherent beauty and its growing power.', + } + }] _FORMATS_PREFERENCE = { 'low': 1, @@ -57,6 +69,8 @@ class TEDIE(SubtitlesInfoExtractor): name = m.group('name') if m.group('type_talk'): return self._talk_info(url, name) + elif m.group('type_watch'): + return self._watch_info(url, name) else: return self._playlist_videos_info(url, name) @@ -123,3 +137,26 @@ class TEDIE(SubtitlesInfoExtractor): else: self._downloader.report_warning(u'video doesn\'t have subtitles') return {} + + def _watch_info(self, url, name): + webpage = self._download_webpage(url, name) + + config_json = self._html_search_regex( + r"data-config='([^']+)", webpage, 'config') + config = json.loads(config_json) + video_url = config['video']['url'] + thumbnail = config.get('image', {}).get('url') + + title = self._html_search_regex( + r"(?s)(.+?)", webpage, 'title') + description = self._html_search_regex( + r'(?s)

.*?

(.*?)', + webpage, 'description', fatal=False) + + return { + 'id': name, + 'url': video_url, + 'title': title, + 'thumbnail': thumbnail, + 'description': description, + } diff --git a/youtube_dl/extractor/urort.py b/youtube_dl/extractor/urort.py new file mode 100644 index 000000000..5d06fcc9e --- /dev/null +++ b/youtube_dl/extractor/urort.py @@ -0,0 +1,61 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + compat_urllib_parse, + unified_strdate, +) + + +class UrortIE(InfoExtractor): + IE_DESC = 'NRK P3 Urørt' + _VALID_URL = r'https?://(?:www\.)?urort\.p3\.no/#!/Band/(?P[^/]+)$' + + _TEST = { + 'url': 'https://urort.p3.no/#!/Band/Gerilja', + 'md5': '5ed31a924be8a05e47812678a86e127b', + 'info_dict': { + 'id': '33124-4', + 'ext': 'mp3', + 'title': 'The Bomb', + 'thumbnail': 're:^https?://.+\.jpg', + 'like_count': int, + 'uploader': 'Gerilja', + 'uploader_id': 'Gerilja', + 'upload_date': '20100323', + }, + 'params': { + 'matchtitle': '^The Bomb$', # To test, we want just one video + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + playlist_id = mobj.group('id') + + fstr = compat_urllib_parse.quote("InternalBandUrl eq '%s'" % playlist_id) + json_url = 'http://urort.p3.no/breeze/urort/TrackDtos?$filter=' + fstr + songs = self._download_json(json_url, playlist_id) + print(songs[0]) + + entries = [{ + 'id': '%d-%s' % (s['BandId'], s['$id']), + 'title': s['Title'], + 'url': s['TrackUrl'], + 'ext': 'mp3', + 'uploader_id': playlist_id, + 'uploader': s.get('BandName', playlist_id), + 'like_count': s.get('LikeCount'), + 'thumbnail': 'http://urort.p3.no/cloud/images/%s' % s['Image'], + 'upload_date': unified_strdate(s.get('Released')), + } for s in songs] + + return { + '_type': 'playlist', + 'id': playlist_id, + 'title': playlist_id, + 'entries': entries, + } diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index d5dcc3a66..2d77aa4c3 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1185,6 +1185,10 @@ def int_or_none(v, scale=1): return v if v is None else (int(v) // scale) +def float_or_none(v, scale=1): + return v if v is None else (float(v) / scale) + + def parse_duration(s): if s is None: return None diff --git a/youtube_dl/version.py b/youtube_dl/version.py index b569d52f5..154aeca05 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2014.03.25.1' +__version__ = '2014.03.28'