From 24a2aac445d9149c5e4154e6fb32150a25646987 Mon Sep 17 00:00:00 2001 From: Mark Lee Date: Tue, 25 Mar 2014 22:06:49 -0700 Subject: [PATCH 01/21] [comedycentral] fix TDS extended interviews The new website broke the URL format. Added "playlist" as a valid ID keyword. --- youtube_dl/extractor/comedycentral.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/comedycentral.py b/youtube_dl/extractor/comedycentral.py index 483ae5761..ea1675cf6 100644 --- a/youtube_dl/extractor/comedycentral.py +++ b/youtube_dl/extractor/comedycentral.py @@ -46,7 +46,7 @@ class ComedyCentralShowsIE(InfoExtractor): (the-colbert-report-(videos|collections)/(?P[0-9]+)/[^/]*/(?P.*?)) |(watch/(?P[^/]*)/(?P.*)))| (?P - extended-interviews/(?P[0-9]+)/playlist_tds_extended_(?P.*?)/.*?))) + extended-interviews/(?P[0-9a-z]+)/(?:playlist_tds_extended_)?(?P.*?)(/.*?)?))) $''' _TEST = { 'url': 'http://thedailyshow.cc.com/watch/thu-december-13-2012/kristen-stewart', @@ -134,7 +134,7 @@ class ComedyCentralShowsIE(InfoExtractor): # a URL prefix; so extract the alternate reference # and then add the URL prefix manually. - altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage) + altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video|playlist).*?:.*?)"', webpage) if len(altMovieParams) == 0: raise ExtractorError('unable to find Flash URL in webpage ' + url) else: From 69c01a9f684f44b601840a93496278e06f31b928 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Thu, 27 Mar 2014 02:02:48 +0100 Subject: [PATCH 02/21] [comedycentral] Add a testcase for extended-interviews URLs (#2636) --- test/test_all_urls.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/test/test_all_urls.py b/test/test_all_urls.py index 39ac8b8a1..5b6d18a82 100644 --- a/test/test_all_urls.py +++ b/test/test_all_urls.py @@ -143,5 +143,8 @@ class TestAllURLsMatching(unittest.TestCase): self.assertMatch('http://video.pbs.org/viralplayer/2365173446/', ['PBS']) self.assertMatch('http://video.pbs.org/widget/partnerplayer/980042464/', ['PBS']) + def test_ComedyCentralShows(self): + self.assertMatch('http://thedailyshow.cc.com/extended-interviews/xm3fnq/andrew-napolitano-extended-interview', ['ComedyCentralShows']) + if __name__ == '__main__': unittest.main() From ac6c104871399dd6832f70069a14e4c404d86f38 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Thu, 27 Mar 2014 02:22:40 +0100 Subject: [PATCH 03/21] [ted] Add support for watch/ URLs (Fixes #2637) --- youtube_dl/extractor/ted.py | 43 ++++++++++++++++++++++++++++++++++--- 1 file changed, 40 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/ted.py b/youtube_dl/extractor/ted.py index ad1a46c33..a8d8e8b29 100644 --- a/youtube_dl/extractor/ted.py +++ b/youtube_dl/extractor/ted.py @@ -18,12 +18,14 @@ class TEDIE(SubtitlesInfoExtractor): (?Pplaylists(?:/\d+)?) # We have a playlist | ((?Ptalks)) # We have a simple talk + | + (?Pwatch)/[^/]+/[^/]+ ) (/lang/(.*?))? # The url may contain the language - /(?P\w+) # Here goes the name and then ".html" + /(?P[\w-]+) # Here goes the name and then ".html" .*)$ ''' - _TEST = { + _TESTS = [{ 'url': 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html', 'md5': '4ea1dada91e4174b53dac2bb8ace429d', 'info_dict': { @@ -36,7 +38,17 @@ class TEDIE(SubtitlesInfoExtractor): 'actively fooling us.'), 'uploader': 'Dan Dennett', } - } + }, { + 'url': 'http://www.ted.com/watch/ted-institute/ted-bcg/vishal-sikka-the-beauty-and-power-of-algorithms', + 'md5': '226f4fb9c62380d11b7995efa4c87994', + 'info_dict': { + 'id': 'vishal-sikka-the-beauty-and-power-of-algorithms', + 'ext': 'mp4', + 'title': 'Vishal Sikka: The beauty and power of algorithms', + 'thumbnail': 're:^https?://.+\.jpg', + 'description': 'Adaptive, intelligent, and consistent, algorithms are emerging as the ultimate app for everything from matching consumers to products to assessing medical diagnoses. Vishal Sikka shares his appreciation for the algorithm, charting both its inherent beauty and its growing power.', + } + }] _FORMATS_PREFERENCE = { 'low': 1, @@ -57,6 +69,8 @@ class TEDIE(SubtitlesInfoExtractor): name = m.group('name') if m.group('type_talk'): return self._talk_info(url, name) + elif m.group('type_watch'): + return self._watch_info(url, name) else: return self._playlist_videos_info(url, name) @@ -123,3 +137,26 @@ class TEDIE(SubtitlesInfoExtractor): else: self._downloader.report_warning(u'video doesn\'t have subtitles') return {} + + def _watch_info(self, url, name): + webpage = self._download_webpage(url, name) + + config_json = self._html_search_regex( + r"data-config='([^']+)", webpage, 'config') + config = json.loads(config_json) + video_url = config['video']['url'] + thumbnail = config.get('image', {}).get('url') + + title = self._html_search_regex( + r"(?s)(.+?)", webpage, 'title') + description = self._html_search_regex( + r'(?s)

.*?

(.*?)', + webpage, 'description', fatal=False) + + return { + 'id': name, + 'url': video_url, + 'title': title, + 'thumbnail': thumbnail, + 'description': description, + } From 6c072e7d2521821f692102fdfa4a431cf1a076ae Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Thu, 27 Mar 2014 02:22:57 +0100 Subject: [PATCH 04/21] release 2014.03.27 --- README.md | 1 + youtube_dl/version.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 95795c315..0352a5bcb 100644 --- a/README.md +++ b/README.md @@ -169,6 +169,7 @@ which means you can modify it, redistribute it or use it however you like. ## Verbosity / Simulation Options: -q, --quiet activates quiet mode + --no-warnings Ignore warnings -s, --simulate do not download the video and do not write anything to disk --skip-download do not download the video diff --git a/youtube_dl/version.py b/youtube_dl/version.py index b569d52f5..4b468bb9e 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2014.03.25.1' +__version__ = '2014.03.27' From e9c076c31753d97de574d43c19feef069571bd3a Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Thu, 27 Mar 2014 02:30:00 +0100 Subject: [PATCH 05/21] [clipsyndicate] Modernize --- youtube_dl/extractor/clipsyndicate.py | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/youtube_dl/extractor/clipsyndicate.py b/youtube_dl/extractor/clipsyndicate.py index 9ab6a4ab6..02a1667fa 100644 --- a/youtube_dl/extractor/clipsyndicate.py +++ b/youtube_dl/extractor/clipsyndicate.py @@ -1,3 +1,5 @@ +from __future__ import unicode_literals + import re from .common import InfoExtractor @@ -11,13 +13,14 @@ class ClipsyndicateIE(InfoExtractor): _VALID_URL = r'http://www\.clipsyndicate\.com/video/play(list/\d+)?/(?P\d+)' _TEST = { - u'url': u'http://www.clipsyndicate.com/video/play/4629301/brick_briscoe', - u'md5': u'4d7d549451bad625e0ff3d7bd56d776c', - u'info_dict': { - u'id': u'4629301', - u'ext': u'mp4', - u'title': u'Brick Briscoe', - u'duration': 612, + 'url': 'http://www.clipsyndicate.com/video/play/4629301/brick_briscoe', + 'md5': '4d7d549451bad625e0ff3d7bd56d776c', + 'info_dict': { + 'id': '4629301', + 'ext': 'mp4', + 'title': 'Brick Briscoe', + 'duration': 612, + 'thumbnail': 're:^https?://.+\.jpg', }, } @@ -26,13 +29,13 @@ class ClipsyndicateIE(InfoExtractor): video_id = mobj.group('id') js_player = self._download_webpage( 'http://eplayer.clipsyndicate.com/embed/player.js?va_id=%s' % video_id, - video_id, u'Downlaoding player') + video_id, 'Downlaoding player') # it includes a required token - flvars = self._search_regex(r'flvars: "(.*?)"', js_player, u'flvars') + flvars = self._search_regex(r'flvars: "(.*?)"', js_player, 'flvars') pdoc = self._download_xml( 'http://eplayer.clipsyndicate.com/osmf/playlist?%s' % flvars, - video_id, u'Downloading video info', + video_id, 'Downloading video info', transform_source=fix_xml_ampersands) track_doc = pdoc.find('trackList/track') From e2b06e76c12bb97775a36fdd0d3c58066160a24f Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Thu, 27 Mar 2014 02:51:50 +0100 Subject: [PATCH 06/21] [urort] Add extractor (Fixes #2634) --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/urort.py | 57 ++++++++++++++++++++++++++++++++ 2 files changed, 58 insertions(+) create mode 100644 youtube_dl/extractor/urort.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 685fc749d..481296231 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -256,6 +256,7 @@ from .udemy import ( UdemyCourseIE ) from .unistra import UnistraIE +from .urort import UrortIE from .ustream import UstreamIE, UstreamChannelIE from .vbox7 import Vbox7IE from .veehd import VeeHDIE diff --git a/youtube_dl/extractor/urort.py b/youtube_dl/extractor/urort.py new file mode 100644 index 000000000..ad4807491 --- /dev/null +++ b/youtube_dl/extractor/urort.py @@ -0,0 +1,57 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + compat_urllib_parse, +) + + +class UrortIE(InfoExtractor): + IE_DESC = 'NRK P3 Urørt' + _VALID_URL = r'https?://(?:www\.)?urort\.p3\.no/#!/Band/(?P[^/]+)$' + + _TEST = { + 'url': 'https://urort.p3.no/#!/Band/Gerilja', + 'md5': '5ed31a924be8a05e47812678a86e127b', + 'info_dict': { + 'id': '33124-4', + 'ext': 'mp3', + 'title': 'The Bomb', + 'thumbnail': 're:^https?://.+\.jpg', + 'like_count': int, + 'uploader': 'Gerilja', + 'uploader_id': 'Gerilja', + }, + 'params': { + 'matchtitle': '^The Bomb$', # To test, we want just one video + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + playlist_id = mobj.group('id') + + fstr = compat_urllib_parse.quote("InternalBandUrl eq '%s'" % playlist_id) + json_url = 'http://urort.p3.no/breeze/urort/TrackDtos?$filter=' + fstr + songs = self._download_json(json_url, playlist_id) + + entries = [{ + 'id': '%d-%s' % (s['BandId'], s['$id']), + 'title': s['Title'], + 'url': s['TrackUrl'], + 'ext': 'mp3', + 'uploader_id': playlist_id, + 'uploader': s.get('BandName', playlist_id), + 'like_count': s.get('LikeCount'), + 'thumbnail': 'http://urort.p3.no/cloud/images/%s' % s['Image'], + } for s in songs] + + return { + '_type': 'playlist', + 'id': playlist_id, + 'title': playlist_id, + 'entries': entries, + } From 4c89bbd22cf24b3c275fef759fbdea3b06657c05 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Thu, 27 Mar 2014 02:52:06 +0100 Subject: [PATCH 07/21] release 2014.03.27.1 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 4b468bb9e..5a415d489 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2014.03.27' +__version__ = '2014.03.27.1' From 0dae5083f13682da7c43ac2d8215d1147c3332b4 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Thu, 27 Mar 2014 02:56:23 +0100 Subject: [PATCH 08/21] [urort] Add date --- youtube_dl/extractor/urort.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/youtube_dl/extractor/urort.py b/youtube_dl/extractor/urort.py index ad4807491..5d06fcc9e 100644 --- a/youtube_dl/extractor/urort.py +++ b/youtube_dl/extractor/urort.py @@ -6,6 +6,7 @@ import re from .common import InfoExtractor from ..utils import ( compat_urllib_parse, + unified_strdate, ) @@ -24,6 +25,7 @@ class UrortIE(InfoExtractor): 'like_count': int, 'uploader': 'Gerilja', 'uploader_id': 'Gerilja', + 'upload_date': '20100323', }, 'params': { 'matchtitle': '^The Bomb$', # To test, we want just one video @@ -37,6 +39,7 @@ class UrortIE(InfoExtractor): fstr = compat_urllib_parse.quote("InternalBandUrl eq '%s'" % playlist_id) json_url = 'http://urort.p3.no/breeze/urort/TrackDtos?$filter=' + fstr songs = self._download_json(json_url, playlist_id) + print(songs[0]) entries = [{ 'id': '%d-%s' % (s['BandId'], s['$id']), @@ -47,6 +50,7 @@ class UrortIE(InfoExtractor): 'uploader': s.get('BandName', playlist_id), 'like_count': s.get('LikeCount'), 'thumbnail': 'http://urort.p3.no/cloud/images/%s' % s['Image'], + 'upload_date': unified_strdate(s.get('Released')), } for s in songs] return { From 8cdafb47b946bc92f50e22fc664130ad18498e1e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 27 Mar 2014 19:08:02 +0700 Subject: [PATCH 09/21] [mooshare] Add support for URLs starting with 'www' --- youtube_dl/extractor/mooshare.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/mooshare.py b/youtube_dl/extractor/mooshare.py index f1875add5..7d21ea18f 100644 --- a/youtube_dl/extractor/mooshare.py +++ b/youtube_dl/extractor/mooshare.py @@ -14,7 +14,7 @@ from ..utils import ( class MooshareIE(InfoExtractor): IE_NAME = 'mooshare' IE_DESC = 'Mooshare.biz' - _VALID_URL = r'http://mooshare\.biz/(?P[\da-z]{12})' + _VALID_URL = r'http://(?:www\.)?mooshare\.biz/(?P[\da-z]{12})' _TESTS = [ { From 214c22c704e3a9be43ae36871705106b4eed6e52 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Thu, 27 Mar 2014 21:01:09 +0100 Subject: [PATCH 10/21] [niconico] Modernize --- youtube_dl/extractor/niconico.py | 81 +++++++++++++++----------------- 1 file changed, 38 insertions(+), 43 deletions(-) diff --git a/youtube_dl/extractor/niconico.py b/youtube_dl/extractor/niconico.py index 46774317c..517a72561 100644 --- a/youtube_dl/extractor/niconico.py +++ b/youtube_dl/extractor/niconico.py @@ -1,12 +1,10 @@ # encoding: utf-8 +from __future__ import unicode_literals import re -import socket from .common import InfoExtractor from ..utils import ( - compat_http_client, - compat_urllib_error, compat_urllib_parse, compat_urllib_request, compat_urlparse, @@ -18,57 +16,54 @@ from ..utils import ( class NiconicoIE(InfoExtractor): - IE_NAME = u'niconico' - IE_DESC = u'ニコニコ動画' + IE_NAME = 'niconico' + IE_DESC = 'ニコニコ動画' _TEST = { - u'url': u'http://www.nicovideo.jp/watch/sm22312215', - u'file': u'sm22312215.mp4', - u'md5': u'd1a75c0823e2f629128c43e1212760f9', - u'info_dict': { - u'title': u'Big Buck Bunny', - u'uploader': u'takuya0301', - u'uploader_id': u'2698420', - u'upload_date': u'20131123', - u'description': u'(c) copyright 2008, Blender Foundation / www.bigbuckbunny.org', + 'url': 'http://www.nicovideo.jp/watch/sm22312215', + 'md5': 'd1a75c0823e2f629128c43e1212760f9', + 'info_dict': { + 'id': 'sm22312215', + 'ext': 'mp4', + 'title': 'Big Buck Bunny', + 'uploader': 'takuya0301', + 'uploader_id': '2698420', + 'upload_date': '20131123', + 'description': '(c) copyright 2008, Blender Foundation / www.bigbuckbunny.org', }, - u'params': { - u'username': u'ydl.niconico@gmail.com', - u'password': u'youtube-dl', + 'params': { + 'username': 'ydl.niconico@gmail.com', + 'password': 'youtube-dl', }, } _VALID_URL = r'^https?://(?:www\.|secure\.)?nicovideo\.jp/watch/([a-z][a-z][0-9]+)(?:.*)$' _NETRC_MACHINE = 'niconico' - # If True it will raise an error if no login info is provided - _LOGIN_REQUIRED = True def _real_initialize(self): self._login() def _login(self): (username, password) = self._get_login_info() - # No authentication to be performed if username is None: - if self._LOGIN_REQUIRED: - raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True) - return False + # Login is required + raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True) # Log in login_form_strs = { - u'mail': username, - u'password': password, + 'mail': username, + 'password': password, } # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode # chokes on unicode - login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items()) + login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k, v in login_form_strs.items()) login_data = compat_urllib_parse.urlencode(login_form).encode('utf-8') request = compat_urllib_request.Request( - u'https://secure.nicovideo.jp/secure/login', login_data) + 'https://secure.nicovideo.jp/secure/login', login_data) login_results = self._download_webpage( - request, u'', note=u'Logging in', errnote=u'Unable to log in') + request, None, note='Logging in', errnote='Unable to log in') if re.search(r'(?i)

Log in error

', login_results) is not None: - self._downloader.report_warning(u'unable to log in: bad username or password') + self._downloader.report_warning('unable to log in: bad username or password') return False return True @@ -82,12 +77,12 @@ class NiconicoIE(InfoExtractor): video_info = self._download_xml( 'http://ext.nicovideo.jp/api/getthumbinfo/' + video_id, video_id, - note=u'Downloading video info page') + note='Downloading video info page') # Get flv info flv_info_webpage = self._download_webpage( - u'http://flapi.nicovideo.jp/api/getflv?v=' + video_id, - video_id, u'Downloading flv info') + 'http://flapi.nicovideo.jp/api/getflv?v=' + video_id, + video_id, 'Downloading flv info') video_real_url = compat_urlparse.parse_qs(flv_info_webpage)['url'][0] # Start extracting information @@ -106,22 +101,22 @@ class NiconicoIE(InfoExtractor): url = 'http://seiga.nicovideo.jp/api/user/info?id=' + video_uploader_id try: user_info = self._download_xml( - url, video_id, note=u'Downloading user information') + url, video_id, note='Downloading user information') video_uploader = user_info.find('.//nickname').text - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.report_warning(u'Unable to download user info webpage: %s' % compat_str(err)) + except ExtractorError as err: + self._downloader.report_warning('Unable to download user info webpage: %s' % compat_str(err)) return { - 'id': video_id, - 'url': video_real_url, - 'title': video_title, - 'ext': video_extension, - 'format': video_format, - 'thumbnail': video_thumbnail, + 'id': video_id, + 'url': video_real_url, + 'title': video_title, + 'ext': video_extension, + 'format': video_format, + 'thumbnail': video_thumbnail, 'description': video_description, - 'uploader': video_uploader, + 'uploader': video_uploader, 'upload_date': video_upload_date, 'uploader_id': video_uploader_id, - 'view_count': video_view_count, + 'view_count': video_view_count, 'webpage_url': video_webpage_url, } From 28acf5500a05f01111f65adc84ba7c8e78fcd10b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Thu, 27 Mar 2014 21:10:51 +0100 Subject: [PATCH 11/21] [appletrailers] Modernize --- youtube_dl/extractor/appletrailers.py | 28 ++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/youtube_dl/extractor/appletrailers.py b/youtube_dl/extractor/appletrailers.py index 922cede05..fc5d6825e 100644 --- a/youtube_dl/extractor/appletrailers.py +++ b/youtube_dl/extractor/appletrailers.py @@ -16,9 +16,10 @@ class AppleTrailersIE(InfoExtractor): "url": "http://trailers.apple.com/trailers/wb/manofsteel/", "playlist": [ { - "file": "manofsteel-trailer4.mov", "md5": "d97a8e575432dbcb81b7c3acb741f8a8", "info_dict": { + "id": "manofsteel-trailer4", + "ext": "mov", "duration": 111, "title": "Trailer 4", "upload_date": "20130523", @@ -26,9 +27,10 @@ class AppleTrailersIE(InfoExtractor): }, }, { - "file": "manofsteel-trailer3.mov", "md5": "b8017b7131b721fb4e8d6f49e1df908c", "info_dict": { + "id": "manofsteel-trailer3", + "ext": "mov", "duration": 182, "title": "Trailer 3", "upload_date": "20130417", @@ -36,9 +38,10 @@ class AppleTrailersIE(InfoExtractor): }, }, { - "file": "manofsteel-trailer.mov", "md5": "d0f1e1150989b9924679b441f3404d48", "info_dict": { + "id": "manofsteel-trailer", + "ext": "mov", "duration": 148, "title": "Trailer", "upload_date": "20121212", @@ -46,15 +49,16 @@ class AppleTrailersIE(InfoExtractor): }, }, { - "file": "manofsteel-teaser.mov", "md5": "5fe08795b943eb2e757fa95cb6def1cb", "info_dict": { + "id": "manofsteel-teaser", + "ext": "mov", "duration": 93, "title": "Teaser", "upload_date": "20120721", "uploader_id": "wb", }, - } + }, ] } @@ -65,16 +69,16 @@ class AppleTrailersIE(InfoExtractor): movie = mobj.group('movie') uploader_id = mobj.group('company') - playlist_url = compat_urlparse.urljoin(url, u'includes/playlists/itunes.inc') + playlist_url = compat_urlparse.urljoin(url, 'includes/playlists/itunes.inc') def fix_html(s): - s = re.sub(r'(?s).*?', u'', s) + s = re.sub(r'(?s).*?', '', s) s = re.sub(r'', r'', s) # The ' in the onClick attributes are not escaped, it couldn't be parsed # like: http://trailers.apple.com/trailers/wb/gravity/ def _clean_json(m): - return u'iTunes.playURL(%s);' % m.group(1).replace('\'', ''') + return 'iTunes.playURL(%s);' % m.group(1).replace('\'', ''') s = re.sub(self._JSON_RE, _clean_json, s) - s = u'' + s + u'' + s = '' + s + u'' return s doc = self._download_xml(playlist_url, movie, transform_source=fix_html) @@ -82,7 +86,7 @@ class AppleTrailersIE(InfoExtractor): for li in doc.findall('./div/ul/li'): on_click = li.find('.//a').attrib['onClick'] trailer_info_json = self._search_regex(self._JSON_RE, - on_click, u'trailer info') + on_click, 'trailer info') trailer_info = json.loads(trailer_info_json) title = trailer_info['title'] video_id = movie + '-' + re.sub(r'[^a-zA-Z0-9]', '', title).lower() @@ -98,8 +102,7 @@ class AppleTrailersIE(InfoExtractor): first_url = trailer_info['url'] trailer_id = first_url.split('/')[-1].rpartition('_')[0].lower() settings_json_url = compat_urlparse.urljoin(url, 'includes/settings/%s.json' % trailer_id) - settings_json = self._download_webpage(settings_json_url, trailer_id, u'Downloading settings json') - settings = json.loads(settings_json) + settings = self._download_json(settings_json_url, trailer_id, 'Downloading settings json') formats = [] for format in settings['metadata']['sizes']: @@ -107,7 +110,6 @@ class AppleTrailersIE(InfoExtractor): format_url = re.sub(r'_(\d*p.mov)', r'_h\1', format['src']) formats.append({ 'url': format_url, - 'ext': determine_ext(format_url), 'format': format['type'], 'width': format['width'], 'height': int(format['height']), From d26ebe990fd0e12365202db9948702be7c5edd02 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Thu, 27 Mar 2014 21:23:02 +0100 Subject: [PATCH 12/21] [ehow] Modernize --- youtube_dl/extractor/ehow.py | 43 +++++++++++++++++------------------- 1 file changed, 20 insertions(+), 23 deletions(-) diff --git a/youtube_dl/extractor/ehow.py b/youtube_dl/extractor/ehow.py index 2bb77aec6..f8f49a013 100644 --- a/youtube_dl/extractor/ehow.py +++ b/youtube_dl/extractor/ehow.py @@ -1,23 +1,25 @@ +from __future__ import unicode_literals + import re from ..utils import ( compat_urllib_parse, - determine_ext ) from .common import InfoExtractor class EHowIE(InfoExtractor): - IE_NAME = u'eHow' - _VALID_URL = r'(?:https?://)?(?:www\.)?ehow\.com/[^/_?]*_(?P[0-9]+)' + IE_NAME = 'eHow' + _VALID_URL = r'https?://(?:www\.)?ehow\.com/[^/_?]*_(?P[0-9]+)' _TEST = { - u'url': u'http://www.ehow.com/video_12245069_hardwood-flooring-basics.html', - u'file': u'12245069.flv', - u'md5': u'9809b4e3f115ae2088440bcb4efbf371', - u'info_dict': { - u"title": u"Hardwood Flooring Basics", - u"description": u"Hardwood flooring may be time consuming, but its ultimately a pretty straightforward concept. Learn about hardwood flooring basics with help from a hardware flooring business owner in this free video...", - u"uploader": u"Erick Nathan" + 'url': 'http://www.ehow.com/video_12245069_hardwood-flooring-basics.html', + 'md5': '9809b4e3f115ae2088440bcb4efbf371', + 'info_dict': { + 'id': '12245069', + 'ext': 'flv', + 'title': 'Hardwood Flooring Basics', + 'description': 'Hardwood flooring may be time consuming, but its ultimately a pretty straightforward concept. Learn about hardwood flooring basics with help from a hardware flooring business owner in this free video...', + 'uploader': 'Erick Nathan', } } @@ -26,21 +28,16 @@ class EHowIE(InfoExtractor): video_id = mobj.group('id') webpage = self._download_webpage(url, video_id) video_url = self._search_regex(r'(?:file|source)=(http[^\'"&]*)', - webpage, u'video URL') - final_url = compat_urllib_parse.unquote(video_url) - uploader = self._search_regex(r'', - webpage, u'uploader') + webpage, 'video URL') + final_url = compat_urllib_parse.unquote(video_url) + uploader = self._html_search_meta('uploader', webpage) title = self._og_search_title(webpage).replace(' | eHow', '') - ext = determine_ext(final_url) return { - '_type': 'video', - 'id': video_id, - 'url': final_url, - 'ext': ext, - 'title': title, - 'thumbnail': self._og_search_thumbnail(webpage), + 'id': video_id, + 'url': final_url, + 'title': title, + 'thumbnail': self._og_search_thumbnail(webpage), 'description': self._og_search_description(webpage), - 'uploader': uploader, + 'uploader': uploader, } - From 8efd15f4772181f0a72adf369ba71fa219594af9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 28 Mar 2014 18:47:15 +0700 Subject: [PATCH 13/21] [canalplus] Fix video id extraction (Closes #2645) --- youtube_dl/extractor/canalplus.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/canalplus.py b/youtube_dl/extractor/canalplus.py index 7cdcd8399..49dfd881e 100644 --- a/youtube_dl/extractor/canalplus.py +++ b/youtube_dl/extractor/canalplus.py @@ -28,7 +28,7 @@ class CanalplusIE(InfoExtractor): video_id = mobj.groupdict().get('id') if video_id is None: webpage = self._download_webpage(url, mobj.group('path')) - video_id = self._search_regex(r'videoId = "(\d+)";', webpage, u'video id') + video_id = self._search_regex(r' Date: Fri, 28 Mar 2014 19:58:49 +0700 Subject: [PATCH 14/21] [smotri] Modernize and add support for emdebbed videos (Closes #2585) --- youtube_dl/extractor/generic.py | 21 ++++++++++++++ youtube_dl/extractor/smotri.py | 51 +++++++++++++++++++++++++++------ 2 files changed, 64 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index e7ee31877..fc1bedd57 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -25,6 +25,7 @@ from ..utils import ( from .brightcove import BrightcoveIE from .ooyala import OoyalaIE from .rutv import RUTVIE +from .smotri import SmotriIE class GenericIE(InfoExtractor): @@ -212,6 +213,21 @@ class GenericIE(InfoExtractor): 'skip_download': 'Requires rtmpdump' } }, + # smotri embed + { + 'url': 'http://rbctv.rbc.ru/archive/news/562949990879132.shtml', + 'md5': 'ec40048448e9284c9a1de77bb188108b', + 'info_dict': { + 'id': 'v27008541fad', + 'ext': 'mp4', + 'title': 'Крым и Севастополь вошли в состав России', + 'description': 'md5:fae01b61f68984c7bd2fa741e11c3175', + 'duration': 900, + 'upload_date': '20140318', + 'uploader': 'rbctv_2012_4', + 'uploader_id': 'rbctv_2012_4', + }, + }, ] def report_download_webpage(self, video_id): @@ -547,6 +563,11 @@ class GenericIE(InfoExtractor): if mobj is not None: return self.url_result(mobj.group('url'), 'ArteTVEmbed') + # Look for embedded smotri.com player + smotri_url = SmotriIE._extract_url(webpage) + if smotri_url: + return self.url_result(smotri_url, 'Smotri') + # Start with something easy: JW Player in SWFObject mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage) if mobj is None: diff --git a/youtube_dl/extractor/smotri.py b/youtube_dl/extractor/smotri.py index 540c55703..13e7e71cb 100644 --- a/youtube_dl/extractor/smotri.py +++ b/youtube_dl/extractor/smotri.py @@ -13,22 +13,24 @@ from ..utils import ( compat_urllib_request, ExtractorError, url_basename, + int_or_none, ) class SmotriIE(InfoExtractor): IE_DESC = 'Smotri.com' IE_NAME = 'smotri' - _VALID_URL = r'^https?://(?:www\.)?(?Psmotri\.com/video/view/\?id=(?Pv(?P[0-9]+)[a-z0-9]{4}))' + _VALID_URL = r'^https?://(?:www\.)?(?:smotri\.com/video/view/\?id=|pics\.smotri\.com/(?:player|scrubber_custom8)\.swf\?file=)(?Pv(?P[0-9]+)[a-z0-9]{4})' _NETRC_MACHINE = 'smotri' _TESTS = [ # real video id 2610366 { 'url': 'http://smotri.com/video/view/?id=v261036632ab', - 'file': 'v261036632ab.mp4', 'md5': '2a7b08249e6f5636557579c368040eb9', 'info_dict': { + 'id': 'v261036632ab', + 'ext': 'mp4', 'title': 'катастрофа с камер видеонаблюдения', 'uploader': 'rbc2008', 'uploader_id': 'rbc08', @@ -40,9 +42,10 @@ class SmotriIE(InfoExtractor): # real video id 57591 { 'url': 'http://smotri.com/video/view/?id=v57591cb20', - 'file': 'v57591cb20.flv', 'md5': '830266dfc21f077eac5afd1883091bcd', 'info_dict': { + 'id': 'v57591cb20', + 'ext': 'flv', 'title': 'test', 'uploader': 'Support Photofile@photofile', 'uploader_id': 'support-photofile', @@ -54,9 +57,10 @@ class SmotriIE(InfoExtractor): # video-password { 'url': 'http://smotri.com/video/view/?id=v1390466a13c', - 'file': 'v1390466a13c.mp4', 'md5': 'f6331cef33cad65a0815ee482a54440b', 'info_dict': { + 'id': 'v1390466a13c', + 'ext': 'mp4', 'title': 'TOCCA_A_NOI_-_LE_COSE_NON_VANNO_CAMBIAMOLE_ORA-1', 'uploader': 'timoxa40', 'uploader_id': 'timoxa40', @@ -71,9 +75,10 @@ class SmotriIE(InfoExtractor): # age limit + video-password { 'url': 'http://smotri.com/video/view/?id=v15408898bcf', - 'file': 'v15408898bcf.flv', 'md5': '91e909c9f0521adf5ee86fbe073aad70', 'info_dict': { + 'id': 'v15408898bcf', + 'ext': 'flv', 'title': 'этот ролик не покажут по ТВ', 'uploader': 'zzxxx', 'uploader_id': 'ueggb', @@ -85,7 +90,22 @@ class SmotriIE(InfoExtractor): 'params': { 'videopassword': '333' } - } + }, + # swf player + { + 'url': 'http://pics.smotri.com/scrubber_custom8.swf?file=v9188090500', + 'md5': '4d47034979d9390d14acdf59c4935bc2', + 'info_dict': { + 'id': 'v9188090500', + 'ext': 'mp4', + 'title': 'Shakira - Don\'t Bother', + 'uploader': 'HannahL', + 'uploader_id': 'lisaha95', + 'upload_date': '20090331', + 'description': 'Shakira - Don\'t Bother, видео Shakira - Don\'t Bother', + 'thumbnail': 'http://frame8.loadup.ru/44/0b/918809.7.3.jpg', + }, + }, ] _SUCCESS = 0 @@ -93,6 +113,21 @@ class SmotriIE(InfoExtractor): _PASSWORD_DETECTED = 2 _VIDEO_NOT_FOUND = 3 + @classmethod + def _extract_url(cls, webpage): + mobj = re.search( + r']src=(["\'])(?Phttp://pics\.smotri\.com/(?:player|scrubber_custom8)\.swf\?file=v.+?\1)', + webpage) + if mobj is not None: + return mobj.group('url') + + mobj = re.search( + r'''(?x)http://smotri\.com/video/download/file/[^<]+\s* + [^<]+\s* + (?P[^<]+)''', webpage) + if mobj is not None: + return 'http://smotri.com/video/view/?id=%s' % mobj.group('id') + def _search_meta(self, name, html, display_name=None): if display_name is None: display_name = name @@ -134,7 +169,7 @@ class SmotriIE(InfoExtractor): # Video JSON does not provide enough meta data # We will extract some from the video web page instead - video_page_url = 'http://' + mobj.group('url') + video_page_url = 'http://smotri.com/video/view/?id=%s' % video_id video_page = self._download_webpage(video_page_url, video_id, 'Downloading video page') # Warning if video is unavailable @@ -222,7 +257,7 @@ class SmotriIE(InfoExtractor): 'upload_date': video_upload_date, 'uploader_id': video_uploader_id, 'duration': video_duration, - 'view_count': video_view_count, + 'view_count': int_or_none(video_view_count), 'age_limit': 18 if adult_content else 0, 'video_page_url': video_page_url } From f0da3f1ef925ce8aa0850277a03d510a29c0f43d Mon Sep 17 00:00:00 2001 From: phaer Date: Fri, 28 Mar 2014 17:57:25 +0200 Subject: [PATCH 15/21] [oe1] Add support for oe1.orf.at. --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/oe1.py | 38 ++++++++++++++++++++++++++++++++ 2 files changed, 39 insertions(+) create mode 100644 youtube_dl/extractor/oe1.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 481296231..0e4b2b6e8 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -177,6 +177,7 @@ from .normalboots import NormalbootsIE from .novamov import NovaMovIE from .nowness import NownessIE from .nowvideo import NowVideoIE +from .oe1 import OE1IE from .ooyala import OoyalaIE from .orf import ORFIE from .parliamentliveuk import ParliamentLiveUKIE diff --git a/youtube_dl/extractor/oe1.py b/youtube_dl/extractor/oe1.py new file mode 100644 index 000000000..f327e9e08 --- /dev/null +++ b/youtube_dl/extractor/oe1.py @@ -0,0 +1,38 @@ +# coding: utf-8 +from __future__ import unicode_literals +import calendar +import datetime +import json +import re + +from .common import InfoExtractor + +# audios on oe1.orf.at are only available for 7 days, so we can't +# add tests. + + +class OE1IE(InfoExtractor): + _VALID_URL = r'http://oe1\.orf\.at/programm/(?P\d+)' + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + show_id = mobj.group('id') + data = json.loads(self._download_webpage( + 'http://oe1.orf.at/programm/%s/konsole' % show_id, + show_id + )) + + timestamp = datetime.datetime.strptime('%s %s' % ( + data['item']['day_label'], + data['item']['time'] + ), '%d.%m.%Y %H:%M') + unix_timestamp = calendar.timegm(timestamp.utctimetuple()) + + return { + 'id': show_id, + 'title': data['item']['title'], + 'url': data['item']['url_stream'], + 'ext': 'mp3', + 'description': data['item']['info'], + 'timestamp': unix_timestamp + } From 263f4b514b9b90a217152d2ef8fbf5b884b41b5a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 28 Mar 2014 23:01:08 +0700 Subject: [PATCH 16/21] [ntv] Add support for ntv.ru (Closes #2581) --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/ntv.py | 157 +++++++++++++++++++++++++++++++ 2 files changed, 158 insertions(+) create mode 100644 youtube_dl/extractor/ntv.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 481296231..eef13d2b8 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -177,6 +177,7 @@ from .normalboots import NormalbootsIE from .novamov import NovaMovIE from .nowness import NownessIE from .nowvideo import NowVideoIE +from .ntv import NTVIE from .ooyala import OoyalaIE from .orf import ORFIE from .parliamentliveuk import ParliamentLiveUKIE diff --git a/youtube_dl/extractor/ntv.py b/youtube_dl/extractor/ntv.py new file mode 100644 index 000000000..29e8f7a04 --- /dev/null +++ b/youtube_dl/extractor/ntv.py @@ -0,0 +1,157 @@ +# encoding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + RegexNotFoundError, + unescapeHTML +) + + +class NTVIE(InfoExtractor): + _VALID_URL = r'http://(?:www\.)?ntv\.ru/(?P.+)' + + _TESTS = [ + { + 'url': 'http://www.ntv.ru/novosti/863142/', + 'info_dict': { + 'id': '746000', + 'ext': 'flv', + 'title': 'Командующий Черноморским флотом провел переговоры в штабе ВМС Украины', + 'description': 'Командующий Черноморским флотом провел переговоры в штабе ВМС Украины', + 'duration': 136, + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + }, + { + 'url': 'http://www.ntv.ru/video/novosti/750370/', + 'info_dict': { + 'id': '750370', + 'ext': 'flv', + 'title': 'Родные пассажиров пропавшего Boeing не верят в трагический исход', + 'description': 'Родные пассажиров пропавшего Boeing не верят в трагический исход', + 'duration': 172, + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + }, + { + 'url': 'http://www.ntv.ru/peredacha/segodnya/m23700/o232416', + 'info_dict': { + 'id': '747480', + 'ext': 'flv', + 'title': '«Сегодня». 21 марта 2014 года. 16:00 ', + 'description': '«Сегодня». 21 марта 2014 года. 16:00 ', + 'duration': 1496, + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + }, + { + 'url': 'http://www.ntv.ru/kino/Koma_film', + 'info_dict': { + 'id': '750783', + 'ext': 'flv', + 'title': 'Остросюжетный фильм «Кома» — 4 апреля вечером на НТВ', + 'description': 'Остросюжетный фильм «Кома» — 4 апреля вечером на НТВ', + 'duration': 28, + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + }, + { + 'url': 'http://www.ntv.ru/serial/Delo_vrachey/m31760/o233916/', + 'info_dict': { + 'id': '751482', + 'ext': 'flv', + 'title': '«Дело врачей»: «Деревце жизни»', + 'description': '«Дело врачей»: «Деревце жизни»', + 'duration': 2590, + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + }, + ] + + _VIDEO_ID_REGEXES = [ + r'