diff --git a/MANIFEST.in b/MANIFEST.in index 08be9af71..d43cc1f3b 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -3,5 +3,4 @@ include test/*.py include test/*.json include youtube-dl.bash-completion include youtube-dl.1 -recursive-include docs * -prune docs/_build +recursive-include docs Makefile conf.py *.rst diff --git a/README.md b/README.md index 5bb6c6e4e..1ba1486d2 100644 --- a/README.md +++ b/README.md @@ -371,7 +371,67 @@ If you want to create a build of youtube-dl yourself, you'll need ### Adding support for a new site -If you want to add support for a new site, copy *any* [recently modified](https://github.com/rg3/youtube-dl/commits/master/youtube_dl/extractor) file in `youtube_dl/extractor`, add an import in [`youtube_dl/extractor/__init__.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/__init__.py). Have a look at [`youtube_dl/common/extractor/common.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should return](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L38). Don't forget to run the tests with `python test/test_download.py TestDownload.test_YourExtractor`! For a detailed tutorial, refer to [this blog post](http://filippo.io/add-support-for-a-new-video-site-to-youtube-dl/). +If you want to add support for a new site, you can follow this quick list (assuming your service is called `yourextractor`): + +1. [Fork this repository](https://github.com/rg3/youtube-dl/fork) +2. Check out the source code with `git clone git@github.com:YOUR_GITHUB_USERNAME/youtube-dl.git` +3. Start a new git branch with `cd youtube-dl; git checkout -b yourextractor` +4. Start with this simple template and save it to `youtube_dl/extractor/yourextractor.py`: + + # coding: utf-8 + from __future__ import unicode_literals + + import re + + from .common import InfoExtractor + + + class YourExtractorIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?yourextractor\.com/watch/(?P[0-9]+)' + _TEST = { + 'url': 'http://yourextractor.com/watch/42', + 'md5': 'TODO: md5 sum of the first 10KiB of the video file', + 'info_dict': { + 'id': '42', + 'ext': 'mp4', + 'title': 'Video title goes here', + # TODO more properties, either as: + # * A value + # * MD5 checksum; start the string with md5: + # * A regular expression; start the string with re: + # * Any Python type (for example int or float) + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + + # TODO more code goes here, for example ... + webpage = self._download_webpage(url, video_id) + title = self._html_search_regex(r'

(.*?)

', webpage, 'title') + + return { + 'id': video_id, + 'title': title, + # TODO more properties (see youtube_dl/extractor/common.py) + } + + +5. Add an import in [`youtube_dl/extractor/__init__.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/__init__.py). +6. Run `python test/test_download.py TestDownload.test_YourExtractor`. This *should fail* at first, but you can continually re-run it until you're done. +7. Have a look at [`youtube_dl/common/extractor/common.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should return](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L38). Add tests and code for as many as you want. +8. If you can, check the code with [pyflakes](https://pypi.python.org/pypi/pyflakes) (a good idea) and [pep8](https://pypi.python.org/pypi/pep8) (optional, ignore E501). +9. When the tests pass, [add](https://www.kernel.org/pub/software/scm/git/docs/git-add.html) the new files and [commit](https://www.kernel.org/pub/software/scm/git/docs/git-commit.html) them and [push](https://www.kernel.org/pub/software/scm/git/docs/git-push.html) the result, like this: + + $ git add youtube_dl/extractor/__init__.py + $ git add youtube_dl/extractor/yourextractor.py + $ git commit -m '[yourextractor] Add new extractor' + $ git push origin yourextractor + +10. Finally, [create a pull request](https://help.github.com/articles/creating-a-pull-request). We'll then review and merge it. + +In any case, thank you very much for your contributions! # BUGS diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index 1f3ccaea0..2902dbec7 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -26,16 +26,27 @@ class YDL(FakeYDL): self.msgs.append(msg) +def _make_result(formats, **kwargs): + res = { + 'formats': formats, + 'id': 'testid', + 'title': 'testttitle', + 'extractor': 'testex', + } + res.update(**kwargs) + return res + + class TestFormatSelection(unittest.TestCase): def test_prefer_free_formats(self): # Same resolution => download webm ydl = YDL() ydl.params['prefer_free_formats'] = True formats = [ - {'ext': 'webm', 'height': 460}, - {'ext': 'mp4', 'height': 460}, + {'ext': 'webm', 'height': 460, 'url': 'x'}, + {'ext': 'mp4', 'height': 460, 'url': 'y'}, ] - info_dict = {'formats': formats, 'extractor': 'test'} + info_dict = _make_result(formats) yie = YoutubeIE(ydl) yie._sort_formats(info_dict['formats']) ydl.process_ie_result(info_dict) @@ -46,8 +57,8 @@ class TestFormatSelection(unittest.TestCase): ydl = YDL() ydl.params['prefer_free_formats'] = True formats = [ - {'ext': 'webm', 'height': 720}, - {'ext': 'mp4', 'height': 1080}, + {'ext': 'webm', 'height': 720, 'url': 'a'}, + {'ext': 'mp4', 'height': 1080, 'url': 'b'}, ] info_dict['formats'] = formats yie = YoutubeIE(ydl) @@ -60,9 +71,9 @@ class TestFormatSelection(unittest.TestCase): ydl = YDL() ydl.params['prefer_free_formats'] = False formats = [ - {'ext': 'webm', 'height': 720}, - {'ext': 'mp4', 'height': 720}, - {'ext': 'flv', 'height': 720}, + {'ext': 'webm', 'height': 720, 'url': '_'}, + {'ext': 'mp4', 'height': 720, 'url': '_'}, + {'ext': 'flv', 'height': 720, 'url': '_'}, ] info_dict['formats'] = formats yie = YoutubeIE(ydl) @@ -74,8 +85,8 @@ class TestFormatSelection(unittest.TestCase): ydl = YDL() ydl.params['prefer_free_formats'] = False formats = [ - {'ext': 'flv', 'height': 720}, - {'ext': 'webm', 'height': 720}, + {'ext': 'flv', 'height': 720, 'url': '_'}, + {'ext': 'webm', 'height': 720, 'url': '_'}, ] info_dict['formats'] = formats yie = YoutubeIE(ydl) @@ -91,8 +102,7 @@ class TestFormatSelection(unittest.TestCase): {'format_id': 'great', 'url': 'http://example.com/great', 'preference': 3}, {'format_id': 'excellent', 'url': 'http://example.com/exc', 'preference': 4}, ] - info_dict = { - 'formats': formats, 'extractor': 'test', 'id': 'testvid'} + info_dict = _make_result(formats) ydl = YDL() ydl.process_ie_result(info_dict) @@ -120,12 +130,12 @@ class TestFormatSelection(unittest.TestCase): def test_format_selection(self): formats = [ - {'format_id': '35', 'ext': 'mp4', 'preference': 1}, - {'format_id': '45', 'ext': 'webm', 'preference': 2}, - {'format_id': '47', 'ext': 'webm', 'preference': 3}, - {'format_id': '2', 'ext': 'flv', 'preference': 4}, + {'format_id': '35', 'ext': 'mp4', 'preference': 1, 'url': '_'}, + {'format_id': '45', 'ext': 'webm', 'preference': 2, 'url': '_'}, + {'format_id': '47', 'ext': 'webm', 'preference': 3, 'url': '_'}, + {'format_id': '2', 'ext': 'flv', 'preference': 4, 'url': '_'}, ] - info_dict = {'formats': formats, 'extractor': 'test'} + info_dict = _make_result(formats) ydl = YDL({'format': '20/47'}) ydl.process_ie_result(info_dict.copy()) @@ -154,12 +164,12 @@ class TestFormatSelection(unittest.TestCase): def test_format_selection_audio(self): formats = [ - {'format_id': 'audio-low', 'ext': 'webm', 'preference': 1, 'vcodec': 'none'}, - {'format_id': 'audio-mid', 'ext': 'webm', 'preference': 2, 'vcodec': 'none'}, - {'format_id': 'audio-high', 'ext': 'flv', 'preference': 3, 'vcodec': 'none'}, - {'format_id': 'vid', 'ext': 'mp4', 'preference': 4}, + {'format_id': 'audio-low', 'ext': 'webm', 'preference': 1, 'vcodec': 'none', 'url': '_'}, + {'format_id': 'audio-mid', 'ext': 'webm', 'preference': 2, 'vcodec': 'none', 'url': '_'}, + {'format_id': 'audio-high', 'ext': 'flv', 'preference': 3, 'vcodec': 'none', 'url': '_'}, + {'format_id': 'vid', 'ext': 'mp4', 'preference': 4, 'url': '_'}, ] - info_dict = {'formats': formats, 'extractor': 'test'} + info_dict = _make_result(formats) ydl = YDL({'format': 'bestaudio'}) ydl.process_ie_result(info_dict.copy()) @@ -172,10 +182,10 @@ class TestFormatSelection(unittest.TestCase): self.assertEqual(downloaded['format_id'], 'audio-low') formats = [ - {'format_id': 'vid-low', 'ext': 'mp4', 'preference': 1}, - {'format_id': 'vid-high', 'ext': 'mp4', 'preference': 2}, + {'format_id': 'vid-low', 'ext': 'mp4', 'preference': 1, 'url': '_'}, + {'format_id': 'vid-high', 'ext': 'mp4', 'preference': 2, 'url': '_'}, ] - info_dict = {'formats': formats, 'extractor': 'test'} + info_dict = _make_result(formats) ydl = YDL({'format': 'bestaudio/worstaudio/best'}) ydl.process_ie_result(info_dict.copy()) @@ -184,11 +194,11 @@ class TestFormatSelection(unittest.TestCase): def test_format_selection_video(self): formats = [ - {'format_id': 'dash-video-low', 'ext': 'mp4', 'preference': 1, 'acodec': 'none'}, - {'format_id': 'dash-video-high', 'ext': 'mp4', 'preference': 2, 'acodec': 'none'}, - {'format_id': 'vid', 'ext': 'mp4', 'preference': 3}, + {'format_id': 'dash-video-low', 'ext': 'mp4', 'preference': 1, 'acodec': 'none', 'url': '_'}, + {'format_id': 'dash-video-high', 'ext': 'mp4', 'preference': 2, 'acodec': 'none', 'url': '_'}, + {'format_id': 'vid', 'ext': 'mp4', 'preference': 3, 'url': '_'}, ] - info_dict = {'formats': formats, 'extractor': 'test'} + info_dict = _make_result(formats) ydl = YDL({'format': 'bestvideo'}) ydl.process_ie_result(info_dict.copy()) @@ -217,10 +227,12 @@ class TestFormatSelection(unittest.TestCase): for f1id, f2id in zip(order, order[1:]): f1 = YoutubeIE._formats[f1id].copy() f1['format_id'] = f1id + f1['url'] = 'url:' + f1id f2 = YoutubeIE._formats[f2id].copy() f2['format_id'] = f2id + f2['url'] = 'url:' + f2id - info_dict = {'formats': [f1, f2], 'extractor': 'youtube'} + info_dict = _make_result([f1, f2], extractor='youtube') ydl = YDL() yie = YoutubeIE(ydl) yie._sort_formats(info_dict['formats']) @@ -228,7 +240,7 @@ class TestFormatSelection(unittest.TestCase): downloaded = ydl.downloaded_info_dicts[0] self.assertEqual(downloaded['format_id'], f1id) - info_dict = {'formats': [f2, f1], 'extractor': 'youtube'} + info_dict = _make_result([f2, f1], extractor='youtube') ydl = YDL() yie = YoutubeIE(ydl) yie._sort_formats(info_dict['formats']) diff --git a/test/test_all_urls.py b/test/test_all_urls.py index ed041ffda..bea8c41fb 100644 --- a/test/test_all_urls.py +++ b/test/test_all_urls.py @@ -153,6 +153,9 @@ class TestAllURLsMatching(unittest.TestCase): self.assertMatch( 'http://thecolbertreport.cc.com/videos/gh6urb/neil-degrasse-tyson-pt--1?xrs=eml_col_031114', ['ComedyCentralShows']) + self.assertMatch( + 'http://thedailyshow.cc.com/guests/michael-lewis/3efna8/exclusive---michael-lewis-extended-interview-pt--3', + ['ComedyCentralShows']) if __name__ == '__main__': unittest.main() diff --git a/test/test_playlists.py b/test/test_playlists.py index 4af38632e..5fb679aa1 100644 --- a/test/test_playlists.py +++ b/test/test_playlists.py @@ -42,6 +42,7 @@ from youtube_dl.extractor import ( ToypicsUserIE, XTubeUserIE, InstagramUserIE, + CSpanIE, ) @@ -314,6 +315,19 @@ class TestPlaylists(unittest.TestCase): } expect_info_dict(self, EXPECTED, test_video) + def test_CSpan_playlist(self): + dl = FakeYDL() + ie = CSpanIE(dl) + result = ie.extract( + 'http://www.c-span.org/video/?318608-1/gm-ignition-switch-recall') + self.assertIsPlaylist(result) + self.assertEqual(result['id'], '342759') + self.assertEqual( + result['title'], 'General Motors Ignition Switch Recall') + self.assertEqual(len(result['entries']), 9) + whole_duration = sum(e['duration'] for e in result['entries']) + self.assertEqual(whole_duration, 14855) + if __name__ == '__main__': unittest.main() diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 7017e58ea..95b7a9f31 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -703,6 +703,11 @@ class YoutubeDL(object): def process_video_result(self, info_dict, download=True): assert info_dict.get('_type', 'video') == 'video' + if 'id' not in info_dict: + raise ExtractorError('Missing "id" field in extractor result') + if 'title' not in info_dict: + raise ExtractorError('Missing "title" field in extractor result') + if 'playlist' not in info_dict: # It isn't part of a playlist info_dict['playlist'] = None @@ -734,6 +739,9 @@ class YoutubeDL(object): # We check that all the formats have the format and format_id fields for i, format in enumerate(formats): + if 'url' not in format: + raise ExtractorError('Missing "url" key in result (index %d)' % i) + if format.get('format_id') is None: format['format_id'] = compat_str(i) if format.get('format') is None: @@ -744,7 +752,7 @@ class YoutubeDL(object): ) # Automatically determine file extension if missing if 'ext' not in format: - format['ext'] = determine_ext(format['url']) + format['ext'] = determine_ext(format['url']).lower() format_limit = self.params.get('format_limit', None) if format_limit: @@ -869,7 +877,7 @@ class YoutubeDL(object): try: dn = os.path.dirname(encodeFilename(filename)) - if dn != '' and not os.path.exists(dn): + if dn and not os.path.exists(dn): os.makedirs(dn) except (OSError, IOError) as err: self.report_error('unable to create directory ' + compat_str(err)) diff --git a/youtube_dl/downloader/common.py b/youtube_dl/downloader/common.py index 5a068aa8b..917f3450e 100644 --- a/youtube_dl/downloader/common.py +++ b/youtube_dl/downloader/common.py @@ -4,9 +4,10 @@ import sys import time from ..utils import ( + compat_str, encodeFilename, - timeconvert, format_bytes, + timeconvert, ) @@ -173,7 +174,7 @@ class FileDownloader(object): return os.rename(encodeFilename(old_filename), encodeFilename(new_filename)) except (IOError, OSError) as err: - self.report_error(u'unable to rename file: %s' % str(err)) + self.report_error(u'unable to rename file: %s' % compat_str(err)) def try_utime(self, filename, last_modified_hdr): """Try to set the last-modified time of the given file.""" diff --git a/youtube_dl/downloader/f4m.py b/youtube_dl/downloader/f4m.py index 4e6abfe10..e6be6ae6c 100644 --- a/youtube_dl/downloader/f4m.py +++ b/youtube_dl/downloader/f4m.py @@ -297,6 +297,7 @@ class F4mFD(FileDownloader): break frags_filenames.append(frag_filename) + dest_stream.close() self.report_finish(format_bytes(state['downloaded_bytes']), time.time() - start) self.try_rename(tmpfilename, filename) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index e2e66c526..66f71edf6 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -40,6 +40,7 @@ from .clipfish import ClipfishIE from .cliphunter import CliphunterIE from .clipsyndicate import ClipsyndicateIE from .cmt import CMTIE +from .cnet import CNETIE from .cnn import ( CNNIE, CNNBlogsIE, @@ -83,6 +84,7 @@ from .fktv import ( ) from .flickr import FlickrIE from .fourtube import FourTubeIE +from .franceculture import FranceCultureIE from .franceinter import FranceInterIE from .francetv import ( PluzzIE, @@ -152,6 +154,8 @@ from .mixcloud import MixcloudIE from .mpora import MporaIE from .mofosex import MofosexIE from .mooshare import MooshareIE +from .morningstar import MorningstarIE +from .motorsport import MotorsportIE from .mtv import ( MTVIE, MTVIggyIE, diff --git a/youtube_dl/extractor/c56.py b/youtube_dl/extractor/c56.py index 690bc7c25..cb96c3876 100644 --- a/youtube_dl/extractor/c56.py +++ b/youtube_dl/extractor/c56.py @@ -2,39 +2,46 @@ from __future__ import unicode_literals import re -import json from .common import InfoExtractor class C56IE(InfoExtractor): - _VALID_URL = r'https?://((www|player)\.)?56\.com/(.+?/)?(v_|(play_album.+-))(?P.+?)\.(html|swf)' + _VALID_URL = r'https?://(?:(?:www|player)\.)?56\.com/(?:.+?/)?(?:v_|(?:play_album.+-))(?P.+?)\.(?:html|swf)' IE_NAME = '56.com' _TEST = { 'url': 'http://www.56.com/u39/v_OTM0NDA3MTY.html', - 'file': '93440716.flv', 'md5': 'e59995ac63d0457783ea05f93f12a866', 'info_dict': { + 'id': '93440716', + 'ext': 'flv', 'title': '网事知多少 第32期:车怒', + 'duration': 283.813, }, } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url, flags=re.VERBOSE) text_id = mobj.group('textid') - info_page = self._download_webpage('http://vxml.56.com/json/%s/' % text_id, - text_id, 'Downloading video info') - info = json.loads(info_page)['info'] - formats = [{ - 'format_id': f['type'], - 'filesize': int(f['filesize']), - 'url': f['url'] - } for f in info['rfiles']] + + page = self._download_json( + 'http://vxml.56.com/json/%s/' % text_id, text_id, 'Downloading video info') + + info = page['info'] + + formats = [ + { + 'format_id': f['type'], + 'filesize': int(f['filesize']), + 'url': f['url'] + } for f in info['rfiles'] + ] self._sort_formats(formats) return { 'id': info['vid'], 'title': info['Subject'], + 'duration': int(info['duration']) / 1000.0, 'formats': formats, 'thumbnail': info.get('bimg') or info.get('img'), } diff --git a/youtube_dl/extractor/cnet.py b/youtube_dl/extractor/cnet.py new file mode 100644 index 000000000..f5ab443d2 --- /dev/null +++ b/youtube_dl/extractor/cnet.py @@ -0,0 +1,75 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json +import re + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + int_or_none, +) + + +class CNETIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?cnet\.com/videos/(?P[^/]+)/' + _TEST = { + 'url': 'http://www.cnet.com/videos/hands-on-with-microsofts-windows-8-1-update/', + 'md5': '041233212a0d06b179c87cbcca1577b8', + 'info_dict': { + 'id': '56f4ea68-bd21-4852-b08c-4de5b8354c60', + 'ext': 'mp4', + 'title': 'Hands-on with Microsoft Windows 8.1 Update', + 'description': 'The new update to the Windows 8 OS brings improved performance for mouse and keyboard users.', + 'thumbnail': 're:^http://.*/flmswindows8.jpg$', + 'uploader_id': 'sarah.mitroff@cbsinteractive.com', + 'uploader': 'Sarah Mitroff', + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + display_id = mobj.group('id') + + webpage = self._download_webpage(url, display_id) + data_json = self._html_search_regex( + r"
tds|thedailyshow|cr|colbert|colbertnation|colbertreport) |https?://(:www\.)? (?Pthedailyshow|thecolbertreport)\.(?:cc\.)?com/ - (full-episodes/(?P.*)| + (full-episodes/(?:[0-9a-z]{6}/)?(?P.*)| (?P - (?:videos/[^/]+/(?P[^/?#]+)) + (?:(?:guests/[^/]+|videos)/[^/]+/(?P[^/?#]+)) |(the-colbert-report-(videos|collections)/(?P[0-9]+)/[^/]*/(?P.*?)) |(watch/(?P[^/]*)/(?P.*)) )| diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index c47fc58b9..4444d1095 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -256,6 +256,17 @@ class InfoExtractor(object): outf.write(webpage_bytes) content = webpage_bytes.decode(encoding, 'replace') + + if (u'Access to this site is blocked' in content and + u'Websense' in content[:512]): + msg = u'Access to this webpage has been blocked by Websense filtering software in your network.' + blocked_iframe = self._html_search_regex( + r'