diff --git a/test/helper.py b/test/helper.py index 230d2bd67..84b16f770 100644 --- a/test/helper.py +++ b/test/helper.py @@ -148,3 +148,10 @@ def assertRegexpMatches(self, text, regexp, msg=None): else: msg = note + ', ' + msg self.assertTrue(m, msg) + + +def assertGreaterEqual(self, got, expected, msg=None): + if not (got >= expected): + if msg is None: + msg = '%r not greater than or equal to %r' % (got, expected) + self.assertTrue(got >= expected, msg) diff --git a/test/test_playlists.py b/test/test_playlists.py index 1a38a667b..4789200e9 100644 --- a/test/test_playlists.py +++ b/test/test_playlists.py @@ -11,6 +11,7 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from test.helper import ( assertRegexpMatches, + assertGreaterEqual, expect_info_dict, FakeYDL, ) @@ -71,8 +72,8 @@ class TestPlaylists(unittest.TestCase): ie = DailymotionUserIE(dl) result = ie.extract('https://www.dailymotion.com/user/nqtv') self.assertIsPlaylist(result) + assertGreaterEqual(self, len(result['entries']), 100) self.assertEqual(result['title'], 'Rémi Gaillard') - self.assertTrue(len(result['entries']) >= 100) def test_vimeo_channel(self): dl = FakeYDL() @@ -111,7 +112,7 @@ class TestPlaylists(unittest.TestCase): ie = VineUserIE(dl) result = ie.extract('https://vine.co/Visa') self.assertIsPlaylist(result) - self.assertTrue(len(result['entries']) >= 47) + assertGreaterEqual(self, len(result['entries']), 47) def test_ustream_channel(self): dl = FakeYDL() @@ -119,7 +120,7 @@ class TestPlaylists(unittest.TestCase): result = ie.extract('http://www.ustream.tv/channel/channeljapan') self.assertIsPlaylist(result) self.assertEqual(result['id'], '10874166') - self.assertTrue(len(result['entries']) >= 54) + assertGreaterEqual(self, len(result['entries']), 54) def test_soundcloud_set(self): dl = FakeYDL() @@ -127,7 +128,7 @@ class TestPlaylists(unittest.TestCase): result = ie.extract('https://soundcloud.com/the-concept-band/sets/the-royal-concept-ep') self.assertIsPlaylist(result) self.assertEqual(result['title'], 'The Royal Concept EP') - self.assertTrue(len(result['entries']) >= 6) + assertGreaterEqual(self, len(result['entries']), 6) def test_soundcloud_user(self): dl = FakeYDL() @@ -135,7 +136,7 @@ class TestPlaylists(unittest.TestCase): result = ie.extract('https://soundcloud.com/the-concept-band') self.assertIsPlaylist(result) self.assertEqual(result['id'], '9615865') - self.assertTrue(len(result['entries']) >= 12) + assertGreaterEqual(self, len(result['entries']), 12) def test_soundcloud_likes(self): dl = FakeYDL() @@ -143,7 +144,7 @@ class TestPlaylists(unittest.TestCase): result = ie.extract('https://soundcloud.com/the-concept-band/likes') self.assertIsPlaylist(result) self.assertEqual(result['id'], '9615865') - self.assertTrue(len(result['entries']) >= 1) + assertGreaterEqual(self, len(result['entries']), 1) def test_soundcloud_playlist(self): dl = FakeYDL() @@ -162,7 +163,7 @@ class TestPlaylists(unittest.TestCase): result = ie.extract('http://new.livestream.com/tedx/cityenglish') self.assertIsPlaylist(result) self.assertEqual(result['title'], 'TEDCity2.0 (English)') - self.assertTrue(len(result['entries']) >= 4) + assertGreaterEqual(self, len(result['entries']), 4) def test_livestreamoriginal_folder(self): dl = FakeYDL() @@ -170,7 +171,7 @@ class TestPlaylists(unittest.TestCase): result = ie.extract('https://www.livestream.com/newplay/folder?dirId=a07bf706-d0e4-4e75-a747-b021d84f2fd3') self.assertIsPlaylist(result) self.assertEqual(result['id'], 'a07bf706-d0e4-4e75-a747-b021d84f2fd3') - self.assertTrue(len(result['entries']) >= 28) + assertGreaterEqual(self, len(result['entries']), 28) def test_nhl_videocenter(self): dl = FakeYDL() @@ -187,7 +188,7 @@ class TestPlaylists(unittest.TestCase): result = ie.extract('http://bambuser.com/channel/pixelversity') self.assertIsPlaylist(result) self.assertEqual(result['title'], 'pixelversity') - self.assertTrue(len(result['entries']) >= 60) + assertGreaterEqual(self, len(result['entries']), 60) def test_bandcamp_album(self): dl = FakeYDL() @@ -195,7 +196,7 @@ class TestPlaylists(unittest.TestCase): result = ie.extract('http://mpallante.bandcamp.com/album/nightmare-night-ep') self.assertIsPlaylist(result) self.assertEqual(result['title'], 'Nightmare Night EP') - self.assertTrue(len(result['entries']) >= 4) + assertGreaterEqual(self, len(result['entries']), 4) def test_smotri_community(self): dl = FakeYDL() @@ -204,7 +205,7 @@ class TestPlaylists(unittest.TestCase): self.assertIsPlaylist(result) self.assertEqual(result['id'], 'kommuna') self.assertEqual(result['title'], 'КПРФ') - self.assertTrue(len(result['entries']) >= 4) + assertGreaterEqual(self, len(result['entries']), 4) def test_smotri_user(self): dl = FakeYDL() @@ -213,7 +214,7 @@ class TestPlaylists(unittest.TestCase): self.assertIsPlaylist(result) self.assertEqual(result['id'], 'inspector') self.assertEqual(result['title'], 'Inspector') - self.assertTrue(len(result['entries']) >= 9) + assertGreaterEqual(self, len(result['entries']), 9) def test_AcademicEarthCourse(self): dl = FakeYDL() @@ -232,7 +233,7 @@ class TestPlaylists(unittest.TestCase): self.assertIsPlaylist(result) self.assertEqual(result['id'], 'dvoe_iz_lartsa') self.assertEqual(result['title'], 'Двое из ларца (2006 - 2008)') - self.assertTrue(len(result['entries']) >= 24) + assertGreaterEqual(self, len(result['entries']), 24) def test_ivi_compilation_season(self): dl = FakeYDL() @@ -241,7 +242,7 @@ class TestPlaylists(unittest.TestCase): self.assertIsPlaylist(result) self.assertEqual(result['id'], 'dvoe_iz_lartsa/season1') self.assertEqual(result['title'], 'Двое из ларца (2006 - 2008) 1 сезон') - self.assertTrue(len(result['entries']) >= 12) + assertGreaterEqual(self, len(result['entries']), 12) def test_imdb_list(self): dl = FakeYDL() @@ -260,7 +261,7 @@ class TestPlaylists(unittest.TestCase): self.assertEqual(result['id'], 'cryptography') self.assertEqual(result['title'], 'Journey into cryptography') self.assertEqual(result['description'], 'How have humans protected their secret messages through history? What has changed today?') - self.assertTrue(len(result['entries']) >= 3) + assertGreaterEqual(self, len(result['entries']), 3) def test_EveryonesMixtape(self): dl = FakeYDL() @@ -277,7 +278,7 @@ class TestPlaylists(unittest.TestCase): result = ie.extract('http://rutube.ru/tags/video/1800/') self.assertIsPlaylist(result) self.assertEqual(result['id'], '1800') - self.assertTrue(len(result['entries']) >= 68) + assertGreaterEqual(self, len(result['entries']), 68) def test_rutube_person(self): dl = FakeYDL() @@ -285,7 +286,7 @@ class TestPlaylists(unittest.TestCase): result = ie.extract('http://rutube.ru/video/person/313878/') self.assertIsPlaylist(result) self.assertEqual(result['id'], '313878') - self.assertTrue(len(result['entries']) >= 37) + assertGreaterEqual(self, len(result['entries']), 37) def test_multiple_brightcove_videos(self): # https://github.com/rg3/youtube-dl/issues/2283 @@ -322,7 +323,7 @@ class TestPlaylists(unittest.TestCase): self.assertIsPlaylist(result) self.assertEqual(result['id'], '10') self.assertEqual(result['title'], 'Who are the hackers?') - self.assertTrue(len(result['entries']) >= 6) + assertGreaterEqual(self, len(result['entries']), 6) def test_toypics_user(self): dl = FakeYDL() @@ -330,7 +331,7 @@ class TestPlaylists(unittest.TestCase): result = ie.extract('http://videos.toypics.net/Mikey') self.assertIsPlaylist(result) self.assertEqual(result['id'], 'Mikey') - self.assertTrue(len(result['entries']) >= 17) + assertGreaterEqual(self, len(result['entries']), 17) def test_xtube_user(self): dl = FakeYDL() @@ -338,7 +339,7 @@ class TestPlaylists(unittest.TestCase): result = ie.extract('http://www.xtube.com/community/profile.php?user=greenshowers') self.assertIsPlaylist(result) self.assertEqual(result['id'], 'greenshowers') - self.assertTrue(len(result['entries']) >= 155) + assertGreaterEqual(self, len(result['entries']), 155) def test_InstagramUser(self): dl = FakeYDL() @@ -346,7 +347,7 @@ class TestPlaylists(unittest.TestCase): result = ie.extract('http://instagram.com/porsche') self.assertIsPlaylist(result) self.assertEqual(result['id'], 'porsche') - self.assertTrue(len(result['entries']) >= 2) + assertGreaterEqual(self, len(result['entries']), 2) test_video = next( e for e in result['entries'] if e['id'] == '614605558512799803_462752227') @@ -385,7 +386,7 @@ class TestPlaylists(unittest.TestCase): self.assertEqual(result['id'], '152147') self.assertEqual( result['title'], 'Brace Yourself - Today\'s Weirdest News') - self.assertTrue(len(result['entries']) >= 10) + assertGreaterEqual(self, len(result['entries']), 10) def test_TeacherTubeUser(self): dl = FakeYDL() @@ -393,7 +394,7 @@ class TestPlaylists(unittest.TestCase): result = ie.extract('http://www.teachertube.com/user/profile/rbhagwati2') self.assertIsPlaylist(result) self.assertEqual(result['id'], 'rbhagwati2') - self.assertTrue(len(result['entries']) >= 179) + assertGreaterEqual(self, len(result['entries']), 179) if __name__ == '__main__': unittest.main() diff --git a/test/test_swfinterp.py b/test/test_swfinterp.py index 3bb5a6308..b42cd74c7 100644 --- a/test/test_swfinterp.py +++ b/test/test_swfinterp.py @@ -7,6 +7,7 @@ import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +import errno import io import json import re diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index e02e06523..f5f25fea1 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -1197,6 +1197,10 @@ class YoutubeDL(object): if res: res += ', ' res += format_bytes(fdict['filesize']) + elif fdict.get('filesize_approx') is not None: + if res: + res += ', ' + res += '~' + format_bytes(fdict['filesize_approx']) return res def list_formats(self, info_dict): diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index f223b75f4..0e7b9ddaf 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -72,11 +72,9 @@ __license__ = 'Public Domain' import codecs import io -import locale import optparse import os import random -import re import shlex import sys diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index a17a80a5f..8d63d9281 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -267,6 +267,8 @@ from .smotri import ( SmotriUserIE, SmotriBroadcastIE, ) +from .snotr import SnotrIE +from .sockshare import SockshareIE from .sohu import SohuIE from .soundcloud import ( SoundcloudIE, diff --git a/youtube_dl/extractor/chilloutzone.py b/youtube_dl/extractor/chilloutzone.py index 02d5ba527..a62395d4b 100644 --- a/youtube_dl/extractor/chilloutzone.py +++ b/youtube_dl/extractor/chilloutzone.py @@ -42,7 +42,7 @@ class ChilloutzoneIE(InfoExtractor): 'id': '85523671', 'ext': 'mp4', 'title': 'The Sunday Times - Icons', - 'description': 'md5:3e1c0dc6047498d6728dcdaad0891762', + 'description': 'md5:a5f7ff82e2f7a9ed77473fe666954e84', 'uploader': 'Us', 'uploader_id': 'usfilms', 'upload_date': '20140131' diff --git a/youtube_dl/extractor/cnet.py b/youtube_dl/extractor/cnet.py index a94f42571..710d5009b 100644 --- a/youtube_dl/extractor/cnet.py +++ b/youtube_dl/extractor/cnet.py @@ -43,7 +43,11 @@ class CNETIE(InfoExtractor): raise ExtractorError('Cannot find video data') video_id = vdata['id'] - title = vdata['headline'] + title = vdata.get('headline') + if title is None: + title = vdata.get('title') + if title is None: + raise ExtractorError('Cannot find title!') description = vdata.get('dek') thumbnail = vdata.get('image', {}).get('path') author = vdata.get('author') diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index e68657314..9b36e0789 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -69,6 +69,7 @@ class InfoExtractor(object): * vcodec Name of the video codec in use * container Name of the container format * filesize The number of bytes, if known in advance + * filesize_approx An estimate for the number of bytes * player_url SWF Player URL (used for rtmpdump). * protocol The protocol that will be used for the actual download, lower-case. @@ -468,7 +469,7 @@ class InfoExtractor(object): display_name = name return self._html_search_regex( r'''(?ix)]+(?:itemprop|name|property)=["\']%s["\']) + (?=[^>]+(?:itemprop|name|property)=["\']?%s["\']?) [^>]+content=["\']([^"\']+)["\']''' % re.escape(name), html, display_name, fatal=fatal, **kwargs) @@ -555,6 +556,7 @@ class InfoExtractor(object): f.get('abr') if f.get('abr') is not None else -1, audio_ext_preference, f.get('filesize') if f.get('filesize') is not None else -1, + f.get('filesize_approx') if f.get('filesize_approx') is not None else -1, f.get('format_id'), ) formats.sort(key=_formats_key) diff --git a/youtube_dl/extractor/dropbox.py b/youtube_dl/extractor/dropbox.py index 41208c976..9f569aa93 100644 --- a/youtube_dl/extractor/dropbox.py +++ b/youtube_dl/extractor/dropbox.py @@ -5,24 +5,26 @@ import os.path import re from .common import InfoExtractor +from ..utils import compat_urllib_parse_unquote class DropboxIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?dropbox[.]com/s/(?P[a-zA-Z0-9]{15})/(?P[^?#]*)' _TEST = { - 'url': 'https://www.dropbox.com/s/0qr9sai2veej4f8/THE_DOCTOR_GAMES.mp4', - 'md5': '8ae17c51172fb7f93bdd6a214cc8c896', + 'url': 'https://www.dropbox.com/s/nelirfsxnmcfbfh/youtube-dl%20test%20video%20%27%C3%A4%22BaW_jenozKc.mp4', + 'md5': '8a3d905427a6951ccb9eb292f154530b', 'info_dict': { - 'id': '0qr9sai2veej4f8', + 'id': 'nelirfsxnmcfbfh', 'ext': 'mp4', - 'title': 'THE_DOCTOR_GAMES' + 'title': 'youtube-dl test video \'ä"BaW_jenozKc' } } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') - title = os.path.splitext(mobj.group('title'))[0] + fn = compat_urllib_parse_unquote(mobj.group('title')) + title = os.path.splitext(fn)[0] video_url = url + '?dl=1' return { diff --git a/youtube_dl/extractor/funnyordie.py b/youtube_dl/extractor/funnyordie.py index 6e6b66660..721e5fce0 100644 --- a/youtube_dl/extractor/funnyordie.py +++ b/youtube_dl/extractor/funnyordie.py @@ -26,7 +26,7 @@ class FunnyOrDieIE(InfoExtractor): 'id': 'e402820827', 'ext': 'mp4', 'title': 'Please Use This Song (Jon Lajoie)', - 'description': 'md5:2ed27d364f5a805a6dba199faaf6681d', + 'description': 'Please use this to sell something. www.jonlajoie.com', 'thumbnail': 're:^http:.*\.jpg$', }, }] diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index f97b59845..9db27f9aa 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -402,7 +402,7 @@ class GenericIE(InfoExtractor): elif default_search == 'error': raise ExtractorError( ('%r is not a valid URL. ' - 'Set --default-search "ytseach" (or run youtube-dl "ytsearch:%s" ) to search YouTube' + 'Set --default-search "ytsearch" (or run youtube-dl "ytsearch:%s" ) to search YouTube' ) % (url, url), expected=True) else: assert ':' in default_search diff --git a/youtube_dl/extractor/rtve.py b/youtube_dl/extractor/rtve.py index 77fd08dde..c2228b2f0 100644 --- a/youtube_dl/extractor/rtve.py +++ b/youtube_dl/extractor/rtve.py @@ -17,7 +17,7 @@ class RTVEALaCartaIE(InfoExtractor): _TEST = { 'url': 'http://www.rtve.es/alacarta/videos/balonmano/o-swiss-cup-masculina-final-espana-suecia/2491869/', - 'md5': '18fcd45965bdd076efdb12cd7f6d7b9e', + 'md5': '1d49b7e1ca7a7502c56a4bf1b60f1b43', 'info_dict': { 'id': '2491869', 'ext': 'mp4', diff --git a/youtube_dl/extractor/savefrom.py b/youtube_dl/extractor/savefrom.py index 198a08c1c..ccd545971 100644 --- a/youtube_dl/extractor/savefrom.py +++ b/youtube_dl/extractor/savefrom.py @@ -20,7 +20,7 @@ class SaveFromIE(InfoExtractor): 'upload_date': '20120816', 'uploader': 'Howcast', 'uploader_id': 'Howcast', - 'description': 'md5:4f0aac94361a12e1ce57d74f85265175', + 'description': 'md5:727900f130df3dc9a25e2721497c7910', }, 'params': { 'skip_download': True diff --git a/youtube_dl/extractor/snotr.py b/youtube_dl/extractor/snotr.py new file mode 100644 index 000000000..da3b05a8d --- /dev/null +++ b/youtube_dl/extractor/snotr.py @@ -0,0 +1,68 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + float_or_none, + str_to_int, + parse_duration, +) + + +class SnotrIE(InfoExtractor): + _VALID_URL = r'http?://(?:www\.)?snotr\.com/video/(?P<id>\d+)/([\w]+)' + _TESTS = [{ + 'url': 'http://www.snotr.com/video/13708/Drone_flying_through_fireworks', + 'info_dict': { + 'id': '13708', + 'ext': 'flv', + 'title': 'Drone flying through fireworks!', + 'duration': 247, + 'filesize_approx': 98566144, + 'description': 'A drone flying through Fourth of July Fireworks', + } + }, { + 'url': 'http://www.snotr.com/video/530/David_Letteman_-_George_W_Bush_Top_10', + 'info_dict': { + 'id': '530', + 'ext': 'flv', + 'title': 'David Letteman - George W. Bush Top 10', + 'duration': 126, + 'filesize_approx': 8912896, + 'description': 'The top 10 George W. Bush moments, brought to you by David Letterman!', + } + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + + webpage = self._download_webpage(url, video_id) + title = self._og_search_title(webpage) + + description = self._og_search_description(webpage) + video_url = "http://cdn.videos.snotr.com/%s.flv" % video_id + + view_count = str_to_int(self._html_search_regex( + r'<p>\n<strong>Views:</strong>\n([\d,\.]+)</p>', + webpage, 'view count', fatal=False)) + + duration = parse_duration(self._html_search_regex( + r'<p>\n<strong>Length:</strong>\n\s*([0-9:]+).*?</p>', + webpage, 'duration', fatal=False)) + + filesize_approx = float_or_none(self._html_search_regex( + r'<p>\n<strong>Filesize:</strong>\n\s*([0-9.]+)\s*megabyte</p>', + webpage, 'filesize', fatal=False), invscale=1024 * 1024) + + return { + 'id': video_id, + 'description': description, + 'title': title, + 'url': video_url, + 'view_count': view_count, + 'duration': duration, + 'filesize_approx': filesize_approx, + } diff --git a/youtube_dl/extractor/sockshare.py b/youtube_dl/extractor/sockshare.py new file mode 100644 index 000000000..75b634bc6 --- /dev/null +++ b/youtube_dl/extractor/sockshare.py @@ -0,0 +1,78 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from ..utils import ( + ExtractorError, + compat_urllib_parse, + compat_urllib_request, +) +import re + +from .common import InfoExtractor + + +class SockshareIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?sockshare\.com/file/(?P<id>[0-9A-Za-z]+)' + _FILE_DELETED_REGEX = r'This file doesn\'t exist, or has been removed\.</div>' + _TEST = { + 'url': 'http://www.sockshare.com/file/437BE28B89D799D7', + 'md5': '9d0bf1cfb6dbeaa8d562f6c97506c5bd', + 'info_dict': { + 'id': '437BE28B89D799D7', + 'title': 'big_buck_bunny_720p_surround.avi', + 'ext': 'avi', + 'thumbnail': 're:^http://.*\.jpg$', + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + + url = 'http://sockshare.com/file/%s' % video_id + webpage = self._download_webpage(url, video_id) + + if re.search(self._FILE_DELETED_REGEX, webpage) is not None: + raise ExtractorError('Video %s does not exist' % video_id, + expected=True) + + confirm_hash = self._html_search_regex(r'''(?x)<input\s+ + type="hidden"\s+ + value="([^"]*)"\s+ + name="hash" + ''', webpage, 'hash') + + fields = { + "hash": confirm_hash, + "confirm": "Continue as Free User" + } + + post = compat_urllib_parse.urlencode(fields) + req = compat_urllib_request.Request(url, post) + # Apparently, this header is required for confirmation to work. + req.add_header('Host', 'www.sockshare.com') + req.add_header('Content-type', 'application/x-www-form-urlencoded') + + webpage = self._download_webpage( + req, video_id, 'Downloading video page') + + video_url = self._html_search_regex( + r'<a href="([^"]*)".+class="download_file_link"', + webpage, 'file url') + video_url = "http://www.sockshare.com" + video_url + title = self._html_search_regex(r'<h1>(.+)<strong>', webpage, 'title') + thumbnail = self._html_search_regex( + r'<img\s+src="([^"]*)".+?name="bg"', + webpage, 'thumbnail') + + formats = [{ + 'format_id': 'sd', + 'url': video_url, + }] + + return { + 'id': video_id, + 'title': title, + 'thumbnail': thumbnail, + 'formats': formats, + } diff --git a/youtube_dl/extractor/steam.py b/youtube_dl/extractor/steam.py index af689e2c2..183dcb03c 100644 --- a/youtube_dl/extractor/steam.py +++ b/youtube_dl/extractor/steam.py @@ -53,7 +53,7 @@ class SteamIE(InfoExtractor): 'ext': 'mp4', 'upload_date': '20140329', 'title': 'FRONTIERS - Final Greenlight Trailer', - 'description': 'md5:6df4fe8dd494ae811869672b0767e025', + 'description': 'md5:dc96a773669d0ca1b36c13c1f30250d9', 'uploader': 'AAD Productions', 'uploader_id': 'AtomicAgeDogGames', } diff --git a/youtube_dl/extractor/tagesschau.py b/youtube_dl/extractor/tagesschau.py index 25b9864ad..b87047451 100644 --- a/youtube_dl/extractor/tagesschau.py +++ b/youtube_dl/extractor/tagesschau.py @@ -19,16 +19,6 @@ class TagesschauIE(InfoExtractor): 'description': 'md5:69da3c61275b426426d711bde96463ab', 'thumbnail': 're:^http:.*\.jpg$', }, - }, { - 'url': 'http://www.tagesschau.de/multimedia/video/video-5964.html', - 'md5': '66652566900963a3f962333579eeffcf', - 'info_dict': { - 'id': '5964', - 'ext': 'mp4', - 'title': 'Nahost-Konflikt: Israel bombadiert Ziele im Gazastreifen und Westjordanland', - 'description': 'md5:07bfc78c48eec3145ed4805299a1900a', - 'thumbnail': 're:http://.*\.jpg', - }, }] _FORMATS = { diff --git a/youtube_dl/extractor/teachertube.py b/youtube_dl/extractor/teachertube.py index 2c2113b14..46d727d1d 100644 --- a/youtube_dl/extractor/teachertube.py +++ b/youtube_dl/extractor/teachertube.py @@ -62,7 +62,7 @@ class TeacherTubeIE(InfoExtractor): webpage = self._download_webpage(url, video_id) - title = self._html_search_meta('title', webpage, 'title') + title = self._html_search_meta('title', webpage, 'title', fatal=True) TITLE_SUFFIX = ' - TeacherTube' if title.endswith(TITLE_SUFFIX): title = title[:-len(TITLE_SUFFIX)].strip() @@ -101,7 +101,11 @@ class TeacherTubeUserIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?teachertube\.com/(user/profile|collection)/(?P<user>[0-9a-zA-Z]+)/?' - _MEDIA_RE = r'(?s)"sidebar_thumb_time">[0-9:]+</div>.+?<a href="(https?://(?:www\.)?teachertube\.com/(?:video|audio)/[^"]+)">' + _MEDIA_RE = r'''(?sx) + class="?sidebar_thumb_time"?>[0-9:]+</div> + \s* + <a\s+href="(https?://(?:www\.)?teachertube\.com/(?:video|audio)/[^"]+)" + ''' def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) @@ -111,14 +115,12 @@ class TeacherTubeUserIE(InfoExtractor): webpage = self._download_webpage(url, user_id) urls.extend(re.findall(self._MEDIA_RE, webpage)) - pages = re.findall(r'/ajax-user/user-videos/%s\?page=([0-9]+)' % user_id, webpage)[1:-1] + pages = re.findall(r'/ajax-user/user-videos/%s\?page=([0-9]+)' % user_id, webpage)[:-1] for p in pages: more = 'http://www.teachertube.com/ajax-user/user-videos/%s?page=%s' % (user_id, p) - webpage = self._download_webpage(more, user_id, 'Downloading page %s/%s' % (p, len(pages) + 1)) - urls.extend(re.findall(self._MEDIA_RE, webpage)) - - entries = [] - for url in urls: - entries.append(self.url_result(url, 'TeacherTube')) + webpage = self._download_webpage(more, user_id, 'Downloading page %s/%s' % (p, len(pages))) + video_urls = re.findall(self._MEDIA_RE, webpage) + urls.extend(video_urls) + entries = [self.url_result(vurl, 'TeacherTube') for vurl in urls] return self.playlist_result(entries, user_id) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 255855558..a3c6e83b0 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -98,7 +98,7 @@ class VimeoIE(VimeoBaseInfoExtractor, SubtitlesInfoExtractor): 'info_dict': { 'id': '54469442', 'ext': 'mp4', - 'title': 'Kathy Sierra: Building the minimum Badass User, Business of Software', + 'title': 'Kathy Sierra: Building the minimum Badass User, Business of Software 2012', 'uploader': 'The BLN & Business of Software', 'uploader_id': 'theblnbusinessofsoftware', 'duration': 3610, diff --git a/youtube_dl/extractor/wdr.py b/youtube_dl/extractor/wdr.py index f741ba540..ab28ef6fe 100644 --- a/youtube_dl/extractor/wdr.py +++ b/youtube_dl/extractor/wdr.py @@ -55,14 +55,14 @@ class WDRIE(InfoExtractor): }, }, { - 'url': 'http://www.funkhauseuropa.de/av/audiosuepersongsoulbossanova100-audioplayer.html', - 'md5': '24e83813e832badb0a8d7d1ef9ef0691', + 'url': 'http://www.funkhauseuropa.de/av/audioflaviacoelhoamaramar100-audioplayer.html', + 'md5': '99a1443ff29af19f6c52cf6f4dc1f4aa', 'info_dict': { - 'id': 'mdb-463528', + 'id': 'mdb-478135', 'ext': 'mp3', - 'title': 'Süpersong: Soul Bossa Nova', + 'title': 'Flavia Coelho: Amar é Amar', 'description': 'md5:7b29e97e10dfb6e265238b32fa35b23a', - 'upload_date': '20140630', + 'upload_date': '20140717', }, }, ] diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 919603c62..3ecd798d7 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -91,11 +91,9 @@ except ImportError: compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w') try: - from urllib.parse import parse_qs as compat_parse_qs -except ImportError: # Python 2 - # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib. - # Python 2's version is apparently totally broken - def _unquote(string, encoding='utf-8', errors='replace'): + from urllib.parse import unquote as compat_urllib_parse_unquote +except ImportError: + def compat_urllib_parse_unquote(string, encoding='utf-8', errors='replace'): if string == '': return string res = string.split('%') @@ -130,6 +128,13 @@ except ImportError: # Python 2 string += pct_sequence.decode(encoding, errors) return string + +try: + from urllib.parse import parse_qs as compat_parse_qs +except ImportError: # Python 2 + # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib. + # Python 2's version is apparently totally broken + def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False, encoding='utf-8', errors='replace'): qs, _coerce_result = qs, unicode @@ -149,10 +154,12 @@ except ImportError: # Python 2 continue if len(nv[1]) or keep_blank_values: name = nv[0].replace('+', ' ') - name = _unquote(name, encoding=encoding, errors=errors) + name = compat_urllib_parse_unquote( + name, encoding=encoding, errors=errors) name = _coerce_result(name) value = nv[1].replace('+', ' ') - value = _unquote(value, encoding=encoding, errors=errors) + value = compat_urllib_parse_unquote( + value, encoding=encoding, errors=errors) value = _coerce_result(value) r.append((name, value)) return r @@ -1193,13 +1200,6 @@ def format_bytes(bytes): return u'%.2f%s' % (converted, suffix) -def str_to_int(int_str): - if int_str is None: - return None - int_str = re.sub(r'[,\.]', u'', int_str) - return int(int_str) - - def get_term_width(): columns = os.environ.get('COLUMNS', None) if columns: @@ -1267,15 +1267,22 @@ class HEADRequest(compat_urllib_request.Request): return "HEAD" -def int_or_none(v, scale=1, default=None, get_attr=None): +def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1): if get_attr: if v is not None: v = getattr(v, get_attr, None) - return default if v is None else (int(v) // scale) + return default if v is None else (int(v) * invscale // scale) -def float_or_none(v, scale=1, default=None): - return default if v is None else (float(v) / scale) +def str_to_int(int_str): + if int_str is None: + return None + int_str = re.sub(r'[,\.]', u'', int_str) + return int(int_str) + + +def float_or_none(v, scale=1, invscale=1, default=None): + return default if v is None else (float(v) * invscale / scale) def parse_duration(s):