From 66aa382eae9342506db64ce3328a009fd3f06d5c Mon Sep 17 00:00:00 2001 From: Naglis Jonaitis Date: Wed, 16 Jul 2014 02:07:20 +0300 Subject: [PATCH 01/21] [sockshare] Add new extractor --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/sockshare.py | 77 +++++++++++++++++++++++++++++++ 2 files changed, 78 insertions(+) create mode 100644 youtube_dl/extractor/sockshare.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index e49ac3e52..f3575b6c9 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -262,6 +262,7 @@ from .smotri import ( SmotriUserIE, SmotriBroadcastIE, ) +from .sockshare import SockshareIE from .sohu import SohuIE from .soundcloud import ( SoundcloudIE, diff --git a/youtube_dl/extractor/sockshare.py b/youtube_dl/extractor/sockshare.py new file mode 100644 index 000000000..cbf2d7abe --- /dev/null +++ b/youtube_dl/extractor/sockshare.py @@ -0,0 +1,77 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from ..utils import ( + ExtractorError, + compat_urllib_parse, + compat_urllib_request, + determine_ext, +) +import re + +from .common import InfoExtractor + + +class SockshareIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?sockshare\.com/file/(?P[0-9A-Za-z]+)' + _FILE_DELETED_REGEX = r'This file doesn\'t exist, or has been removed\.' + _TEST = { + 'url': 'http://www.sockshare.com/file/437BE28B89D799D7', + 'md5': '9d0bf1cfb6dbeaa8d562f6c97506c5bd', + 'info_dict': { + 'id': '437BE28B89D799D7', + 'title': 'big_buck_bunny_720p_surround.avi', + 'ext': 'avi', + 'thumbnail': 're:^http://.*\.jpg$', + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + + url = 'http://sockshare.com/file/%s' % video_id + webpage = self._download_webpage(url, video_id) + + if re.search(self._FILE_DELETED_REGEX, webpage) is not None: + raise ExtractorError(u'Video %s does not exist' % video_id, + expected=True) + + confirm_hash = self._html_search_regex(r'''(?x)(.+)', webpage, 'title') + thumbnail = self._html_search_regex(r' Date: Sat, 19 Jul 2014 22:49:25 +0500 Subject: [PATCH 02/21] [snotr] Add new extractor --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/snotr.py | 73 ++++++++++++++++++++++++++++++++ 2 files changed, 74 insertions(+) create mode 100644 youtube_dl/extractor/snotr.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 78b95c2a5..faf473548 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -263,6 +263,7 @@ from .smotri import ( SmotriUserIE, SmotriBroadcastIE, ) +from .snotr import SnotrIE from .sohu import SohuIE from .soundcloud import ( SoundcloudIE, diff --git a/youtube_dl/extractor/snotr.py b/youtube_dl/extractor/snotr.py new file mode 100644 index 000000000..f89e81bf3 --- /dev/null +++ b/youtube_dl/extractor/snotr.py @@ -0,0 +1,73 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor + +from ..utils import ( + + str_to_int, + parse_iso8601, + + + +) + +class SnotrIE(InfoExtractor): + _VALID_URL = r'http?://(?:www\.)?snotr\.com/video/(?P\d+)/([\w]+)' + _TESTS =[ { + 'url': 'http://www.snotr.com/video/13708/Drone_flying_through_fireworks', + 'info_dict': { + 'id': '13708', + 'ext': 'flv', + 'title': 'Drone flying through fireworks!', + 'duration': 247, + 'filesize':12320768 + } + }, + + + + { + + 'url': 'http://www.snotr.com/video/530/David_Letteman_-_George_W_Bush_Top_10', + 'info_dict': { + 'id': '530', + 'ext': 'flv', + 'title': 'David Letteman - George W. Bush Top 10', + 'duration': 126, + 'filesize': 1048576 + } + }] + + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + + # TODO more code goes here, for example ... + webpage = self._download_webpage(url, video_id) + title = self._og_search_title(webpage) + + description = self._og_search_description(webpage) + + video_url = "http://cdn.videos.snotr.com/%s.flv" % video_id + + view_count = str_to_int(self._html_search_regex(r'

\nViews:\n([\d,\.]+)

',webpage,'view count')) + + duration = self._html_search_regex(r'

\nLength:\n(.*?)

',webpage,'duration') + duration = str_to_int(duration[:1])*60 + str_to_int(duration[2:4]) + + file_size = self._html_search_regex(r'

\nFilesize:\n(.*?)

',webpage,'filesize') + file_size = str_to_int(re.match(r'\d+',file_size).group())*131072 + + return { + 'id': video_id, + 'title': title, + 'url':video_url, + 'view_count':view_count, + 'duration':duration, + 'filesize':file_size + + } \ No newline at end of file From eef4a7a3042914e4cad6d46a90308567f012ae59 Mon Sep 17 00:00:00 2001 From: "Anthony J. Bentley" Date: Sun, 20 Jul 2014 18:37:44 -0600 Subject: [PATCH 03/21] =?UTF-8?q?Fix=20typo:=20=E2=80=9Cytseach=E2=80=9D?= =?UTF-8?q?=20=E2=86=92=20=E2=80=9Cytsearch=E2=80=9D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- youtube_dl/extractor/generic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index f97b59845..9db27f9aa 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -402,7 +402,7 @@ class GenericIE(InfoExtractor): elif default_search == 'error': raise ExtractorError( ('%r is not a valid URL. ' - 'Set --default-search "ytseach" (or run youtube-dl "ytsearch:%s" ) to search YouTube' + 'Set --default-search "ytsearch" (or run youtube-dl "ytsearch:%s" ) to search YouTube' ) % (url, url), expected=True) else: assert ':' in default_search From 9732d77ed273406afcf9ed3ccb4d109824c9c69d Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Mon, 21 Jul 2014 12:02:44 +0200 Subject: [PATCH 04/21] [snotr] PEP8 and minor fixes (#3296) --- youtube_dl/YoutubeDL.py | 4 +++ youtube_dl/extractor/common.py | 2 ++ youtube_dl/extractor/snotr.py | 55 +++++++++++++++------------------- youtube_dl/utils.py | 22 +++++++------- 4 files changed, 41 insertions(+), 42 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 3dff723b8..686988fe5 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -1197,6 +1197,10 @@ class YoutubeDL(object): if res: res += ', ' res += format_bytes(fdict['filesize']) + elif fdict.get('filesize_approx') is not None: + if res: + res += ', ' + res += '~' + format_bytes(fdict['filesize_approx']) return res def list_formats(self, info_dict): diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index e68657314..3213abacf 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -69,6 +69,7 @@ class InfoExtractor(object): * vcodec Name of the video codec in use * container Name of the container format * filesize The number of bytes, if known in advance + * filesize_approx An estimate for the number of bytes * player_url SWF Player URL (used for rtmpdump). * protocol The protocol that will be used for the actual download, lower-case. @@ -555,6 +556,7 @@ class InfoExtractor(object): f.get('abr') if f.get('abr') is not None else -1, audio_ext_preference, f.get('filesize') if f.get('filesize') is not None else -1, + f.get('filesize_approx') if f.get('filesize_approx') is not None else -1, f.get('format_id'), ) formats.sort(key=_formats_key) diff --git a/youtube_dl/extractor/snotr.py b/youtube_dl/extractor/snotr.py index f89e81bf3..e762ad8f6 100644 --- a/youtube_dl/extractor/snotr.py +++ b/youtube_dl/extractor/snotr.py @@ -4,49 +4,39 @@ from __future__ import unicode_literals import re from .common import InfoExtractor - from ..utils import ( - + float_or_none, str_to_int, - parse_iso8601, - - - + parse_duration, ) + class SnotrIE(InfoExtractor): _VALID_URL = r'http?://(?:www\.)?snotr\.com/video/(?P\d+)/([\w]+)' - _TESTS =[ { + _TESTS = [{ 'url': 'http://www.snotr.com/video/13708/Drone_flying_through_fireworks', 'info_dict': { 'id': '13708', 'ext': 'flv', 'title': 'Drone flying through fireworks!', 'duration': 247, - 'filesize':12320768 - } - }, - - - - { - + 'filesize_approx': 98566144, + } + }, { 'url': 'http://www.snotr.com/video/530/David_Letteman_-_George_W_Bush_Top_10', 'info_dict': { 'id': '530', 'ext': 'flv', 'title': 'David Letteman - George W. Bush Top 10', 'duration': 126, - 'filesize': 1048576 - } - }] - + 'filesize_approx': 8912896, + } + }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') - # TODO more code goes here, for example ... webpage = self._download_webpage(url, video_id) title = self._og_search_title(webpage) @@ -54,20 +44,23 @@ class SnotrIE(InfoExtractor): video_url = "http://cdn.videos.snotr.com/%s.flv" % video_id - view_count = str_to_int(self._html_search_regex(r'

\nViews:\n([\d,\.]+)

',webpage,'view count')) + view_count = str_to_int(self._html_search_regex( + r'

\nViews:\n([\d,\.]+)

', + webpage, 'view count', fatal=False)) - duration = self._html_search_regex(r'

\nLength:\n(.*?)

',webpage,'duration') - duration = str_to_int(duration[:1])*60 + str_to_int(duration[2:4]) + duration = parse_duration(self._html_search_regex( + r'

\nLength:\n\s*([0-9:]+).*?

', + webpage, 'duration', fatal=False)) - file_size = self._html_search_regex(r'

\nFilesize:\n(.*?)

',webpage,'filesize') - file_size = str_to_int(re.match(r'\d+',file_size).group())*131072 + filesize_approx = float_or_none(self._html_search_regex( + r'

\nFilesize:\n\s*([0-9.]+)\s*megabyte

', + webpage, 'filesize', fatal=False), invscale=1024 * 1024) return { 'id': video_id, 'title': title, - 'url':video_url, - 'view_count':view_count, - 'duration':duration, - 'filesize':file_size - - } \ No newline at end of file + 'url': video_url, + 'view_count': view_count, + 'duration': duration, + 'filesize_approx': filesize_approx, + } diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 919603c62..bf4d1112f 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1193,13 +1193,6 @@ def format_bytes(bytes): return u'%.2f%s' % (converted, suffix) -def str_to_int(int_str): - if int_str is None: - return None - int_str = re.sub(r'[,\.]', u'', int_str) - return int(int_str) - - def get_term_width(): columns = os.environ.get('COLUMNS', None) if columns: @@ -1267,15 +1260,22 @@ class HEADRequest(compat_urllib_request.Request): return "HEAD" -def int_or_none(v, scale=1, default=None, get_attr=None): +def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1): if get_attr: if v is not None: v = getattr(v, get_attr, None) - return default if v is None else (int(v) // scale) + return default if v is None else (int(v) * invscale // scale) -def float_or_none(v, scale=1, default=None): - return default if v is None else (float(v) / scale) +def str_to_int(int_str): + if int_str is None: + return None + int_str = re.sub(r'[,\.]', u'', int_str) + return int(int_str) + + +def float_or_none(v, scale=1, invscale=1, default=None): + return default if v is None else (float(v) * invscale / scale) def parse_duration(s): From 54330a1c3c3d4f3c4ce520e0deeece68120c3051 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Mon, 21 Jul 2014 12:07:26 +0200 Subject: [PATCH 05/21] [swfinterp] Fix imports --- test/test_swfinterp.py | 1 + 1 file changed, 1 insertion(+) diff --git a/test/test_swfinterp.py b/test/test_swfinterp.py index 3bb5a6308..b42cd74c7 100644 --- a/test/test_swfinterp.py +++ b/test/test_swfinterp.py @@ -7,6 +7,7 @@ import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +import errno import io import json import re From da8fb85859964d9a1d21a0328eb9044e19499d9c Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Mon, 21 Jul 2014 12:08:44 +0200 Subject: [PATCH 06/21] [snotr] Add description --- youtube_dl/extractor/snotr.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/snotr.py b/youtube_dl/extractor/snotr.py index e762ad8f6..da3b05a8d 100644 --- a/youtube_dl/extractor/snotr.py +++ b/youtube_dl/extractor/snotr.py @@ -21,6 +21,7 @@ class SnotrIE(InfoExtractor): 'title': 'Drone flying through fireworks!', 'duration': 247, 'filesize_approx': 98566144, + 'description': 'A drone flying through Fourth of July Fireworks', } }, { 'url': 'http://www.snotr.com/video/530/David_Letteman_-_George_W_Bush_Top_10', @@ -30,6 +31,7 @@ class SnotrIE(InfoExtractor): 'title': 'David Letteman - George W. Bush Top 10', 'duration': 126, 'filesize_approx': 8912896, + 'description': 'The top 10 George W. Bush moments, brought to you by David Letterman!', } }] @@ -41,7 +43,6 @@ class SnotrIE(InfoExtractor): title = self._og_search_title(webpage) description = self._og_search_description(webpage) - video_url = "http://cdn.videos.snotr.com/%s.flv" % video_id view_count = str_to_int(self._html_search_regex( @@ -58,6 +59,7 @@ class SnotrIE(InfoExtractor): return { 'id': video_id, + 'description': description, 'title': title, 'url': video_url, 'view_count': view_count, From db964a33a1c8ec0449fe2e39cf8d5de70daaffc2 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Mon, 21 Jul 2014 12:12:50 +0200 Subject: [PATCH 07/21] Remove unused imports --- youtube_dl/__init__.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index f223b75f4..0e7b9ddaf 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -72,11 +72,9 @@ __license__ = 'Public Domain' import codecs import io -import locale import optparse import os import random -import re import shlex import sys From 9aeaf730ad712aed29d241d4e6655b8e5fee1d47 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Mon, 21 Jul 2014 12:14:06 +0200 Subject: [PATCH 08/21] [rtve] Fix md5sum Looks like these guys reencoded the video. --- youtube_dl/extractor/rtve.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/rtve.py b/youtube_dl/extractor/rtve.py index 77fd08dde..c2228b2f0 100644 --- a/youtube_dl/extractor/rtve.py +++ b/youtube_dl/extractor/rtve.py @@ -17,7 +17,7 @@ class RTVEALaCartaIE(InfoExtractor): _TEST = { 'url': 'http://www.rtve.es/alacarta/videos/balonmano/o-swiss-cup-masculina-final-espana-suecia/2491869/', - 'md5': '18fcd45965bdd076efdb12cd7f6d7b9e', + 'md5': '1d49b7e1ca7a7502c56a4bf1b60f1b43', 'info_dict': { 'id': '2491869', 'ext': 'mp4', From 468d19a9c15f1a3ddd5363c4a966667d777782b0 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Mon, 21 Jul 2014 12:15:23 +0200 Subject: [PATCH 09/21] [savefrom] Fix test description --- youtube_dl/extractor/savefrom.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/savefrom.py b/youtube_dl/extractor/savefrom.py index 198a08c1c..ccd545971 100644 --- a/youtube_dl/extractor/savefrom.py +++ b/youtube_dl/extractor/savefrom.py @@ -20,7 +20,7 @@ class SaveFromIE(InfoExtractor): 'upload_date': '20120816', 'uploader': 'Howcast', 'uploader_id': 'Howcast', - 'description': 'md5:4f0aac94361a12e1ce57d74f85265175', + 'description': 'md5:727900f130df3dc9a25e2721497c7910', }, 'params': { 'skip_download': True From 4f95d455edf20583abd85801c23e88fa749be237 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Mon, 21 Jul 2014 12:17:44 +0200 Subject: [PATCH 10/21] [steam] Update test description --- youtube_dl/extractor/steam.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/steam.py b/youtube_dl/extractor/steam.py index af689e2c2..183dcb03c 100644 --- a/youtube_dl/extractor/steam.py +++ b/youtube_dl/extractor/steam.py @@ -53,7 +53,7 @@ class SteamIE(InfoExtractor): 'ext': 'mp4', 'upload_date': '20140329', 'title': 'FRONTIERS - Final Greenlight Trailer', - 'description': 'md5:6df4fe8dd494ae811869672b0767e025', + 'description': 'md5:dc96a773669d0ca1b36c13c1f30250d9', 'uploader': 'AAD Productions', 'uploader_id': 'AtomicAgeDogGames', } From d8624e6a80751c09a48ff6b9db1d4d85e377c437 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Mon, 21 Jul 2014 12:25:49 +0200 Subject: [PATCH 11/21] [test_playlist] Add and use assertGreaterEqual --- test/helper.py | 7 +++++++ test/test_playlists.py | 47 +++++++++++++++++++++--------------------- 2 files changed, 31 insertions(+), 23 deletions(-) diff --git a/test/helper.py b/test/helper.py index 230d2bd67..84b16f770 100644 --- a/test/helper.py +++ b/test/helper.py @@ -148,3 +148,10 @@ def assertRegexpMatches(self, text, regexp, msg=None): else: msg = note + ', ' + msg self.assertTrue(m, msg) + + +def assertGreaterEqual(self, got, expected, msg=None): + if not (got >= expected): + if msg is None: + msg = '%r not greater than or equal to %r' % (got, expected) + self.assertTrue(got >= expected, msg) diff --git a/test/test_playlists.py b/test/test_playlists.py index 1a38a667b..4789200e9 100644 --- a/test/test_playlists.py +++ b/test/test_playlists.py @@ -11,6 +11,7 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from test.helper import ( assertRegexpMatches, + assertGreaterEqual, expect_info_dict, FakeYDL, ) @@ -71,8 +72,8 @@ class TestPlaylists(unittest.TestCase): ie = DailymotionUserIE(dl) result = ie.extract('https://www.dailymotion.com/user/nqtv') self.assertIsPlaylist(result) + assertGreaterEqual(self, len(result['entries']), 100) self.assertEqual(result['title'], 'Rémi Gaillard') - self.assertTrue(len(result['entries']) >= 100) def test_vimeo_channel(self): dl = FakeYDL() @@ -111,7 +112,7 @@ class TestPlaylists(unittest.TestCase): ie = VineUserIE(dl) result = ie.extract('https://vine.co/Visa') self.assertIsPlaylist(result) - self.assertTrue(len(result['entries']) >= 47) + assertGreaterEqual(self, len(result['entries']), 47) def test_ustream_channel(self): dl = FakeYDL() @@ -119,7 +120,7 @@ class TestPlaylists(unittest.TestCase): result = ie.extract('http://www.ustream.tv/channel/channeljapan') self.assertIsPlaylist(result) self.assertEqual(result['id'], '10874166') - self.assertTrue(len(result['entries']) >= 54) + assertGreaterEqual(self, len(result['entries']), 54) def test_soundcloud_set(self): dl = FakeYDL() @@ -127,7 +128,7 @@ class TestPlaylists(unittest.TestCase): result = ie.extract('https://soundcloud.com/the-concept-band/sets/the-royal-concept-ep') self.assertIsPlaylist(result) self.assertEqual(result['title'], 'The Royal Concept EP') - self.assertTrue(len(result['entries']) >= 6) + assertGreaterEqual(self, len(result['entries']), 6) def test_soundcloud_user(self): dl = FakeYDL() @@ -135,7 +136,7 @@ class TestPlaylists(unittest.TestCase): result = ie.extract('https://soundcloud.com/the-concept-band') self.assertIsPlaylist(result) self.assertEqual(result['id'], '9615865') - self.assertTrue(len(result['entries']) >= 12) + assertGreaterEqual(self, len(result['entries']), 12) def test_soundcloud_likes(self): dl = FakeYDL() @@ -143,7 +144,7 @@ class TestPlaylists(unittest.TestCase): result = ie.extract('https://soundcloud.com/the-concept-band/likes') self.assertIsPlaylist(result) self.assertEqual(result['id'], '9615865') - self.assertTrue(len(result['entries']) >= 1) + assertGreaterEqual(self, len(result['entries']), 1) def test_soundcloud_playlist(self): dl = FakeYDL() @@ -162,7 +163,7 @@ class TestPlaylists(unittest.TestCase): result = ie.extract('http://new.livestream.com/tedx/cityenglish') self.assertIsPlaylist(result) self.assertEqual(result['title'], 'TEDCity2.0 (English)') - self.assertTrue(len(result['entries']) >= 4) + assertGreaterEqual(self, len(result['entries']), 4) def test_livestreamoriginal_folder(self): dl = FakeYDL() @@ -170,7 +171,7 @@ class TestPlaylists(unittest.TestCase): result = ie.extract('https://www.livestream.com/newplay/folder?dirId=a07bf706-d0e4-4e75-a747-b021d84f2fd3') self.assertIsPlaylist(result) self.assertEqual(result['id'], 'a07bf706-d0e4-4e75-a747-b021d84f2fd3') - self.assertTrue(len(result['entries']) >= 28) + assertGreaterEqual(self, len(result['entries']), 28) def test_nhl_videocenter(self): dl = FakeYDL() @@ -187,7 +188,7 @@ class TestPlaylists(unittest.TestCase): result = ie.extract('http://bambuser.com/channel/pixelversity') self.assertIsPlaylist(result) self.assertEqual(result['title'], 'pixelversity') - self.assertTrue(len(result['entries']) >= 60) + assertGreaterEqual(self, len(result['entries']), 60) def test_bandcamp_album(self): dl = FakeYDL() @@ -195,7 +196,7 @@ class TestPlaylists(unittest.TestCase): result = ie.extract('http://mpallante.bandcamp.com/album/nightmare-night-ep') self.assertIsPlaylist(result) self.assertEqual(result['title'], 'Nightmare Night EP') - self.assertTrue(len(result['entries']) >= 4) + assertGreaterEqual(self, len(result['entries']), 4) def test_smotri_community(self): dl = FakeYDL() @@ -204,7 +205,7 @@ class TestPlaylists(unittest.TestCase): self.assertIsPlaylist(result) self.assertEqual(result['id'], 'kommuna') self.assertEqual(result['title'], 'КПРФ') - self.assertTrue(len(result['entries']) >= 4) + assertGreaterEqual(self, len(result['entries']), 4) def test_smotri_user(self): dl = FakeYDL() @@ -213,7 +214,7 @@ class TestPlaylists(unittest.TestCase): self.assertIsPlaylist(result) self.assertEqual(result['id'], 'inspector') self.assertEqual(result['title'], 'Inspector') - self.assertTrue(len(result['entries']) >= 9) + assertGreaterEqual(self, len(result['entries']), 9) def test_AcademicEarthCourse(self): dl = FakeYDL() @@ -232,7 +233,7 @@ class TestPlaylists(unittest.TestCase): self.assertIsPlaylist(result) self.assertEqual(result['id'], 'dvoe_iz_lartsa') self.assertEqual(result['title'], 'Двое из ларца (2006 - 2008)') - self.assertTrue(len(result['entries']) >= 24) + assertGreaterEqual(self, len(result['entries']), 24) def test_ivi_compilation_season(self): dl = FakeYDL() @@ -241,7 +242,7 @@ class TestPlaylists(unittest.TestCase): self.assertIsPlaylist(result) self.assertEqual(result['id'], 'dvoe_iz_lartsa/season1') self.assertEqual(result['title'], 'Двое из ларца (2006 - 2008) 1 сезон') - self.assertTrue(len(result['entries']) >= 12) + assertGreaterEqual(self, len(result['entries']), 12) def test_imdb_list(self): dl = FakeYDL() @@ -260,7 +261,7 @@ class TestPlaylists(unittest.TestCase): self.assertEqual(result['id'], 'cryptography') self.assertEqual(result['title'], 'Journey into cryptography') self.assertEqual(result['description'], 'How have humans protected their secret messages through history? What has changed today?') - self.assertTrue(len(result['entries']) >= 3) + assertGreaterEqual(self, len(result['entries']), 3) def test_EveryonesMixtape(self): dl = FakeYDL() @@ -277,7 +278,7 @@ class TestPlaylists(unittest.TestCase): result = ie.extract('http://rutube.ru/tags/video/1800/') self.assertIsPlaylist(result) self.assertEqual(result['id'], '1800') - self.assertTrue(len(result['entries']) >= 68) + assertGreaterEqual(self, len(result['entries']), 68) def test_rutube_person(self): dl = FakeYDL() @@ -285,7 +286,7 @@ class TestPlaylists(unittest.TestCase): result = ie.extract('http://rutube.ru/video/person/313878/') self.assertIsPlaylist(result) self.assertEqual(result['id'], '313878') - self.assertTrue(len(result['entries']) >= 37) + assertGreaterEqual(self, len(result['entries']), 37) def test_multiple_brightcove_videos(self): # https://github.com/rg3/youtube-dl/issues/2283 @@ -322,7 +323,7 @@ class TestPlaylists(unittest.TestCase): self.assertIsPlaylist(result) self.assertEqual(result['id'], '10') self.assertEqual(result['title'], 'Who are the hackers?') - self.assertTrue(len(result['entries']) >= 6) + assertGreaterEqual(self, len(result['entries']), 6) def test_toypics_user(self): dl = FakeYDL() @@ -330,7 +331,7 @@ class TestPlaylists(unittest.TestCase): result = ie.extract('http://videos.toypics.net/Mikey') self.assertIsPlaylist(result) self.assertEqual(result['id'], 'Mikey') - self.assertTrue(len(result['entries']) >= 17) + assertGreaterEqual(self, len(result['entries']), 17) def test_xtube_user(self): dl = FakeYDL() @@ -338,7 +339,7 @@ class TestPlaylists(unittest.TestCase): result = ie.extract('http://www.xtube.com/community/profile.php?user=greenshowers') self.assertIsPlaylist(result) self.assertEqual(result['id'], 'greenshowers') - self.assertTrue(len(result['entries']) >= 155) + assertGreaterEqual(self, len(result['entries']), 155) def test_InstagramUser(self): dl = FakeYDL() @@ -346,7 +347,7 @@ class TestPlaylists(unittest.TestCase): result = ie.extract('http://instagram.com/porsche') self.assertIsPlaylist(result) self.assertEqual(result['id'], 'porsche') - self.assertTrue(len(result['entries']) >= 2) + assertGreaterEqual(self, len(result['entries']), 2) test_video = next( e for e in result['entries'] if e['id'] == '614605558512799803_462752227') @@ -385,7 +386,7 @@ class TestPlaylists(unittest.TestCase): self.assertEqual(result['id'], '152147') self.assertEqual( result['title'], 'Brace Yourself - Today\'s Weirdest News') - self.assertTrue(len(result['entries']) >= 10) + assertGreaterEqual(self, len(result['entries']), 10) def test_TeacherTubeUser(self): dl = FakeYDL() @@ -393,7 +394,7 @@ class TestPlaylists(unittest.TestCase): result = ie.extract('http://www.teachertube.com/user/profile/rbhagwati2') self.assertIsPlaylist(result) self.assertEqual(result['id'], 'rbhagwati2') - self.assertTrue(len(result['entries']) >= 179) + assertGreaterEqual(self, len(result['entries']), 179) if __name__ == '__main__': unittest.main() From 1a30deca50d6256bb833aee672d5055d72319aca Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Mon, 21 Jul 2014 12:47:01 +0200 Subject: [PATCH 12/21] [teachertube] Fix title and playlist recognition --- youtube_dl/extractor/common.py | 2 +- youtube_dl/extractor/teachertube.py | 20 +++++++++++--------- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 3213abacf..9b36e0789 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -469,7 +469,7 @@ class InfoExtractor(object): display_name = name return self._html_search_regex( r'''(?ix)]+(?:itemprop|name|property)=["\']%s["\']) + (?=[^>]+(?:itemprop|name|property)=["\']?%s["\']?) [^>]+content=["\']([^"\']+)["\']''' % re.escape(name), html, display_name, fatal=fatal, **kwargs) diff --git a/youtube_dl/extractor/teachertube.py b/youtube_dl/extractor/teachertube.py index 2c2113b14..46d727d1d 100644 --- a/youtube_dl/extractor/teachertube.py +++ b/youtube_dl/extractor/teachertube.py @@ -62,7 +62,7 @@ class TeacherTubeIE(InfoExtractor): webpage = self._download_webpage(url, video_id) - title = self._html_search_meta('title', webpage, 'title') + title = self._html_search_meta('title', webpage, 'title', fatal=True) TITLE_SUFFIX = ' - TeacherTube' if title.endswith(TITLE_SUFFIX): title = title[:-len(TITLE_SUFFIX)].strip() @@ -101,7 +101,11 @@ class TeacherTubeUserIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?teachertube\.com/(user/profile|collection)/(?P[0-9a-zA-Z]+)/?' - _MEDIA_RE = r'(?s)"sidebar_thumb_time">[0-9:]+.+?' + _MEDIA_RE = r'''(?sx) + class="?sidebar_thumb_time"?>[0-9:]+ + \s* + Date: Mon, 21 Jul 2014 12:57:40 +0200 Subject: [PATCH 13/21] [dropbox] Fix test and add support for spaces in filenames --- youtube_dl/extractor/dropbox.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/dropbox.py b/youtube_dl/extractor/dropbox.py index 41208c976..1711f0263 100644 --- a/youtube_dl/extractor/dropbox.py +++ b/youtube_dl/extractor/dropbox.py @@ -5,24 +5,26 @@ import os.path import re from .common import InfoExtractor +from ..utils import compat_urllib_parse class DropboxIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?dropbox[.]com/s/(?P[a-zA-Z0-9]{15})/(?P[^?#]*)' _TEST = { - 'url': 'https://www.dropbox.com/s/0qr9sai2veej4f8/THE_DOCTOR_GAMES.mp4', - 'md5': '8ae17c51172fb7f93bdd6a214cc8c896', + 'url': 'https://www.dropbox.com/s/nelirfsxnmcfbfh/youtube-dl%20test%20video%20%27%C3%A4%22BaW_jenozKc.mp4', + 'md5': '8a3d905427a6951ccb9eb292f154530b', 'info_dict': { - 'id': '0qr9sai2veej4f8', + 'id': 'nelirfsxnmcfbfh', 'ext': 'mp4', - 'title': 'THE_DOCTOR_GAMES' + 'title': 'youtube-dl test video \'ä"BaW_jenozKc' } } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') - title = os.path.splitext(mobj.group('title'))[0] + fn = compat_urllib_parse.unquote(mobj.group('title')) + title = os.path.splitext(fn)[0] video_url = url + '?dl=1' return { From 6f5342a201e2568ce91454d96281165c39dae16e Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 21 Jul 2014 13:03:18 +0200 Subject: [PATCH 14/21] [cnet] Fix title extraction URLs are still missing --- youtube_dl/extractor/cnet.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/cnet.py b/youtube_dl/extractor/cnet.py index a94f42571..710d5009b 100644 --- a/youtube_dl/extractor/cnet.py +++ b/youtube_dl/extractor/cnet.py @@ -43,7 +43,11 @@ class CNETIE(InfoExtractor): raise ExtractorError('Cannot find video data') video_id = vdata['id'] - title = vdata['headline'] + title = vdata.get('headline') + if title is None: + title = vdata.get('title') + if title is None: + raise ExtractorError('Cannot find title!') description = vdata.get('dek') thumbnail = vdata.get('image', {}).get('path') author = vdata.get('author') From 0e6ebc13d154e6b8b063dfe7e9c2dd28d427fb77 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 21 Jul 2014 13:11:24 +0200 Subject: [PATCH 15/21] [vimeo] Update test description --- youtube_dl/extractor/vimeo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 255855558..a3c6e83b0 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -98,7 +98,7 @@ class VimeoIE(VimeoBaseInfoExtractor, SubtitlesInfoExtractor): 'info_dict': { 'id': '54469442', 'ext': 'mp4', - 'title': 'Kathy Sierra: Building the minimum Badass User, Business of Software', + 'title': 'Kathy Sierra: Building the minimum Badass User, Business of Software 2012', 'uploader': 'The BLN & Business of Software', 'uploader_id': 'theblnbusinessofsoftware', 'duration': 3610, From a850fde1d82d86ed5b75e6f7e1f2e43817946290 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 21 Jul 2014 13:14:41 +0200 Subject: [PATCH 16/21] [funnyordie] Fix test description --- youtube_dl/extractor/funnyordie.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/funnyordie.py b/youtube_dl/extractor/funnyordie.py index 6e6b66660..721e5fce0 100644 --- a/youtube_dl/extractor/funnyordie.py +++ b/youtube_dl/extractor/funnyordie.py @@ -26,7 +26,7 @@ class FunnyOrDieIE(InfoExtractor): 'id': 'e402820827', 'ext': 'mp4', 'title': 'Please Use This Song (Jon Lajoie)', - 'description': 'md5:2ed27d364f5a805a6dba199faaf6681d', + 'description': 'Please use this to sell something. www.jonlajoie.com', 'thumbnail': 're:^http:.*\.jpg$', }, }] From caf5a8817bc53e6d799c70c71d7ca03568738620 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 21 Jul 2014 13:16:48 +0200 Subject: [PATCH 17/21] [chilloutzone] Fix test description --- youtube_dl/extractor/chilloutzone.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/chilloutzone.py b/youtube_dl/extractor/chilloutzone.py index 02d5ba527..a62395d4b 100644 --- a/youtube_dl/extractor/chilloutzone.py +++ b/youtube_dl/extractor/chilloutzone.py @@ -42,7 +42,7 @@ class ChilloutzoneIE(InfoExtractor): 'id': '85523671', 'ext': 'mp4', 'title': 'The Sunday Times - Icons', - 'description': 'md5:3e1c0dc6047498d6728dcdaad0891762', + 'description': 'md5:a5f7ff82e2f7a9ed77473fe666954e84', 'uploader': 'Us', 'uploader_id': 'usfilms', 'upload_date': '20140131' From ff1956e07b64735a37de886fa2e800dd823bb3e1 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 21 Jul 2014 13:19:41 +0200 Subject: [PATCH 18/21] [wdr] Replace test case --- youtube_dl/extractor/wdr.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/wdr.py b/youtube_dl/extractor/wdr.py index f741ba540..ab28ef6fe 100644 --- a/youtube_dl/extractor/wdr.py +++ b/youtube_dl/extractor/wdr.py @@ -55,14 +55,14 @@ class WDRIE(InfoExtractor): }, }, { - 'url': 'http://www.funkhauseuropa.de/av/audiosuepersongsoulbossanova100-audioplayer.html', - 'md5': '24e83813e832badb0a8d7d1ef9ef0691', + 'url': 'http://www.funkhauseuropa.de/av/audioflaviacoelhoamaramar100-audioplayer.html', + 'md5': '99a1443ff29af19f6c52cf6f4dc1f4aa', 'info_dict': { - 'id': 'mdb-463528', + 'id': 'mdb-478135', 'ext': 'mp3', - 'title': 'Süpersong: Soul Bossa Nova', + 'title': 'Flavia Coelho: Amar é Amar', 'description': 'md5:7b29e97e10dfb6e265238b32fa35b23a', - 'upload_date': '20140630', + 'upload_date': '20140717', }, }, ] From df8ba0d2cf9ea0ae1fde4c9f76a12f315e88aef3 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 21 Jul 2014 13:22:14 +0200 Subject: [PATCH 19/21] [tagesschau] Remove test case See http://de.wikipedia.org/wiki/Depublizieren for the sad rationale. --- youtube_dl/extractor/tagesschau.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/youtube_dl/extractor/tagesschau.py b/youtube_dl/extractor/tagesschau.py index 25b9864ad..b87047451 100644 --- a/youtube_dl/extractor/tagesschau.py +++ b/youtube_dl/extractor/tagesschau.py @@ -19,16 +19,6 @@ class TagesschauIE(InfoExtractor): 'description': 'md5:69da3c61275b426426d711bde96463ab', 'thumbnail': 're:^http:.*\.jpg$', }, - }, { - 'url': 'http://www.tagesschau.de/multimedia/video/video-5964.html', - 'md5': '66652566900963a3f962333579eeffcf', - 'info_dict': { - 'id': '5964', - 'ext': 'mp4', - 'title': 'Nahost-Konflikt: Israel bombadiert Ziele im Gazastreifen und Westjordanland', - 'description': 'md5:07bfc78c48eec3145ed4805299a1900a', - 'thumbnail': 're:http://.*\.jpg', - }, }] _FORMATS = { From 06c155420fda2a922a7219dd6758f42b868e6d96 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 21 Jul 2014 13:25:59 +0200 Subject: [PATCH 20/21] [sockshare] Simplify (#3268) --- youtube_dl/extractor/sockshare.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/sockshare.py b/youtube_dl/extractor/sockshare.py index cbf2d7abe..75b634bc6 100644 --- a/youtube_dl/extractor/sockshare.py +++ b/youtube_dl/extractor/sockshare.py @@ -5,7 +5,6 @@ from ..utils import ( ExtractorError, compat_urllib_parse, compat_urllib_request, - determine_ext, ) import re @@ -34,7 +33,7 @@ class SockshareIE(InfoExtractor): webpage = self._download_webpage(url, video_id) if re.search(self._FILE_DELETED_REGEX, webpage) is not None: - raise ExtractorError(u'Video %s does not exist' % video_id, + raise ExtractorError('Video %s does not exist' % video_id, expected=True) confirm_hash = self._html_search_regex(r'''(?x)<input\s+ @@ -54,19 +53,21 @@ class SockshareIE(InfoExtractor): req.add_header('Host', 'www.sockshare.com') req.add_header('Content-type', 'application/x-www-form-urlencoded') - webpage = self._download_webpage(req, video_id, 'Downloading video page') + webpage = self._download_webpage( + req, video_id, 'Downloading video page') - video_url = self._html_search_regex(r'<a href="([^"]*)".+class="download_file_link"', webpage, 'file url') + video_url = self._html_search_regex( + r'<a href="([^"]*)".+class="download_file_link"', + webpage, 'file url') video_url = "http://www.sockshare.com" + video_url title = self._html_search_regex(r'<h1>(.+)<strong>', webpage, 'title') - thumbnail = self._html_search_regex(r'<img\ssrc="([^"]*)".+name="bg"', - webpage, 'thumbnail') - ext = determine_ext(title) + thumbnail = self._html_search_regex( + r'<img\s+src="([^"]*)".+?name="bg"', + webpage, 'thumbnail') formats = [{ 'format_id': 'sd', 'url': video_url, - 'ext': ext, }] return { From f1f725c6a0e567283704046fc21614f4826e77fd Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 21 Jul 2014 13:55:47 +0200 Subject: [PATCH 21/21] [dropbox] Fix title encoding on Python 2 --- youtube_dl/extractor/dropbox.py | 4 ++-- youtube_dl/utils.py | 21 ++++++++++++++------- 2 files changed, 16 insertions(+), 9 deletions(-) diff --git a/youtube_dl/extractor/dropbox.py b/youtube_dl/extractor/dropbox.py index 1711f0263..9f569aa93 100644 --- a/youtube_dl/extractor/dropbox.py +++ b/youtube_dl/extractor/dropbox.py @@ -5,7 +5,7 @@ import os.path import re from .common import InfoExtractor -from ..utils import compat_urllib_parse +from ..utils import compat_urllib_parse_unquote class DropboxIE(InfoExtractor): @@ -23,7 +23,7 @@ class DropboxIE(InfoExtractor): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') - fn = compat_urllib_parse.unquote(mobj.group('title')) + fn = compat_urllib_parse_unquote(mobj.group('title')) title = os.path.splitext(fn)[0] video_url = url + '?dl=1' diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index bf4d1112f..3ecd798d7 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -91,11 +91,9 @@ except ImportError: compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w') try: - from urllib.parse import parse_qs as compat_parse_qs -except ImportError: # Python 2 - # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib. - # Python 2's version is apparently totally broken - def _unquote(string, encoding='utf-8', errors='replace'): + from urllib.parse import unquote as compat_urllib_parse_unquote +except ImportError: + def compat_urllib_parse_unquote(string, encoding='utf-8', errors='replace'): if string == '': return string res = string.split('%') @@ -130,6 +128,13 @@ except ImportError: # Python 2 string += pct_sequence.decode(encoding, errors) return string + +try: + from urllib.parse import parse_qs as compat_parse_qs +except ImportError: # Python 2 + # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib. + # Python 2's version is apparently totally broken + def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False, encoding='utf-8', errors='replace'): qs, _coerce_result = qs, unicode @@ -149,10 +154,12 @@ except ImportError: # Python 2 continue if len(nv[1]) or keep_blank_values: name = nv[0].replace('+', ' ') - name = _unquote(name, encoding=encoding, errors=errors) + name = compat_urllib_parse_unquote( + name, encoding=encoding, errors=errors) name = _coerce_result(name) value = nv[1].replace('+', ' ') - value = _unquote(value, encoding=encoding, errors=errors) + value = compat_urllib_parse_unquote( + value, encoding=encoding, errors=errors) value = _coerce_result(value) r.append((name, value)) return r