From 3d1bb6b4dd472e0232af029f05e0a1c8e3c1e1c9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Wed, 16 Apr 2014 15:45:05 +0200 Subject: [PATCH 001/184] Add an extractor for tlc.de (fixes #2748) --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/tlc.py | 42 ++++++++++++++++++++++++++++++++ 2 files changed, 43 insertions(+) create mode 100644 youtube_dl/extractor/tlc.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 3a91e1a46..ae5296d90 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -251,6 +251,7 @@ from .tf1 import TF1IE from .theplatform import ThePlatformIE from .thisav import ThisAVIE from .tinypic import TinyPicIE +from .tlc import TlcDeIE from .toutv import TouTvIE from .toypics import ToypicsUserIE, ToypicsIE from .traileraddict import TrailerAddictIE diff --git a/youtube_dl/extractor/tlc.py b/youtube_dl/extractor/tlc.py new file mode 100644 index 000000000..0a092ea3c --- /dev/null +++ b/youtube_dl/extractor/tlc.py @@ -0,0 +1,42 @@ +# encoding: utf-8 +from __future__ import unicode_literals +import re + +from .common import InfoExtractor +from .brightcove import BrightcoveIE + + +class TlcDeIE(InfoExtractor): + IE_NAME = 'tlc.de' + _VALID_URL = r'http://www\.tlc\.de/sendungen/[^/]+/videos/(?P[^/?]+)' + + _TEST = { + 'url': 'http://www.tlc.de/sendungen/breaking-amish/videos/#3235167922001', + 'info_dict': { + 'id': '3235167922001', + 'ext': 'mp4', + 'title': 'Breaking Amish: Die Welt da draußen', + 'uploader': 'Discovery Networks - Germany', + 'description': 'Vier Amische und eine Mennonitin wagen in New York' + ' den Sprung in ein komplett anderes Leben. Begleitet sie auf' + ' ihrem spannenden Weg.', + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + title = mobj.group('title') + webpage = self._download_webpage(url, title) + iframe_url = self._search_regex( + '<iframe src="(http://www\.tlc\.de/wp-content/.+?)"', webpage, + 'iframe url') + # Otherwise we don't get the correct 'BrightcoveExperience' element, + # example: http://www.tlc.de/sendungen/cake-boss/videos/cake-boss-cannoli-drama/ + iframe_url = iframe_url.replace('.htm?', '.php?') + iframe = self._download_webpage(iframe_url, title) + + return { + '_type': 'url', + 'url': BrightcoveIE._extract_brightcove_url(iframe), + 'ie': BrightcoveIE.ie_key(), + } From b075d25bedc26a2ec8202f612f6949eb356a5ef7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 16 Apr 2014 20:47:39 +0700 Subject: [PATCH 002/184] [canalplus] Prefer f4m and modernize (Closes #2749) --- youtube_dl/extractor/canalplus.py | 70 ++++++++++++++++++------------- 1 file changed, 42 insertions(+), 28 deletions(-) diff --git a/youtube_dl/extractor/canalplus.py b/youtube_dl/extractor/canalplus.py index 49dfd881e..553eeb230 100644 --- a/youtube_dl/extractor/canalplus.py +++ b/youtube_dl/extractor/canalplus.py @@ -1,4 +1,6 @@ # encoding: utf-8 +from __future__ import unicode_literals + import re from .common import InfoExtractor @@ -8,46 +10,58 @@ from ..utils import unified_strdate class CanalplusIE(InfoExtractor): _VALID_URL = r'https?://(www\.canalplus\.fr/.*?/(?P<path>.*)|player\.canalplus\.fr/#/(?P<id>\d+))' _VIDEO_INFO_TEMPLATE = 'http://service.canal-plus.com/video/rest/getVideosLiees/cplus/%s' - IE_NAME = u'canalplus.fr' + IE_NAME = 'canalplus.fr' _TEST = { - u'url': u'http://www.canalplus.fr/c-infos-documentaires/pid1830-c-zapping.html?vid=922470', - u'file': u'922470.flv', - u'info_dict': { - u'title': u'Zapping - 26/08/13', - u'description': u'Le meilleur de toutes les chaînes, tous les jours.\nEmission du 26 août 2013', - u'upload_date': u'20130826', + 'url': 'http://www.canalplus.fr/c-infos-documentaires/pid1830-c-zapping.html?vid=922470', + 'info_dict': { + 'id': '922470', + 'ext': 'flv', + 'title': 'Zapping - 26/08/13', + 'description': 'Le meilleur de toutes les chaînes, tous les jours.\nEmission du 26 août 2013', + 'upload_date': '20130826', }, - u'params': { - u'skip_download': True, + 'params': { + 'skip_download': True, }, } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - video_id = mobj.groupdict().get('id') + video_id = mobj.group('id') + if video_id is None: webpage = self._download_webpage(url, mobj.group('path')) - video_id = self._search_regex(r'<canal:player videoId="(\d+)"', webpage, u'video id') + video_id = self._search_regex(r'<canal:player videoId="(\d+)"', webpage, 'video id') + info_url = self._VIDEO_INFO_TEMPLATE % video_id - doc = self._download_xml(info_url,video_id, - u'Downloading video info') + doc = self._download_xml(info_url, video_id, 'Downloading video XML') - self.report_extraction(video_id) video_info = [video for video in doc if video.find('ID').text == video_id][0] - infos = video_info.find('INFOS') media = video_info.find('MEDIA') - formats = [media.find('VIDEOS/%s' % format) - for format in ['BAS_DEBIT', 'HAUT_DEBIT', 'HD']] - video_url = [format.text for format in formats if format is not None][-1] + infos = video_info.find('INFOS') - return {'id': video_id, - 'title': u'%s - %s' % (infos.find('TITRAGE/TITRE').text, - infos.find('TITRAGE/SOUS_TITRE').text), - 'url': video_url, - 'ext': 'flv', - 'upload_date': unified_strdate(infos.find('PUBLICATION/DATE').text), - 'thumbnail': media.find('IMAGES/GRAND').text, - 'description': infos.find('DESCRIPTION').text, - 'view_count': int(infos.find('NB_VUES').text), - } + preferences = ['MOBILE', 'BAS_DEBIT', 'HAUT_DEBIT', 'HD', 'HLS', 'HDS'] + + formats = [ + { + 'url': fmt.text + '?hdcore=2.11.3' if fmt.tag == 'HDS' else fmt.text, + 'format_id': fmt.tag, + 'ext': 'mp4' if fmt.tag == 'HLS' else 'flv', + 'preference': preferences.index(fmt.tag) if fmt.tag in preferences else -1, + } for fmt in media.find('VIDEOS') if fmt.text + ] + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': '%s - %s' % (infos.find('TITRAGE/TITRE').text, + infos.find('TITRAGE/SOUS_TITRE').text), + 'upload_date': unified_strdate(infos.find('PUBLICATION/DATE').text), + 'thumbnail': media.find('IMAGES/GRAND').text, + 'description': infos.find('DESCRIPTION').text, + 'view_count': int(infos.find('NB_VUES').text), + 'like_count': int(infos.find('NB_LIKES').text), + 'comment_count': int(infos.find('NB_COMMENTS').text), + 'formats': formats, + } \ No newline at end of file From f401c6f69f2d4bda37cefcde6e4a0500bdb20892 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Wed, 16 Apr 2014 15:54:00 +0200 Subject: [PATCH 003/184] [canalplus] Download the video in the test It doesn't use rtmpdump now. --- youtube_dl/extractor/canalplus.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/youtube_dl/extractor/canalplus.py b/youtube_dl/extractor/canalplus.py index 553eeb230..a67aed027 100644 --- a/youtube_dl/extractor/canalplus.py +++ b/youtube_dl/extractor/canalplus.py @@ -14,6 +14,7 @@ class CanalplusIE(InfoExtractor): _TEST = { 'url': 'http://www.canalplus.fr/c-infos-documentaires/pid1830-c-zapping.html?vid=922470', + 'md5': '60c29434a416a83c15dae2587d47027d', 'info_dict': { 'id': '922470', 'ext': 'flv', @@ -21,9 +22,6 @@ class CanalplusIE(InfoExtractor): 'description': 'Le meilleur de toutes les chaînes, tous les jours.\nEmission du 26 août 2013', 'upload_date': '20130826', }, - 'params': { - 'skip_download': True, - }, } def _real_extract(self, url): From f270256e06237039779c81e833ccfa098edf6986 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Wed, 16 Apr 2014 20:27:33 +0200 Subject: [PATCH 004/184] [tlc] Add an extractor for tlc.com It uses the same system as discovery.com --- youtube_dl/extractor/__init__.py | 2 +- youtube_dl/extractor/tlc.py | 18 ++++++++++++++++++ 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index ae5296d90..b9c759165 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -251,7 +251,7 @@ from .tf1 import TF1IE from .theplatform import ThePlatformIE from .thisav import ThisAVIE from .tinypic import TinyPicIE -from .tlc import TlcDeIE +from .tlc import TlcIE, TlcDeIE from .toutv import TouTvIE from .toypics import ToypicsUserIE, ToypicsIE from .traileraddict import TrailerAddictIE diff --git a/youtube_dl/extractor/tlc.py b/youtube_dl/extractor/tlc.py index 0a092ea3c..ad175b83e 100644 --- a/youtube_dl/extractor/tlc.py +++ b/youtube_dl/extractor/tlc.py @@ -4,6 +4,24 @@ import re from .common import InfoExtractor from .brightcove import BrightcoveIE +from .discovery import DiscoveryIE + + +class TlcIE(DiscoveryIE): + IE_NAME = 'tlc.com' + _VALID_URL = r'http://www\.tlc\.com\/[a-zA-Z0-9\-]*/[a-zA-Z0-9\-]*/videos/(?P<id>[a-zA-Z0-9\-]*)(.htm)?' + + _TEST = { + 'url': 'http://www.tlc.com/tv-shows/cake-boss/videos/too-big-to-fly.htm', + 'md5': 'c4038f4a9b44d0b5d74caaa64ed2a01a', + 'info_dict': { + 'id': '853232', + 'ext': 'mp4', + 'title': 'Cake Boss: Too Big to Fly', + 'description': 'Buddy has taken on a high flying task.', + 'duration': 119, + }, + } class TlcDeIE(InfoExtractor): From e6c6d10d99ebefcf80ebbb3c5bfef9b675829bee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 17 Apr 2014 19:59:52 +0700 Subject: [PATCH 005/184] [podomatic] Improve video URL extraction (Closes #2763) --- youtube_dl/extractor/podomatic.py | 42 +++++++++++++++++++++---------- 1 file changed, 29 insertions(+), 13 deletions(-) diff --git a/youtube_dl/extractor/podomatic.py b/youtube_dl/extractor/podomatic.py index 19ad45c98..ffafd2380 100644 --- a/youtube_dl/extractor/podomatic.py +++ b/youtube_dl/extractor/podomatic.py @@ -6,22 +6,36 @@ import re from .common import InfoExtractor from ..utils import int_or_none - class PodomaticIE(InfoExtractor): IE_NAME = 'podomatic' _VALID_URL = r'^(?P<proto>https?)://(?P<channel>[^.]+)\.podomatic\.com/entry/(?P<id>[^?]+)' - _TEST = { - "url": "http://scienceteachingtips.podomatic.com/entry/2009-01-02T16_03_35-08_00", - "file": "2009-01-02T16_03_35-08_00.mp3", - "md5": "84bb855fcf3429e6bf72460e1eed782d", - "info_dict": { - "uploader": "Science Teaching Tips", - "uploader_id": "scienceteachingtips", - "title": "64. When the Moon Hits Your Eye", - "duration": 446, - } - } + _TESTS = [ + { + 'url': 'http://scienceteachingtips.podomatic.com/entry/2009-01-02T16_03_35-08_00', + 'md5': '84bb855fcf3429e6bf72460e1eed782d', + 'info_dict': { + 'id': '2009-01-02T16_03_35-08_00', + 'ext': 'mp3', + 'uploader': 'Science Teaching Tips', + 'uploader_id': 'scienceteachingtips', + 'title': '64. When the Moon Hits Your Eye', + 'duration': 446, + } + }, + { + 'url': 'http://ostbahnhof.podomatic.com/entry/2013-11-15T16_31_21-08_00', + 'md5': 'd2cf443931b6148e27638650e2638297', + 'info_dict': { + 'id': '2013-11-15T16_31_21-08_00', + 'ext': 'mp3', + 'uploader': 'Ostbahnhof / Techno Mix', + 'uploader_id': 'ostbahnhof', + 'title': 'Einunddreizig', + 'duration': 3799, + } + }, + ] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) @@ -32,10 +46,12 @@ class PodomaticIE(InfoExtractor): '?permalink=true&rtmp=0') % (mobj.group('proto'), channel, video_id)) data_json = self._download_webpage( - json_url, video_id, note=u'Downloading video info') + json_url, video_id, 'Downloading video info') data = json.loads(data_json) video_url = data['downloadLink'] + if not video_url: + video_url = '%s/%s' % (data['streamer'].replace('rtmp', 'http'), data['mediaLocation']) uploader = data['podcast'] title = data['title'] thumbnail = data['imageLocation'] From 1bf3210816bb3b4be2db43f95d1c23da63bfb097 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 18 Apr 2014 21:11:09 +0700 Subject: [PATCH 006/184] [noco] Add support for noco.tv (Closes #2712) --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/noco.py | 105 +++++++++++++++++++++++++++++++ 2 files changed, 106 insertions(+) create mode 100644 youtube_dl/extractor/noco.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index b9c759165..e0ef1cd3e 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -181,6 +181,7 @@ from .nfb import NFBIE from .nhl import NHLIE, NHLVideocenterIE from .niconico import NiconicoIE from .ninegag import NineGagIE +from .noco import NocoIE from .normalboots import NormalbootsIE from .novamov import NovaMovIE from .nowness import NownessIE diff --git a/youtube_dl/extractor/noco.py b/youtube_dl/extractor/noco.py new file mode 100644 index 000000000..6b0c49e02 --- /dev/null +++ b/youtube_dl/extractor/noco.py @@ -0,0 +1,105 @@ +# encoding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + unified_strdate, + compat_str, +) + + +class NocoIE(InfoExtractor): + _VALID_URL = r'http://(?:(?:www\.)?noco\.tv/emission/|player\.noco\.tv/\?idvideo=)(?P<id>\d+)' + + _TEST = { + 'url': 'http://noco.tv/emission/11538/nolife/ami-ami-idol-hello-france/', + 'md5': '0a993f0058ddbcd902630b2047ef710e', + 'info_dict': { + 'id': '11538', + 'ext': 'mp4', + 'title': 'Ami Ami Idol - Hello! France', + 'description': 'md5:6fcfdbbb73aee107a6b7553cefbcbeae', + 'upload_date': '20140412', + 'uploader': 'Nolife', + 'uploader_id': 'NOL', + 'duration': 2851.2, + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + + medias = self._download_json( + 'http://api.noco.tv/1.0/video/medias/%s' % video_id, video_id, 'Downloading video JSON') + + formats = [] + + for fmt in medias['fr']['video_list']['default']['quality_list']: + format_id = fmt['quality_key'] + + file = self._download_json( + 'http://api.noco.tv/1.0/video/file/%s/fr/%s' % (format_id.lower(), video_id), + video_id, 'Downloading %s video JSON' % format_id) + + file_url = file['file'] + if not file_url: + continue + + if file_url == 'forbidden': + raise ExtractorError( + '%s returned error: %s - %s' % ( + self.IE_NAME, file['popmessage']['title'], file['popmessage']['message']), + expected=True) + + formats.append({ + 'url': file_url, + 'format_id': format_id, + 'width': fmt['res_width'], + 'height': fmt['res_lines'], + 'abr': fmt['audiobitrate'], + 'vbr': fmt['videobitrate'], + 'filesize': fmt['filesize'], + 'format_note': fmt['quality_name'], + 'preference': fmt['priority'], + }) + + self._sort_formats(formats) + + show = self._download_json( + 'http://api.noco.tv/1.0/shows/show/%s' % video_id, video_id, 'Downloading show JSON')[0] + + upload_date = unified_strdate(show['indexed']) + uploader = show['partner_name'] + uploader_id = show['partner_key'] + duration = show['duration_ms'] / 1000.0 + thumbnail = show['screenshot'] + + episode = show.get('show_TT') or show.get('show_OT') + family = show.get('family_TT') or show.get('family_OT') + episode_number = show.get('episode_number') + + title = '' + if family: + title += family + if episode_number: + title += ' #' + compat_str(episode_number) + if episode: + title += ' - ' + episode + + description = show.get('show_resume') or show.get('family_resume') + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'upload_date': upload_date, + 'uploader': uploader, + 'uploader_id': uploader_id, + 'duration': duration, + 'formats': formats, + } \ No newline at end of file From 525dc9809e8ddd1761e4eff7517bddc527c44e79 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 18 Apr 2014 21:36:04 +0700 Subject: [PATCH 007/184] [noco] Fix test description md5 --- youtube_dl/extractor/noco.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/noco.py b/youtube_dl/extractor/noco.py index 6b0c49e02..ec011eb49 100644 --- a/youtube_dl/extractor/noco.py +++ b/youtube_dl/extractor/noco.py @@ -21,7 +21,7 @@ class NocoIE(InfoExtractor): 'id': '11538', 'ext': 'mp4', 'title': 'Ami Ami Idol - Hello! France', - 'description': 'md5:6fcfdbbb73aee107a6b7553cefbcbeae', + 'description': 'md5:4eaab46ab68fa4197a317a88a53d3b86', 'upload_date': '20140412', 'uploader': 'Nolife', 'uploader_id': 'NOL', From 4145a257bed5caa4070cef5a87e1184ebb70ea75 Mon Sep 17 00:00:00 2001 From: MikeCol <MikeCol@gmx.net> Date: Sat, 19 Apr 2014 00:29:42 +0200 Subject: [PATCH 008/184] Extended regex match to include gay clips --- youtube_dl/extractor/extremetube.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/extremetube.py b/youtube_dl/extractor/extremetube.py index 1c20e4364..d7fc43287 100644 --- a/youtube_dl/extractor/extremetube.py +++ b/youtube_dl/extractor/extremetube.py @@ -9,7 +9,7 @@ from ..utils import ( ) class ExtremeTubeIE(InfoExtractor): - _VALID_URL = r'^(?:https?://)?(?:www\.)?(?P<url>extremetube\.com/video/.+?(?P<videoid>[0-9]+))(?:[/?&]|$)' + _VALID_URL = r'^(?:https?://)?(?:www\.)?(?P<url>extremetube\.com/.*?video/.+?(?P<videoid>[0-9]+))(?:[/?&]|$)' _TEST = { u'url': u'http://www.extremetube.com/video/music-video-14-british-euro-brit-european-cumshots-swallow-652431', u'file': u'652431.mp4', From d7f1e7c88f01105d07f8552675b4c272b8206971 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 19 Apr 2014 15:59:12 +0700 Subject: [PATCH 009/184] [rutube] Fix extraction --- youtube_dl/extractor/rutube.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/rutube.py b/youtube_dl/extractor/rutube.py index f1ce66433..4f7f8cb6d 100644 --- a/youtube_dl/extractor/rutube.py +++ b/youtube_dl/extractor/rutube.py @@ -43,13 +43,14 @@ class RutubeIE(InfoExtractor): 'http://rutube.ru/api/video/%s/?format=json' % video_id, video_id, 'Downloading video JSON') - trackinfo = self._download_json( - 'http://rutube.ru/api/play/trackinfo/%s/?format=json' % video_id, - video_id, 'Downloading trackinfo JSON') - # Some videos don't have the author field - author = trackinfo.get('author') or {} - m3u8_url = trackinfo['video_balancer'].get('m3u8') + author = video.get('author') or {} + + options = self._download_json( + 'http://rutube.ru/api/play/options/%s/?format=json' %video_id, + video_id, 'Downloading options JSON') + + m3u8_url = options['video_balancer'].get('m3u8') if m3u8_url is None: raise ExtractorError('Couldn\'t find m3u8 manifest url') From 51745be312e3942deb331fa54a9cc358205db24c Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Sat, 19 Apr 2014 11:55:33 +0200 Subject: [PATCH 010/184] release 2014.04.19 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 55382f0b0..b20bb8753 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2014.04.13' +__version__ = '2014.04.19' From 5367fe7f4d8699b711a712598615a815a013fa9c Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Sat, 19 Apr 2014 13:01:15 +0200 Subject: [PATCH 011/184] [test_all_urls] Simplify --- test/test_all_urls.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/test/test_all_urls.py b/test/test_all_urls.py index 7f7362a3a..a9c4ed9e3 100644 --- a/test/test_all_urls.py +++ b/test/test_all_urls.py @@ -77,20 +77,20 @@ class TestAllURLsMatching(unittest.TestCase): self.assertMatch('https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video', ['youtube:search_url']) def test_justin_tv_channelid_matching(self): - self.assertTrue(JustinTVIE.suitable(u"justin.tv/vanillatv")) - self.assertTrue(JustinTVIE.suitable(u"twitch.tv/vanillatv")) - self.assertTrue(JustinTVIE.suitable(u"www.justin.tv/vanillatv")) - self.assertTrue(JustinTVIE.suitable(u"www.twitch.tv/vanillatv")) - self.assertTrue(JustinTVIE.suitable(u"http://www.justin.tv/vanillatv")) - self.assertTrue(JustinTVIE.suitable(u"http://www.twitch.tv/vanillatv")) - self.assertTrue(JustinTVIE.suitable(u"http://www.justin.tv/vanillatv/")) - self.assertTrue(JustinTVIE.suitable(u"http://www.twitch.tv/vanillatv/")) + self.assertTrue(JustinTVIE.suitable('justin.tv/vanillatv')) + self.assertTrue(JustinTVIE.suitable('twitch.tv/vanillatv')) + self.assertTrue(JustinTVIE.suitable('www.justin.tv/vanillatv')) + self.assertTrue(JustinTVIE.suitable('www.twitch.tv/vanillatv')) + self.assertTrue(JustinTVIE.suitable('http://www.justin.tv/vanillatv')) + self.assertTrue(JustinTVIE.suitable('http://www.twitch.tv/vanillatv')) + self.assertTrue(JustinTVIE.suitable('http://www.justin.tv/vanillatv/')) + self.assertTrue(JustinTVIE.suitable('http://www.twitch.tv/vanillatv/')) def test_justintv_videoid_matching(self): - self.assertTrue(JustinTVIE.suitable(u"http://www.twitch.tv/vanillatv/b/328087483")) + self.assertTrue(JustinTVIE.suitable('http://www.twitch.tv/vanillatv/b/328087483')) def test_justin_tv_chapterid_matching(self): - self.assertTrue(JustinTVIE.suitable(u"http://www.twitch.tv/tsm_theoddone/c/2349361")) + self.assertTrue(JustinTVIE.suitable('http://www.twitch.tv/tsm_theoddone/c/2349361')) def test_youtube_extract(self): assertExtractId = lambda url, id: self.assertEqual(YoutubeIE.extract_id(url), id) From 52fadd5fb2ea5d7e7cd6000203aa7ef886ffad07 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Sat, 19 Apr 2014 19:41:06 +0200 Subject: [PATCH 012/184] [test_all_urls] Add support for distributed URL matching test definition --- test/helper.py | 14 ++++++++++---- test/test_all_urls.py | 3 ++- youtube_dl/extractor/extremetube.py | 7 +++++-- youtube_dl/extractor/ign.py | 7 ++----- youtube_dl/extractor/yahoo.py | 7 ++----- 5 files changed, 21 insertions(+), 17 deletions(-) diff --git a/test/helper.py b/test/helper.py index 8739f816c..09873aea3 100644 --- a/test/helper.py +++ b/test/helper.py @@ -74,13 +74,19 @@ class FakeYDL(YoutubeDL): old_report_warning(message) self.report_warning = types.MethodType(report_warning, self) -def gettestcases(): + +def gettestcases(include_onlymatching=False): for ie in youtube_dl.extractor.gen_extractors(): t = getattr(ie, '_TEST', None) if t: - t['name'] = type(ie).__name__[:-len('IE')] - yield t - for t in getattr(ie, '_TESTS', []): + assert not hasattr(ie, '_TESTS'), \ + '%s has _TEST and _TESTS' % type(ie).__name__ + tests = [t] + else: + tests = getattr(ie, '_TESTS', []) + for t in tests: + if not include_onlymatching and getattr(t, 'only_matching', False): + continue t['name'] = type(ie).__name__[:-len('IE')] yield t diff --git a/test/test_all_urls.py b/test/test_all_urls.py index a9c4ed9e3..4b56137ce 100644 --- a/test/test_all_urls.py +++ b/test/test_all_urls.py @@ -106,7 +106,7 @@ class TestAllURLsMatching(unittest.TestCase): def test_no_duplicates(self): ies = gen_extractors() - for tc in gettestcases(): + for tc in gettestcases(include_onlymatching=True): url = tc['url'] for ie in ies: if type(ie).__name__ in ('GenericIE', tc['name'] + 'IE'): @@ -176,5 +176,6 @@ class TestAllURLsMatching(unittest.TestCase): 'https://screen.yahoo.com/smartwatches-latest-wearable-gadgets-163745379-cbs.html', ['Yahoo']) + if __name__ == '__main__': unittest.main() diff --git a/youtube_dl/extractor/extremetube.py b/youtube_dl/extractor/extremetube.py index d7fc43287..7612a0364 100644 --- a/youtube_dl/extractor/extremetube.py +++ b/youtube_dl/extractor/extremetube.py @@ -10,7 +10,7 @@ from ..utils import ( class ExtremeTubeIE(InfoExtractor): _VALID_URL = r'^(?:https?://)?(?:www\.)?(?P<url>extremetube\.com/.*?video/.+?(?P<videoid>[0-9]+))(?:[/?&]|$)' - _TEST = { + _TESTS = [{ u'url': u'http://www.extremetube.com/video/music-video-14-british-euro-brit-european-cumshots-swallow-652431', u'file': u'652431.mp4', u'md5': u'1fb9228f5e3332ec8c057d6ac36f33e0', @@ -19,7 +19,10 @@ class ExtremeTubeIE(InfoExtractor): u"uploader": u"unknown", u"age_limit": 18, } - } + }, { + 'url': 'http://www.extremetube.com/gay/video/abcde-1234', + 'only_matching': True, + }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) diff --git a/youtube_dl/extractor/ign.py b/youtube_dl/extractor/ign.py index cfeaa4146..1f42c6d3a 100644 --- a/youtube_dl/extractor/ign.py +++ b/youtube_dl/extractor/ign.py @@ -106,7 +106,7 @@ class OneUPIE(IGNIE): _DESCRIPTION_RE = r'<div id="vid_summary">(.+?)</div>' - _TEST = { + _TESTS = [{ 'url': 'http://gamevideos.1up.com/video/id/34976', 'md5': '68a54ce4ebc772e4b71e3123d413163d', 'info_dict': { @@ -115,10 +115,7 @@ class OneUPIE(IGNIE): 'title': 'Sniper Elite V2 - Trailer', 'description': 'md5:5d289b722f5a6d940ca3136e9dae89cf', } - } - - # Override IGN tests - _TESTS = [] + }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) diff --git a/youtube_dl/extractor/yahoo.py b/youtube_dl/extractor/yahoo.py index e2cf1ae56..4671f49ed 100644 --- a/youtube_dl/extractor/yahoo.py +++ b/youtube_dl/extractor/yahoo.py @@ -104,7 +104,7 @@ class YahooNewsIE(YahooIE): IE_NAME = 'yahoo:news' _VALID_URL = r'http://news\.yahoo\.com/video/.*?-(?P<id>\d*?)\.html' - _TEST = { + _TESTS = [{ 'url': 'http://news.yahoo.com/video/china-moses-crazy-blues-104538833.html', 'md5': '67010fdf3a08d290e060a4dd96baa07b', 'info_dict': { @@ -113,10 +113,7 @@ class YahooNewsIE(YahooIE): 'title': 'China Moses Is Crazy About the Blues', 'description': 'md5:9900ab8cd5808175c7b3fe55b979bed0', }, - } - - # Overwrite YahooIE properties we don't want - _TESTS = [] + }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) From 3c50b99ab4f39b3b78d5f9d5c20f8d7ce682d0fd Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Sat, 19 Apr 2014 19:42:51 +0200 Subject: [PATCH 013/184] [extremetube] Modernize --- youtube_dl/extractor/extremetube.py | 30 +++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/youtube_dl/extractor/extremetube.py b/youtube_dl/extractor/extremetube.py index 7612a0364..d123623c5 100644 --- a/youtube_dl/extractor/extremetube.py +++ b/youtube_dl/extractor/extremetube.py @@ -1,3 +1,5 @@ +from __future__ import unicode_literals + import os import re @@ -8,16 +10,18 @@ from ..utils import ( compat_urllib_parse, ) + class ExtremeTubeIE(InfoExtractor): _VALID_URL = r'^(?:https?://)?(?:www\.)?(?P<url>extremetube\.com/.*?video/.+?(?P<videoid>[0-9]+))(?:[/?&]|$)' _TESTS = [{ - u'url': u'http://www.extremetube.com/video/music-video-14-british-euro-brit-european-cumshots-swallow-652431', - u'file': u'652431.mp4', - u'md5': u'1fb9228f5e3332ec8c057d6ac36f33e0', - u'info_dict': { - u"title": u"Music Video 14 british euro brit european cumshots swallow", - u"uploader": u"unknown", - u"age_limit": 18, + 'url': 'http://www.extremetube.com/video/music-video-14-british-euro-brit-european-cumshots-swallow-652431', + 'md5': '1fb9228f5e3332ec8c057d6ac36f33e0', + 'info_dict': { + 'id': '652431', + 'ext': 'mp4', + 'title': 'Music Video 14 british euro brit european cumshots swallow', + 'uploader': 'unknown', + 'age_limit': 18, } }, { 'url': 'http://www.extremetube.com/gay/video/abcde-1234', @@ -33,11 +37,14 @@ class ExtremeTubeIE(InfoExtractor): req.add_header('Cookie', 'age_verified=1') webpage = self._download_webpage(req, video_id) - video_title = self._html_search_regex(r'<h1 [^>]*?title="([^"]+)"[^>]*>\1<', webpage, u'title') - uploader = self._html_search_regex(r'>Posted by:(?=<)(?:\s|<[^>]*>)*(.+?)\|', webpage, u'uploader', fatal=False) - video_url = compat_urllib_parse.unquote(self._html_search_regex(r'video_url=(.+?)&', webpage, u'video_url')) + video_title = self._html_search_regex( + r'<h1 [^>]*?title="([^"]+)"[^>]*>\1<', webpage, 'title') + uploader = self._html_search_regex( + r'>Posted by:(?=<)(?:\s|<[^>]*>)*(.+?)\|', webpage, 'uploader', + fatal=False) + video_url = compat_urllib_parse.unquote(self._html_search_regex( + r'video_url=(.+?)&', webpage, 'video_url')) path = compat_urllib_parse_urlparse(video_url).path - extension = os.path.splitext(path)[1][1:] format = path.split('/')[5].split('_')[:2] format = "-".join(format) @@ -46,7 +53,6 @@ class ExtremeTubeIE(InfoExtractor): 'title': video_title, 'uploader': uploader, 'url': video_url, - 'ext': extension, 'format': format, 'format_id': format, 'age_limit': 18, From 3fa6b6e29371325d2ba57fb3dcfd776810bb795a Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Sat, 19 Apr 2014 19:51:04 +0200 Subject: [PATCH 014/184] [steam] Modernize --- youtube_dl/extractor/steam.py | 32 ++++++++++++++++++-------------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/youtube_dl/extractor/steam.py b/youtube_dl/extractor/steam.py index 91658f892..b3cbbfd77 100644 --- a/youtube_dl/extractor/steam.py +++ b/youtube_dl/extractor/steam.py @@ -1,3 +1,5 @@ +from __future__ import unicode_literals + import re from .common import InfoExtractor @@ -17,22 +19,24 @@ class SteamIE(InfoExtractor): _VIDEO_PAGE_TEMPLATE = 'http://store.steampowered.com/video/%s/' _AGECHECK_TEMPLATE = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970' _TEST = { - u"url": u"http://store.steampowered.com/video/105600/", - u"playlist": [ + "url": "http://store.steampowered.com/video/105600/", + "playlist": [ { - u"file": u"81300.flv", - u"md5": u"f870007cee7065d7c76b88f0a45ecc07", - u"info_dict": { - u"title": u"Terraria 1.1 Trailer", - u'playlist_index': 1, + "md5": "f870007cee7065d7c76b88f0a45ecc07", + "info_dict": { + 'id': '81300', + 'ext': 'flv', + "title": "Terraria 1.1 Trailer", + 'playlist_index': 1, } }, { - u"file": u"80859.flv", - u"md5": u"61aaf31a5c5c3041afb58fb83cbb5751", - u"info_dict": { - u"title": u"Terraria Trailer", - u'playlist_index': 2, + "md5": "61aaf31a5c5c3041afb58fb83cbb5751", + "info_dict": { + 'id': '80859', + 'ext': 'flv', + "title": "Terraria Trailer", + 'playlist_index': 2, } } ] @@ -73,7 +77,7 @@ class SteamIE(InfoExtractor): video_url = vid.group('videoURL') video_thumb = thumb.group('thumbnail') if not video_url: - raise ExtractorError(u'Cannot find video url for %s' % video_id) + raise ExtractorError('Cannot find video url for %s' % video_id) info = { 'id':video_id, 'url':video_url, @@ -82,4 +86,4 @@ class SteamIE(InfoExtractor): 'thumbnail': video_thumb } videos.append(info) - return [self.playlist_result(videos, gameID, game_title)] + return self.playlist_result(videos, gameID, game_title) From 7f9c31df880b02ef1746e2c73f4c1e5aee3da06a Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Sat, 19 Apr 2014 19:55:53 +0200 Subject: [PATCH 015/184] [steam] Simplify --- youtube_dl/extractor/steam.py | 37 ++++++++++++++++------------------- 1 file changed, 17 insertions(+), 20 deletions(-) diff --git a/youtube_dl/extractor/steam.py b/youtube_dl/extractor/steam.py index b3cbbfd77..89ac52e66 100644 --- a/youtube_dl/extractor/steam.py +++ b/youtube_dl/extractor/steam.py @@ -10,7 +10,7 @@ from ..utils import ( class SteamIE(InfoExtractor): - _VALID_URL = r"""http://store\.steampowered\.com/ + _VALID_URL = r"""(?x)http://store\.steampowered\.com/ (agecheck/)? (?P<urltype>video|app)/ #If the page is only for videos or for a game (?P<gameID>\d+)/? @@ -39,15 +39,12 @@ class SteamIE(InfoExtractor): 'playlist_index': 2, } } - ] + ], + 'params': { + 'playlistend': 2, + } } - - @classmethod - def suitable(cls, url): - """Receives a URL and returns True if suitable for this IE.""" - return re.match(cls._VALID_URL, url, re.VERBOSE) is not None - def _real_extract(self, url): m = re.match(self._VALID_URL, url, re.VERBOSE) gameID = m.group('gameID') @@ -64,26 +61,26 @@ class SteamIE(InfoExtractor): game_title = self._html_search_regex(r'<h2 class="pageheader">(.*?)</h2>', webpage, 'game title') - urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\}," - mweb = re.finditer(urlRE, webpage) - namesRE = r'<span class="title">(?P<videoName>.+?)</span>' - titles = re.finditer(namesRE, webpage) - thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">' - thumbs = re.finditer(thumbsRE, webpage) + mweb = re.finditer( + r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},", + webpage) + titles = re.finditer( + r'<span class="title">(?P<videoName>.+?)</span>', webpage) + thumbs = re.finditer( + r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">', webpage) videos = [] - for vid,vtitle,thumb in zip(mweb,titles,thumbs): + for vid, vtitle, thumb in zip(mweb, titles, thumbs): video_id = vid.group('videoID') title = vtitle.group('videoName') video_url = vid.group('videoURL') video_thumb = thumb.group('thumbnail') if not video_url: raise ExtractorError('Cannot find video url for %s' % video_id) - info = { - 'id':video_id, - 'url':video_url, + videos.append({ + 'id': video_id, + 'url': video_url, 'ext': 'flv', 'title': unescapeHTML(title), 'thumbnail': video_thumb - } - videos.append(info) + }) return self.playlist_result(videos, gameID, game_title) From 0610a3e0b201bd9c58d8983cd96c1d5545134b92 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Sat, 19 Apr 2014 19:57:09 +0200 Subject: [PATCH 016/184] Remove unused imports --- youtube_dl/extractor/dailymotion.py | 1 - youtube_dl/extractor/extremetube.py | 1 - youtube_dl/extractor/teamcoco.py | 3 --- 3 files changed, 5 deletions(-) diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index bae1c7754..c759b9889 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -8,7 +8,6 @@ from .subtitles import SubtitlesInfoExtractor from ..utils import ( compat_urllib_request, compat_str, - get_element_by_id, orderedSet, str_to_int, int_or_none, diff --git a/youtube_dl/extractor/extremetube.py b/youtube_dl/extractor/extremetube.py index d123623c5..ff7c0cd3e 100644 --- a/youtube_dl/extractor/extremetube.py +++ b/youtube_dl/extractor/extremetube.py @@ -1,6 +1,5 @@ from __future__ import unicode_literals -import os import re from .common import InfoExtractor diff --git a/youtube_dl/extractor/teamcoco.py b/youtube_dl/extractor/teamcoco.py index dcdadd120..f8dd7e955 100644 --- a/youtube_dl/extractor/teamcoco.py +++ b/youtube_dl/extractor/teamcoco.py @@ -3,9 +3,6 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..utils import ( - ExtractorError, -) class TeamcocoIE(InfoExtractor): From 3a9d6790ad1e897d274e4817de5a7aff00e30ddf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 20 Apr 2014 03:06:50 +0700 Subject: [PATCH 017/184] [ivi] Update playlist tests --- test/test_playlists.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test/test_playlists.py b/test/test_playlists.py index 17f1e5fab..02fcde1f7 100644 --- a/test/test_playlists.py +++ b/test/test_playlists.py @@ -192,8 +192,8 @@ class TestPlaylists(unittest.TestCase): self.assertIsPlaylist(result) self.assertEqual(result['id'], 'dezhurnyi_angel') self.assertEqual(result['title'], 'Дежурный ангел (2010 - 2012)') - self.assertTrue(len(result['entries']) >= 36) - + self.assertTrue(len(result['entries']) >= 23) + def test_ivi_compilation_season(self): dl = FakeYDL() ie = IviCompilationIE(dl) @@ -201,7 +201,7 @@ class TestPlaylists(unittest.TestCase): self.assertIsPlaylist(result) self.assertEqual(result['id'], 'dezhurnyi_angel/season2') self.assertEqual(result['title'], 'Дежурный ангел (2010 - 2012) 2 сезон') - self.assertTrue(len(result['entries']) >= 20) + self.assertTrue(len(result['entries']) >= 7) def test_imdb_list(self): dl = FakeYDL() From 282cb9c7ba918904bfa6f58a5a1bbf44938c76ba Mon Sep 17 00:00:00 2001 From: Kai Weber <kai.weber@brands4friends.de> Date: Sun, 20 Apr 2014 01:01:37 +0200 Subject: [PATCH 018/184] [infoq] Fix extractor --- youtube_dl/extractor/infoq.py | 47 +++++++++++++++++------------------ 1 file changed, 23 insertions(+), 24 deletions(-) diff --git a/youtube_dl/extractor/infoq.py b/youtube_dl/extractor/infoq.py index ed32373a1..55f7608b5 100644 --- a/youtube_dl/extractor/infoq.py +++ b/youtube_dl/extractor/infoq.py @@ -11,16 +11,16 @@ from ..utils import ( class InfoQIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?infoq\.com/[^/]+/(?P<id>[^/]+)$' + _TEST = { - "name": "InfoQ", - "url": "http://www.infoq.com/presentations/A-Few-of-My-Favorite-Python-Things", - "file": "12-jan-pythonthings.mp4", - "info_dict": { - "description": "Mike Pirnat presents some tips and tricks, standard libraries and third party packages that make programming in Python a richer experience.", - "title": "A Few of My Favorite [Python] Things", - }, - "params": { - "skip_download": True, + u'name': u'InfoQ', + u'url': u'http://www.infoq.com/presentations/A-Few-of-My-Favorite-Python-Things', + u'md5': u'fcaa3d995e04080dcb9465d86b5eef62', + u'info_dict': { + u'id': u'12-jan-pythonthings', + u'ext': u'mp4', + u'description': u'Mike Pirnat presents some tips and tricks, standard libraries and third party packages that make programming in Python a richer experience.', + u'title': u'A Few of My Favorite [Python] Things', }, } @@ -30,26 +30,25 @@ class InfoQIE(InfoExtractor): webpage = self._download_webpage(url, video_id) - # Extract video URL - encoded_id = self._search_regex(r"jsclassref ?= ?'([^']*)'", webpage, 'encoded id') - real_id = compat_urllib_parse.unquote(base64.b64decode(encoded_id.encode('ascii')).decode('utf-8')) - video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id + self.report_extraction(video_id) - # Extract title - video_title = self._search_regex(r'contentTitle = "(.*?)";', - webpage, 'title') + video_title = self._html_search_regex(r'<title>(.*?)', webpage, 'title') + video_description = self._html_search_meta('description', webpage, 'description') - # Extract description - video_description = self._html_search_regex(r'', - webpage, 'description', fatal=False) + video_url = 'rtmpe://video.infoq.com/cfx/st/' + base64playpath = self._search_regex(r"jsclassref = '([^']*)'", webpage, 'jsclassref') + playpath = 'mp4:' + base64.b64decode(base64playpath).decode('utf-8') - video_filename = video_url.split('/')[-1] + video_filename = playpath.split('/')[-1] video_id, extension = video_filename.split('.') - return { + return [{ 'id': video_id, - 'url': video_url, 'title': video_title, - 'ext': extension, # Extension is always(?) mp4, but seems to be flv 'description': video_description, - } + 'formats': [{ + 'url': video_url, + 'ext': extension, + 'play_path': playpath, + }], + }] From 7560096db5ce0002b3cf3f3bcbbb53ef05e6fb13 Mon Sep 17 00:00:00 2001 From: Kai Weber Date: Sun, 20 Apr 2014 01:10:30 +0200 Subject: [PATCH 019/184] [infoq] Simplify playpath calculation --- youtube_dl/extractor/infoq.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/infoq.py b/youtube_dl/extractor/infoq.py index 55f7608b5..65f0b76cb 100644 --- a/youtube_dl/extractor/infoq.py +++ b/youtube_dl/extractor/infoq.py @@ -35,9 +35,13 @@ class InfoQIE(InfoExtractor): video_title = self._html_search_regex(r'(.*?)', webpage, 'title') video_description = self._html_search_meta('description', webpage, 'description') + # The server URL is hardcoded video_url = 'rtmpe://video.infoq.com/cfx/st/' - base64playpath = self._search_regex(r"jsclassref = '([^']*)'", webpage, 'jsclassref') - playpath = 'mp4:' + base64.b64decode(base64playpath).decode('utf-8') + + # Extract video URL + encoded_id = self._search_regex(r"jsclassref ?= ?'([^']*)'", webpage, 'encoded id') + real_id = compat_urllib_parse.unquote(base64.b64decode(encoded_id.encode('ascii')).decode('utf-8')) + playpath = 'mp4:' + real_id video_filename = playpath.split('/')[-1] video_id, extension = video_filename.split('.') From 1f27d2c0e113cca4e38bb63ba5bf173a01044fd2 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Mon, 21 Apr 2014 02:34:34 +0200 Subject: [PATCH 020/184] [steam] Add support for steamcommunity.com (Fixes #2757) --- youtube_dl/extractor/steam.py | 119 ++++++++++++++++++++++------------ 1 file changed, 78 insertions(+), 41 deletions(-) diff --git a/youtube_dl/extractor/steam.py b/youtube_dl/extractor/steam.py index 89ac52e66..1d8d57224 100644 --- a/youtube_dl/extractor/steam.py +++ b/youtube_dl/extractor/steam.py @@ -10,15 +10,18 @@ from ..utils import ( class SteamIE(InfoExtractor): - _VALID_URL = r"""(?x)http://store\.steampowered\.com/ - (agecheck/)? - (?Pvideo|app)/ #If the page is only for videos or for a game - (?P\d+)/? - (?P\d*)(?P\??) #For urltype == video we sometimes get the videoID - """ + _VALID_URL = r"""(?x) + https?://store\.steampowered\.com/ + (agecheck/)? + (?Pvideo|app)/ #If the page is only for videos or for a game + (?P\d+)/? + (?P\d*)(?P\??) # For urltype == video we sometimes get the videoID + | + https?://(?:www\.)?steamcommunity\.com/sharedfiles/filedetails/\?id=(?P[0-9]+) + """ _VIDEO_PAGE_TEMPLATE = 'http://store.steampowered.com/video/%s/' _AGECHECK_TEMPLATE = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970' - _TEST = { + _TESTS = [{ "url": "http://store.steampowered.com/video/105600/", "playlist": [ { @@ -43,44 +46,78 @@ class SteamIE(InfoExtractor): 'params': { 'playlistend': 2, } - } + }, { + 'url': 'http://steamcommunity.com/sharedfiles/filedetails/?id=242472205', + 'info_dict': { + 'id': 'WB5DvDOOvAY', + 'ext': 'mp4', + 'upload_date': '20140329', + 'title': 'FRONTIERS - Final Greenlight Trailer', + 'description': "The final trailer for the Steam Greenlight launch. Hooray, progress! Here's the official Greenlight page: http://steamcommunity.com/sharedfiles/filedetails/?id=242472205", + 'uploader': 'AAD Productions', + 'uploader_id': 'AtomicAgeDogGames', + } + }] def _real_extract(self, url): - m = re.match(self._VALID_URL, url, re.VERBOSE) - gameID = m.group('gameID') - - videourl = self._VIDEO_PAGE_TEMPLATE % gameID - webpage = self._download_webpage(videourl, gameID) + m = re.match(self._VALID_URL, url) + fileID = m.group('fileID') + if fileID: + videourl = url + playlist_id = fileID + else: + gameID = m.group('gameID') + playlist_id = gameID + videourl = self._VIDEO_PAGE_TEMPLATE % playlist_id + webpage = self._download_webpage(videourl, playlist_id) if re.search('

Please enter your birth date to continue:

', webpage) is not None: - videourl = self._AGECHECK_TEMPLATE % gameID + videourl = self._AGECHECK_TEMPLATE % playlist_id self.report_age_confirmation() - webpage = self._download_webpage(videourl, gameID) + webpage = self._download_webpage(videourl, playlist_id) - self.report_extraction(gameID) - game_title = self._html_search_regex(r'', - webpage, 'game title') + if fileID: + playlist_title = self._html_search_regex( + r'
(.+)
', webpage, 'title') + mweb = re.finditer(r'''(?x) + 'movie_(?P[0-9]+)':\s*\{\s* + YOUTUBE_VIDEO_ID:\s*"(?P[^"]+)", + ''', webpage) + videos = [{ + '_type': 'url', + 'url': vid.group('youtube_id'), + 'ie_key': 'Youtube', + } for vid in mweb] + else: + playlist_title = self._html_search_regex( + r'', webpage, 'game title') - mweb = re.finditer( - r"'movie_(?P\d+)': \{\s*FILENAME: \"(?P[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P[\w:/\.\?=\+-]+)\")?\s*\},", - webpage) - titles = re.finditer( - r'(?P.+?)', webpage) - thumbs = re.finditer( - r'', webpage) - videos = [] - for vid, vtitle, thumb in zip(mweb, titles, thumbs): - video_id = vid.group('videoID') - title = vtitle.group('videoName') - video_url = vid.group('videoURL') - video_thumb = thumb.group('thumbnail') - if not video_url: - raise ExtractorError('Cannot find video url for %s' % video_id) - videos.append({ - 'id': video_id, - 'url': video_url, - 'ext': 'flv', - 'title': unescapeHTML(title), - 'thumbnail': video_thumb - }) - return self.playlist_result(videos, gameID, game_title) + mweb = re.finditer(r'''(?x) + 'movie_(?P[0-9]+)':\s*\{\s* + FILENAME:\s*"(?P[\w:/\.\?=]+)" + (,\s*MOVIE_NAME:\s*\"(?P[\w:/\.\?=\+-]+)\")?\s*\}, + ''', webpage) + titles = re.finditer( + r'(?P.+?)', webpage) + thumbs = re.finditer( + r'', webpage) + videos = [] + + for vid, vtitle, thumb in zip(mweb, titles, thumbs): + video_id = vid.group('videoID') + title = vtitle.group('videoName') + video_url = vid.group('videoURL') + video_thumb = thumb.group('thumbnail') + if not video_url: + raise ExtractorError('Cannot find video url for %s' % video_id) + videos.append({ + 'id': video_id, + 'url': video_url, + 'ext': 'flv', + 'title': unescapeHTML(title), + 'thumbnail': video_thumb + }) + if not videos: + raise ExtractorError('Could not find any videos') + + return self.playlist_result(videos, playlist_id, playlist_title) From a40e0dd434da375eb3f2642e927ac93b8575ed2d Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Mon, 21 Apr 2014 02:34:53 +0200 Subject: [PATCH 021/184] release 2014.04.21 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index b20bb8753..a1bbf4176 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2014.04.19' +__version__ = '2014.04.21' From c0a7c608150bfdb36a4916988fc47375db621d59 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Mon, 21 Apr 2014 02:55:35 +0200 Subject: [PATCH 022/184] [infoq] Simplify (#2777) --- youtube_dl/extractor/infoq.py | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/youtube_dl/extractor/infoq.py b/youtube_dl/extractor/infoq.py index 65f0b76cb..15cf8ad01 100644 --- a/youtube_dl/extractor/infoq.py +++ b/youtube_dl/extractor/infoq.py @@ -13,14 +13,14 @@ class InfoQIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?infoq\.com/[^/]+/(?P[^/]+)$' _TEST = { - u'name': u'InfoQ', - u'url': u'http://www.infoq.com/presentations/A-Few-of-My-Favorite-Python-Things', - u'md5': u'fcaa3d995e04080dcb9465d86b5eef62', - u'info_dict': { - u'id': u'12-jan-pythonthings', - u'ext': u'mp4', - u'description': u'Mike Pirnat presents some tips and tricks, standard libraries and third party packages that make programming in Python a richer experience.', - u'title': u'A Few of My Favorite [Python] Things', + 'name': 'InfoQ', + 'url': 'http://www.infoq.com/presentations/A-Few-of-My-Favorite-Python-Things', + 'md5': 'fcaa3d995e04080dcb9465d86b5eef62', + 'info_dict': { + 'id': '12-jan-pythonthings', + 'ext': 'mp4', + 'description': 'Mike Pirnat presents some tips and tricks, standard libraries and third party packages that make programming in Python a richer experience.', + 'title': 'A Few of My Favorite [Python] Things', }, } @@ -30,8 +30,6 @@ class InfoQIE(InfoExtractor): webpage = self._download_webpage(url, video_id) - self.report_extraction(video_id) - video_title = self._html_search_regex(r'(.*?)', webpage, 'title') video_description = self._html_search_meta('description', webpage, 'description') @@ -46,7 +44,7 @@ class InfoQIE(InfoExtractor): video_filename = playpath.split('/')[-1] video_id, extension = video_filename.split('.') - return [{ + return { 'id': video_id, 'title': video_title, 'description': video_description, @@ -55,4 +53,4 @@ class InfoQIE(InfoExtractor): 'ext': extension, 'play_path': playpath, }], - }] + } From edec83a02579007da0f1043f7340ff2fe252a84b Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Mon, 21 Apr 2014 03:21:34 +0200 Subject: [PATCH 023/184] [infoq] Add support for HTTP downloads (Fixes #722) --- youtube_dl/extractor/infoq.py | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/infoq.py b/youtube_dl/extractor/infoq.py index 15cf8ad01..e76dd222d 100644 --- a/youtube_dl/extractor/infoq.py +++ b/youtube_dl/extractor/infoq.py @@ -13,9 +13,8 @@ class InfoQIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?infoq\.com/[^/]+/(?P[^/]+)$' _TEST = { - 'name': 'InfoQ', 'url': 'http://www.infoq.com/presentations/A-Few-of-My-Favorite-Python-Things', - 'md5': 'fcaa3d995e04080dcb9465d86b5eef62', + 'md5': 'b5ca0e0a8c1fed93b0e65e48e462f9a2', 'info_dict': { 'id': '12-jan-pythonthings', 'ext': 'mp4', @@ -37,20 +36,32 @@ class InfoQIE(InfoExtractor): video_url = 'rtmpe://video.infoq.com/cfx/st/' # Extract video URL - encoded_id = self._search_regex(r"jsclassref ?= ?'([^']*)'", webpage, 'encoded id') + encoded_id = self._search_regex( + r"jsclassref\s*=\s*'([^']*)'", webpage, 'encoded id') real_id = compat_urllib_parse.unquote(base64.b64decode(encoded_id.encode('ascii')).decode('utf-8')) playpath = 'mp4:' + real_id video_filename = playpath.split('/')[-1] video_id, extension = video_filename.split('.') + http_base = self._search_regex( + r'EXPRESSINSTALL_SWF\s*=\s*"(https?://[^/"]+/)', webpage, + 'HTTP base URL') + + formats = [{ + 'format_id': 'rtmp', + 'url': video_url, + 'ext': extension, + 'play_path': playpath, + }, { + 'format_id': 'http', + 'url': http_base + real_id, + }] + self._sort_formats(formats) + return { 'id': video_id, 'title': video_title, 'description': video_description, - 'formats': [{ - 'url': video_url, - 'ext': extension, - 'play_path': playpath, - }], + 'formats': formats, } From d1b9c912a42de3b99ae73553d38fbfa50b8ebc52 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Mon, 21 Apr 2014 04:59:44 +0200 Subject: [PATCH 024/184] [utils] Fix _windows_write_string (Fixes #2779) It turns out that the function did not work for outputs longer than 1024 UCS-2 tokens. Write non-BMP characters one by one to ensure that we count correctly. --- youtube_dl/utils.py | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 9c9320934..116eb3610 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -923,9 +923,6 @@ def _windows_write_string(s, out): 2: -12, } - def ucs2_len(s): - return sum((2 if ord(c) > 0xffff else 1) for c in s) - fileno = out.fileno() if fileno not in WIN_OUTPUT_IDS: return False @@ -959,13 +956,25 @@ def _windows_write_string(s, out): if not_a_console(h): return False - remaining = ucs2_len(s) - while remaining > 0: + def next_nonbmp_pos(s): + try: + return next(i for i, c in enumerate(s) if ord(c) > 0xffff) + except StopIteration: + return len(s) + + while s: + count = min(next_nonbmp_pos(s), 1024) + ret = WriteConsoleW( - h, s, min(remaining, 1024), ctypes.byref(written), None) + h, s, count if count else 2, ctypes.byref(written), None) if ret == 0: raise OSError('Failed to write string') - remaining -= written.value + if not count: # We just wrote a non-BMP character + assert written.value == 2 + s = s[1:] + else: + assert written.value > 0 + s = s[written.value:] return True From fa35cdad02e1c40094f01c9f8e6529da2f021563 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Mon, 21 Apr 2014 05:47:52 +0200 Subject: [PATCH 025/184] [condenast|generic] Add support for condenast embeds (Fixes #2783) --- youtube_dl/extractor/condenast.py | 30 ++++++++++++++++++------------ youtube_dl/extractor/generic.py | 26 ++++++++++++++++++++++++++ 2 files changed, 44 insertions(+), 12 deletions(-) diff --git a/youtube_dl/extractor/condenast.py b/youtube_dl/extractor/condenast.py index 91c1c1348..ffbe4903b 100644 --- a/youtube_dl/extractor/condenast.py +++ b/youtube_dl/extractor/condenast.py @@ -28,16 +28,18 @@ class CondeNastIE(InfoExtractor): 'glamour': 'Glamour', 'wmagazine': 'W Magazine', 'vanityfair': 'Vanity Fair', + 'cnevids': 'Condé Nast', } - _VALID_URL = r'http://(video|www)\.(?P%s)\.com/(?Pwatch|series|video)/(?P.+)' % '|'.join(_SITES.keys()) + _VALID_URL = r'http://(video|www|player)\.(?P%s)\.com/(?Pwatch|series|video|embed)/(?P[^/?#]+)' % '|'.join(_SITES.keys()) IE_DESC = 'Condé Nast media group: %s' % ', '.join(sorted(_SITES.values())) _TEST = { 'url': 'http://video.wired.com/watch/3d-printed-speakers-lit-with-led', - 'file': '5171b343c2b4c00dd0c1ccb3.mp4', 'md5': '1921f713ed48aabd715691f774c451f7', 'info_dict': { + 'id': '5171b343c2b4c00dd0c1ccb3', + 'ext': 'mp4', 'title': '3D Printed Speakers Lit With LED', 'description': 'Check out these beautiful 3D printed LED speakers. You can\'t actually buy them, but LumiGeek is working on a board that will let you make you\'re own.', } @@ -55,12 +57,16 @@ class CondeNastIE(InfoExtractor): entries = [self.url_result(build_url(path), 'CondeNast') for path in paths] return self.playlist_result(entries, playlist_title=title) - def _extract_video(self, webpage): - description = self._html_search_regex([r'
(.+?)
', - r'
(.+?)
', - ], - webpage, 'description', - fatal=False, flags=re.DOTALL) + def _extract_video(self, webpage, url_type): + if url_type != 'embed': + description = self._html_search_regex( + [ + r'
(.+?)
', + r'
(.+?)
', + ], + webpage, 'description', fatal=False, flags=re.DOTALL) + else: + description = None params = self._search_regex(r'var params = {(.+?)}[;,]', webpage, 'player params', flags=re.DOTALL) video_id = self._search_regex(r'videoId: [\'"](.+?)[\'"]', params, 'video id') @@ -99,12 +105,12 @@ class CondeNastIE(InfoExtractor): mobj = re.match(self._VALID_URL, url) site = mobj.group('site') url_type = mobj.group('type') - id = mobj.group('id') + item_id = mobj.group('id') - self.to_screen(u'Extracting from %s with the Condé Nast extractor' % self._SITES[site]) - webpage = self._download_webpage(url, id) + self.to_screen('Extracting from %s with the Condé Nast extractor' % self._SITES[site]) + webpage = self._download_webpage(url, item_id) if url_type == 'series': return self._extract_series(url, webpage) else: - return self._extract_video(webpage) + return self._extract_video(webpage, url_type) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index f9b9d56d2..4b14cc5bf 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -239,6 +239,16 @@ class GenericIE(InfoExtractor): 'uploader_id': 'rbctv_2012_4', }, }, + # Condé Nast embed + { + 'url': 'http://www.wired.com/2014/04/honda-asimo/', + 'md5': 'ba0dfe966fa007657bd1443ee672db0f', + 'info_dict': { + 'id': '53501be369702d3275860000', + 'ext': 'mp4', + 'title': 'Honda’s New Asimo Robot Is More Human Than Ever', + } + } ] def report_download_webpage(self, video_id): @@ -485,6 +495,22 @@ class GenericIE(InfoExtractor): if mobj: return self.url_result(mobj.group(1), 'BlipTV') + # Look for embedded condenast player + matches = re.findall( + r']*?content="(.*?bandcamp\.com.*?)"', webpage) if mobj is not None: From a4eb9578af3ef0c0b4a3f73020743e1efe3c6c09 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Mon, 21 Apr 2014 06:18:04 +0200 Subject: [PATCH 026/184] [yahoo] Add support for movies (Fixes #2780) --- youtube_dl/extractor/yahoo.py | 35 ++++++++++++++++++++++++++--------- 1 file changed, 26 insertions(+), 9 deletions(-) diff --git a/youtube_dl/extractor/yahoo.py b/youtube_dl/extractor/yahoo.py index 4671f49ed..393f6ffbe 100644 --- a/youtube_dl/extractor/yahoo.py +++ b/youtube_dl/extractor/yahoo.py @@ -14,8 +14,8 @@ from ..utils import ( class YahooIE(InfoExtractor): - IE_DESC = 'Yahoo screen' - _VALID_URL = r'https?://screen\.yahoo\.com/.*?-(?P[0-9]+)(?:-[a-z]+)?\.html' + IE_DESC = 'Yahoo screen and movies' + _VALID_URL = r'https?://(?:screen|movies)\.yahoo\.com/.*?-(?P[0-9]+)(?:-[a-z]+)?\.html' _TESTS = [ { 'url': 'http://screen.yahoo.com/julian-smith-travis-legg-watch-214727115.html', @@ -37,6 +37,16 @@ class YahooIE(InfoExtractor): 'description': 'Agent Topple\'s mustache does its dirty work, and Nicole brokers a deal for peace. But why is the NSA collecting millions of Instagram brunch photos? And if your waffles have nothing to hide, what are they so worried about?', }, }, + { + 'url': 'https://movies.yahoo.com/video/world-loves-spider-man-190819223.html', + 'md5': '410b7104aa9893b765bc22787a22f3d9', + 'info_dict': { + 'id': '516ed8e2-2c4f-339f-a211-7a8b49d30845', + 'ext': 'mp4', + 'title': 'The World Loves Spider-Man', + 'description': '''People all over the world are celebrating the release of \"The Amazing Spider-Man 2.\" We're taking a look at the enthusiastic response Spider-Man has received from viewers all over the world.''', + } + } ] def _real_extract(self, url): @@ -44,13 +54,20 @@ class YahooIE(InfoExtractor): video_id = mobj.group('id') webpage = self._download_webpage(url, video_id) - items_json = self._search_regex(r'mediaItems: ({.*?})$', - webpage, 'items', flags=re.MULTILINE) - items = json.loads(items_json) - info = items['mediaItems']['query']['results']['mediaObj'][0] - # The 'meta' field is not always in the video webpage, we request it - # from another page - long_id = info['id'] + items_json = self._search_regex( + r'mediaItems: ({.*?})$', webpage, 'items', flags=re.MULTILINE, + default=None) + if items_json is None: + long_id = self._search_regex( + r'YUI\.namespace\("Media"\)\.CONTENT_ID\s*=\s*"([^"]+)"', + webpage, 'content ID') + video_id = long_id + else: + items = json.loads(items_json) + info = items['mediaItems']['query']['results']['mediaObj'][0] + # The 'meta' field is not always in the video webpage, we request it + # from another page + long_id = info['id'] return self._get_info(long_id, video_id) def _get_info(self, long_id, video_id): From e8f2025edf789647dc8569a69f05c8e1b54f46da Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Mon, 21 Apr 2014 06:25:21 +0200 Subject: [PATCH 027/184] [mdr] Add support for modern URLs (Fixes #2775) --- youtube_dl/extractor/mdr.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/mdr.py b/youtube_dl/extractor/mdr.py index 7aa0080d7..271dccf53 100644 --- a/youtube_dl/extractor/mdr.py +++ b/youtube_dl/extractor/mdr.py @@ -1,3 +1,5 @@ +from __future__ import unicode_literals + import re from .common import InfoExtractor @@ -7,9 +9,13 @@ from ..utils import ( class MDRIE(InfoExtractor): - _VALID_URL = r'^(?P(?:https?://)?(?:www\.)?mdr\.de)/mediathek/(?:.*)/(?Pvideo|audio)(?P[^/_]+)_.*' + _VALID_URL = r'^(?Phttps?://(?:www\.)?mdr\.de)/(?:.*)/(?Pvideo|audio)(?P[^/_]+)(?:_|\.html)' # No tests, MDR regularily deletes its videos + _TEST = { + 'url': 'http://www.mdr.de/fakt/video189002.html', + 'only_matching': True, + } def _real_extract(self, url): m = re.match(self._VALID_URL, url) @@ -19,9 +25,9 @@ class MDRIE(InfoExtractor): # determine title and media streams from webpage html = self._download_webpage(url, video_id) - title = self._html_search_regex(r'

(.*?)

', html, u'title') + title = self._html_search_regex(r'(.*?)', html, 'title') xmlurl = self._search_regex( - r'(/mediathek/(?:.+)/(?:video|audio)[0-9]+-avCustom.xml)', html, u'XML URL') + r'dataURL:\'(/(?:.+)/(?:video|audio)[0-9]+-avCustom.xml)', html, 'XML URL') doc = self._download_xml(domain + xmlurl, video_id) formats = [] @@ -41,7 +47,7 @@ class MDRIE(InfoExtractor): if vbr_el is None: format.update({ 'vcodec': 'none', - 'format_id': u'%s-%d' % (media_type, abr), + 'format_id': '%s-%d' % (media_type, abr), }) else: vbr = int(vbr_el.text) // 1000 @@ -49,12 +55,9 @@ class MDRIE(InfoExtractor): 'vbr': vbr, 'width': int(a.find('frameWidth').text), 'height': int(a.find('frameHeight').text), - 'format_id': u'%s-%d' % (media_type, vbr), + 'format_id': '%s-%d' % (media_type, vbr), }) formats.append(format) - if not formats: - raise ExtractorError(u'Could not find any valid formats') - self._sort_formats(formats) return { From 43acb120f36a2eb8db88daedb7e451ca90622252 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Mon, 21 Apr 2014 06:28:25 +0200 Subject: [PATCH 028/184] release 2014.04.21.1 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index a1bbf4176..df8ac0284 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2014.04.21' +__version__ = '2014.04.21.1' From d2d6481afb064b6d809e3b131bd7365e2aeee1e1 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Mon, 21 Apr 2014 06:49:21 +0200 Subject: [PATCH 029/184] [mdr] Remove unused imports --- youtube_dl/extractor/mdr.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/youtube_dl/extractor/mdr.py b/youtube_dl/extractor/mdr.py index 271dccf53..1b8c4a32e 100644 --- a/youtube_dl/extractor/mdr.py +++ b/youtube_dl/extractor/mdr.py @@ -3,9 +3,6 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..utils import ( - ExtractorError, -) class MDRIE(InfoExtractor): From 478c2c619364f5fb0c1ee9e9489048ab4ae26521 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Mon, 21 Apr 2014 07:12:02 +0200 Subject: [PATCH 030/184] [clubic] Add extractor (Fixes #2773) --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/clubic.py | 58 ++++++++++++++++++++++++++++++++ youtube_dl/utils.py | 11 ++++++ 3 files changed, 70 insertions(+) create mode 100644 youtube_dl/extractor/clubic.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index e0ef1cd3e..8235d42b6 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -40,6 +40,7 @@ from .cinemassacre import CinemassacreIE from .clipfish import ClipfishIE from .cliphunter import CliphunterIE from .clipsyndicate import ClipsyndicateIE +from .clubic import ClubicIE from .cmt import CMTIE from .cnet import CNETIE from .cnn import ( diff --git a/youtube_dl/extractor/clubic.py b/youtube_dl/extractor/clubic.py new file mode 100644 index 000000000..14f215c5c --- /dev/null +++ b/youtube_dl/extractor/clubic.py @@ -0,0 +1,58 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json +import re + +from .common import InfoExtractor +from ..utils import ( + clean_html, + qualities, +) + + +class ClubicIE(InfoExtractor): + _VALID_URL = r'http://(?:www\.)?clubic\.com/video/[^/]+/video.*-(?P[0-9]+)\.html' + + _TEST = { + 'url': 'http://www.clubic.com/video/clubic-week/video-clubic-week-2-0-le-fbi-se-lance-dans-la-photo-d-identite-448474.html', + 'md5': '1592b694ba586036efac1776b0b43cd3', + 'info_dict': { + 'id': '448474', + 'ext': 'mp4', + 'title': 'Clubic Week 2.0 : le FBI se lance dans la photo d\u0092identité', + 'description': 're:Gueule de bois chez Nokia. Le constructeur a indiqué cette.*', + 'thumbnail': 're:^http://img\.clubic\.com/.*\.jpg$', + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + + player_url = 'http://player.m6web.fr/v1/player/clubic/%s.html' % video_id + player_page = self._download_webpage(player_url, video_id) + + config_json = self._search_regex( + r'(?m)M6\.Player\.config\s*=\s*(\{.+?\});$', player_page, + 'configuration') + config = json.loads(config_json) + + video_info = config['videoInfo'] + sources = config['sources'] + quality_order = qualities(['sd', 'hq']) + + formats = [{ + 'format_id': src['streamQuality'], + 'url': src['src'], + 'quality': quality_order(src['streamQuality']), + } for src in sources] + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': video_info['title'], + 'formats': formats, + 'description': clean_html(video_info.get('description')), + 'thumbnail': config.get('poster'), + } diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 116eb3610..d4df78071 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1406,3 +1406,14 @@ US_RATINGS = { def strip_jsonp(code): return re.sub(r'(?s)^[a-zA-Z_]+\s*\(\s*(.*)\);\s*?\s*$', r'\1', code) + + +def qualities(quality_ids): + """ Get a numeric quality value out of a list of possible values """ + def q(qid): + try: + return quality_ids.index(qid) + except ValueError: + return -1 + return q + From 4086f119292ab1d5deab38ece163322e1011eba0 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Mon, 21 Apr 2014 07:12:12 +0200 Subject: [PATCH 031/184] release 2014.04.21.2 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index df8ac0284..fac28e134 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2014.04.21.1' +__version__ = '2014.04.21.2' From b9ba5dfa28baa4541016b49eadf74d731dc8936c Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Mon, 21 Apr 2014 07:56:51 +0200 Subject: [PATCH 032/184] [test helper] Correct only_matching test gathering --- test/helper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/helper.py b/test/helper.py index 09873aea3..da714078d 100644 --- a/test/helper.py +++ b/test/helper.py @@ -85,7 +85,7 @@ def gettestcases(include_onlymatching=False): else: tests = getattr(ie, '_TESTS', []) for t in tests: - if not include_onlymatching and getattr(t, 'only_matching', False): + if not include_onlymatching and t.get('only_matching', False): continue t['name'] = type(ie).__name__[:-len('IE')] yield t From 88ce273da4d6a870903d2551d1e1451c08febb01 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Mon, 21 Apr 2014 07:59:16 +0200 Subject: [PATCH 033/184] [arte] differentiate JSON outputs --- youtube_dl/extractor/arte.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py index 646377e4b..2abdd5029 100644 --- a/youtube_dl/extractor/arte.py +++ b/youtube_dl/extractor/arte.py @@ -74,7 +74,8 @@ class ArteTVPlus7IE(InfoExtractor): return self._extract_from_webpage(webpage, video_id, lang) def _extract_from_webpage(self, webpage, video_id, lang): - json_url = self._html_search_regex(r'arte_vp_url="(.*?)"', webpage, 'json url') + json_url = self._html_search_regex( + r'arte_vp_url="(.*?)"', webpage, 'json vp url') return self._extract_from_json_url(json_url, video_id, lang) def _extract_from_json_url(self, json_url, video_id, lang): From e51880fd32c2087885b203f7dffd48c01b68d99a Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Mon, 21 Apr 2014 07:59:29 +0200 Subject: [PATCH 034/184] [cnet] Correct JSON capturing --- youtube_dl/extractor/cnet.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/cnet.py b/youtube_dl/extractor/cnet.py index f5ab443d2..a94f42571 100644 --- a/youtube_dl/extractor/cnet.py +++ b/youtube_dl/extractor/cnet.py @@ -33,7 +33,7 @@ class CNETIE(InfoExtractor): webpage = self._download_webpage(url, display_id) data_json = self._html_search_regex( - r"
Date: Mon, 21 Apr 2014 12:34:32 +0200 Subject: [PATCH 035/184] [ted] Remove superfluous u prefixes --- youtube_dl/extractor/ted.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/ted.py b/youtube_dl/extractor/ted.py index df569a876..5b24716d9 100644 --- a/youtube_dl/extractor/ted.py +++ b/youtube_dl/extractor/ted.py @@ -97,7 +97,7 @@ class TEDIE(SubtitlesInfoExtractor): playlist_info = info['playlist'] playlist_entries = [ - self.url_result(u'http://www.ted.com/talks/' + talk['slug'], self.ie_key()) + self.url_result('http://www.ted.com/talks/' + talk['slug'], self.ie_key()) for talk in info['talks'] ] return self.playlist_result( @@ -163,7 +163,7 @@ class TEDIE(SubtitlesInfoExtractor): sub_lang_list[l] = url return sub_lang_list else: - self._downloader.report_warning(u'video doesn\'t have subtitles') + self._downloader.report_warning('video doesn\'t have subtitles') return {} def _watch_info(self, url, name): From 621f33c9d00a82c2d119046917a8bb628dc38067 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Mon, 21 Apr 2014 12:37:16 +0200 Subject: [PATCH 036/184] [ted] Extend search for description --- youtube_dl/extractor/ted.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/ted.py b/youtube_dl/extractor/ted.py index 5b24716d9..abe1c34d3 100644 --- a/youtube_dl/extractor/ted.py +++ b/youtube_dl/extractor/ted.py @@ -178,7 +178,10 @@ class TEDIE(SubtitlesInfoExtractor): title = self._html_search_regex( r"(?s)(.+?)", webpage, 'title') description = self._html_search_regex( - r'(?s)

.*?

(.*?)
', + [ + r'(?s)

.*?

(.*?)', + r'(?s)

About this talk:\s+(.*?)

', + ], webpage, 'description', fatal=False) return { From 48099643cccebbff39550808a5d2e22b492b10af Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Mon, 21 Apr 2014 12:37:41 +0200 Subject: [PATCH 037/184] [generic] Be more relaxed when looking for aparat embeds (Fixes #2784) --- youtube_dl/extractor/generic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 4b14cc5bf..3c9f98dc3 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -531,7 +531,7 @@ class GenericIE(InfoExtractor): return OoyalaIE._build_url_result(mobj.group('ec')) # Look for Aparat videos - mobj = re.search(r'