From 3f64379eda3477306df013466045ab1a711533f4 Mon Sep 17 00:00:00 2001 From: remitamine Date: Fri, 1 Apr 2016 16:22:06 +0100 Subject: [PATCH 01/30] [movieclips] fix extraction --- youtube_dl/extractor/movieclips.py | 43 ++++++++++++++++++------------ 1 file changed, 26 insertions(+), 17 deletions(-) diff --git a/youtube_dl/extractor/movieclips.py b/youtube_dl/extractor/movieclips.py index 1564cb71f..d0cb8278e 100644 --- a/youtube_dl/extractor/movieclips.py +++ b/youtube_dl/extractor/movieclips.py @@ -2,39 +2,48 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import sanitized_Request +from ..utils import ( + smuggle_url, + float_or_none, + parse_iso8601, + update_url_query, +) class MovieClipsIE(InfoExtractor): - _VALID_URL = r'https?://(?:www.)?movieclips\.com/videos/(?P[^/?#]+)' + _VALID_URL = r'https?://(?:www.)?movieclips\.com/videos/.+-(?P\d+)(?:\?|$)' _TEST = { - 'url': 'http://www.movieclips.com/videos/warcraft-trailer-1-561180739597?autoPlay=true&playlistId=5', + 'url': 'http://www.movieclips.com/videos/warcraft-trailer-1-561180739597', + 'md5': '42b5a0352d4933a7bd54f2104f481244', 'info_dict': { 'id': 'pKIGmG83AqD9', - 'display_id': 'warcraft-trailer-1-561180739597', 'ext': 'mp4', 'title': 'Warcraft Trailer 1', 'description': 'Watch Trailer 1 from Warcraft (2016). Legendary’s WARCRAFT is a 3D epic adventure of world-colliding conflict based.', 'thumbnail': 're:^https?://.*\.jpg$', + 'timestamp': 1446843055, + 'upload_date': '20151106', + 'uploader': 'Movieclips', }, 'add_ie': ['ThePlatform'], } def _real_extract(self, url): - display_id = self._match_id(url) - - req = sanitized_Request(url) - # it doesn't work if it thinks the browser it's too old - req.add_header('User-Agent', 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/43.0 (Chrome)') - webpage = self._download_webpage(req, display_id) - theplatform_link = self._html_search_regex(r'src="(http://player.theplatform.com/p/.*?)"', webpage, 'theplatform link') - title = self._html_search_regex(r']*>([^>]+)-\s*\d+\s*|\s*Movieclips.com', webpage, 'title') - description = self._html_search_meta('description', webpage) + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + video = next(v for v in self._parse_json(self._search_regex( + r'var\s+__REACT_ENGINE__\s*=\s*({.+});', + webpage, 'react engine'), video_id)['playlist']['videos'] if v['id'] == video_id) return { '_type': 'url_transparent', - 'url': theplatform_link, - 'title': title, - 'display_id': display_id, - 'description': description, + 'ie_key': 'ThePlatform', + 'url': smuggle_url(update_url_query( + video['contentUrl'], {'mbr': 'true'}), {'force_smil_url': True}), + 'title': self._og_search_title(webpage), + 'description': self._html_search_meta('description', webpage), + 'duration': float_or_none(video.get('duration')), + 'timestamp': parse_iso8601(video.get('dateCreated')), + 'thumbnail': video.get('defaultImage'), + 'uploader': video.get('provider'), } From 03caa463e73c2ae2f666b85febf25ddb03f961ca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 1 Apr 2016 22:38:56 +0600 Subject: [PATCH 02/30] [udemy:course] Skip non-video lectures --- youtube_dl/extractor/udemy.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/udemy.py b/youtube_dl/extractor/udemy.py index a788cdd77..bc69e6e41 100644 --- a/youtube_dl/extractor/udemy.py +++ b/youtube_dl/extractor/udemy.py @@ -193,12 +193,12 @@ class UdemyIE(InfoExtractor): asset = lecture['asset'] - asset_type = asset.get('assetType') or asset.get('asset_type') + asset_type = asset.get('asset_type') or asset.get('assetType') if asset_type != 'Video': raise ExtractorError( 'Lecture %s is not a video' % lecture_id, expected=True) - stream_url = asset.get('streamUrl') or asset.get('stream_url') + stream_url = asset.get('stream_url') or asset.get('streamUrl') if stream_url: youtube_url = self._search_regex( r'(https?://www\.youtube\.com/watch\?v=.*)', stream_url, 'youtube URL', default=None) @@ -206,7 +206,7 @@ class UdemyIE(InfoExtractor): return self.url_result(youtube_url, 'Youtube') video_id = asset['id'] - thumbnail = asset.get('thumbnailUrl') or asset.get('thumbnail_url') + thumbnail = asset.get('thumbnail_url') or asset.get('thumbnailUrl') duration = float_or_none(asset.get('data', {}).get('duration')) formats = [] @@ -325,7 +325,7 @@ class UdemyCourseIE(UdemyIE): 'https://www.udemy.com/api-2.0/courses/%s/cached-subscriber-curriculum-items' % course_id, course_id, 'Downloading course curriculum', query={ 'fields[chapter]': 'title,object_index', - 'fields[lecture]': 'title', + 'fields[lecture]': 'title,asset', 'page_size': '1000', }) @@ -334,6 +334,11 @@ class UdemyCourseIE(UdemyIE): for entry in response['results']: clazz = entry.get('_class') if clazz == 'lecture': + asset = entry.get('asset') + if isinstance(asset, dict): + asset_type = asset.get('asset_type') or asset.get('assetType') + if asset_type != 'Video': + continue lecture_id = entry.get('id') if lecture_id: entry = { From a3373823e1bd0239e0f58d5dd16ef5a4ec6bceb3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 1 Apr 2016 22:42:09 +0600 Subject: [PATCH 03/30] [udemy] Remove unnecessary login/password encode This is now covered by compat_urllib_parse_urlencode --- youtube_dl/extractor/udemy.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/udemy.py b/youtube_dl/extractor/udemy.py index bc69e6e41..d1e6f2703 100644 --- a/youtube_dl/extractor/udemy.py +++ b/youtube_dl/extractor/udemy.py @@ -151,8 +151,8 @@ class UdemyIE(InfoExtractor): login_form = self._form_hidden_inputs('login-form', login_popup) login_form.update({ - 'email': username.encode('utf-8'), - 'password': password.encode('utf-8'), + 'email': username, + 'password': password, }) request = sanitized_Request( From fbdaced256f9d7d9b0adb97d093f0f381c9483f7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 1 Apr 2016 22:45:20 +0600 Subject: [PATCH 04/30] [lynda] Remove unnecessary login/password encode --- youtube_dl/extractor/lynda.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/lynda.py b/youtube_dl/extractor/lynda.py index 655627479..86d47266f 100644 --- a/youtube_dl/extractor/lynda.py +++ b/youtube_dl/extractor/lynda.py @@ -28,8 +28,8 @@ class LyndaBaseIE(InfoExtractor): return login_form = { - 'username': username.encode('utf-8'), - 'password': password.encode('utf-8'), + 'username': username, + 'password': password, 'remember': 'false', 'stayPut': 'false' } From 244cd04237fe4a1e4d92421711f41de3c2566d5c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 1 Apr 2016 22:46:46 +0600 Subject: [PATCH 05/30] [pluralsight] Remove unnecessary login/password encode --- youtube_dl/extractor/pluralsight.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/pluralsight.py b/youtube_dl/extractor/pluralsight.py index df03dd419..9aab77645 100644 --- a/youtube_dl/extractor/pluralsight.py +++ b/youtube_dl/extractor/pluralsight.py @@ -64,8 +64,8 @@ class PluralsightIE(PluralsightBaseIE): login_form = self._hidden_inputs(login_page) login_form.update({ - 'Username': username.encode('utf-8'), - 'Password': password.encode('utf-8'), + 'Username': username, + 'Password': password, }) post_url = self._search_regex( From 83cedc1cf224206adf513f5bdd5f5ce915d67933 Mon Sep 17 00:00:00 2001 From: Martin Trigaux Date: Tue, 29 Mar 2016 14:18:44 +0200 Subject: [PATCH 06/30] screencast.com: support missing www The "www." part of the URL is not mandatory --- youtube_dl/extractor/screencast.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/screencast.py b/youtube_dl/extractor/screencast.py index dfd897ba3..d5111c629 100644 --- a/youtube_dl/extractor/screencast.py +++ b/youtube_dl/extractor/screencast.py @@ -12,7 +12,7 @@ from ..utils import ( class ScreencastIE(InfoExtractor): - _VALID_URL = r'https?://www\.screencast\.com/t/(?P[a-zA-Z0-9]+)' + _VALID_URL = r'https?://(?:www\.)?screencast\.com/t/(?P[a-zA-Z0-9]+)' _TESTS = [{ 'url': 'http://www.screencast.com/t/3ZEjQXlT', 'md5': '917df1c13798a3e96211dd1561fded83', @@ -34,7 +34,7 @@ class ScreencastIE(InfoExtractor): 'thumbnail': 're:^https?://.*\.(?:gif|jpg)$', } }, { - 'url': 'http://www.screencast.com/t/aAB3iowa', + 'url': 'http://screencast.com/t/aAB3iowa', 'md5': 'dedb2734ed00c9755761ccaee88527cd', 'info_dict': { 'id': 'aAB3iowa', From 81de73e5b43e5009a14f569aed92fe73e61d4f03 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 1 Apr 2016 23:00:45 +0600 Subject: [PATCH 07/30] [screencast] Add test --- youtube_dl/extractor/screencast.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/screencast.py b/youtube_dl/extractor/screencast.py index d5111c629..c69451151 100644 --- a/youtube_dl/extractor/screencast.py +++ b/youtube_dl/extractor/screencast.py @@ -34,7 +34,7 @@ class ScreencastIE(InfoExtractor): 'thumbnail': 're:^https?://.*\.(?:gif|jpg)$', } }, { - 'url': 'http://screencast.com/t/aAB3iowa', + 'url': 'http://www.screencast.com/t/aAB3iowa', 'md5': 'dedb2734ed00c9755761ccaee88527cd', 'info_dict': { 'id': 'aAB3iowa', @@ -53,8 +53,10 @@ class ScreencastIE(InfoExtractor): 'description': 'md5:7b9f393bc92af02326a5c5889639eab0', 'thumbnail': 're:^https?://.*\.(?:gif|jpg)$', } - }, - ] + }, { + 'url': 'http://screencast.com/t/aAB3iowa', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) From 791d6aaeccd2efae2c4c5fa1e72010be85eb89b8 Mon Sep 17 00:00:00 2001 From: Martin Trigaux Date: Tue, 29 Mar 2016 14:34:58 +0200 Subject: [PATCH 08/30] screencast.com: fallback on page title When determining the title of the page, use the tag of the page --- youtube_dl/extractor/screencast.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/screencast.py b/youtube_dl/extractor/screencast.py index c69451151..32f31fdd7 100644 --- a/youtube_dl/extractor/screencast.py +++ b/youtube_dl/extractor/screencast.py @@ -97,7 +97,8 @@ class ScreencastIE(InfoExtractor): if title is None: title = self._html_search_regex( [r'<b>Title:</b> ([^<]*)</div>', - r'class="tabSeperator">></span><span class="tabText">(.*?)<'], + r'class="tabSeperator">></span><span class="tabText">(.*?)<', + r'<title>([^<]*)'], webpage, 'title') thumbnail = self._og_search_thumbnail(webpage) description = self._og_search_description(webpage, default=None) From 75d572e9fb8d3e26e4ab45e65cd5e23c6b1c6915 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 1 Apr 2016 23:01:55 +0600 Subject: [PATCH 09/30] [screencast] Improve title regexes (Closes #9025) --- youtube_dl/extractor/screencast.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/screencast.py b/youtube_dl/extractor/screencast.py index 32f31fdd7..356631700 100644 --- a/youtube_dl/extractor/screencast.py +++ b/youtube_dl/extractor/screencast.py @@ -96,9 +96,9 @@ class ScreencastIE(InfoExtractor): title = self._og_search_title(webpage, default=None) if title is None: title = self._html_search_regex( - [r'Title: ([^<]*)', - r'class="tabSeperator">>(.*?)<', - r'([^<]*)'], + [r'Title: ([^<]+)', + r'class="tabSeperator">>(.+?)<', + r'([^<]+)'], webpage, 'title') thumbnail = self._og_search_thumbnail(webpage) description = self._og_search_description(webpage, default=None) From 79ba9140dc8fcf5883b7473596e8f20cba6b479f Mon Sep 17 00:00:00 2001 From: remitamine Date: Fri, 1 Apr 2016 18:06:11 +0100 Subject: [PATCH 10/30] [theplatform] extract timestamp and uploader --- youtube_dl/extractor/aenetworks.py | 6 ++++++ youtube_dl/extractor/bravotv.py | 3 +++ youtube_dl/extractor/cbs.py | 3 +++ youtube_dl/extractor/cnbc.py | 3 +++ youtube_dl/extractor/fox.py | 3 +++ youtube_dl/extractor/nationalgeographic.py | 12 ++++++++++++ youtube_dl/extractor/nbc.py | 14 ++++++++++++++ youtube_dl/extractor/sbs.py | 6 +++++- youtube_dl/extractor/theplatform.py | 10 ++++++++++ 9 files changed, 59 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/aenetworks.py b/youtube_dl/extractor/aenetworks.py index b7232c904..3fddaba54 100644 --- a/youtube_dl/extractor/aenetworks.py +++ b/youtube_dl/extractor/aenetworks.py @@ -22,6 +22,9 @@ class AENetworksIE(InfoExtractor): 'ext': 'mp4', 'title': "Bet You Didn't Know: Valentine's Day", 'description': 'md5:7b57ea4829b391995b405fa60bd7b5f7', + 'timestamp': 1375819729, + 'upload_date': '20130806', + 'uploader': 'AENE-NEW', }, 'params': { # m3u8 download @@ -37,6 +40,9 @@ class AENetworksIE(InfoExtractor): 'ext': 'mp4', 'title': 'Winter Is Coming', 'description': 'md5:641f424b7a19d8e24f26dea22cf59d74', + 'timestamp': 1338306241, + 'upload_date': '20120529', + 'uploader': 'AENE-NEW', }, 'add_ie': ['ThePlatform'], }, { diff --git a/youtube_dl/extractor/bravotv.py b/youtube_dl/extractor/bravotv.py index 34d451f38..541c76944 100644 --- a/youtube_dl/extractor/bravotv.py +++ b/youtube_dl/extractor/bravotv.py @@ -15,6 +15,9 @@ class BravoTVIE(InfoExtractor): 'ext': 'mp4', 'title': 'Last Chance Kitchen Returns', 'description': 'S13: Last Chance Kitchen Returns for Top Chef Season 13', + 'timestamp': 1448926740, + 'upload_date': '20151130', + 'uploader': 'NBCU-BRAV', } } diff --git a/youtube_dl/extractor/cbs.py b/youtube_dl/extractor/cbs.py index 6e4079ca3..c621a08d5 100644 --- a/youtube_dl/extractor/cbs.py +++ b/youtube_dl/extractor/cbs.py @@ -33,6 +33,9 @@ class CBSIE(CBSBaseIE): 'title': 'Connect Chat feat. Garth Brooks', 'description': 'Connect with country music singer Garth Brooks, as he chats with fans on Wednesday November 27, 2013. Be sure to tune in to Garth Brooks: Live from Las Vegas, Friday November 29, at 9/8c on CBS!', 'duration': 1495, + 'timestamp': 1385585425, + 'upload_date': '20131127', + 'uploader': 'CBSI-NEW', }, 'params': { # rtmp download diff --git a/youtube_dl/extractor/cnbc.py b/youtube_dl/extractor/cnbc.py index 25b308752..d354d9f95 100644 --- a/youtube_dl/extractor/cnbc.py +++ b/youtube_dl/extractor/cnbc.py @@ -14,6 +14,9 @@ class CNBCIE(InfoExtractor): 'ext': 'mp4', 'title': 'Fighting zombies is big business', 'description': 'md5:0c100d8e1a7947bd2feec9a5550e519e', + 'timestamp': 1459332000, + 'upload_date': '20160330', + 'uploader': 'NBCU-CNBC', }, 'params': { # m3u8 download diff --git a/youtube_dl/extractor/fox.py b/youtube_dl/extractor/fox.py index fa05af50d..95c1abf94 100644 --- a/youtube_dl/extractor/fox.py +++ b/youtube_dl/extractor/fox.py @@ -16,6 +16,9 @@ class FOXIE(InfoExtractor): 'title': 'Official Trailer: Gotham', 'description': 'Tracing the rise of the great DC Comics Super-Villains and vigilantes, Gotham reveals an entirely new chapter that has never been told.', 'duration': 129, + 'timestamp': 1400020798, + 'upload_date': '20140513', + 'uploader': 'NEWA-FNG-FOXCOM', }, 'add_ie': ['ThePlatform'], } diff --git a/youtube_dl/extractor/nationalgeographic.py b/youtube_dl/extractor/nationalgeographic.py index 61b5c700e..722518663 100644 --- a/youtube_dl/extractor/nationalgeographic.py +++ b/youtube_dl/extractor/nationalgeographic.py @@ -21,6 +21,9 @@ class NationalGeographicIE(InfoExtractor): 'ext': 'mp4', 'title': 'Mating Crabs Busted by Sharks', 'description': 'md5:16f25aeffdeba55aaa8ec37e093ad8b3', + 'timestamp': 1423523799, + 'upload_date': '20150209', + 'uploader': 'NAGS', }, 'add_ie': ['ThePlatform'], }, @@ -32,6 +35,9 @@ class NationalGeographicIE(InfoExtractor): 'ext': 'mp4', 'title': 'The Real Jaws', 'description': 'md5:8d3e09d9d53a85cd397b4b21b2c77be6', + 'timestamp': 1433772632, + 'upload_date': '20150608', + 'uploader': 'NAGS', }, 'add_ie': ['ThePlatform'], }, @@ -68,6 +74,9 @@ class NationalGeographicChannelIE(InfoExtractor): 'ext': 'mp4', 'title': 'Uncovering a Universal Knowledge', 'description': 'md5:1a89148475bf931b3661fcd6ddb2ae3a', + 'timestamp': 1458680907, + 'upload_date': '20160322', + 'uploader': 'NEWA-FNG-NGTV', }, 'add_ie': ['ThePlatform'], }, @@ -79,6 +88,9 @@ class NationalGeographicChannelIE(InfoExtractor): 'ext': 'mp4', 'title': 'The Stunning Red Bird of Paradise', 'description': 'md5:7bc8cd1da29686be4d17ad1230f0140c', + 'timestamp': 1459362152, + 'upload_date': '20160330', + 'uploader': 'NEWA-FNG-NGTV', }, 'add_ie': ['ThePlatform'], }, diff --git a/youtube_dl/extractor/nbc.py b/youtube_dl/extractor/nbc.py index 43d75d3ca..e67025ff6 100644 --- a/youtube_dl/extractor/nbc.py +++ b/youtube_dl/extractor/nbc.py @@ -27,6 +27,9 @@ class NBCIE(InfoExtractor): 'ext': 'mp4', 'title': 'Jimmy Fallon Surprises Fans at Ben & Jerry\'s', 'description': 'Jimmy gives out free scoops of his new "Tonight Dough" ice cream flavor by surprising customers at the Ben & Jerry\'s scoop shop.', + 'timestamp': 1424246400, + 'upload_date': '20150218', + 'uploader': 'NBCU-COM', }, 'params': { # m3u8 download @@ -50,6 +53,9 @@ class NBCIE(InfoExtractor): 'ext': 'mp4', 'title': 'Star Wars Teaser', 'description': 'md5:0b40f9cbde5b671a7ff62fceccc4f442', + 'timestamp': 1417852800, + 'upload_date': '20141206', + 'uploader': 'NBCU-COM', }, 'params': { # m3u8 download @@ -78,6 +84,7 @@ class NBCIE(InfoExtractor): theplatform_url = 'http:' + theplatform_url return { '_type': 'url_transparent', + 'ie_key': 'ThePlatform', 'url': smuggle_url(theplatform_url, {'source_url': url}), 'id': video_id, } @@ -93,6 +100,9 @@ class NBCSportsVPlayerIE(InfoExtractor): 'ext': 'flv', 'description': 'md5:df390f70a9ba7c95ff1daace988f0d8d', 'title': 'Tyler Kalinoski hits buzzer-beater to lift Davidson', + 'timestamp': 1426270238, + 'upload_date': '20150313', + 'uploader': 'NBCU-SPORTS', } }, { 'url': 'http://vplayer.nbcsports.com/p/BxmELC/nbc_embedshare/select/_hqLjQ95yx8Z', @@ -144,6 +154,9 @@ class CSNNEIE(InfoExtractor): 'ext': 'mp4', 'title': 'SNC evening update: Wright named Red Sox\' No. 5 starter.', 'description': 'md5:1753cfee40d9352b19b4c9b3e589b9e3', + 'timestamp': 1459369979, + 'upload_date': '20160330', + 'uploader': 'NBCU-SPORTS', } } @@ -331,6 +344,7 @@ class MSNBCIE(InfoExtractor): 'thumbnail': 're:^https?://.*\.jpg$', 'timestamp': 1406937606, 'upload_date': '20140802', + 'uploader': 'NBCU-NEWS', 'categories': ['MSNBC/Topics/Franchise/Best of last night', 'MSNBC/Topics/General/Congress'], }, } diff --git a/youtube_dl/extractor/sbs.py b/youtube_dl/extractor/sbs.py index 2f96477ca..96472fbc4 100644 --- a/youtube_dl/extractor/sbs.py +++ b/youtube_dl/extractor/sbs.py @@ -24,6 +24,9 @@ class SBSIE(InfoExtractor): 'description': 'md5:f250a9856fca50d22dec0b5b8015f8a5', 'thumbnail': 're:http://.*\.jpg', 'duration': 308, + 'timestamp': 1408613220, + 'upload_date': '20140821', + 'uploader': 'SBSC', }, }, { 'url': 'http://www.sbs.com.au/ondemand/video/320403011771/Dingo-Conservation-The-Feed', @@ -57,6 +60,7 @@ class SBSIE(InfoExtractor): return { '_type': 'url_transparent', + 'ie_key': 'ThePlatform', 'id': video_id, - 'url': smuggle_url(theplatform_url, {'force_smil_url': True}), + 'url': smuggle_url(self._proto_relative_url(theplatform_url), {'force_smil_url': True}), } diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py index bf6f82f5a..6da701a39 100644 --- a/youtube_dl/extractor/theplatform.py +++ b/youtube_dl/extractor/theplatform.py @@ -76,6 +76,8 @@ class ThePlatformBaseIE(OnceIE): 'description': info['description'], 'thumbnail': info['defaultThumbnailUrl'], 'duration': int_or_none(info.get('duration'), 1000), + 'timestamp': int_or_none(info.get('pubDate'), 1000) or None, + 'uploader': info.get('billingCode'), } @@ -94,6 +96,9 @@ class ThePlatformIE(ThePlatformBaseIE): 'title': 'Blackberry\'s big, bold Z30', 'description': 'The Z30 is Blackberry\'s biggest, baddest mobile messaging device yet.', 'duration': 247, + 'timestamp': 1383239700, + 'upload_date': '20131031', + 'uploader': 'CBSI-NEW', }, 'params': { # rtmp download @@ -107,6 +112,9 @@ class ThePlatformIE(ThePlatformBaseIE): 'ext': 'flv', 'description': 'md5:ac330c9258c04f9d7512cf26b9595409', 'title': 'Tesla Model S: A second step towards a cleaner motoring future', + 'timestamp': 1426176191, + 'upload_date': '20150312', + 'uploader': 'CBSI-NEW', }, 'params': { # rtmp download @@ -119,6 +127,7 @@ class ThePlatformIE(ThePlatformBaseIE): 'ext': 'mp4', 'description': 'md5:644ad9188d655b742f942bf2e06b002d', 'title': 'HIGHLIGHTS: USA bag first ever series Cup win', + 'uploader': 'EGSM', } }, { 'url': 'http://player.theplatform.com/p/NnzsPC/widget/select/media/4Y0TlYUr_ZT7', @@ -135,6 +144,7 @@ class ThePlatformIE(ThePlatformBaseIE): 'thumbnail': 're:^https?://.*\.jpg$', 'timestamp': 1435752600, 'upload_date': '20150701', + 'uploader': 'NBCU-NEWS', }, }, { # From http://www.nbc.com/the-blacklist/video/sir-crispin-crandall/2928790?onid=137781#vc137781=1 From 04819db58edfa7a169e7ba6fd2d5734500754571 Mon Sep 17 00:00:00 2001 From: theGeekPirate Date: Sat, 26 Mar 2016 05:37:40 -0700 Subject: [PATCH 11/30] [camwithher] Add extractor Corrected unnecessary test Sane variable naming RTMP all .flv & url_id for _download_webpage() Corrected all outstanding issues, next up is a squash! --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/camwithher.py | 55 ++++++++++++++++++++++++++++++ 2 files changed, 56 insertions(+) create mode 100644 youtube_dl/extractor/camwithher.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 19f802411..438e1cc63 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -95,6 +95,7 @@ from .camdemy import ( CamdemyIE, CamdemyFolderIE ) +from .camwithher import CamWithHerIE from .canalplus import CanalplusIE from .canalc2 import Canalc2IE from .canvas import CanvasIE diff --git a/youtube_dl/extractor/camwithher.py b/youtube_dl/extractor/camwithher.py new file mode 100644 index 000000000..eb0a4ec56 --- /dev/null +++ b/youtube_dl/extractor/camwithher.py @@ -0,0 +1,55 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class CamWithHerIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?camwithher\.tv/view_video\.php\?.*viewkey=(?P\w+)' + + _TESTS = [ + { + 'url': 'http://camwithher.tv/view_video.php?viewkey=6e9a24e2c0e842e1f177&page=&viewtype=&category=', + 'info_dict': { + 'id': '5644', + 'ext': 'flv', + 'title': 'Periscope Tease', + }, + 'params': { + 'skip_download': True, + } + }, + { + 'url': 'http://camwithher.tv/view_video.php?viewkey=6dfd8b7c97531a459937', + 'only_matching': True, + }, + { + 'url': 'http://camwithher.tv/view_video.php?page=&viewkey=6e9a24e2c0e842e1f177&viewtype=&category=', + 'only_matching': True, + }, + { + 'url': 'http://camwithher.tv/view_video.php?viewkey=b6c3b5bea9515d1a1fc4&page=&viewtype=&category=mv', + 'only_matching': True, + } + ] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + flv_id = self._html_search_regex(r' 2010 else flv_id) + + title = self._html_search_regex(r'
\s+

(.+?)

', webpage, 'title') + + return { + 'id': flv_id, + 'url': rtmp_url, + 'no_resume': True, + 'ext': 'flv', + 'title': title, + } From 9aaaf8e8e8ae12ed8fbc62461558a4cdb8640ad5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 1 Apr 2016 23:47:27 +0600 Subject: [PATCH 12/30] [camwithher] Improve extraction (Closes #8989) --- youtube_dl/extractor/camwithher.py | 95 ++++++++++++++++++++---------- 1 file changed, 64 insertions(+), 31 deletions(-) diff --git a/youtube_dl/extractor/camwithher.py b/youtube_dl/extractor/camwithher.py index eb0a4ec56..9809096ec 100644 --- a/youtube_dl/extractor/camwithher.py +++ b/youtube_dl/extractor/camwithher.py @@ -1,55 +1,88 @@ from __future__ import unicode_literals +import re + from .common import InfoExtractor +from ..utils import ( + int_or_none, + parse_duration, + unified_strdate, +) class CamWithHerIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?camwithher\.tv/view_video\.php\?.*viewkey=(?P\w+)' + _VALID_URL = r'https?://(?:www\.)?camwithher\.tv/view_video\.php\?.*\bviewkey=(?P\w+)' - _TESTS = [ - { - 'url': 'http://camwithher.tv/view_video.php?viewkey=6e9a24e2c0e842e1f177&page=&viewtype=&category=', - 'info_dict': { - 'id': '5644', - 'ext': 'flv', - 'title': 'Periscope Tease', - }, - 'params': { - 'skip_download': True, - } + _TESTS = [{ + 'url': 'http://camwithher.tv/view_video.php?viewkey=6e9a24e2c0e842e1f177&page=&viewtype=&category=', + 'info_dict': { + 'id': '5644', + 'ext': 'flv', + 'title': 'Periscope Tease', + 'description': 'In the clouds teasing on periscope to my favorite song', + 'duration': 240, + 'view_count': int, + 'comment_count': int, + 'uploader': 'MileenaK', + 'upload_date': '20160322', }, - { - 'url': 'http://camwithher.tv/view_video.php?viewkey=6dfd8b7c97531a459937', - 'only_matching': True, - }, - { - 'url': 'http://camwithher.tv/view_video.php?page=&viewkey=6e9a24e2c0e842e1f177&viewtype=&category=', - 'only_matching': True, - }, - { - 'url': 'http://camwithher.tv/view_video.php?viewkey=b6c3b5bea9515d1a1fc4&page=&viewtype=&category=mv', - 'only_matching': True, + 'params': { + 'skip_download': True, } - ] + }, { + 'url': 'http://camwithher.tv/view_video.php?viewkey=6dfd8b7c97531a459937', + 'only_matching': True, + }, { + 'url': 'http://camwithher.tv/view_video.php?page=&viewkey=6e9a24e2c0e842e1f177&viewtype=&category=', + 'only_matching': True, + }, { + 'url': 'http://camwithher.tv/view_video.php?viewkey=b6c3b5bea9515d1a1fc4&page=&viewtype=&category=mv', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - flv_id = self._html_search_regex(r'
2010 else flv_id) + # Video URL construction algorithm is reverse-engineered from cwhplayer.swf + rtmp_url = 'rtmp://camwithher.tv/clipshare/%s' % ( + ('mp4:%s.mp4' % flv_id) if int(flv_id) > 2010 else flv_id) + + title = self._html_search_regex( + r']+style="float:left"[^>]*>\s*

(.+?)

', webpage, 'title') + description = self._html_search_regex( + r'>Description:(.+?)
', webpage, 'description', default=None) + + runtime = self._search_regex( + r'Runtime\s*:\s*(.+?) \|', webpage, 'duration', default=None) + if runtime: + runtime = re.sub(r'[\s-]', '', runtime) + duration = parse_duration(runtime) + view_count = int_or_none(self._search_regex( + r'Views\s*:\s*(\d+)', webpage, 'view count', default=None)) + comment_count = int_or_none(self._search_regex( + r'Comments\s*:\s*(\d+)', webpage, 'comment count', default=None)) + + uploader = self._search_regex( + r'Added by\s*:\s*]+>([^<]+)', webpage, 'uploader', default=None) + upload_date = unified_strdate(self._search_regex( + r'Added on\s*:\s*([\d-]+)', webpage, 'upload date', default=None)) - title = self._html_search_regex(r'
\s+

(.+?)

', webpage, 'title') return { 'id': flv_id, 'url': rtmp_url, - 'no_resume': True, 'ext': 'flv', + 'no_resume': True, 'title': title, + 'description': description, + 'duration': duration, + 'view_count': view_count, + 'comment_count': comment_count, + 'uploader': uploader, + 'upload_date': upload_date, } From 329c1eae54bf71ae8602f79f71570eaf90ef7d2c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Fri, 1 Apr 2016 20:42:19 +0200 Subject: [PATCH 13/30] [aenetworks] Make pep8 happy --- youtube_dl/extractor/aenetworks.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/aenetworks.py b/youtube_dl/extractor/aenetworks.py index 3fddaba54..1bbfe2641 100644 --- a/youtube_dl/extractor/aenetworks.py +++ b/youtube_dl/extractor/aenetworks.py @@ -75,8 +75,9 @@ class AENetworksIE(InfoExtractor): info = self._search_json_ld(webpage, video_id, fatal=False) info.update({ '_type': 'url_transparent', - 'url': smuggle_url(update_url_query( - video_url, query), { + 'url': smuggle_url( + update_url_query(video_url, query), + { 'sig': { 'key': 'crazyjava', 'secret': 's3cr3t'}, From 0f28777f58b5c21226d8e02477834dbb08b170ba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Fri, 1 Apr 2016 20:43:14 +0200 Subject: [PATCH 14/30] [cbsnews] Remove unused import --- youtube_dl/extractor/cbsnews.py | 1 - 1 file changed, 1 deletion(-) diff --git a/youtube_dl/extractor/cbsnews.py b/youtube_dl/extractor/cbsnews.py index b5e78a65d..79ddc20a0 100644 --- a/youtube_dl/extractor/cbsnews.py +++ b/youtube_dl/extractor/cbsnews.py @@ -5,7 +5,6 @@ from .common import InfoExtractor from .cbs import CBSBaseIE from ..utils import ( parse_duration, - find_xpath_attr, ) From 6d628fafcadf6b9d2bc16c34c8cda8b53860e406 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Fri, 1 Apr 2016 20:45:21 +0200 Subject: [PATCH 15/30] [camwithher] Remove extra blank line --- youtube_dl/extractor/camwithher.py | 1 - 1 file changed, 1 deletion(-) diff --git a/youtube_dl/extractor/camwithher.py b/youtube_dl/extractor/camwithher.py index 9809096ec..afbc5ea26 100644 --- a/youtube_dl/extractor/camwithher.py +++ b/youtube_dl/extractor/camwithher.py @@ -72,7 +72,6 @@ class CamWithHerIE(InfoExtractor): upload_date = unified_strdate(self._search_regex( r'Added on\s*:\s*([\d-]+)', webpage, 'upload date', default=None)) - return { 'id': flv_id, 'url': rtmp_url, From df634be2ed85b33968973a3e85935bb5d578ce42 Mon Sep 17 00:00:00 2001 From: remitamine Date: Fri, 1 Apr 2016 19:39:02 +0100 Subject: [PATCH 16/30] [common] prefer using mime type over ext for smil subtitle extraction the subtitle ext for http://www.cnet.com/videos/download-amazon-prime-movies-and-tv/ is adb_xml while using the mime type it get tt(application/smptett+xml) --- youtube_dl/extractor/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 85ac0400c..94a583891 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1335,7 +1335,7 @@ class InfoExtractor(object): if not src or src in urls: continue urls.append(src) - ext = textstream.get('ext') or determine_ext(src) or mimetype2ext(textstream.get('type')) + ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src) lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang subtitles.setdefault(lang, []).append({ 'url': src, From 0750b2491f5f14e51c2bf91584fd490944154393 Mon Sep 17 00:00:00 2001 From: remitamine Date: Fri, 1 Apr 2016 19:47:20 +0100 Subject: [PATCH 17/30] [ffmpeg] try to convert tt subtitles usng dfxp2srt --- youtube_dl/postprocessor/ffmpeg.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/postprocessor/ffmpeg.py b/youtube_dl/postprocessor/ffmpeg.py index 06b8c0548..b64cd396b 100644 --- a/youtube_dl/postprocessor/ffmpeg.py +++ b/youtube_dl/postprocessor/ffmpeg.py @@ -536,7 +536,7 @@ class FFmpegSubtitlesConvertorPP(FFmpegPostProcessor): sub_filenames.append(old_file) new_file = subtitles_filename(filename, lang, new_ext) - if ext == 'dfxp' or ext == 'ttml': + if ext == 'dfxp' or ext == 'ttml' or ext == 'tt': self._downloader.report_warning( 'You have requested to convert dfxp (TTML) subtitles into another format, ' 'which results in style information loss') From 5f705baf5ecda6be678481ff9ab9c27a6cd54dc0 Mon Sep 17 00:00:00 2001 From: remitamine Date: Fri, 1 Apr 2016 20:42:15 +0100 Subject: [PATCH 18/30] [cnet] extract more formats --- youtube_dl/extractor/cnet.py | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/youtube_dl/extractor/cnet.py b/youtube_dl/extractor/cnet.py index c154b3e19..58c26f20f 100644 --- a/youtube_dl/extractor/cnet.py +++ b/youtube_dl/extractor/cnet.py @@ -17,6 +17,8 @@ class CNETIE(ThePlatformIE): 'uploader_id': '6085384d-619e-11e3-b231-14feb5ca9861', 'uploader': 'Sarah Mitroff', 'duration': 70, + 'timestamp': 1396479627, + 'upload_date': '20140402', }, }, { 'url': 'http://www.cnet.com/videos/whiny-pothole-tweets-at-local-government-when-hit-by-cars-tomorrow-daily-187/', @@ -28,8 +30,11 @@ class CNETIE(ThePlatformIE): 'uploader_id': 'b163284d-6b73-44fc-b3e6-3da66c392d40', 'uploader': 'Ashley Esqueda', 'duration': 1482, + 'timestamp': 1433289889, + 'upload_date': '20150603', }, }] + TP_RELEASE_URL_TEMPLATE = 'http://link.theplatform.com/s/kYEXFC/%s?mbr=true' def _real_extract(self, url): display_id = self._match_id(url) @@ -51,16 +56,12 @@ class CNETIE(ThePlatformIE): uploader = None uploader_id = None - metadata = self.get_metadata('kYEXFC/%s' % list(vdata['files'].values())[0], video_id) - description = vdata.get('description') or metadata.get('description') - duration = int_or_none(vdata.get('duration')) or metadata.get('duration') - - formats = [] - subtitles = {} + media_guid_path = 'media/guid/2288573011/%s' % vdata['mpxRefId'] + formats, subtitles = self._extract_theplatform_smil(self.TP_RELEASE_URL_TEMPLATE % media_guid_path, video_id) for (fkey, vid) in vdata['files'].items(): if fkey == 'hls_phone' and 'hls_tablet' in vdata['files']: continue - release_url = 'http://link.theplatform.com/s/kYEXFC/%s?mbr=true' % vid + release_url = self.TP_RELEASE_URL_TEMPLATE % vid if fkey == 'hds': release_url += '&manifest=f4m' tp_formats, tp_subtitles = self._extract_theplatform_smil(release_url, video_id, 'Downloading %s SMIL data' % fkey) @@ -68,15 +69,15 @@ class CNETIE(ThePlatformIE): subtitles = self._merge_subtitles(subtitles, tp_subtitles) self._sort_formats(formats) - return { + info = self.get_metadata('kYEXFC/%s' % media_guid_path, video_id) + info.update({ 'id': video_id, 'display_id': display_id, 'title': title, - 'description': description, - 'thumbnail': metadata.get('thumbnail'), - 'duration': duration, + 'duration': int_or_none(vdata.get('duration')), 'uploader': uploader, 'uploader_id': uploader_id, 'subtitles': subtitles, 'formats': formats, - } + }) + return info From fe7ef95e91cec1c1794692029561a68e2aaa7809 Mon Sep 17 00:00:00 2001 From: remitamine Date: Fri, 1 Apr 2016 23:53:32 +0100 Subject: [PATCH 19/30] [cbsinteractive] Add support for ZDNet videos --- youtube_dl/extractor/__init__.py | 2 +- .../extractor/{cnet.py => cbsinteractive.py} | 37 ++++++++++++++++--- 2 files changed, 32 insertions(+), 7 deletions(-) rename youtube_dl/extractor/{cnet.py => cbsinteractive.py} (69%) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 438e1cc63..ebf5ca3e9 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -104,6 +104,7 @@ from .cbc import ( CBCPlayerIE, ) from .cbs import CBSIE +from .cbsinteractive import CBSInteractiveIE from .cbsnews import ( CBSNewsIE, CBSNewsLiveVideoIE, @@ -129,7 +130,6 @@ from .clubic import ClubicIE from .clyp import ClypIE from .cmt import CMTIE from .cnbc import CNBCIE -from .cnet import CNETIE from .cnn import ( CNNIE, CNNBlogsIE, diff --git a/youtube_dl/extractor/cnet.py b/youtube_dl/extractor/cbsinteractive.py similarity index 69% rename from youtube_dl/extractor/cnet.py rename to youtube_dl/extractor/cbsinteractive.py index 58c26f20f..0011c3029 100644 --- a/youtube_dl/extractor/cnet.py +++ b/youtube_dl/extractor/cbsinteractive.py @@ -1,12 +1,14 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .theplatform import ThePlatformIE from ..utils import int_or_none -class CNETIE(ThePlatformIE): - _VALID_URL = r'https?://(?:www\.)?cnet\.com/videos/(?P[^/]+)/' +class CBSInteractiveIE(ThePlatformIE): + _VALID_URL = r'https?://(?:www\.)?(?Pcnet|zdnet)\.com/(?:videos|video/share)/(?P[^/?]+)' _TESTS = [{ 'url': 'http://www.cnet.com/videos/hands-on-with-microsofts-windows-8-1-update/', 'info_dict': { @@ -33,15 +35,35 @@ class CNETIE(ThePlatformIE): 'timestamp': 1433289889, 'upload_date': '20150603', }, + }, { + 'url': 'http://www.zdnet.com/video/share/video-keeping-android-smartphones-and-tablets-secure/', + 'info_dict': { + 'id': 'bc1af9f0-a2b5-4e54-880d-0d95525781c0', + 'ext': 'mp4', + 'title': 'Video: Keeping Android smartphones and tablets secure', + 'description': 'Here\'s the best way to keep Android devices secure, and what you do when they\'ve come to the end of their lives.', + 'uploader_id': 'f2d97ea2-8175-11e2-9d12-0018fe8a00b0', + 'uploader': 'Adrian Kingsley-Hughes', + 'timestamp': 1448961720, + 'upload_date': '20151201', + }, + 'params': { + # m3u8 download + 'skip_download': True, + } }] TP_RELEASE_URL_TEMPLATE = 'http://link.theplatform.com/s/kYEXFC/%s?mbr=true' + MPX_ACCOUNTS = { + 'cnet': 2288573011, + 'zdnet': 2387448114, + } def _real_extract(self, url): - display_id = self._match_id(url) + site, display_id = re.match(self._VALID_URL, url).groups() webpage = self._download_webpage(url, display_id) data_json = self._html_search_regex( - r"data-cnet-video(?:-uvp)?-options='([^']+)'", + r"data-(?:cnet|zdnet)-video(?:-uvp)?-options='([^']+)'", webpage, 'data json') data = self._parse_json(data_json, display_id) vdata = data.get('video') or data['videos'][0] @@ -56,8 +78,11 @@ class CNETIE(ThePlatformIE): uploader = None uploader_id = None - media_guid_path = 'media/guid/2288573011/%s' % vdata['mpxRefId'] - formats, subtitles = self._extract_theplatform_smil(self.TP_RELEASE_URL_TEMPLATE % media_guid_path, video_id) + media_guid_path = 'media/guid/%d/%s' % (self.MPX_ACCOUNTS[site], vdata['mpxRefId']) + formats, subtitles = [], {} + if site == 'cnet': + formats, subtitles = self._extract_theplatform_smil( + self.TP_RELEASE_URL_TEMPLATE % media_guid_path, video_id) for (fkey, vid) in vdata['files'].items(): if fkey == 'hls_phone' and 'hls_tablet' in vdata['files']: continue From 08136dc13805abb1832587e03e68066f07bd5776 Mon Sep 17 00:00:00 2001 From: remitamine Date: Sat, 2 Apr 2016 10:57:57 +0100 Subject: [PATCH 20/30] [brightcove] fix format sorting --- youtube_dl/extractor/brightcove.py | 3 ++- youtube_dl/extractor/thestar.py | 4 ++++ youtube_dl/extractor/tv3.py | 1 + 3 files changed, 7 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index 0d162d337..a8919001d 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -396,6 +396,7 @@ class BrightcoveNewIE(InfoExtractor): 'formats': 'mincount:41', }, 'params': { + # m3u8 download 'skip_download': True, } }, { @@ -533,7 +534,7 @@ class BrightcoveNewIE(InfoExtractor): f.update({ 'url': src or streaming_src, 'format_id': build_format_id('http' if src else 'http-streaming'), - 'preference': 2 if src else 1, + 'source_preference': 0 if src else -1, }) else: f.update({ diff --git a/youtube_dl/extractor/thestar.py b/youtube_dl/extractor/thestar.py index b7e9af2af..ba1380abc 100644 --- a/youtube_dl/extractor/thestar.py +++ b/youtube_dl/extractor/thestar.py @@ -19,6 +19,10 @@ class TheStarIE(InfoExtractor): 'uploader_id': '794267642001', 'timestamp': 1454353482, 'upload_date': '20160201', + }, + 'params': { + # m3u8 download + 'skip_download': True, } } BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/794267642001/default_default/index.html?videoId=%s' diff --git a/youtube_dl/extractor/tv3.py b/youtube_dl/extractor/tv3.py index d3f690dc7..3867ec90d 100644 --- a/youtube_dl/extractor/tv3.py +++ b/youtube_dl/extractor/tv3.py @@ -21,6 +21,7 @@ class TV3IE(InfoExtractor): 'Failed to download MPD manifest' ], 'params': { + # m3u8 download 'skip_download': True, }, } From db8ee7ec0598f8893e4259ac9373c44726e4f84f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 2 Apr 2016 18:48:05 +0600 Subject: [PATCH 21/30] [extractor/common] Fix numeric identifiers conversion in DASH URL templates --- youtube_dl/extractor/common.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 94a583891..011edcc0a 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1515,7 +1515,8 @@ class InfoExtractor(object): representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration)) media_template = representation_ms_info['media_template'] media_template = media_template.replace('$RepresentationID$', representation_id) - media_template = re.sub(r'\$(Number|Bandwidth)(?:%(0\d+)d)?\$', r'%(\1)\2d', media_template) + media_template = re.sub(r'\$(Number|Bandwidth)\$', r'%(\1)d', media_template) + media_template = re.sub(r'\$(Number|Bandwidth)%(\d+)\$', r'%(\1)\2d', media_template) media_template.replace('$$', '$') representation_ms_info['segment_urls'] = [media_template % {'Number': segment_number, 'Bandwidth': representation_attrib.get('bandwidth')} for segment_number in range(representation_ms_info['start_number'], representation_ms_info['total_number'] + representation_ms_info['start_number'])] if 'segment_urls' in representation_ms_info: From b507cc925b8dbb37b0abce748ff73a7ad102494a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 2 Apr 2016 18:49:58 +0600 Subject: [PATCH 22/30] [extractor/common] Carry long line --- youtube_dl/extractor/common.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 011edcc0a..ec6625eea 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1518,7 +1518,13 @@ class InfoExtractor(object): media_template = re.sub(r'\$(Number|Bandwidth)\$', r'%(\1)d', media_template) media_template = re.sub(r'\$(Number|Bandwidth)%(\d+)\$', r'%(\1)\2d', media_template) media_template.replace('$$', '$') - representation_ms_info['segment_urls'] = [media_template % {'Number': segment_number, 'Bandwidth': representation_attrib.get('bandwidth')} for segment_number in range(representation_ms_info['start_number'], representation_ms_info['total_number'] + representation_ms_info['start_number'])] + representation_ms_info['segment_urls'] = [ + media_template % { + 'Number': segment_number, + 'Bandwidth': representation_attrib.get('bandwidth')} + for segment_number in range( + representation_ms_info['start_number'], + representation_ms_info['total_number'] + representation_ms_info['start_number'])] if 'segment_urls' in representation_ms_info: f.update({ 'segment_urls': representation_ms_info['segment_urls'], From bbc26c8a012d215e98a98a671471cd75e7765286 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 2 Apr 2016 19:00:38 +0600 Subject: [PATCH 23/30] [bbc] Set vcodec to none for audio formats --- youtube_dl/extractor/bbc.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index dedf721bd..425f08f2b 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -328,6 +328,7 @@ class BBCCoUkIE(InfoExtractor): 'format_id': '%s_%s' % (service, format['format_id']), 'abr': abr, 'acodec': acodec, + 'vcodec': 'none', }) formats.extend(conn_formats) return formats From 2defa7d75aa424c16ca76a25a05297daed0bb5bd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sat, 2 Apr 2016 18:01:58 +0200 Subject: [PATCH 24/30] [instagram:user] Fix extraction (fixes #9059) The URL for the next page was incorrect and we always got the same page, therefore it got trapped in an infinite loop. --- youtube_dl/extractor/instagram.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/instagram.py b/youtube_dl/extractor/instagram.py index 4e62098b0..11bb58d8a 100644 --- a/youtube_dl/extractor/instagram.py +++ b/youtube_dl/extractor/instagram.py @@ -152,7 +152,7 @@ class InstagramUserIE(InfoExtractor): if not page['items']: break - max_id = page['items'][-1]['id'] + max_id = page['items'][-1]['id'].split('_')[0] media_url = ( 'http://instagram.com/%s/media?max_id=%s' % ( uploader_id, max_id)) From 41f5492fbcddfcbae133dc27e8d94ece3755df2e Mon Sep 17 00:00:00 2001 From: remitamine Date: Sat, 2 Apr 2016 18:54:40 +0100 Subject: [PATCH 25/30] [brightcove:legacy] improve format extraction and extract uploader_id, duration and timestamp --- youtube_dl/extractor/brightcove.py | 48 +++++++++++++++++++++++++----- 1 file changed, 41 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index a8919001d..a5091238b 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -46,6 +46,9 @@ class BrightcoveLegacyIE(InfoExtractor): 'title': 'Xavier Sala i Martín: “Un banc que no presta és un banc zombi que no serveix per a res”', 'uploader': '8TV', 'description': 'md5:a950cc4285c43e44d763d036710cd9cd', + 'timestamp': 1368213670, + 'upload_date': '20130510', + 'uploader_id': 1589608506001, } }, { @@ -57,6 +60,9 @@ class BrightcoveLegacyIE(InfoExtractor): 'title': 'JVMLS 2012: Arrays 2.0 - Opportunities and Challenges', 'description': 'John Rose speaks at the JVM Language Summit, August 1, 2012.', 'uploader': 'Oracle', + 'timestamp': 1344975024, + 'upload_date': '20120814', + 'uploader_id': 1460825906, }, }, { @@ -68,6 +74,9 @@ class BrightcoveLegacyIE(InfoExtractor): 'title': 'This Bracelet Acts as a Personal Thermostat', 'description': 'md5:547b78c64f4112766ccf4e151c20b6a0', 'uploader': 'Mashable', + 'timestamp': 1382041798, + 'upload_date': '20131017', + 'uploader_id': 1130468786001, }, }, { @@ -85,14 +94,17 @@ class BrightcoveLegacyIE(InfoExtractor): { # test flv videos served by akamaihd.net # From http://www.redbull.com/en/bike/stories/1331655643987/replay-uci-dh-world-cup-2014-from-fort-william - 'url': 'http://c.brightcove.com/services/viewer/htmlFederated?%40videoPlayer=ref%3ABC2996102916001&linkBaseURL=http%3A%2F%2Fwww.redbull.com%2Fen%2Fbike%2Fvideos%2F1331655630249%2Freplay-uci-fort-william-2014-dh&playerKey=AQ%7E%7E%2CAAAApYJ7UqE%7E%2Cxqr_zXk0I-zzNndy8NlHogrCb5QdyZRf&playerID=1398061561001#__youtubedl_smuggle=%7B%22Referer%22%3A+%22http%3A%2F%2Fwww.redbull.com%2Fen%2Fbike%2Fstories%2F1331655643987%2Freplay-uci-dh-world-cup-2014-from-fort-william%22%7D', + 'url': 'http://c.brightcove.com/services/viewer/htmlFederated?%40videoPlayer=ref%3Aevent-stream-356&linkBaseURL=http%3A%2F%2Fwww.redbull.com%2Fen%2Fbike%2Fvideos%2F1331655630249%2Freplay-uci-fort-william-2014-dh&playerKey=AQ%7E%7E%2CAAAApYJ7UqE%7E%2Cxqr_zXk0I-zzNndy8NlHogrCb5QdyZRf&playerID=1398061561001#__youtubedl_smuggle=%7B%22Referer%22%3A+%22http%3A%2F%2Fwww.redbull.com%2Fen%2Fbike%2Fstories%2F1331655643987%2Freplay-uci-dh-world-cup-2014-from-fort-william%22%7D', # The md5 checksum changes on each download 'info_dict': { - 'id': '2996102916001', + 'id': '3750436379001', 'ext': 'flv', 'title': 'UCI MTB World Cup 2014: Fort William, UK - Downhill Finals', - 'uploader': 'Red Bull TV', + 'uploader': 'RBTV Old (do not use)', 'description': 'UCI MTB World Cup 2014: Fort William, UK - Downhill Finals', + 'timestamp': 1409122195, + 'upload_date': '20140827', + 'uploader_id': 710858724001, }, }, { @@ -106,6 +118,12 @@ class BrightcoveLegacyIE(InfoExtractor): 'playlist_mincount': 7, }, ] + FLV_VCODECS = { + 1: 'SORENSON', + 2: 'ON2', + 3: 'H264', + 4: 'VP8', + } @classmethod def _build_brighcove_url(cls, object_str): @@ -295,6 +313,9 @@ class BrightcoveLegacyIE(InfoExtractor): 'description': video_info.get('shortDescription'), 'thumbnail': video_info.get('videoStillURL') or video_info.get('thumbnailURL'), 'uploader': video_info.get('publisherName'), + 'uploader_id': video_info.get('publisherId'), + 'duration': float_or_none(video_info.get('length'), 1000), + 'timestamp': int_or_none(video_info.get('creationDate'), 1000), } renditions = video_info.get('renditions', []) + video_info.get('IOSRenditions', []) @@ -318,19 +339,30 @@ class BrightcoveLegacyIE(InfoExtractor): ext = 'flv' if ext is None: ext = determine_ext(url) - size = rend.get('size') + tbr = int_or_none(rend.get('encodingRate'), 1000), a_format = { + 'format_id': 'http%s' % ('-%s' % tbr if tbr else ''), 'url': url, 'ext': ext, - 'height': rend.get('frameHeight'), - 'width': rend.get('frameWidth'), - 'filesize': size if size != 0 else None, + 'filesize': int_or_none(rend.get('size')) or None, + 'tbr': tbr, } + if rend.get('audioOnly'): + a_format.update({ + 'vcodec': 'none', + }) + else: + a_format.update({ + 'height': int_or_none(rend.get('frameHeight')), + 'width': int_or_none(rend.get('frameWidth')), + 'vcodec': rend.get('videoCodec'), + }) # m3u8 manifests with remote == false are media playlists # Not calling _extract_m3u8_formats here to save network traffic if ext == 'm3u8': a_format.update({ + 'format_id': 'hls%s' % ('-%s' % tbr if tbr else ''), 'ext': 'mp4', 'protocol': 'm3u8', }) @@ -341,6 +373,8 @@ class BrightcoveLegacyIE(InfoExtractor): elif video_info.get('FLVFullLengthURL') is not None: info.update({ 'url': video_info['FLVFullLengthURL'], + 'vcodec': self.FLV_VCODECS.get(video_info.get('FLVFullCodec')), + 'filesize': int_or_none(video_info.get('FLVFullSize')), }) if self._downloader.params.get('include_ads', False): From e47d19e991456fe4afdab1a76a653f7821e99c3f Mon Sep 17 00:00:00 2001 From: remitamine Date: Sat, 2 Apr 2016 18:56:01 +0100 Subject: [PATCH 26/30] [brightcove:new] extract subtitles and strip video title --- youtube_dl/extractor/brightcove.py | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index a5091238b..6128b6762 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -515,7 +515,7 @@ class BrightcoveNewIE(InfoExtractor): raise ExtractorError(json_data[0]['message'], expected=True) raise - title = json_data['name'] + title = json_data['name'].strip() formats = [] for source in json_data.get('sources', []): @@ -579,20 +579,22 @@ class BrightcoveNewIE(InfoExtractor): formats.append(f) self._sort_formats(formats) - description = json_data.get('description') - thumbnail = json_data.get('thumbnail') - timestamp = parse_iso8601(json_data.get('published_at')) - duration = float_or_none(json_data.get('duration'), 1000) - tags = json_data.get('tags', []) + subtitles = {} + for text_track in json_data.get('text_tracks', []): + if text_track.get('src'): + subtitles.setdefault(text_track.get('srclang'), []).append({ + 'url': text_track['src'], + }) return { 'id': video_id, 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'duration': duration, - 'timestamp': timestamp, + 'description': json_data.get('description'), + 'thumbnail': json_data.get('thumbnail') or json_data.get('poster'), + 'duration': float_or_none(json_data.get('duration'), 1000), + 'timestamp': parse_iso8601(json_data.get('published_at')), 'uploader_id': account_id, 'formats': formats, - 'tags': tags, + 'subtitles': subtitles, + 'tags': json_data.get('tags', []), } From 3aac9b2fb1a103f1d350ba10060e59bb04a6a2e8 Mon Sep 17 00:00:00 2001 From: remitamine Date: Sat, 2 Apr 2016 18:56:31 +0100 Subject: [PATCH 27/30] [nowness] update tests --- youtube_dl/extractor/nowness.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/nowness.py b/youtube_dl/extractor/nowness.py index 446f5901c..74860eb20 100644 --- a/youtube_dl/extractor/nowness.py +++ b/youtube_dl/extractor/nowness.py @@ -63,8 +63,11 @@ class NownessIE(NownessBaseIE): 'title': 'Candor: The Art of Gesticulation', 'description': 'Candor: The Art of Gesticulation', 'thumbnail': 're:^https?://.*\.jpg', - 'uploader': 'Nowness', + 'timestamp': 1446745676, + 'upload_date': '20151105', + 'uploader_id': '2385340575001', }, + 'add_ie': ['BrightcoveNew'], }, { 'url': 'https://cn.nowness.com/story/kasper-bjorke-ft-jaakko-eino-kalevi-tnr', 'md5': 'e79cf125e387216f86b2e0a5b5c63aa3', @@ -74,8 +77,11 @@ class NownessIE(NownessBaseIE): 'title': 'Kasper Bjørke ft. Jaakko Eino Kalevi: TNR', 'description': 'Kasper Bjørke ft. Jaakko Eino Kalevi: TNR', 'thumbnail': 're:^https?://.*\.jpg', - 'uploader': 'Nowness', + 'timestamp': 1407315371, + 'upload_date': '20140806', + 'uploader_id': '2385340575001', }, + 'add_ie': ['BrightcoveNew'], }, { # vimeo 'url': 'https://www.nowness.com/series/nowness-picks/jean-luc-godard-supercut', @@ -90,6 +96,7 @@ class NownessIE(NownessBaseIE): 'uploader': 'Cinema Sem Lei', 'uploader_id': 'cinemasemlei', }, + 'add_ie': ['Vimeo'], }] def _real_extract(self, url): From 4d4cd35f485c652a39a631fbf3d40c9f4353e807 Mon Sep 17 00:00:00 2001 From: remitamine Date: Sat, 2 Apr 2016 20:55:44 +0100 Subject: [PATCH 28/30] [brightcove:legacy] extract uploader_id as a string --- youtube_dl/extractor/brightcove.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index 6128b6762..f9056f514 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -48,7 +48,7 @@ class BrightcoveLegacyIE(InfoExtractor): 'description': 'md5:a950cc4285c43e44d763d036710cd9cd', 'timestamp': 1368213670, 'upload_date': '20130510', - 'uploader_id': 1589608506001, + 'uploader_id': '1589608506001', } }, { @@ -62,7 +62,7 @@ class BrightcoveLegacyIE(InfoExtractor): 'uploader': 'Oracle', 'timestamp': 1344975024, 'upload_date': '20120814', - 'uploader_id': 1460825906, + 'uploader_id': '1460825906', }, }, { @@ -76,7 +76,7 @@ class BrightcoveLegacyIE(InfoExtractor): 'uploader': 'Mashable', 'timestamp': 1382041798, 'upload_date': '20131017', - 'uploader_id': 1130468786001, + 'uploader_id': '1130468786001', }, }, { @@ -104,7 +104,7 @@ class BrightcoveLegacyIE(InfoExtractor): 'description': 'UCI MTB World Cup 2014: Fort William, UK - Downhill Finals', 'timestamp': 1409122195, 'upload_date': '20140827', - 'uploader_id': 710858724001, + 'uploader_id': '710858724001', }, }, { @@ -313,7 +313,7 @@ class BrightcoveLegacyIE(InfoExtractor): 'description': video_info.get('shortDescription'), 'thumbnail': video_info.get('videoStillURL') or video_info.get('thumbnailURL'), 'uploader': video_info.get('publisherName'), - 'uploader_id': video_info.get('publisherId'), + 'uploader_id': compat_str(video_info.get('publisherId')), 'duration': float_or_none(video_info.get('length'), 1000), 'timestamp': int_or_none(video_info.get('creationDate'), 1000), } From 23576edbfcaa3d7f0283631516c82715a85c6856 Mon Sep 17 00:00:00 2001 From: remitamine Date: Sat, 2 Apr 2016 21:31:21 +0100 Subject: [PATCH 29/30] [brightcove:legacy] skip None value for uploader_id --- youtube_dl/extractor/brightcove.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index f9056f514..c718cf385 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -307,13 +307,14 @@ class BrightcoveLegacyIE(InfoExtractor): playlist_title=playlist_info['mediaCollectionDTO']['displayName']) def _extract_video_info(self, video_info): + publisher_id = video_info.get('publisherId') info = { 'id': compat_str(video_info['id']), 'title': video_info['displayName'].strip(), 'description': video_info.get('shortDescription'), 'thumbnail': video_info.get('videoStillURL') or video_info.get('thumbnailURL'), 'uploader': video_info.get('publisherName'), - 'uploader_id': compat_str(video_info.get('publisherId')), + 'uploader_id': compat_str(publisher_id) if publisher_id else None, 'duration': float_or_none(video_info.get('length'), 1000), 'timestamp': int_or_none(video_info.get('creationDate'), 1000), } From 6d4fc66bfc9bb3ed2a4f68366f372a9bedf6e708 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 4 Apr 2016 02:26:20 +0600 Subject: [PATCH 30/30] [youtube] Add support for zwearz (Closes #9062) --- youtube_dl/extractor/youtube.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 28355bf46..188066561 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -270,7 +270,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): )) |(?: youtu\.be| # just youtu.be/xxxx - vid\.plus # or vid.plus/xxxx + vid\.plus| # or vid.plus/xxxx + zwearz\.com/watch| # or zwearz.com/watch/xxxx )/ |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId= ) @@ -758,6 +759,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'url': 'http://vid.plus/FlRa-iH7PGw', 'only_matching': True, }, + { + 'url': 'http://zwearz.com/watch/9lWxNJF-ufM/electra-woman-dyna-girl-official-trailer-grace-helbig.html', + 'only_matching': True, + }, { # Title with JS-like syntax "};" (see https://github.com/rg3/youtube-dl/issues/7468) # Also tests cut-off URL expansion in video description (see