From 34724e37bbe6fcfe31dbfb372989c64295c39779 Mon Sep 17 00:00:00 2001 From: John Hawkinson Date: Sat, 4 Mar 2017 21:13:30 -0500 Subject: [PATCH 01/10] [cbs.com:playlist] Extract playlists of CBS shows. --- youtube_dl/extractor/cbs.py | 99 ++++++++++++++++++++++++++++++ youtube_dl/extractor/extractors.py | 5 +- 2 files changed, 103 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/cbs.py b/youtube_dl/extractor/cbs.py index 58f258c54..c709df097 100644 --- a/youtube_dl/extractor/cbs.py +++ b/youtube_dl/extractor/cbs.py @@ -1,12 +1,16 @@ from __future__ import unicode_literals +from .common import InfoExtractor from .theplatform import ThePlatformFeedIE from ..utils import ( int_or_none, + js_to_json, find_xpath_attr, + RegexNotFoundError, xpath_element, xpath_text, update_url_query, + urljoin, ) @@ -20,6 +24,101 @@ class CBSBaseIE(ThePlatformFeedIE): }] } if closed_caption_e is not None and closed_caption_e.attrib.get('value') else [] +class CBSShowIE(InfoExtractor): + IE_DESC = 'CBS show playlists, including full episodes and clips' + IE_NAME = 'cbs.com:playlist' + _VALID_URL = r'(?i)https?://(?:www\.)cbs.com/shows/(?P[\w-]+)' + _TEST = { + 'url': 'http://www.cbs.com/shows/the-late-show-with-stephen-colbert', + 'info_dict': { + 'id': 61456254, + 'title': 'The Late Show with Stephen Colbert', + }, + 'playlist_mincount': 14, + # If uncommented, the test harness tries to download all 30 playlist entries. + # Even limited to 10KB each, this can take 15 minutes. Not reasonable. + # 'playlist': [{ + # 'info_dict': { + # 'id': 'xxx', + # 'ext': 'xxx.mp4', + # }, + # }], + } + + def carousel_playlist(self, url, type): + carousel = self._download_json(url, 'Downloading %s carousel' % type) + episodes = carousel['result']['data'] + carousel_title = episodes[0]['series_title'] + + entries = [] + for ep in episodes: + entries.append(self.url_result( + urljoin(url, ep['app_url']), + 'CBS', + ep['content_id'], + ep['episode_title'])) + + return self.playlist_result(entries, playlist_title=carousel_title) + + def _real_extract(self, url): + show_name = self._match_id(url) + webpage = self._download_webpage(url, show_name) + + # not-quite JSON, no double-quotes: + # var show = new CBS.Show({id:61456254}); + show_id_json = self._search_regex(r'new CBS\.Show\(([^)]*)\);', webpage, 'show_id') + + show = self._parse_json(show_id_json, show_name, transform_source=js_to_json) + + # Found in http://www.cbs.com/assets/min/js/min/com.cbs.min.js?20170303-224247 + # unminified at http://www.cbs.com/assets/js/min/com.cbs.js + # http://www.cbs.com/carousels/shows/61456254/offset/0/limit/15/xs/0/ + # => {id: 240172, title: "Full Episodes", + episodes_url = urljoin(url, '/carousels/shows/%d/offset/0/limit/15/xs/0/' % show['id']) + + # var loader = new CBS.V2.CarouselLoader({ + # 'video-preview-carousel': function(element) { + # element.videoCarousel({ + # id : 241426, + # templates : 'carousels/videoAdaptive', + # scroll : 3, + # layout : 3, + # start : 0, + # saveState : false + # }); + # } + try: + clipdata = self._parse_json( + self._search_regex(r'element\.videoCarousel\(([^)]*)\);', webpage, + 'clip carousel'), + show_name, transform_source=js_to_json) + + # http://www.cbs.com/carousels/videosBySection/241426/offset/0/limit/15/xs/0/ + # => {id: 241426, title: "Clips", + clips_url = urljoin(url, + '/carousels/videosBySection/%d/offset/0/limit/15/xs/0' % clipdata['id']) + clips = self.carousel_playlist(clips_url, 'clips') + except RegexNotFoundError: + clips = { 'entries': [] } + + # We separately retrieve a carousel of full episodes, and also one of clips. + # Clips are identifiable as such because they lack an "episode_number" field, + # unlike full episodes. + # + # It might be desirable to specify only retrieving a playlist of one or the other, + # but there isn't a good way for users to pass such parameters to InfoExtractors + # (custom URLs, maybe? With cbs: URLs?). + # + # But since the playlist is filterable, only full episodes can be returned with: + # youtube_dl --match-filter 'episode_number' http://... + # and similarly, only clips can be returned with: + # youtube_dl --match-filter '!episode_number' http://... + + playlist = self.carousel_playlist(episodes_url, 'episodes') + playlist['entries'] += clips['entries'] + playlist['id'] = show['id'] + + return playlist class CBSIE(CBSBaseIE): _VALID_URL = r'(?:cbs:|https?://(?:www\.)?(?:cbs\.com/shows/[^/]+/video|colbertlateshow\.com/(?:video|podcasts))/)(?P[\w-]+)' diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index caf1dc766..ec3c48ad9 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -150,7 +150,10 @@ from .cbc import ( CBCWatchVideoIE, CBCWatchIE, ) -from .cbs import CBSIE +from .cbs import ( + CBSIE, + CBSShowIE, + ) from .cbslocal import CBSLocalIE from .cbsinteractive import CBSInteractiveIE from .cbsnews import ( From 80df786df990bef1e801f56dcc3a8f48d51dddd0 Mon Sep 17 00:00:00 2001 From: John Hawkinson Date: Sat, 4 Mar 2017 22:25:33 -0500 Subject: [PATCH 02/10] [cbs.com:playlist] Tighten VALID_URL per test_all_urls File "test/test_all_urls.py", line 98, in test_no_duplicates '%s should not match URL %r . That URL belongs to %s.' % (type(ie).__name__, url, tc['name'])) AssertionError: CBSShowIE should not match URL u'http://www.cbs.com/shows/garth-brooks/video/_u7W953k6la293J7EPTd9oHkSPs6Xn6_/connect-chat-feat-garth-brooks/' . That URL belongs to CBS. Soltution is to anchor the regexp with a $, while permitting an optional terminal / --- youtube_dl/extractor/cbs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/cbs.py b/youtube_dl/extractor/cbs.py index c709df097..154152a1a 100644 --- a/youtube_dl/extractor/cbs.py +++ b/youtube_dl/extractor/cbs.py @@ -27,7 +27,7 @@ class CBSBaseIE(ThePlatformFeedIE): class CBSShowIE(InfoExtractor): IE_DESC = 'CBS show playlists, including full episodes and clips' IE_NAME = 'cbs.com:playlist' - _VALID_URL = r'(?i)https?://(?:www\.)cbs.com/shows/(?P[\w-]+)' + _VALID_URL = r'(?i)https?://(?:www\.)cbs.com/shows/(?P[\w-]+)/?$' _TEST = { 'url': 'http://www.cbs.com/shows/the-late-show-with-stephen-colbert', 'info_dict': { From a019668cbdcfffd122a5c08f5e0a4c792ee478a9 Mon Sep 17 00:00:00 2001 From: John Hawkinson Date: Sat, 25 Mar 2017 12:09:17 -0400 Subject: [PATCH 03/10] [cbs.com:playlist] remove test harness commentary --- youtube_dl/extractor/cbs.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/youtube_dl/extractor/cbs.py b/youtube_dl/extractor/cbs.py index 154152a1a..21c2f190a 100644 --- a/youtube_dl/extractor/cbs.py +++ b/youtube_dl/extractor/cbs.py @@ -35,14 +35,6 @@ class CBSShowIE(InfoExtractor): 'title': 'The Late Show with Stephen Colbert', }, 'playlist_mincount': 14, - # If uncommented, the test harness tries to download all 30 playlist entries. - # Even limited to 10KB each, this can take 15 minutes. Not reasonable. - # 'playlist': [{ - # 'info_dict': { - # 'id': 'xxx', - # 'ext': 'xxx.mp4', - # }, - # }], } def carousel_playlist(self, url, type): From 8660a6b1b1ab07739a5d50bad49d01fd273dc3a3 Mon Sep 17 00:00:00 2001 From: John Hawkinson Date: Sat, 25 Mar 2017 12:11:22 -0400 Subject: [PATCH 04/10] [cbs.com:playlist] Remove enduser guidance on filtering clips --- youtube_dl/extractor/cbs.py | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/youtube_dl/extractor/cbs.py b/youtube_dl/extractor/cbs.py index 21c2f190a..e4bc2ad2c 100644 --- a/youtube_dl/extractor/cbs.py +++ b/youtube_dl/extractor/cbs.py @@ -93,19 +93,6 @@ class CBSShowIE(InfoExtractor): except RegexNotFoundError: clips = { 'entries': [] } - # We separately retrieve a carousel of full episodes, and also one of clips. - # Clips are identifiable as such because they lack an "episode_number" field, - # unlike full episodes. - # - # It might be desirable to specify only retrieving a playlist of one or the other, - # but there isn't a good way for users to pass such parameters to InfoExtractors - # (custom URLs, maybe? With cbs: URLs?). - # - # But since the playlist is filterable, only full episodes can be returned with: - # youtube_dl --match-filter 'episode_number' http://... - # and similarly, only clips can be returned with: - # youtube_dl --match-filter '!episode_number' http://... - playlist = self.carousel_playlist(episodes_url, 'episodes') playlist['entries'] += clips['entries'] playlist['id'] = show['id'] From 91654b0437fdedacbb9a407f21ed27b0d200c180 Mon Sep 17 00:00:00 2001 From: John Hawkinson Date: Sat, 25 Mar 2017 12:14:33 -0400 Subject: [PATCH 05/10] [cbs.com:playlist] remove js_to_json comment --- youtube_dl/extractor/cbs.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/cbs.py b/youtube_dl/extractor/cbs.py index e4bc2ad2c..bbdc6df5b 100644 --- a/youtube_dl/extractor/cbs.py +++ b/youtube_dl/extractor/cbs.py @@ -56,11 +56,10 @@ class CBSShowIE(InfoExtractor): show_name = self._match_id(url) webpage = self._download_webpage(url, show_name) - # not-quite JSON, no double-quotes: - # var show = new CBS.Show({id:61456254}); - show_id_json = self._search_regex(r'new CBS\.Show\(([^)]*)\);', webpage, 'show_id') - - show = self._parse_json(show_id_json, show_name, transform_source=js_to_json) + show_id_js = self._search_regex(r'new CBS\.Show\(([^)]*)\);', webpage, + 'show_id') + show = self._parse_json(show_id_js, show_name, + transform_source=js_to_json) # Found in http://www.cbs.com/assets/min/js/min/com.cbs.min.js?20170303-224247 # unminified at http://www.cbs.com/assets/js/min/com.cbs.js From 9961f5379521c5c29f20f5dc8faf6bf5c67b6ee2 Mon Sep 17 00:00:00 2001 From: John Hawkinson Date: Sat, 25 Mar 2017 12:18:07 -0400 Subject: [PATCH 06/10] [cbs:playlist] flake8 for PEP 8 --- youtube_dl/extractor/cbs.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/cbs.py b/youtube_dl/extractor/cbs.py index bbdc6df5b..fac7e6bfc 100644 --- a/youtube_dl/extractor/cbs.py +++ b/youtube_dl/extractor/cbs.py @@ -24,6 +24,7 @@ class CBSBaseIE(ThePlatformFeedIE): }] } if closed_caption_e is not None and closed_caption_e.attrib.get('value') else [] + class CBSShowIE(InfoExtractor): IE_DESC = 'CBS show playlists, including full episodes and clips' IE_NAME = 'cbs.com:playlist' @@ -80,23 +81,26 @@ class CBSShowIE(InfoExtractor): # } try: clipdata = self._parse_json( - self._search_regex(r'element\.videoCarousel\(([^)]*)\);', webpage, - 'clip carousel'), + self._search_regex(r'element\.videoCarousel\(([^)]*)\);', + webpage, 'carousel'), show_name, transform_source=js_to_json) # http://www.cbs.com/carousels/videosBySection/241426/offset/0/limit/15/xs/0/ # => {id: 241426, title: "Clips", - clips_url = urljoin(url, - '/carousels/videosBySection/%d/offset/0/limit/15/xs/0' % clipdata['id']) + clips_url = \ + urljoin(url, + '/carousels/videosBySection/%d/offset/0/limit/15/xs/0' + % clipdata['id']) clips = self.carousel_playlist(clips_url, 'clips') except RegexNotFoundError: - clips = { 'entries': [] } + clips = {'entries': []} playlist = self.carousel_playlist(episodes_url, 'episodes') playlist['entries'] += clips['entries'] playlist['id'] = show['id'] - return playlist + return playlist + class CBSIE(CBSBaseIE): _VALID_URL = r'(?:cbs:|https?://(?:www\.)?(?:cbs\.com/shows/[^/]+/video|colbertlateshow\.com/(?:video|podcasts))/)(?P[\w-]+)' From 53b53c6638653b128d5945ee8c593c56ea496b21 Mon Sep 17 00:00:00 2001 From: John Hawkinson Date: Sat, 25 Mar 2017 12:39:25 -0400 Subject: [PATCH 07/10] [cbs:playlist] Improve tests Mark Colbert test with actual playlist count of 30, instead of a minimum. 30 is 15 clips and 15 full episodes. Add another test (only_matching) for a show with no clips, only episodes: Star Trek. playlist count is 15. --- youtube_dl/extractor/cbs.py | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/cbs.py b/youtube_dl/extractor/cbs.py index fac7e6bfc..3e6573ff5 100644 --- a/youtube_dl/extractor/cbs.py +++ b/youtube_dl/extractor/cbs.py @@ -29,14 +29,25 @@ class CBSShowIE(InfoExtractor): IE_DESC = 'CBS show playlists, including full episodes and clips' IE_NAME = 'cbs.com:playlist' _VALID_URL = r'(?i)https?://(?:www\.)cbs.com/shows/(?P[\w-]+)/?$' - _TEST = { - 'url': 'http://www.cbs.com/shows/the-late-show-with-stephen-colbert', - 'info_dict': { - 'id': 61456254, - 'title': 'The Late Show with Stephen Colbert', + _TESTS = [ + { + 'url': 'http://www.cbs.com/shows/the-late-show-with-stephen-colbert', + 'info_dict': { + 'id': 61456254, + 'title': 'The Late Show with Stephen Colbert', + }, + 'playlist_count': 30, }, - 'playlist_mincount': 14, - } + { + 'url': 'http://www.cbs.com/shows/star_trek/', + 'info_dict': { + 'id': 22927, + 'title': 'Star Trek - The Original Series', + }, + 'playlist_count': 15, # No clips, only episodes + 'only_matching': True, + }, + ] def carousel_playlist(self, url, type): carousel = self._download_json(url, 'Downloading %s carousel' % type) From 7fc3a7d09f288e2565f1a1afe84e67093a71ee3e Mon Sep 17 00:00:00 2001 From: John Hawkinson Date: Sat, 25 Mar 2017 12:54:22 -0400 Subject: [PATCH 08/10] [cbs:playlist] if instead of try per @dstftw --- youtube_dl/extractor/cbs.py | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/youtube_dl/extractor/cbs.py b/youtube_dl/extractor/cbs.py index 3e6573ff5..e0b94f731 100644 --- a/youtube_dl/extractor/cbs.py +++ b/youtube_dl/extractor/cbs.py @@ -6,7 +6,6 @@ from ..utils import ( int_or_none, js_to_json, find_xpath_attr, - RegexNotFoundError, xpath_element, xpath_text, update_url_query, @@ -44,7 +43,7 @@ class CBSShowIE(InfoExtractor): 'id': 22927, 'title': 'Star Trek - The Original Series', }, - 'playlist_count': 15, # No clips, only episodes + 'playlist_count': 15, # No clips, only episodes 'only_matching': True, }, ] @@ -90,12 +89,11 @@ class CBSShowIE(InfoExtractor): # saveState : false # }); # } - try: - clipdata = self._parse_json( - self._search_regex(r'element\.videoCarousel\(([^)]*)\);', - webpage, 'carousel'), - show_name, transform_source=js_to_json) - + clipdata = self._parse_json( + self._search_regex(r'element\.videoCarousel\(([^)]*)\);', + webpage, 'carousel', default='{}'), + show_name, transform_source=js_to_json) + if (clipdata.get('id')): # http://www.cbs.com/carousels/videosBySection/241426/offset/0/limit/15/xs/0/ # => {id: 241426, title: "Clips", clips_url = \ @@ -103,7 +101,7 @@ class CBSShowIE(InfoExtractor): '/carousels/videosBySection/%d/offset/0/limit/15/xs/0' % clipdata['id']) clips = self.carousel_playlist(clips_url, 'clips') - except RegexNotFoundError: + else: clips = {'entries': []} playlist = self.carousel_playlist(episodes_url, 'episodes') From ca4d55a78cf93283a654f753236dde9af7a2cb2f Mon Sep 17 00:00:00 2001 From: John Hawkinson Date: Sat, 25 Mar 2017 12:56:04 -0400 Subject: [PATCH 09/10] [cbs.com:playlist] avoid continuation line It was originally there to avoid exceeding 80cols, but this is a better way. --- youtube_dl/extractor/cbs.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/cbs.py b/youtube_dl/extractor/cbs.py index e0b94f731..1d491d43e 100644 --- a/youtube_dl/extractor/cbs.py +++ b/youtube_dl/extractor/cbs.py @@ -96,10 +96,10 @@ class CBSShowIE(InfoExtractor): if (clipdata.get('id')): # http://www.cbs.com/carousels/videosBySection/241426/offset/0/limit/15/xs/0/ # => {id: 241426, title: "Clips", - clips_url = \ - urljoin(url, - '/carousels/videosBySection/%d/offset/0/limit/15/xs/0' - % clipdata['id']) + clips_url = urljoin( + url, + '/carousels/videosBySection/%d/offset/0/limit/15/xs/0' + % clipdata['id']) clips = self.carousel_playlist(clips_url, 'clips') else: clips = {'entries': []} From cfeaa28a1e40069b9bf76e69df752a0570df35d4 Mon Sep 17 00:00:00 2001 From: John Hawkinson Date: Fri, 31 Mar 2017 01:46:55 -0400 Subject: [PATCH 10/10] [cbs:playlist] say "clip carousel" partial revert of 9961f5379521c5c29f20f5dc8faf6bf5c67b6ee2 where we lost one word while filling a liine for flake8. Oops. --- youtube_dl/extractor/cbs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/cbs.py b/youtube_dl/extractor/cbs.py index 1d491d43e..21ce44bbe 100644 --- a/youtube_dl/extractor/cbs.py +++ b/youtube_dl/extractor/cbs.py @@ -91,7 +91,7 @@ class CBSShowIE(InfoExtractor): # } clipdata = self._parse_json( self._search_regex(r'element\.videoCarousel\(([^)]*)\);', - webpage, 'carousel', default='{}'), + webpage, 'clip carousel', default='{}'), show_name, transform_source=js_to_json) if (clipdata.get('id')): # http://www.cbs.com/carousels/videosBySection/241426/offset/0/limit/15/xs/0/