From cc80dbec9eb8c0aa53d45c7cfa36d03de4ee8b57 Mon Sep 17 00:00:00 2001 From: gfabiano Date: Wed, 27 Sep 2017 23:02:13 +0200 Subject: [PATCH 1/4] [pbs] Fix extraction (closes #14305) --- youtube_dl/extractor/pbs.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py index 8889e4a1a..fb3c32b93 100644 --- a/youtube_dl/extractor/pbs.py +++ b/youtube_dl/extractor/pbs.py @@ -416,7 +416,7 @@ class PBSIE(InfoExtractor): r"div\s*:\s*'videoembed'\s*,\s*mediaid\s*:\s*'(\d+)'", # frontline video embed r'class="coveplayerid">([^<]+)<', # coveplayer r']+data-coveid="(\d+)"', # coveplayer from http://www.pbs.org/wgbh/frontline/film/real-csi/ - r'', # jwplayer + r'', # jwplayer ] media_id = self._search_regex( @@ -493,7 +493,7 @@ class PBSIE(InfoExtractor): chapters = [] # Player pages may also serve different qualities - for page in ('widget/partnerplayer', 'portalplayer'): + for page in ('widget/partnerplayer', 'portalplayer', 'partnerplayer'): player = self._download_webpage( 'http://player.pbs.org/%s/%s' % (page, video_id), display_id, 'Downloading %s page' % page, fatal=False) From dcbe7b25f5175464050e6d5045cf5ac27a6ead18 Mon Sep 17 00:00:00 2001 From: gfabiano Date: Sun, 1 Oct 2017 15:05:54 +0200 Subject: [PATCH 2/4] Fix video_id extraction and improves regexes --- youtube_dl/extractor/pbs.py | 59 +++++++++++++++++++++++++++++++------ 1 file changed, 50 insertions(+), 9 deletions(-) diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py index fb3c32b93..0cb166797 100644 --- a/youtube_dl/extractor/pbs.py +++ b/youtube_dl/extractor/pbs.py @@ -14,6 +14,7 @@ from ..utils import ( strip_jsonp, strip_or_none, unified_strdate, + urljoin, US_RATINGS, ) @@ -215,7 +216,7 @@ class PBSIE(InfoExtractor): 'info_dict': { 'id': '2365297690', 'ext': 'mp4', - 'title': 'FRONTLINE - Losing Iraq', + 'title': 'FRONTLINE - S32 Ep15: Losing Iraq', 'description': 'md5:5979a4d069b157f622d02bff62fbe654', 'duration': 5050, }, @@ -250,8 +251,8 @@ class PBSIE(InfoExtractor): 'id': '2365160389', 'display_id': 'killer-typhoon', 'ext': 'mp4', - 'description': 'md5:c741d14e979fc53228c575894094f157', - 'title': 'NOVA - Killer Typhoon', + 'description': 'md5:d3a61bce8de59bff91cd75269e38c13f', + 'title': 'NOVA - S41 Ep6: Killer Typhoon', 'duration': 3172, 'thumbnail': r're:^https?://.*\.jpg$', 'upload_date': '20140122', @@ -310,7 +311,7 @@ class PBSIE(InfoExtractor): 'id': '2365546844', 'display_id': 'a-chefs-life-season-3-episode-5-prickly-business', 'ext': 'mp4', - 'title': "A Chef's Life - Season 3, Ep. 5: Prickly Business", + 'title': "A Chef's Life - S3 Ep5: Prickly Business", 'description': 'md5:c0ff7475a4b70261c7e58f493c2792a5', 'duration': 1480, 'thumbnail': r're:^https?://.*\.jpg$', @@ -323,7 +324,7 @@ class PBSIE(InfoExtractor): 'id': '2070868960', 'display_id': 'the-atomic-artists', 'ext': 'mp4', - 'title': 'FRONTLINE - The Atomic Artists', + 'title': 'FRONTLINE - S29 Ep17: The Atomic Artists', 'description': 'md5:f677e4520cfacb4a5ce1471e31b57800', 'duration': 723, 'thumbnail': r're:^https?://.*\.jpg$', @@ -339,7 +340,7 @@ class PBSIE(InfoExtractor): 'info_dict': { 'id': '2365641075', 'ext': 'mp4', - 'title': 'FRONTLINE - Netanyahu at War', + 'title': 'FRONTLINE - S34 Ep4: Netanyahu at War', 'duration': 6852, 'thumbnail': r're:^https?://.*\.jpg$', 'formats': 'mincount:8', @@ -360,6 +361,21 @@ class PBSIE(InfoExtractor): 'skip_download': True, }, }, + { + 'url': 'http://www.pbs.org/wgbh/nova/space/death-dive-to-Saturn.html', + 'info_dict': { + 'id': '3004606354', + 'ext': 'mp4', + 'title': 'NOVA - S44 Ep12: Death Dive to Saturn', + 'description': 'md5:683464c4d38d73c55d53ea45dff5f2e3', + 'duration': 3272, + 'thumbnail': r're:^https?://.*\.jpg$', + 'upload_date': '20170913', + }, + 'params': { + 'skip_download': True, + }, + }, { 'url': 'http://player.pbs.org/widget/partnerplayer/2365297708/?start=0&end=0&chapterbar=false&endscreen=false&topbar=true', 'only_matching': True, @@ -415,8 +431,10 @@ class PBSIE(InfoExtractor): MEDIA_ID_REGEXES = [ r"div\s*:\s*'videoembed'\s*,\s*mediaid\s*:\s*'(\d+)'", # frontline video embed r'class="coveplayerid">([^<]+)<', # coveplayer - r']+data-coveid="(\d+)"', # coveplayer from http://www.pbs.org/wgbh/frontline/film/real-csi/ - r'', # jwplayer + r'<(?:section|div)[^>]+data-coveid="(\d+)"', # coveplayer from http://www.pbs.org/wgbh/frontline/film/real-csi/ + r']+id\s*=\s*"pbs_video_id_[0-9]+"\s*value="(\d+)"[^>]*>', # jwplayer + r']+id\s*=\s*"video_(\d+)"[^>]*>', + r'<(?:button|input)[^>]+data-video-id\s*=\s*["\']?(\d+)[^>]*>', ] media_id = self._search_regex( @@ -424,6 +442,29 @@ class PBSIE(InfoExtractor): if media_id: return media_id, presumptive_id, upload_date, description + media_id = self._search_regex( + [r'<(?:section|div)[^>]+data-coveid="([^"]+)"', + r']+id\s*=\s*"pbs_video_id_[0-9]+"\s*value="([^"]+)"[^>]*>', + ], + webpage, 'media ID', fatal=False, default=None) + if media_id: + player = self._download_webpage( + urljoin('http://player.pbs.org/partnerplayer/', media_id), + display_id, 'Downloading partnerplayer page', fatal=False) + jwSettings = self._parse_json( + self._search_regex( + r'(?s)jwSettings\s*=\s*({.+?});', + player, 'jwSetting data', default='{}'), + presumptive_id, transform_source=js_to_json, fatal=False) + if jwSettings and jwSettings.get('videoId'): + return jwSettings['videoId'], presumptive_id, upload_date, description + + media_id = self._search_regex( + MEDIA_ID_REGEXES, player, 'media ID', fatal=False, default=None) + + if media_id: + return media_id, presumptive_id, upload_date, description + # Fronline video embedded via flp video_id = self._search_regex( r'videoid\s*:\s*"([\d+a-z]{7,})"', webpage, 'videoid', default=None) @@ -493,7 +534,7 @@ class PBSIE(InfoExtractor): chapters = [] # Player pages may also serve different qualities - for page in ('widget/partnerplayer', 'portalplayer', 'partnerplayer'): + for page in ('widget/partnerplayer', 'portalplayer', 'viralplayer'): player = self._download_webpage( 'http://player.pbs.org/%s/%s' % (page, video_id), display_id, 'Downloading %s page' % page, fatal=False) From d193805c8c16c79b283ecb0ff6d9b0ae6719deef Mon Sep 17 00:00:00 2001 From: gfabiano Date: Fri, 6 Oct 2017 22:04:45 +0200 Subject: [PATCH 3/4] Fix --- youtube_dl/extractor/pbs.py | 31 +++++++++++++------------------ 1 file changed, 13 insertions(+), 18 deletions(-) diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py index 0cb166797..8a10cf965 100644 --- a/youtube_dl/extractor/pbs.py +++ b/youtube_dl/extractor/pbs.py @@ -438,7 +438,7 @@ class PBSIE(InfoExtractor): ] media_id = self._search_regex( - MEDIA_ID_REGEXES, webpage, 'media ID', fatal=False, default=None) + MEDIA_ID_REGEXES, webpage, 'media ID', default=None) if media_id: return media_id, presumptive_id, upload_date, description @@ -446,24 +446,19 @@ class PBSIE(InfoExtractor): [r'<(?:section|div)[^>]+data-coveid="([^"]+)"', r']+id\s*=\s*"pbs_video_id_[0-9]+"\s*value="([^"]+)"[^>]*>', ], - webpage, 'media ID', fatal=False, default=None) + webpage, 'media ID', default=None) if media_id: - player = self._download_webpage( - urljoin('http://player.pbs.org/partnerplayer/', media_id), - display_id, 'Downloading partnerplayer page', fatal=False) - jwSettings = self._parse_json( - self._search_regex( - r'(?s)jwSettings\s*=\s*({.+?});', - player, 'jwSetting data', default='{}'), - presumptive_id, transform_source=js_to_json, fatal=False) - if jwSettings and jwSettings.get('videoId'): - return jwSettings['videoId'], presumptive_id, upload_date, description + return self._extract_webpage( + 'http://player.pbs.org/partnerplayer/%s/' % media_id + )[0], presumptive_id, upload_date, description - media_id = self._search_regex( - MEDIA_ID_REGEXES, player, 'media ID', fatal=False, default=None) - - if media_id: - return media_id, presumptive_id, upload_date, description + jwSettings = self._parse_json( + self._search_regex( + r'(?s)jwSettings\s*=\s*({.+?});', + webpage or '', 'jwSetting data', default='{}'), + presumptive_id, transform_source=js_to_json, fatal=False) + if jwSettings and jwSettings.get('videoId'): + return jwSettings['videoId'], presumptive_id, upload_date, description # Fronline video embedded via flp video_id = self._search_regex( @@ -534,7 +529,7 @@ class PBSIE(InfoExtractor): chapters = [] # Player pages may also serve different qualities - for page in ('widget/partnerplayer', 'portalplayer', 'viralplayer'): + for page in ('widget/partnerplayer', 'portalplayer'): player = self._download_webpage( 'http://player.pbs.org/%s/%s' % (page, video_id), display_id, 'Downloading %s page' % page, fatal=False) From 9165606929ad5e8b362bdacfd845359a02594314 Mon Sep 17 00:00:00 2001 From: gfabiano Date: Mon, 30 Jul 2018 18:30:38 +0200 Subject: [PATCH 4/4] resolve conflicts --- youtube_dl/extractor/pbs.py | 53 ++++++++++++++++++++++++++++++++++--- 1 file changed, 49 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py index 8a10cf965..3a0f975fe 100644 --- a/youtube_dl/extractor/pbs.py +++ b/youtube_dl/extractor/pbs.py @@ -14,6 +14,7 @@ from ..utils import ( strip_jsonp, strip_or_none, unified_strdate, + url_or_none, urljoin, US_RATINGS, ) @@ -361,6 +362,49 @@ class PBSIE(InfoExtractor): 'skip_download': True, }, }, + { + 'url': 'http://www.pbs.org/wgbh/roadshow/watch/episode/2105-indianapolis-hour-2/', + 'info_dict': { + 'id': '2365936247', + 'ext': 'mp4', + 'title': 'Antiques Roadshow - Indianapolis, Hour 2', + 'description': 'md5:524b32249db55663e7231b6b8d1671a2', + 'duration': 3180, + 'thumbnail': r're:^https?://.*\.jpg$', + }, + 'params': { + 'skip_download': True, + }, + 'expected_warnings': ['HTTP Error 403: Forbidden'], + }, + { + 'url': 'https://www.pbs.org/wgbh/masterpiece/episodes/victoria-s2-e1/', + 'info_dict': { + 'id': '3007193718', + 'ext': 'mp4', + 'title': "Victoria - A Soldier's Daughter / The Green-Eyed Monster", + 'description': 'md5:37efbac85e0c09b009586523ec143652', + 'duration': 6292, + 'thumbnail': r're:^https?://.*\.(?:jpg|JPG)$', + }, + 'params': { + 'skip_download': True, + }, + 'expected_warnings': ['HTTP Error 403: Forbidden'], + }, + { + 'url': 'https://player.pbs.org/partnerplayer/tOz9tM5ljOXQqIIWke53UA==/', + 'info_dict': { + 'id': '3011407934', + 'ext': 'mp4', + 'title': 'Stories from the Stage - Road Trip', + 'duration': 1619, + 'thumbnail': r're:^https?://.*\.(?:jpg|JPG)$', + }, + 'params': { + 'skip_download': True, + }, + 'expected_warnings': ['HTTP Error 403: Forbidden'], { 'url': 'http://www.pbs.org/wgbh/nova/space/death-dive-to-Saturn.html', 'info_dict': { @@ -431,10 +475,11 @@ class PBSIE(InfoExtractor): MEDIA_ID_REGEXES = [ r"div\s*:\s*'videoembed'\s*,\s*mediaid\s*:\s*'(\d+)'", # frontline video embed r'class="coveplayerid">([^<]+)<', # coveplayer - r'<(?:section|div)[^>]+data-coveid="(\d+)"', # coveplayer from http://www.pbs.org/wgbh/frontline/film/real-csi/ - r']+id\s*=\s*"pbs_video_id_[0-9]+"\s*value="(\d+)"[^>]*>', # jwplayer - r']+id\s*=\s*"video_(\d+)"[^>]*>', - r'<(?:button|input)[^>]+data-video-id\s*=\s*["\']?(\d+)[^>]*>', + r']+data-coveid="(\d+)"', # coveplayer from http://www.pbs.org/wgbh/frontline/film/real-csi/ + r'', # jwplayer + r"(?s)window\.PBS\.playerConfig\s*=\s*{.*?id\s*:\s*'([0-9]+)',", + r']+\bdata-cove-id=["\'](\d+)"', # http://www.pbs.org/wgbh/roadshow/watch/episode/2105-indianapolis-hour-2/ + r']+\bsrc=["\'](?:https?:)?//video\.pbs\.org/widget/partnerplayer/(\d+)', # https://www.pbs.org/wgbh/masterpiece/episodes/victoria-s2-e1/ ] media_id = self._search_regex(