From e54d50902b9b78ea22653b92883da1e182276102 Mon Sep 17 00:00:00 2001 From: nimeir Date: Tue, 26 Mar 2019 21:54:38 +0000 Subject: [PATCH 1/9] fixed json parsing --- youtube_dl/extractor/rtp.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/rtp.py b/youtube_dl/extractor/rtp.py index 533ee27cb..00e68331b 100644 --- a/youtube_dl/extractor/rtp.py +++ b/youtube_dl/extractor/rtp.py @@ -4,7 +4,7 @@ from __future__ import unicode_literals import re from .common import InfoExtractor - +from ..utils import js_to_json class RTPIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?rtp\.pt/play/p(?P[0-9]+)/(?P[^/?#]+)/?' @@ -35,18 +35,17 @@ class RTPIE(InfoExtractor): 'twitter:title', webpage, display_name='title', fatal=True) description = self._html_search_meta('description', webpage) thumbnail = self._og_search_thumbnail(webpage) - player_config = self._search_regex( - r'(?s)RTPPLAY\.player\.newPlayer\(\s*(\{.*?\})\s*\)', webpage, 'player config') + r'(?s)RTPPlayer\(({.*?})', webpage, 'player config') + player_config = js_to_json(player_config) config = self._parse_json(player_config, video_id) - path, ext = config.get('file').rsplit('.', 1) formats = [{ 'format_id': 'rtmp', 'ext': ext, - 'vcodec': config.get('type') == 'audio' and 'none' or None, + #'vcodec': config.get('type') = = 'audio' and 'none' or None, 'preference': -2, - 'url': 'rtmp://{streamer:s}/{application:s}'.format(**config), + 'url': '{file}'.format(**config), 'app': config.get('application'), 'play_path': '{ext:s}:{path:s}'.format(ext=ext, path=path), 'page_url': url, @@ -70,6 +69,7 @@ class RTPIE(InfoExtractor): 'vcodec': 'h264', }, } + ''' r = replacements[config['type']] if re.match(r['pattern'], config['file']) is not None: formats.append({ @@ -77,6 +77,7 @@ class RTPIE(InfoExtractor): 'url': re.sub(r['pattern'], r['repl'], config['file']), 'vcodec': r['vcodec'], }) + ''' self._sort_formats(formats) From 65874699da828cf4ec95a4d5d37f7a847a615901 Mon Sep 17 00:00:00 2001 From: nimeir Date: Tue, 26 Mar 2019 22:31:39 +0000 Subject: [PATCH 2/9] Fixed regex search and removed unncessary format descriptions --- youtube_dl/extractor/rtp.py | 31 +------------------------------ 1 file changed, 1 insertion(+), 30 deletions(-) diff --git a/youtube_dl/extractor/rtp.py b/youtube_dl/extractor/rtp.py index 00e68331b..9ef84ac00 100644 --- a/youtube_dl/extractor/rtp.py +++ b/youtube_dl/extractor/rtp.py @@ -1,11 +1,10 @@ # coding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor from ..utils import js_to_json + class RTPIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?rtp\.pt/play/p(?P[0-9]+)/(?P[^/?#]+)/?' _TESTS = [{ @@ -43,42 +42,14 @@ class RTPIE(InfoExtractor): formats = [{ 'format_id': 'rtmp', 'ext': ext, - #'vcodec': config.get('type') = = 'audio' and 'none' or None, 'preference': -2, 'url': '{file}'.format(**config), 'app': config.get('application'), 'play_path': '{ext:s}:{path:s}'.format(ext=ext, path=path), 'page_url': url, - 'rtmp_live': config.get('live', False), 'player_url': 'http://programas.rtp.pt/play/player.swf?v3', 'rtmp_real_time': True, }] - - # Construct regular HTTP download URLs - replacements = { - 'audio': { - 'format_id': 'mp3', - 'pattern': r'^nas2\.share/wavrss/', - 'repl': 'http://rsspod.rtp.pt/podcasts/', - 'vcodec': 'none', - }, - 'video': { - 'format_id': 'mp4_h264', - 'pattern': r'^nas2\.share/h264/', - 'repl': 'http://rsspod.rtp.pt/videocasts/', - 'vcodec': 'h264', - }, - } - ''' - r = replacements[config['type']] - if re.match(r['pattern'], config['file']) is not None: - formats.append({ - 'format_id': r['format_id'], - 'url': re.sub(r['pattern'], r['repl'], config['file']), - 'vcodec': r['vcodec'], - }) - ''' - self._sort_formats(formats) return { From 0d7184728060db8c5f316b1151779c3efc874b32 Mon Sep 17 00:00:00 2001 From: nimeir Date: Sat, 20 Apr 2019 20:20:44 +0100 Subject: [PATCH 3/9] playlist class --- youtube_dl/extractor/rtp.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/youtube_dl/extractor/rtp.py b/youtube_dl/extractor/rtp.py index 9ef84ac00..02d48d6fd 100644 --- a/youtube_dl/extractor/rtp.py +++ b/youtube_dl/extractor/rtp.py @@ -59,3 +59,7 @@ class RTPIE(InfoExtractor): 'description': description, 'thumbnail': thumbnail, } + +class RTPPlaylistIE(RTPIE): + _VALID_URL = r'https?://(?:www\.)?rtp\.pt/play/p(?P[0-9]+)/(?P[^/?#]+)/?' + From 0d982b71af3239e64b9bea344d74ee7ab525b27f Mon Sep 17 00:00:00 2001 From: nimeir Date: Sun, 21 Apr 2019 00:22:20 +0100 Subject: [PATCH 4/9] experimenting --- youtube_dl/extractor/rtp.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/rtp.py b/youtube_dl/extractor/rtp.py index 02d48d6fd..ae86a842c 100644 --- a/youtube_dl/extractor/rtp.py +++ b/youtube_dl/extractor/rtp.py @@ -28,7 +28,6 @@ class RTPIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) title = self._html_search_meta( 'twitter:title', webpage, display_name='title', fatal=True) @@ -61,5 +60,8 @@ class RTPIE(InfoExtractor): } class RTPPlaylistIE(RTPIE): - _VALID_URL = r'https?://(?:www\.)?rtp\.pt/play/p(?P[0-9]+)/(?P[^/?#]+)/?' - + _VALID_URL = r'https://www.rtp.pt/play/p510/e401638/aleixo-fm' + def _real_extract(self, url): + return { + 'entries': [{'id':'hello1', 'url':'https://cdn-ondemand.rtp.pt/nas2.share/wavrss/at3/1904/5786802_292886-1904170925.mp3'}] + } From e9ef34058e85894dc7cb77ad81f25939de2a6f66 Mon Sep 17 00:00:00 2001 From: nimeir Date: Sun, 21 Apr 2019 15:06:36 +0100 Subject: [PATCH 5/9] figured out how to make extractors work --- youtube_dl/extractor/extractors.py | 5 ++++- youtube_dl/extractor/rtp.py | 7 +++++-- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 8e7a5bf41..24aa145e6 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -968,7 +968,10 @@ from .rtl2 import ( RTL2YouIE, RTL2YouSeriesIE, ) -from .rtp import RTPIE +from .rtp import ( + RTPIE, + RTPPlaylistIE +) from .rts import RTSIE from .rtve import RTVEALaCartaIE, RTVELiveIE, RTVEInfantilIE, RTVELiveIE, RTVETelevisionIE from .rtvnh import RTVNHIE diff --git a/youtube_dl/extractor/rtp.py b/youtube_dl/extractor/rtp.py index ae86a842c..522f0610d 100644 --- a/youtube_dl/extractor/rtp.py +++ b/youtube_dl/extractor/rtp.py @@ -60,8 +60,11 @@ class RTPIE(InfoExtractor): } class RTPPlaylistIE(RTPIE): - _VALID_URL = r'https://www.rtp.pt/play/p510/e401638/aleixo-fm' + #_VALID_URL = r'https?://(?:www\.)?rtp\.pt/play/p(?P[0-9]+)/(?P[^/?#]+)/?' + _VALID_URL = r'1234' def _real_extract(self, url): + return { - 'entries': [{'id':'hello1', 'url':'https://cdn-ondemand.rtp.pt/nas2.share/wavrss/at3/1904/5786802_292886-1904170925.mp3'}] + '_type': 'playlist', + 'entries':[{'_type':'url', 'url':'https://www.rtp.pt/play/p510/e400299/aleixo-fm'}] } From bd2c15bcb557edc2a7d6727d07a9186a9cc47dc4 Mon Sep 17 00:00:00 2001 From: nimeir Date: Sun, 21 Apr 2019 22:26:38 +0100 Subject: [PATCH 6/9] added playlist functionality --- youtube_dl/extractor/rtp.py | 39 +++++++++++++++++++++++++++++-------- 1 file changed, 31 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/rtp.py b/youtube_dl/extractor/rtp.py index 522f0610d..01386a55a 100644 --- a/youtube_dl/extractor/rtp.py +++ b/youtube_dl/extractor/rtp.py @@ -1,12 +1,16 @@ # coding: utf-8 + from __future__ import unicode_literals +import re + from .common import InfoExtractor from ..utils import js_to_json class RTPIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?rtp\.pt/play/p(?P[0-9]+)/(?P[^/?#]+)/?' + _TESTS = [{ 'url': 'http://www.rtp.pt/play/p405/e174042/paixoes-cruzadas', 'md5': 'e736ce0c665e459ddb818546220b4ef8', @@ -59,12 +63,31 @@ class RTPIE(InfoExtractor): 'thumbnail': thumbnail, } -class RTPPlaylistIE(RTPIE): - #_VALID_URL = r'https?://(?:www\.)?rtp\.pt/play/p(?P[0-9]+)/(?P[^/?#]+)/?' - _VALID_URL = r'1234' - def _real_extract(self, url): - return { - '_type': 'playlist', - 'entries':[{'_type':'url', 'url':'https://www.rtp.pt/play/p510/e400299/aleixo-fm'}] - } +class RTPPlaylistIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?rtp\.pt/play/p(?P[0-9]+)' + + def _get_program_id(self, url): + mobj = re.match(self._VALID_URL, url) + program_id = mobj.group('program_id') + return program_id + + def _extract_entries(self, url, program_id, page): + entry_url = "https://www.rtp.pt/play/bg_l_ep/?&listProgram=%s&listcategory=&listchannel=&type=radio&page=%s" % ( + program_id, page + ) + webpage = self._download_webpage(entry_url, program_id) + return [self.url_result('https://www.rtp.pt/play/p%s/%s/' % (program_id, episode), 'RTP') + for episode in re.findall(r'e\d+', webpage)] + + def _real_extract(self, url): + page = 1 + program_id = self._get_program_id(url) + + entry = self._extract_entries(url, program_id, page) + new_entry = self._extract_entries(url, program_id, page + 1) + while new_entry != []: + new_entry = self._extract_entries(url, program_id, page + 1) + entry += new_entry + page += 1 + return self.playlist_result(entry, playlist_id=program_id) From 63ca9b183b30c0e6ed9a0437011989af1cd9ae3d Mon Sep 17 00:00:00 2001 From: nimeir Date: Mon, 22 Apr 2019 01:11:09 +0100 Subject: [PATCH 7/9] fix travisci error --- youtube_dl/extractor/rtp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/rtp.py b/youtube_dl/extractor/rtp.py index 01386a55a..921efca13 100644 --- a/youtube_dl/extractor/rtp.py +++ b/youtube_dl/extractor/rtp.py @@ -65,7 +65,7 @@ class RTPIE(InfoExtractor): class RTPPlaylistIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?rtp\.pt/play/p(?P[0-9]+)' + _VALID_URL = r'https?://(?:www\.)?rtp\.pt/play/p(?P[0-9]+)]+)(?!.)' def _get_program_id(self, url): mobj = re.match(self._VALID_URL, url) From a847d06337aa202080ada49f8935c6457c8ac72f Mon Sep 17 00:00:00 2001 From: nimeir Date: Mon, 22 Apr 2019 01:53:20 +0100 Subject: [PATCH 8/9] fixed typo --- youtube_dl/extractor/rtp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/rtp.py b/youtube_dl/extractor/rtp.py index 921efca13..2712b2251 100644 --- a/youtube_dl/extractor/rtp.py +++ b/youtube_dl/extractor/rtp.py @@ -65,7 +65,7 @@ class RTPIE(InfoExtractor): class RTPPlaylistIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?rtp\.pt/play/p(?P[0-9]+)]+)(?!.)' + _VALID_URL = r'https?://(?:www\.)?rtp\.pt/play/p(?P[0-9]+)(?!.+)' def _get_program_id(self, url): mobj = re.match(self._VALID_URL, url) From e045ba67c3266dc444e7158c708dd74c3710898c Mon Sep 17 00:00:00 2001 From: nimeir Date: Sun, 5 May 2019 14:07:15 +0100 Subject: [PATCH 9/9] added all fixes more fixes m3u8 format processing --- youtube_dl/extractor/rtp.py | 55 +++++++++++++++++++++---------------- 1 file changed, 32 insertions(+), 23 deletions(-) diff --git a/youtube_dl/extractor/rtp.py b/youtube_dl/extractor/rtp.py index 2712b2251..3326a1e0b 100644 --- a/youtube_dl/extractor/rtp.py +++ b/youtube_dl/extractor/rtp.py @@ -3,9 +3,14 @@ from __future__ import unicode_literals import re +import itertools from .common import InfoExtractor -from ..utils import js_to_json +from ..utils import ( + determine_ext, + js_to_json, + update_url_query +) class RTPIE(InfoExtractor): @@ -41,18 +46,24 @@ class RTPIE(InfoExtractor): r'(?s)RTPPlayer\(({.*?})', webpage, 'player config') player_config = js_to_json(player_config) config = self._parse_json(player_config, video_id) - path, ext = config.get('file').rsplit('.', 1) + path = config.get('file') + ext = determine_ext(path) + formats = [{ - 'format_id': 'rtmp', 'ext': ext, 'preference': -2, - 'url': '{file}'.format(**config), + 'url': path, 'app': config.get('application'), - 'play_path': '{ext:s}:{path:s}'.format(ext=ext, path=path), 'page_url': url, 'player_url': 'http://programas.rtp.pt/play/player.swf?v3', - 'rtmp_real_time': True, }] + + if ext == 'm3u8': + m3u8_formats = self._extract_m3u8_formats(path, video_id) + for i in m3u8_formats: + i.update(formats[0]) + i['ext'] = 'mp4' + formats = m3u8_formats self._sort_formats(formats) return { @@ -65,29 +76,27 @@ class RTPIE(InfoExtractor): class RTPPlaylistIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?rtp\.pt/play/p(?P[0-9]+)(?!.+)' + _VALID_URL = r'https?://(?:www\.)?rtp\.pt/play/p(?P[0-9]+)' def _get_program_id(self, url): mobj = re.match(self._VALID_URL, url) program_id = mobj.group('program_id') return program_id - def _extract_entries(self, url, program_id, page): - entry_url = "https://www.rtp.pt/play/bg_l_ep/?&listProgram=%s&listcategory=&listchannel=&type=radio&page=%s" % ( - program_id, page - ) - webpage = self._download_webpage(entry_url, program_id) - return [self.url_result('https://www.rtp.pt/play/p%s/%s/' % (program_id, episode), 'RTP') - for episode in re.findall(r'e\d+', webpage)] + def _extract_entries(self, url, program_id): + for page in itertools.count(1): + query = update_url_query("https://www.rtp.pt/play/bg_l_ep/", { + 'listProgram': program_id, + 'type': 'radio', + 'page': page + }) + webpage = self._download_webpage(query, program_id, 'Downloading page %d' % page) + if not webpage: + break + for episode in re.findall(r'p%s/e(\d+)' % program_id, webpage): + yield self.url_result('https://www.rtp.pt/play/p%s/e%s/' % (program_id, episode), 'RTP') def _real_extract(self, url): - page = 1 program_id = self._get_program_id(url) - - entry = self._extract_entries(url, program_id, page) - new_entry = self._extract_entries(url, program_id, page + 1) - while new_entry != []: - new_entry = self._extract_entries(url, program_id, page + 1) - entry += new_entry - page += 1 - return self.playlist_result(entry, playlist_id=program_id) + entry = self._extract_entries(url, program_id) + return self.playlist_result(entry, playlist_id=program_id) \ No newline at end of file