From 57757308e1167ed6bb422a447180b24fe3ee2ad4 Mon Sep 17 00:00:00 2001 From: Surya Oktafendri Date: Thu, 29 Mar 2018 06:34:03 +0700 Subject: [PATCH 01/10] [DuaPuluhDetik] Add new extractor for 20.detik.com --- youtube_dl/extractor/detik.py | 67 ++++++++++++++++++++++++++++++ youtube_dl/extractor/extractors.py | 1 + 2 files changed, 68 insertions(+) create mode 100644 youtube_dl/extractor/detik.py diff --git a/youtube_dl/extractor/detik.py b/youtube_dl/extractor/detik.py new file mode 100644 index 000000000..3283f9e60 --- /dev/null +++ b/youtube_dl/extractor/detik.py @@ -0,0 +1,67 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ExtractorError + + +class DuaPuluhDetikIE(InfoExtractor): + _VALID_URL = r'https?://20\.detik\.com/[^/]+/(?P\d+)-(?P\d+)/(?P[^/?#&]+)' + IE_NAME = '20detik' + _TESTS = [{ + 'url': 'https://20.detik.com/detikflash/20180328-180328002/dramatis-polisi-selamatkan-pria-yang-coba-bunuh-diri', + 'info_dict': { + 'id': '180328002', + 'display_id': '20180328-180328002', + 'slug': 'dramatis-polisi-selamatkan-pria-yang-coba-bunuh-diri', + 'upload_date': '20180328', + 'title': 'md5:92c18d820d8937f259007e9c6ce40e6b', + 'description': 'md5:3953164fc1746eb98aa3729140f9b5b8', + 'thumbnail': r're:^https?://.*\.jpg(\?.*)?$', + 'ext': 'mp4' + } + }, { + 'url': 'https://20.detik.com/e-flash/20180328-180328009/unboxing-huawei-p20-pro-', + 'only_matching': True + }, { + 'url': 'https://20.detik.com/otobuzz/20180228-180228081/primadona-baru-di-kelas-low-mpv', + 'only_matching': True + }, { + 'url': 'https://20.detik.com/sport-buzz/20180328-180328013/messi-kabur-melihat-argentina-dibantai-spanyol', + 'only_matching': True + }, { + 'url': 'https://20.detik.com/piala-dunia-2018/20180328-180328005/gary-lineker-dan-memori-piala-dunia-1986', + 'only_matching': True + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + upload_date, video_id, slug = mobj.group('date', 'id', 'slug') + embed_url = 'https://20.detik.com/embed/%s' % video_id + display_id = "%s-%s" % (upload_date, video_id) + webpage = self._download_webpage(embed_url, video_id) + m3u8_url = self._html_search_regex( + r'''["\']videoUrl["\']\s*:\s*["\'](?P.*?)["\']''', + webpage, 'm3u8_url', group='m3u8_url', default='', fatal=False) + if len(m3u8_url) == 0: + raise ExtractorError('Video not found') + title = self._og_search_title(webpage) + description = self._og_search_description(webpage) + thumbnail = self._og_search_property('image', webpage) + formats = self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', entry_protocol='m3u8_native') + self._sort_formats(formats) + + return { + 'id': video_id, + 'display_id': display_id, + 'slug': slug, + 'upload_date': upload_date, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'ext': 'mp4', + 'formats': formats + } diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index de48a37ad..2aa369663 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -258,6 +258,7 @@ from .dbtv import DBTVIE from .dctp import DctpTvIE from .deezer import DeezerPlaylistIE from .democracynow import DemocracynowIE +from .detik import DuaPuluhDetikIE from .dfb import DFBIE from .dhm import DHMIE from .digg import DiggIE From 1141133e714faeae5336f2ee5a586115887c665b Mon Sep 17 00:00:00 2001 From: Surya Oktafendri Date: Sat, 31 Mar 2018 04:12:14 +0700 Subject: [PATCH 02/10] [20detik] Use fatal search regex --- youtube_dl/extractor/detik.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/detik.py b/youtube_dl/extractor/detik.py index 3283f9e60..12dc392e1 100644 --- a/youtube_dl/extractor/detik.py +++ b/youtube_dl/extractor/detik.py @@ -44,7 +44,7 @@ class DuaPuluhDetikIE(InfoExtractor): webpage = self._download_webpage(embed_url, video_id) m3u8_url = self._html_search_regex( r'''["\']videoUrl["\']\s*:\s*["\'](?P.*?)["\']''', - webpage, 'm3u8_url', group='m3u8_url', default='', fatal=False) + webpage, 'm3u8_url', group='m3u8_url', default='') if len(m3u8_url) == 0: raise ExtractorError('Video not found') title = self._og_search_title(webpage) From 5323d532b747439c8e0cb181e6b1a9b0a90167fe Mon Sep 17 00:00:00 2001 From: Surya Oktafendri Date: Sat, 31 Mar 2018 04:15:22 +0700 Subject: [PATCH 03/10] [20detik] Remove unused meta field --- youtube_dl/extractor/detik.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/youtube_dl/extractor/detik.py b/youtube_dl/extractor/detik.py index 12dc392e1..b86536b70 100644 --- a/youtube_dl/extractor/detik.py +++ b/youtube_dl/extractor/detik.py @@ -38,7 +38,7 @@ class DuaPuluhDetikIE(InfoExtractor): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - upload_date, video_id, slug = mobj.group('date', 'id', 'slug') + upload_date, video_id = mobj.group('date', 'id') embed_url = 'https://20.detik.com/embed/%s' % video_id display_id = "%s-%s" % (upload_date, video_id) webpage = self._download_webpage(embed_url, video_id) @@ -57,7 +57,6 @@ class DuaPuluhDetikIE(InfoExtractor): return { 'id': video_id, 'display_id': display_id, - 'slug': slug, 'upload_date': upload_date, 'title': title, 'description': description, From 67a9e1b2d5fa4403e2865077bb69a70931f34ef9 Mon Sep 17 00:00:00 2001 From: Surya Oktafendri Date: Sat, 31 Mar 2018 04:16:55 +0700 Subject: [PATCH 04/10] [20detik] Remove 'ext' meta property --- youtube_dl/extractor/detik.py | 1 - 1 file changed, 1 deletion(-) diff --git a/youtube_dl/extractor/detik.py b/youtube_dl/extractor/detik.py index b86536b70..4e0ecf7d1 100644 --- a/youtube_dl/extractor/detik.py +++ b/youtube_dl/extractor/detik.py @@ -61,6 +61,5 @@ class DuaPuluhDetikIE(InfoExtractor): 'title': title, 'description': description, 'thumbnail': thumbnail, - 'ext': 'mp4', 'formats': formats } From 0f70fb8ae09ed8ca10fae0cb91612eb4c86c630d Mon Sep 17 00:00:00 2001 From: Surya Oktafendri Date: Sat, 31 Mar 2018 04:18:13 +0700 Subject: [PATCH 05/10] [20detik] Fix search regex --- youtube_dl/extractor/detik.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/detik.py b/youtube_dl/extractor/detik.py index 4e0ecf7d1..7a6af9a57 100644 --- a/youtube_dl/extractor/detik.py +++ b/youtube_dl/extractor/detik.py @@ -43,7 +43,7 @@ class DuaPuluhDetikIE(InfoExtractor): display_id = "%s-%s" % (upload_date, video_id) webpage = self._download_webpage(embed_url, video_id) m3u8_url = self._html_search_regex( - r'''["\']videoUrl["\']\s*:\s*["\'](?P.*?)["\']''', + r'''["\']videoUrl["\']\s*:\s*["\'](?P.+)["\']''', webpage, 'm3u8_url', group='m3u8_url', default='') if len(m3u8_url) == 0: raise ExtractorError('Video not found') From 662d14ea7ef4e6421b00a0a60259d005b98c8bfb Mon Sep 17 00:00:00 2001 From: Surya Oktafendri Date: Sat, 31 Mar 2018 04:19:54 +0700 Subject: [PATCH 06/10] [20detik] Remove unnecessary named group --- youtube_dl/extractor/detik.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/detik.py b/youtube_dl/extractor/detik.py index 7a6af9a57..6347017e5 100644 --- a/youtube_dl/extractor/detik.py +++ b/youtube_dl/extractor/detik.py @@ -44,7 +44,7 @@ class DuaPuluhDetikIE(InfoExtractor): webpage = self._download_webpage(embed_url, video_id) m3u8_url = self._html_search_regex( r'''["\']videoUrl["\']\s*:\s*["\'](?P.+)["\']''', - webpage, 'm3u8_url', group='m3u8_url', default='') + webpage, 'm3u8_url', default='') if len(m3u8_url) == 0: raise ExtractorError('Video not found') title = self._og_search_title(webpage) From 14f4c41cef149a6941751b85caf5b6f39046192e Mon Sep 17 00:00:00 2001 From: Surya Oktafendri Date: Sat, 31 Mar 2018 09:04:14 +0700 Subject: [PATCH 07/10] [Generic] Move 20detik embed extractor into generic extractor --- youtube_dl/extractor/detik.py | 65 ------------------------------ youtube_dl/extractor/extractors.py | 1 - youtube_dl/extractor/generic.py | 21 ++++++++++ 3 files changed, 21 insertions(+), 66 deletions(-) delete mode 100644 youtube_dl/extractor/detik.py diff --git a/youtube_dl/extractor/detik.py b/youtube_dl/extractor/detik.py deleted file mode 100644 index 6347017e5..000000000 --- a/youtube_dl/extractor/detik.py +++ /dev/null @@ -1,65 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ExtractorError - - -class DuaPuluhDetikIE(InfoExtractor): - _VALID_URL = r'https?://20\.detik\.com/[^/]+/(?P\d+)-(?P\d+)/(?P[^/?#&]+)' - IE_NAME = '20detik' - _TESTS = [{ - 'url': 'https://20.detik.com/detikflash/20180328-180328002/dramatis-polisi-selamatkan-pria-yang-coba-bunuh-diri', - 'info_dict': { - 'id': '180328002', - 'display_id': '20180328-180328002', - 'slug': 'dramatis-polisi-selamatkan-pria-yang-coba-bunuh-diri', - 'upload_date': '20180328', - 'title': 'md5:92c18d820d8937f259007e9c6ce40e6b', - 'description': 'md5:3953164fc1746eb98aa3729140f9b5b8', - 'thumbnail': r're:^https?://.*\.jpg(\?.*)?$', - 'ext': 'mp4' - } - }, { - 'url': 'https://20.detik.com/e-flash/20180328-180328009/unboxing-huawei-p20-pro-', - 'only_matching': True - }, { - 'url': 'https://20.detik.com/otobuzz/20180228-180228081/primadona-baru-di-kelas-low-mpv', - 'only_matching': True - }, { - 'url': 'https://20.detik.com/sport-buzz/20180328-180328013/messi-kabur-melihat-argentina-dibantai-spanyol', - 'only_matching': True - }, { - 'url': 'https://20.detik.com/piala-dunia-2018/20180328-180328005/gary-lineker-dan-memori-piala-dunia-1986', - 'only_matching': True - }] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - upload_date, video_id = mobj.group('date', 'id') - embed_url = 'https://20.detik.com/embed/%s' % video_id - display_id = "%s-%s" % (upload_date, video_id) - webpage = self._download_webpage(embed_url, video_id) - m3u8_url = self._html_search_regex( - r'''["\']videoUrl["\']\s*:\s*["\'](?P.+)["\']''', - webpage, 'm3u8_url', default='') - if len(m3u8_url) == 0: - raise ExtractorError('Video not found') - title = self._og_search_title(webpage) - description = self._og_search_description(webpage) - thumbnail = self._og_search_property('image', webpage) - formats = self._extract_m3u8_formats( - m3u8_url, video_id, 'mp4', entry_protocol='m3u8_native') - self._sort_formats(formats) - - return { - 'id': video_id, - 'display_id': display_id, - 'upload_date': upload_date, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'formats': formats - } diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 2aa369663..de48a37ad 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -258,7 +258,6 @@ from .dbtv import DBTVIE from .dctp import DctpTvIE from .deezer import DeezerPlaylistIE from .democracynow import DemocracynowIE -from .detik import DuaPuluhDetikIE from .dfb import DFBIE from .dhm import DHMIE from .digg import DiggIE diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index cf64398e3..db13d9968 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -1967,6 +1967,15 @@ class GenericIE(InfoExtractor): 'params': { 'skip_download': True, }, + }, + { + # 20detik embed + 'url': 'https://20.detik.com/detikflash/20180328-180328002/dramatis-polisi-selamatkan-pria-yang-coba-bunuh-diri', + 'info_dict': { + 'id': '180328002', + 'title': 'md5:92c18d820d8937f259007e9c6ce40e6b', + 'ext': 'mp4' + } } # { # # TODO: find another test @@ -2829,6 +2838,13 @@ class GenericIE(InfoExtractor): }) return info + # Look for 20detik (https://20.detik.com) embeds + mobj = re.search( + r']+?src=(["\'])(?Phttps?://20\.detik\.com/embed/(\d+)[^"\']+?)\1', + webpage) + if mobj is not None: + return self.url_result(mobj.group('url')) + # Look for Instagram embeds instagram_embed_url = InstagramIE._extract_embed_url(webpage) if instagram_embed_url is not None: @@ -3139,6 +3155,11 @@ class GenericIE(InfoExtractor): if embed_url and embed_url != url: return self.url_result(embed_url) + if not found: + # DetikFlow: It's basically a 'modified' FlowPlayer used in https://20.detik.com + found = re.findall( + r'["\']videoUrl["\']\s*:\s*["\']([^"\']+)["\']', webpage) + if not found: raise UnsupportedError(url) From b2805ca14109e2e4eed3ed99a70bde43f3fa9671 Mon Sep 17 00:00:00 2001 From: Surya Oktafendri Date: Tue, 3 Apr 2018 10:08:11 +0700 Subject: [PATCH 08/10] [20detik] Extract 20detik embeds --- youtube_dl/extractor/detik.py | 59 ++++++++++++++++++++++++++++++ youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/generic.py | 5 --- 3 files changed, 60 insertions(+), 5 deletions(-) create mode 100644 youtube_dl/extractor/detik.py diff --git a/youtube_dl/extractor/detik.py b/youtube_dl/extractor/detik.py new file mode 100644 index 000000000..e97745acd --- /dev/null +++ b/youtube_dl/extractor/detik.py @@ -0,0 +1,59 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ExtractorError + + +class DuaPuluhDetikIE(InfoExtractor): + _VALID_URL = r'https?://20\.detik\.com/embed/(?P\d+)' + IE_NAME = '20detik' + _TESTS = [{ + 'url': 'https://20.detik.com/embed/180403001?autostart=1', + 'info_dict': { + 'id': '180403001', + 'title': 'Dahsyatnya Rudal Anti-balistik yang Diuji Coba Rusia', + 'description': '', + 'thumbnail': r're:^https?://.*\.jpg(\?.*)?$', + 'ext': 'mp4' + } + }, { + 'url': 'https://20.detik.com/embed/180326044', + 'info_dict': { + 'id': '180326044', + 'title': 'md5:204cbc0b3b51b701ee9dc6a502f1e17b', + 'description': 'md5:227d860110eda61876b243e23fe38538', + 'thumbnail': r're:^https?://.*\.jpg(\?.*)?$', + 'ext': 'mp4' + } + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('video_id') + webpage = self._download_webpage(url, video_id) + m3u8_url = self._html_search_regex( + r'["\']videoUrl["\']\s*:\s*["\'](?P.+)["\']', + webpage, 'm3u8_url') + + if m3u8_url is None: + raise ExtractorError('Video not found') + + title = self._og_search_title(webpage) + description = self._og_search_description( + webpage, default='') + thumbnail = self._og_search_property( + 'image', webpage) + formats = self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', entry_protocol='m3u8_native') + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'formats': formats + } diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index de48a37ad..2aa369663 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -258,6 +258,7 @@ from .dbtv import DBTVIE from .dctp import DctpTvIE from .deezer import DeezerPlaylistIE from .democracynow import DemocracynowIE +from .detik import DuaPuluhDetikIE from .dfb import DFBIE from .dhm import DHMIE from .digg import DiggIE diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index db13d9968..e9fde09e6 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -3155,11 +3155,6 @@ class GenericIE(InfoExtractor): if embed_url and embed_url != url: return self.url_result(embed_url) - if not found: - # DetikFlow: It's basically a 'modified' FlowPlayer used in https://20.detik.com - found = re.findall( - r'["\']videoUrl["\']\s*:\s*["\']([^"\']+)["\']', webpage) - if not found: raise UnsupportedError(url) From f428f2fa90cf62f4e590d73c0bc3e380680a02b3 Mon Sep 17 00:00:00 2001 From: Surya Oktafendri Date: Tue, 3 Apr 2018 11:02:44 +0700 Subject: [PATCH 09/10] [20detik] Add _extract_urls static method --- youtube_dl/extractor/detik.py | 8 +++++++- youtube_dl/extractor/generic.py | 9 ++++----- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/detik.py b/youtube_dl/extractor/detik.py index e97745acd..5a298f4f6 100644 --- a/youtube_dl/extractor/detik.py +++ b/youtube_dl/extractor/detik.py @@ -15,7 +15,7 @@ class DuaPuluhDetikIE(InfoExtractor): 'info_dict': { 'id': '180403001', 'title': 'Dahsyatnya Rudal Anti-balistik yang Diuji Coba Rusia', - 'description': '', + 'description': 'md5:909c645cc494f5d9d7089963c13a695d', 'thumbnail': r're:^https?://.*\.jpg(\?.*)?$', 'ext': 'mp4' } @@ -30,6 +30,12 @@ class DuaPuluhDetikIE(InfoExtractor): } }] + @staticmethod + def _extract_urls(webpage): + return [m.group('url') for m in re.finditer( + r'[^\']]+?src=(["\'])(?Phttps?://20\.detik\.com/embed/(\d+)[^"\']+?)\1', + webpage)] + def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('video_id') diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index e9fde09e6..853b0316a 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -105,6 +105,7 @@ from .springboardplatform import SpringboardPlatformIE from .yapfiles import YapFilesIE from .vice import ViceIE from .xfileshare import XFileShareIE +from .detik import DuaPuluhDetikIE class GenericIE(InfoExtractor): @@ -2839,11 +2840,9 @@ class GenericIE(InfoExtractor): return info # Look for 20detik (https://20.detik.com) embeds - mobj = re.search( - r']+?src=(["\'])(?Phttps?://20\.detik\.com/embed/(\d+)[^"\']+?)\1', - webpage) - if mobj is not None: - return self.url_result(mobj.group('url')) + duapuluhdetik_urls = DuaPuluhDetikIE._extract_urls(webpage) + if duapuluhdetik_urls: + return self.playlist_from_matches(duapuluhdetik_urls, video_id, video_title, getter=unescapeHTML, ie=DuaPuluhDetikIE.ie_key()) # Look for Instagram embeds instagram_embed_url = InstagramIE._extract_embed_url(webpage) From 6633aa7340a1daac5e5c7f8fa121463034896f29 Mon Sep 17 00:00:00 2001 From: Surya Oktafendri Date: Tue, 3 Apr 2018 16:16:03 +0700 Subject: [PATCH 10/10] [20detik] Fix embed URL detection --- youtube_dl/extractor/detik.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/detik.py b/youtube_dl/extractor/detik.py index 5a298f4f6..f438314fc 100644 --- a/youtube_dl/extractor/detik.py +++ b/youtube_dl/extractor/detik.py @@ -33,7 +33,7 @@ class DuaPuluhDetikIE(InfoExtractor): @staticmethod def _extract_urls(webpage): return [m.group('url') for m in re.finditer( - r'[^\']]+?src=(["\'])(?Phttps?://20\.detik\.com/embed/(\d+)[^"\']+?)\1', + r']+?src\s*=\s*(["\'])(?Phttps?://20\.detik\.com/embed/\d+).+?\1', webpage)] def _real_extract(self, url):