From 0f726bc002025422fea8c0c55f5ecca02be83894 Mon Sep 17 00:00:00 2001 From: kosantosbik Date: Fri, 3 May 2019 18:19:30 +0300 Subject: [PATCH 1/9] [Kanal D]Added support for new site --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/kanald.py | 122 +++++++++++++++++++++++++++++ 2 files changed, 123 insertions(+) create mode 100644 youtube_dl/extractor/kanald.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 0e3ccb82d..8eedfb7a2 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1524,3 +1524,4 @@ from .zattoo import ( from .zdf import ZDFIE, ZDFChannelIE from .zingmp3 import ZingMp3IE from .zype import ZypeIE +from .kanald import KanaldIE, KanaldSerieIE diff --git a/youtube_dl/extractor/kanald.py b/youtube_dl/extractor/kanald.py new file mode 100644 index 000000000..4bbe4a98e --- /dev/null +++ b/youtube_dl/extractor/kanald.py @@ -0,0 +1,122 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from re import findall + +from .common import InfoExtractor +from ..compat import compat_str + + +class KanaldIE(InfoExtractor): + "Kanal D TV Website extractor" + IE_NAME = 'Kanal D' + _VALID_URL = r'https?://(?:www\.)?kanald\.com\.tr/(?:.*)/(?P.*\d+.*bolum(?!ler).*)/?' + _TESTS = [{ + 'url': 'https://www.kanald.com.tr/kuzeyguney/1-bolum/10115', + 'md5': '88d518f7803b53e9e6187b05fe0f1a63', + 'info_dict': { + 'id': '1-bolum/10115', + 'ext': 'm3u8', + 'title': '1.Bölüm', + 'release_date': '20110907', + 'thumbnail': r're:^https?://.*\.jpg$', + 'uploader': 'Kanal D', + 'description': '1.Bölüm' + } + }, { + 'url': + 'https://www.kanald.com.tr/sevdanin-bahcesi/bolumler/sevdanin-bahcesi-2-bolum', + 'only_matching': True + }, { + 'url': + 'https://www.kanald.com.tr/yarim-elma/bolum/yarim-elma-36-bolum', + 'only_matching': True + }, { + 'url': + 'https://www.kanald.com.tr/ask-ve-gunah/bolumler/ask-ve-gunah-120-bolum-final', + 'only_matching': True + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + title = self._html_search_regex(r'

(.+?)

', webpage, 'title') + video_url = "https://soledge13.dogannet.tv/" + self._search_regex( + r'["\']contentUrl["\']:["\'](?P.*)["\']', webpage, + 'video_url') + formats = self._extract_m3u8_formats(video_url, video_id) + thumbnail = self._search_regex( + r'.*)["\'].*', + webpage, 'thumbnail') + description = self._og_search_description(webpage) + year = self._search_regex( + r'["\']uploadDate["\']:["\'](?P\d{4}).*["\']', webpage, + 'year') + month = self._search_regex( + r'["\']uploadDate["\']:["\']\d{4}-(?P\d\d).*["\']', webpage, + 'month') + day = self._search_regex( + r'["\']uploadDate["\']:["\']\d{4}-\d\d-(?P\d\d).*["\']', + webpage, 'day') + release_date = year + month + day + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'thumbnail': thumbnail, + 'description': description, + 'url': video_url, + 'uploader': compat_str('Kanal D'), + 'release_date': release_date + } + + +class KanaldSerieIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?kanald\.com\.tr/(?P.*)/(?:bolum|bolumler)' + IE_NAME = 'Kanal D:serie' + _TESTS = [{ + 'url': 'https://www.kanald.com.tr/kuzeyguney/bolum', + 'info_dict': { + 'id': 'kuzeyguney' + }, + 'playlist_mincount': 80 + }, { + 'url': 'https://www.kanald.com.tr/iki-yalanci/bolumler', + 'only_matching': True + }] + + def _real_extract(self, url): + playlist_id = self._match_id(url) + next_page = url + webpage = None + page = 1 + has_more = True + entries = [] + + while has_more: + webpage = self._download_webpage(next_page, + playlist_id, + note='Downloading page %s' % page) + + try: + next_page = 'https://www.kanald.com.tr' + self._search_regex( + r'class=["\']next["\']>.*)["\']>.*', + webpage, + 'hasmore', + default=None, + fatal=False) + page += 1 + except TypeError: + has_more = False + + page_entries = findall( + r'.*)["\'].*', + webpage) + + for entry in page_entries: + entries.append( + self.url_result('https://www.kanald.com.tr%s' % entry, + ie=KanaldIE.ie_key())) + + return self.playlist_result(entries, playlist_id) From 50b937c65933a1f61e02788ba656b56df77e73b0 Mon Sep 17 00:00:00 2001 From: kosantosbik <312roadrunner@gmail.com> Date: Sun, 5 May 2019 00:46:21 +0300 Subject: [PATCH 2/9] KanaldSerieIE wrong match fix --- youtube_dl/extractor/kanald.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/kanald.py b/youtube_dl/extractor/kanald.py index 4bbe4a98e..832da6f0c 100644 --- a/youtube_dl/extractor/kanald.py +++ b/youtube_dl/extractor/kanald.py @@ -73,7 +73,7 @@ class KanaldIE(InfoExtractor): class KanaldSerieIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?kanald\.com\.tr/(?P.*)/(?:bolum|bolumler)' + _VALID_URL = r'https?://(?:www\.)?kanald\.com\.tr/(?P.*)/(?:bolum|bolumler)$' IE_NAME = 'Kanal D:serie' _TESTS = [{ 'url': 'https://www.kanald.com.tr/kuzeyguney/bolum', From cf162a440b30351419fd6306cdc8c6c3e19e254e Mon Sep 17 00:00:00 2001 From: Enes Date: Mon, 6 May 2019 02:42:01 +0300 Subject: [PATCH 3/9] Fix kanald.com.tr --- youtube_dl/extractor/extractors.py | 5 ++ youtube_dl/extractor/kanald.py | 137 +++++++++++++++++++++++++++++ 2 files changed, 142 insertions(+) create mode 100644 youtube_dl/extractor/kanald.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index c7a91a986..79df7bc2a 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -508,6 +508,11 @@ from .jwplatform import JWPlatformIE from .jpopsukitv import JpopsukiIE from .kakao import KakaoIE from .kaltura import KalturaIE +from .kanald import ( + KanaldIE, + KanaldEmbedIE, + KanaldSerieIE, +) from .kanalplay import KanalPlayIE from .kankan import KankanIE from .karaoketv import KaraoketvIE diff --git a/youtube_dl/extractor/kanald.py b/youtube_dl/extractor/kanald.py new file mode 100644 index 000000000..b1e387b0c --- /dev/null +++ b/youtube_dl/extractor/kanald.py @@ -0,0 +1,137 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + merge_dicts, + urljoin, +) + + +class KanaldBaseIE(InfoExtractor): + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + info = { + 'id': video_id, + } + + """FIXME: https://www.kanald.com.tr/kuzeyguney/80-bolum-izle/19364 -> Invalid control character at: line 5 column 146 (char 255)""" + + json_ld = self._search_regex( + r'(?is)]+type=(["\'])application/ld\+json\1[^>]*>(?:\s+)?(?P{[^<]+VideoObject[^<]+})(?:\s+)?', webpage, 'JSON-LD', group='json_ld') + ld_info = self._json_ld(json_ld, video_id) + + if not re.match(r'dogannet\.tv', ld_info['url']): + ld_info.update({ + 'url': 'https://soledge13.dogannet.tv/%s' % ld_info['url'] + }) + + return merge_dicts(ld_info, info) + + +class KanaldIE(KanaldBaseIE): + _VALID_URL = r'''(?x) + https?://(?:www\.)?kanald\.com\.tr/(?:[a-zA-Z0-9-]+)/ + (?: + (?:[0-9]+)-bolum| + (?:[0-9]+)-bolum-izle| + bolumler| + bolum + )/ + (?P[a-zA-Z0-9-]+) + ''' + + _TESTS = [{ + 'url': 'https://www.kanald.com.tr/kuzeyguney/1-bolum/10115', + 'md5': '8a32b6e894d45d618360b8b01173de9a', + 'info_dict': { + 'id': '10115', + 'title': '1.Bölüm', + 'description': 'md5:64edbdd153b7eefdf92c31bf5a6e5c1b', + 'upload_date': '20110907', + 'timestamp': 1315426815, + 'ext': 'm3u8', + } + }, { + 'url': 'https://www.kanald.com.tr/kuzeyguney/79-bolum-izle/19270', + 'only_matching': True + }, { + 'url': 'https://www.kanald.com.tr/sevdanin-bahcesi/bolumler/sevdanin-bahcesi-2-bolum', + 'only_matching': True + }, { + 'url': 'https://www.kanald.com.tr/yarim-elma/bolum/yarim-elma-36-bolum', + 'only_matching': True + }, { + 'url': 'https://www.kanald.com.tr/ask-ve-gunah/bolumler/ask-ve-gunah-120-bolum-final', + 'only_matching': True + }] + + +class KanaldEmbedIE(KanaldBaseIE): + _VALID_URL = r'https?://(?:www\.)?kanald\.com\.tr/embed/(?P[a-zA-Z0-9]+)' + + _TESTS = [{ + 'url': 'https://www.kanald.com.tr/embed/5465f0d2cf45af1064b73077', + 'md5': '8a32b6e894d45d618360b8b01173de9a', + 'info_dict': { + 'id': '5465f0d2cf45af1064b73077', + 'title': '1.Bölüm', + 'description': 'md5:64edbdd153b7eefdf92c31bf5a6e5c1b', + 'upload_date': '20110907', + 'timestamp': 1315426815, + 'ext': 'm3u8', + } + }] + + +class KanaldSerieIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?kanald\.com\.tr/(?P[a-zA-Z0-9-]+)/(?:bolum|bolumler)$' + + _TESTS = [{ + 'url': 'https://www.kanald.com.tr/kuzeyguney/bolum', + 'info_dict': { + 'id': 'kuzeyguney' + }, + 'playlist_mincount': 80 + }, { + 'url': 'https://www.kanald.com.tr/iki-yalanci/bolumler', + 'only_matching': True + }] + + def extract_episodes(self, url, playlist_id): + page = 1 + has_more = True + + while has_more: + webpage = self._download_webpage( + url, playlist_id, 'Downloading page %s' % page, query={ + 'page': page, + }) + + episode_urls = re.findall(r'' + re.escape(playlist_id) + r'[a-zA-Z0-9-/]+)\1[^>]*>', webpage) + + if len(episode_urls) is 0: + has_more = False + continue + + for episode_url in episode_urls: + episode_url = episode_url[1] + if not episode_url: + continue + yield self.url_result( + 'https://www.kanald.com.tr/%s' % episode_url, + ie=KanaldIE.ie_key()) + + page += 1 + + def _real_extract(self, url): + playlist_id = self._match_id(url) + + return self.playlist_result(self.extract_episodes(url, playlist_id), playlist_id) From bd6c7743c85df908cc132ca9698806032056f741 Mon Sep 17 00:00:00 2001 From: kosantosbik <312roadrunner@gmail.com> Date: Mon, 6 May 2019 03:22:01 +0300 Subject: [PATCH 4/9] Extractor import alphabetical order fix --- youtube_dl/extractor/extractors.py | 1 - 1 file changed, 1 deletion(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index c4019f1bb..6db86dc5e 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1529,4 +1529,3 @@ from .zattoo import ( from .zdf import ZDFIE, ZDFChannelIE from .zingmp3 import ZingMp3IE from .zype import ZypeIE -from .kanald import KanaldIE, KanaldSerieIE From 4b769d40ea80899236efc8fdc885c489f3dfe7b0 Mon Sep 17 00:00:00 2001 From: Enes Date: Mon, 6 May 2019 04:05:06 +0300 Subject: [PATCH 5/9] Update kanald.py --- youtube_dl/extractor/kanald.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/youtube_dl/extractor/kanald.py b/youtube_dl/extractor/kanald.py index b1e387b0c..fd4653f3c 100644 --- a/youtube_dl/extractor/kanald.py +++ b/youtube_dl/extractor/kanald.py @@ -4,11 +4,7 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_str -from ..utils import ( - merge_dicts, - urljoin, -) +from ..utils import merge_dicts class KanaldBaseIE(InfoExtractor): From fbf0c21b33f340afe454cdb2d75d7d873280b20b Mon Sep 17 00:00:00 2001 From: Enes Date: Mon, 6 May 2019 14:38:00 +0300 Subject: [PATCH 6/9] Update kanald.py --- youtube_dl/extractor/kanald.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/kanald.py b/youtube_dl/extractor/kanald.py index fd4653f3c..57b27a169 100644 --- a/youtube_dl/extractor/kanald.py +++ b/youtube_dl/extractor/kanald.py @@ -4,7 +4,10 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..utils import merge_dicts +from ..utils import ( + merge_dicts, + try_get, +) class KanaldBaseIE(InfoExtractor): @@ -20,15 +23,17 @@ class KanaldBaseIE(InfoExtractor): """FIXME: https://www.kanald.com.tr/kuzeyguney/80-bolum-izle/19364 -> Invalid control character at: line 5 column 146 (char 255)""" - json_ld = self._search_regex( + search_json_ld = self._search_regex( r'(?is)]+type=(["\'])application/ld\+json\1[^>]*>(?:\s+)?(?P{[^<]+VideoObject[^<]+})(?:\s+)?', webpage, 'JSON-LD', group='json_ld') - ld_info = self._json_ld(json_ld, video_id) + json_ld = self._parse_json(search_json_ld, video_id) - if not re.match(r'dogannet\.tv', ld_info['url']): - ld_info.update({ - 'url': 'https://soledge13.dogannet.tv/%s' % ld_info['url'] + if not re.match(r'dogannet\.tv', json_ld['contentUrl']): + json_ld.update({ + 'contentUrl': 'https://soledge13.dogannet.tv/%s' % json_ld['contentUrl'] }) + ld_info = self._json_ld(json_ld, video_id) + return merge_dicts(ld_info, info) @@ -118,7 +123,7 @@ class KanaldSerieIE(InfoExtractor): continue for episode_url in episode_urls: - episode_url = episode_url[1] + episode_url = try_get(episode_url, lambda x: x[1]) if not episode_url: continue yield self.url_result( From 29e7569801c3f5649026ebde2bb0c31ca076c293 Mon Sep 17 00:00:00 2001 From: Enes Date: Mon, 6 May 2019 23:43:08 +0300 Subject: [PATCH 7/9] Fixed invalid control character error --- youtube_dl/extractor/kanald.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/kanald.py b/youtube_dl/extractor/kanald.py index 57b27a169..db4d08a1e 100644 --- a/youtube_dl/extractor/kanald.py +++ b/youtube_dl/extractor/kanald.py @@ -2,9 +2,11 @@ from __future__ import unicode_literals import re +import json from .common import InfoExtractor from ..utils import ( + ExtractorError, merge_dicts, try_get, ) @@ -21,11 +23,14 @@ class KanaldBaseIE(InfoExtractor): 'id': video_id, } - """FIXME: https://www.kanald.com.tr/kuzeyguney/80-bolum-izle/19364 -> Invalid control character at: line 5 column 146 (char 255)""" - search_json_ld = self._search_regex( r'(?is)]+type=(["\'])application/ld\+json\1[^>]*>(?:\s+)?(?P{[^<]+VideoObject[^<]+})(?:\s+)?', webpage, 'JSON-LD', group='json_ld') - json_ld = self._parse_json(search_json_ld, video_id) + + # https://stackoverflow.com/questions/22394235/invalid-control-character-with-python-json-loads + try: + json_ld = json.loads(search_json_ld, strict=False) + except ValueError as ve: + raise ExtractorError('%s: Failed to parse JSON ' % video_id, cause=ve) if not re.match(r'dogannet\.tv', json_ld['contentUrl']): json_ld.update({ From defbc8150a5cd39290c5f76e0ed8170265e1c543 Mon Sep 17 00:00:00 2001 From: Enes Date: Tue, 7 May 2019 03:39:02 +0300 Subject: [PATCH 8/9] Update kanald.py --- youtube_dl/extractor/kanald.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/kanald.py b/youtube_dl/extractor/kanald.py index db4d08a1e..668ee5723 100644 --- a/youtube_dl/extractor/kanald.py +++ b/youtube_dl/extractor/kanald.py @@ -98,7 +98,7 @@ class KanaldEmbedIE(KanaldBaseIE): class KanaldSerieIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?kanald\.com\.tr/(?P[a-zA-Z0-9-]+)/(?:bolum|bolumler)$' + _VALID_URL = r'https?://(?:www\.)?kanald\.com\.tr/(?P[a-zA-Z0-9-]+)/(?:bolum|bolumler)' _TESTS = [{ 'url': 'https://www.kanald.com.tr/kuzeyguney/bolum', From fcf5e0ac1cf7b38af89599aa09f52becfc1a52cf Mon Sep 17 00:00:00 2001 From: kosantosbik <312roadrunner@gmail.com> Date: Wed, 8 May 2019 02:22:22 +0300 Subject: [PATCH 9/9] Revert "Fixed invalid control character error" --- youtube_dl/extractor/kanald.py | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/youtube_dl/extractor/kanald.py b/youtube_dl/extractor/kanald.py index 668ee5723..57b27a169 100644 --- a/youtube_dl/extractor/kanald.py +++ b/youtube_dl/extractor/kanald.py @@ -2,11 +2,9 @@ from __future__ import unicode_literals import re -import json from .common import InfoExtractor from ..utils import ( - ExtractorError, merge_dicts, try_get, ) @@ -23,14 +21,11 @@ class KanaldBaseIE(InfoExtractor): 'id': video_id, } + """FIXME: https://www.kanald.com.tr/kuzeyguney/80-bolum-izle/19364 -> Invalid control character at: line 5 column 146 (char 255)""" + search_json_ld = self._search_regex( r'(?is)]+type=(["\'])application/ld\+json\1[^>]*>(?:\s+)?(?P{[^<]+VideoObject[^<]+})(?:\s+)?', webpage, 'JSON-LD', group='json_ld') - - # https://stackoverflow.com/questions/22394235/invalid-control-character-with-python-json-loads - try: - json_ld = json.loads(search_json_ld, strict=False) - except ValueError as ve: - raise ExtractorError('%s: Failed to parse JSON ' % video_id, cause=ve) + json_ld = self._parse_json(search_json_ld, video_id) if not re.match(r'dogannet\.tv', json_ld['contentUrl']): json_ld.update({ @@ -98,7 +93,7 @@ class KanaldEmbedIE(KanaldBaseIE): class KanaldSerieIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?kanald\.com\.tr/(?P[a-zA-Z0-9-]+)/(?:bolum|bolumler)' + _VALID_URL = r'https?://(?:www\.)?kanald\.com\.tr/(?P[a-zA-Z0-9-]+)/(?:bolum|bolumler)$' _TESTS = [{ 'url': 'https://www.kanald.com.tr/kuzeyguney/bolum',