diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index c7a91a986..79df7bc2a 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -508,6 +508,11 @@ from .jwplatform import JWPlatformIE from .jpopsukitv import JpopsukiIE from .kakao import KakaoIE from .kaltura import KalturaIE +from .kanald import ( + KanaldIE, + KanaldEmbedIE, + KanaldSerieIE, +) from .kanalplay import KanalPlayIE from .kankan import KankanIE from .karaoketv import KaraoketvIE diff --git a/youtube_dl/extractor/kanald.py b/youtube_dl/extractor/kanald.py new file mode 100644 index 000000000..b1e387b0c --- /dev/null +++ b/youtube_dl/extractor/kanald.py @@ -0,0 +1,137 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + merge_dicts, + urljoin, +) + + +class KanaldBaseIE(InfoExtractor): + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + info = { + 'id': video_id, + } + + """FIXME: https://www.kanald.com.tr/kuzeyguney/80-bolum-izle/19364 -> Invalid control character at: line 5 column 146 (char 255)""" + + json_ld = self._search_regex( + r'(?is)]+type=(["\'])application/ld\+json\1[^>]*>(?:\s+)?(?P{[^<]+VideoObject[^<]+})(?:\s+)?', webpage, 'JSON-LD', group='json_ld') + ld_info = self._json_ld(json_ld, video_id) + + if not re.match(r'dogannet\.tv', ld_info['url']): + ld_info.update({ + 'url': 'https://soledge13.dogannet.tv/%s' % ld_info['url'] + }) + + return merge_dicts(ld_info, info) + + +class KanaldIE(KanaldBaseIE): + _VALID_URL = r'''(?x) + https?://(?:www\.)?kanald\.com\.tr/(?:[a-zA-Z0-9-]+)/ + (?: + (?:[0-9]+)-bolum| + (?:[0-9]+)-bolum-izle| + bolumler| + bolum + )/ + (?P[a-zA-Z0-9-]+) + ''' + + _TESTS = [{ + 'url': 'https://www.kanald.com.tr/kuzeyguney/1-bolum/10115', + 'md5': '8a32b6e894d45d618360b8b01173de9a', + 'info_dict': { + 'id': '10115', + 'title': '1.Bölüm', + 'description': 'md5:64edbdd153b7eefdf92c31bf5a6e5c1b', + 'upload_date': '20110907', + 'timestamp': 1315426815, + 'ext': 'm3u8', + } + }, { + 'url': 'https://www.kanald.com.tr/kuzeyguney/79-bolum-izle/19270', + 'only_matching': True + }, { + 'url': 'https://www.kanald.com.tr/sevdanin-bahcesi/bolumler/sevdanin-bahcesi-2-bolum', + 'only_matching': True + }, { + 'url': 'https://www.kanald.com.tr/yarim-elma/bolum/yarim-elma-36-bolum', + 'only_matching': True + }, { + 'url': 'https://www.kanald.com.tr/ask-ve-gunah/bolumler/ask-ve-gunah-120-bolum-final', + 'only_matching': True + }] + + +class KanaldEmbedIE(KanaldBaseIE): + _VALID_URL = r'https?://(?:www\.)?kanald\.com\.tr/embed/(?P[a-zA-Z0-9]+)' + + _TESTS = [{ + 'url': 'https://www.kanald.com.tr/embed/5465f0d2cf45af1064b73077', + 'md5': '8a32b6e894d45d618360b8b01173de9a', + 'info_dict': { + 'id': '5465f0d2cf45af1064b73077', + 'title': '1.Bölüm', + 'description': 'md5:64edbdd153b7eefdf92c31bf5a6e5c1b', + 'upload_date': '20110907', + 'timestamp': 1315426815, + 'ext': 'm3u8', + } + }] + + +class KanaldSerieIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?kanald\.com\.tr/(?P[a-zA-Z0-9-]+)/(?:bolum|bolumler)$' + + _TESTS = [{ + 'url': 'https://www.kanald.com.tr/kuzeyguney/bolum', + 'info_dict': { + 'id': 'kuzeyguney' + }, + 'playlist_mincount': 80 + }, { + 'url': 'https://www.kanald.com.tr/iki-yalanci/bolumler', + 'only_matching': True + }] + + def extract_episodes(self, url, playlist_id): + page = 1 + has_more = True + + while has_more: + webpage = self._download_webpage( + url, playlist_id, 'Downloading page %s' % page, query={ + 'page': page, + }) + + episode_urls = re.findall(r'' + re.escape(playlist_id) + r'[a-zA-Z0-9-/]+)\1[^>]*>', webpage) + + if len(episode_urls) is 0: + has_more = False + continue + + for episode_url in episode_urls: + episode_url = episode_url[1] + if not episode_url: + continue + yield self.url_result( + 'https://www.kanald.com.tr/%s' % episode_url, + ie=KanaldIE.ie_key()) + + page += 1 + + def _real_extract(self, url): + playlist_id = self._match_id(url) + + return self.playlist_result(self.extract_episodes(url, playlist_id), playlist_id)