From 5da61feddffc2d03e9a7c9b95bc0b63d44424ab9 Mon Sep 17 00:00:00 2001 From: Kyle Date: Sun, 21 Jul 2019 09:47:57 +0900 Subject: [PATCH] Make further requested changes. --- youtube_dl/extractor/yahoo.py | 171 +++++++++++++--------------------- 1 file changed, 67 insertions(+), 104 deletions(-) diff --git a/youtube_dl/extractor/yahoo.py b/youtube_dl/extractor/yahoo.py index 881841f9e..b54605ba0 100644 --- a/youtube_dl/extractor/yahoo.py +++ b/youtube_dl/extractor/yahoo.py @@ -566,36 +566,28 @@ class YahooJapanNewsIE(InfoExtractor): IE_NAME = 'yahoo:japannews' IE_DESC = 'Yahoo! Japan News' _VALID_URL = r'https?://(?P(?:news|headlines)\.yahoo\.co\.jp)(/[^\d]*(?P\d[\d-]*\d))?' - _TESTS = [ - { - 'url': 'https://headlines.yahoo.co.jp/videonews/fnn?a=20190719-00421051-fnn-soci', + _TESTS = [{ + 'url': 'https://headlines.yahoo.co.jp/videonews/ann?a=20190716-00000071-ann-int', 'info_dict': { - 'id': '20190719-00421051', + 'id': '1736242', 'ext': 'mp4', - 'title': '15階から2歳女児転落死\u3000母は台所「目を離した隙に」(フジテレビ系(FNN)) - Yahoo!ニュース', - 'description': '大阪市で18日夜、マンションの15階から2歳の女の子が転落し病院に運ばれたが、まもな - Yahoo!ニュース(フジテレビ系(FNN))', + 'title': 'ムン大統領が対日批判を強化“現金化”効果は?(テレビ朝日系(ANN)) - Yahoo!ニュース', + 'description': '韓国の元徴用工らを巡る裁判の原告が弁護士が差し押さえた三菱重工業の資産を売却して - Yahoo!ニュース(テレビ朝日系(ANN))', 'thumbnail': r're:^https?://.*\.[a-zA-Z\d]{3,4}$', - }, - 'params': {'skip_download': True}, - }, - { + }, 'params': {'skip_download': True}, + }, { 'url': 'https://headlines.yahoo.co.jp/videonews/', 'only_matching': True, - }, - { + }, { 'url': 'https://news.yahoo.co.jp', 'only_matching': True, - }, - { + }, { 'url': 'https://news.yahoo.co.jp/byline/hashimotojunji/20190628-00131977/', 'only_matching': True, - }, - { + }, { 'url': 'https://news.yahoo.co.jp/feature/1356', 'only_matching': True - }, - ] - _USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36' + }] def _extract_formats(self, json_data, content_id): formats = [] @@ -603,8 +595,7 @@ class YahooJapanNewsIE(InfoExtractor): video_data = try_get( json_data, lambda x: x['ResultSet']['Result'][0]['VideoUrlSet']['VideoUrl'], - list, - ) + list) for vid in video_data or []: delivery = vid.get('delivery') url = url_or_none(vid.get('Url')) @@ -614,29 +605,21 @@ class YahooJapanNewsIE(InfoExtractor): formats.extend( self._extract_m3u8_formats( url, content_id, 'mp4', 'm3u8_native', - m3u8_id='hls', fatal=False, - ) - ) + m3u8_id='hls', fatal=False)) else: - formats.append( - { - 'url': url, - 'format_id': 'http-%s' % compat_str(vid.get('bitrate', '')), - 'ext': determine_ext(url), - 'height': int_or_none(vid.get('height')), - 'width': int_or_none(vid.get('width')), - 'btr': int_or_none(vid.get('bitrate')), - } - ) + formats.append({ + 'url': url, + 'format_id': 'http-%s' % compat_str(vid.get('bitrate', '')), + 'ext': determine_ext(url), + 'height': int_or_none(vid.get('height')), + 'width': int_or_none(vid.get('width')), + 'btr': int_or_none(vid.get('bitrate')) + }) self._remove_duplicate_formats(formats) self._sort_formats(formats) return formats - @staticmethod - def _get_md5(s): - return hashlib.md5(s.encode()).hexdigest() - def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) host = mobj.group('host') @@ -644,46 +627,34 @@ class YahooJapanNewsIE(InfoExtractor): display_id = mobj.group('id') or host webpage = self._download_webpage(url, display_id) - title = ( - self._og_search_title(webpage, default=None) - or self._html_search_meta('twitter:title', webpage, default=None) - or self._html_search_regex('([^<]+)', webpage, 'title') - ) - description = ( - self._og_search_description(webpage, default=None) - or self._html_search_meta('description', webpage, default=None) - or self._html_search_meta('twitter:description', webpage, default=None) - ) + title = self._html_search_meta( + ['og:title', 'twitter:title'], webpage, 'title', default=None + ) or self._html_search_regex('([^<]+)', webpage, 'title') + description = self._html_search_meta([ + 'og:description', 'description', 'twitter:description' + ], webpage, 'description', default=None) thumbnail = self._og_search_thumbnail( webpage, default=None - ) or self._html_search_meta('twitter:image', webpage, default=None) - space_id = ( - self._search_regex( - r']+class=(["\'])yvpub-player\1[^>]+spaceid=(?P[^&"\']+)', - webpage, 'spaceid', group='spaceid', default=None, - ) - or self._search_regex( - r'YAHOO\.JP\.srch\.\w+link\.onLoad[^;]+spaceID["\' ]*:["\' ]+([^"\']+)', - webpage, 'spaceid', default=None, - ) - or self._search_regex(r'