From 0fe7783eced5c62dbd95780c2150fd1080bd3927 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 28 Apr 2018 01:59:15 +0700 Subject: [PATCH 001/156] [extractor/common] Add _download_json_handle --- youtube_dl/extractor/common.py | 30 +++++++++++++++++++++--------- 1 file changed, 21 insertions(+), 9 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 59b9d3739..e0c3c8eb0 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -682,18 +682,30 @@ class InfoExtractor(object): else: self.report_warning(errmsg + str(ve)) - def _download_json(self, url_or_request, video_id, - note='Downloading JSON metadata', - errnote='Unable to download JSON metadata', - transform_source=None, - fatal=True, encoding=None, data=None, headers={}, query={}): - json_string = self._download_webpage( + def _download_json_handle( + self, url_or_request, video_id, note='Downloading JSON metadata', + errnote='Unable to download JSON metadata', transform_source=None, + fatal=True, encoding=None, data=None, headers={}, query={}): + """Return a tuple (JSON object, URL handle)""" + res = self._download_webpage_handle( url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding, data=data, headers=headers, query=query) - if (not fatal) and json_string is False: - return None + if res is False: + return res + json_string, urlh = res return self._parse_json( - json_string, video_id, transform_source=transform_source, fatal=fatal) + json_string, video_id, transform_source=transform_source, + fatal=fatal), urlh + + def _download_json( + self, url_or_request, video_id, note='Downloading JSON metadata', + errnote='Unable to download JSON metadata', transform_source=None, + fatal=True, encoding=None, data=None, headers={}, query={}): + res = self._download_json_handle( + url_or_request, video_id, note=note, errnote=errnote, + transform_source=transform_source, fatal=fatal, encoding=encoding, + data=data, headers=headers, query=query) + return res if res is False else res[0] def _parse_json(self, json_string, video_id, transform_source=None, fatal=True): if transform_source: From 6cc622327ff8289f94894f3695ed31014c61cf8e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 28 Apr 2018 02:47:17 +0700 Subject: [PATCH 002/156] [utils] Introduce merge_dicts --- test/test_utils.py | 12 ++++++++++++ youtube_dl/extractor/generic.py | 16 +--------------- youtube_dl/utils.py | 14 ++++++++++++++ 3 files changed, 27 insertions(+), 15 deletions(-) diff --git a/test/test_utils.py b/test/test_utils.py index 253a7fe17..14503ab53 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -42,6 +42,7 @@ from youtube_dl.utils import ( is_html, js_to_json, limit_length, + merge_dicts, mimetype2ext, month_by_name, multipart_encode, @@ -669,6 +670,17 @@ class TestUtil(unittest.TestCase): self.assertEqual(dict_get(d, ('b', 'c', key, )), None) self.assertEqual(dict_get(d, ('b', 'c', key, ), skip_false_values=False), false_value) + def test_merge_dicts(self): + self.assertEqual(merge_dicts({'a': 1}, {'b': 2}), {'a': 1, 'b': 2}) + self.assertEqual(merge_dicts({'a': 1}, {'a': 2}), {'a': 1}) + self.assertEqual(merge_dicts({'a': 1}, {'a': None}), {'a': 1}) + self.assertEqual(merge_dicts({'a': 1}, {'a': ''}), {'a': 1}) + self.assertEqual(merge_dicts({'a': 1}, {}), {'a': 1}) + self.assertEqual(merge_dicts({'a': None}, {'a': 1}), {'a': 1}) + self.assertEqual(merge_dicts({'a': ''}, {'a': 1}), {'a': ''}) + self.assertEqual(merge_dicts({'a': ''}, {'a': 'abc'}), {'a': 'abc'}) + self.assertEqual(merge_dicts({'a': None}, {'a': ''}, {'a': 'abc'}), {'a': 'abc'}) + def test_encode_compat_str(self): self.assertEqual(encode_compat_str(b'\xd1\x82\xd0\xb5\xd1\x81\xd1\x82', 'utf-8'), 'тест') self.assertEqual(encode_compat_str('тест', 'utf-8'), 'тест') diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index af1322e00..d48914495 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -23,6 +23,7 @@ from ..utils import ( is_html, js_to_json, KNOWN_EXTENSIONS, + merge_dicts, mimetype2ext, orderedSet, sanitized_Request, @@ -3002,21 +3003,6 @@ class GenericIE(InfoExtractor): return self.playlist_from_matches( sharevideos_urls, video_id, video_title) - def merge_dicts(dict1, dict2): - merged = {} - for k, v in dict1.items(): - if v is not None: - merged[k] = v - for k, v in dict2.items(): - if v is None: - continue - if (k not in merged or - (isinstance(v, compat_str) and v and - isinstance(merged[k], compat_str) and - not merged[k])): - merged[k] = v - return merged - # Look for HTML5 media entries = self._parse_html5_media_entries(url, webpage, video_id, m3u8_id='hls') if entries: diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 574284e94..b460393bf 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -2225,6 +2225,20 @@ def try_get(src, getter, expected_type=None): return v +def merge_dicts(*dicts): + merged = {} + for a_dict in dicts: + for k, v in a_dict.items(): + if v is None: + continue + if (k not in merged or + (isinstance(v, compat_str) and v and + isinstance(merged[k], compat_str) and + not merged[k])): + merged[k] = v + return merged + + def encode_compat_str(string, encoding=preferredencoding(), errors='strict'): return string if isinstance(string, compat_str) else compat_str(string, encoding, errors) From e7e4a6e0f9166cee82c165ca69a6a3c94ddc5f45 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 28 Apr 2018 02:48:03 +0700 Subject: [PATCH 003/156] [extractor/common] Extract interaction statistic --- youtube_dl/extractor/common.py | 35 ++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index e0c3c8eb0..a9939b0fd 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1020,6 +1020,40 @@ class InfoExtractor(object): if isinstance(json_ld, dict): json_ld = [json_ld] + INTERACTION_TYPE_MAP = { + 'CommentAction': 'comment', + 'AgreeAction': 'like', + 'DisagreeAction': 'dislike', + 'LikeAction': 'like', + 'DislikeAction': 'dislike', + 'ListenAction': 'view', + 'WatchAction': 'view', + 'ViewAction': 'view', + } + + def extract_interaction_statistic(e): + interaction_statistic = e.get('interactionStatistic') + if not isinstance(interaction_statistic, list): + return + for is_e in interaction_statistic: + if not isinstance(is_e, dict): + continue + if is_e.get('@type') != 'InteractionCounter': + continue + interaction_type = is_e.get('interactionType') + if not isinstance(interaction_type, compat_str): + continue + interaction_count = int_or_none(is_e.get('userInteractionCount')) + if interaction_count is None: + continue + count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1]) + if not count_kind: + continue + count_key = '%s_count' % count_kind + if info.get(count_key) is not None: + continue + info[count_key] = interaction_count + def extract_video_object(e): assert e['@type'] == 'VideoObject' info.update({ @@ -1035,6 +1069,7 @@ class InfoExtractor(object): 'height': int_or_none(e.get('height')), 'view_count': int_or_none(e.get('interactionCount')), }) + extract_interaction_statistic(e) for e in json_ld: if isinstance(e.get('@context'), compat_str) and re.match(r'^https?://schema.org/?$', e.get('@context')): From ae1c585cee3eb183cddf7c30a09b75d887307dee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 28 Apr 2018 02:48:20 +0700 Subject: [PATCH 004/156] [vimeo] Extract JSON LD (closes #16295) --- youtube_dl/extractor/vimeo.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 08257147e..a026526b2 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -16,6 +16,7 @@ from ..utils import ( ExtractorError, InAdvancePagedList, int_or_none, + merge_dicts, NO_DEFAULT, RegexNotFoundError, sanitized_Request, @@ -639,16 +640,18 @@ class VimeoIE(VimeoBaseInfoExtractor): 'preference': 1, }) - info_dict = self._parse_config(config, video_id) - formats.extend(info_dict['formats']) + info_dict_config = self._parse_config(config, video_id) + formats.extend(info_dict_config['formats']) self._vimeo_sort_formats(formats) + json_ld = self._search_json_ld(webpage, video_id, default={}) + if not cc_license: cc_license = self._search_regex( r']+rel=["\']license["\'][^>]+href=(["\'])(?P(?:(?!\1).)+)\1', webpage, 'license', default=None, group='license') - info_dict.update({ + info_dict = { 'id': video_id, 'formats': formats, 'timestamp': unified_timestamp(timestamp), @@ -658,7 +661,9 @@ class VimeoIE(VimeoBaseInfoExtractor): 'like_count': like_count, 'comment_count': comment_count, 'license': cc_license, - }) + } + + info_dict = merge_dicts(info_dict, info_dict_config, json_ld) return info_dict From 7dd6ab4a47b08beafe45befa29c44df2db00547e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 28 Apr 2018 04:51:39 +0700 Subject: [PATCH 005/156] [imdb] Extract all formats (closes #16249) --- youtube_dl/extractor/imdb.py | 34 +++++++++++++++++++++------------- 1 file changed, 21 insertions(+), 13 deletions(-) diff --git a/youtube_dl/extractor/imdb.py b/youtube_dl/extractor/imdb.py index 3ff672a89..425421968 100644 --- a/youtube_dl/extractor/imdb.py +++ b/youtube_dl/extractor/imdb.py @@ -3,7 +3,9 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( + determine_ext, mimetype2ext, qualities, remove_end, @@ -73,19 +75,25 @@ class ImdbIE(InfoExtractor): video_info_list = format_info.get('videoInfoList') if not video_info_list or not isinstance(video_info_list, list): continue - video_info = video_info_list[0] - if not video_info or not isinstance(video_info, dict): - continue - video_url = video_info.get('videoUrl') - if not video_url: - continue - format_id = format_info.get('ffname') - formats.append({ - 'format_id': format_id, - 'url': video_url, - 'ext': mimetype2ext(video_info.get('videoMimeType')), - 'quality': quality(format_id), - }) + for video_info in video_info_list: + if not video_info or not isinstance(video_info, dict): + continue + video_url = video_info.get('videoUrl') + if not video_url or not isinstance(video_url, compat_str): + continue + if (video_info.get('videoMimeType') == 'application/x-mpegURL' or + determine_ext(video_url) == 'm3u8'): + formats.extend(self._extract_m3u8_formats( + video_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + continue + format_id = format_info.get('ffname') + formats.append({ + 'format_id': format_id, + 'url': video_url, + 'ext': mimetype2ext(video_info.get('videoMimeType')), + 'quality': quality(format_id), + }) self._sort_formats(formats) return { From 500a86a52ee46a3a1acc864b602b74d141afdc24 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 29 Apr 2018 00:33:31 +0700 Subject: [PATCH 006/156] [downloader/fragment] Restart download if .ytdl file is corrupt (closes #16312) --- youtube_dl/downloader/fragment.py | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/youtube_dl/downloader/fragment.py b/youtube_dl/downloader/fragment.py index 927c7e491..917f6dc01 100644 --- a/youtube_dl/downloader/fragment.py +++ b/youtube_dl/downloader/fragment.py @@ -74,9 +74,14 @@ class FragmentFD(FileDownloader): return not ctx['live'] and not ctx['tmpfilename'] == '-' def _read_ytdl_file(self, ctx): + assert 'ytdl_corrupt' not in ctx stream, _ = sanitize_open(self.ytdl_filename(ctx['filename']), 'r') - ctx['fragment_index'] = json.loads(stream.read())['downloader']['current_fragment']['index'] - stream.close() + try: + ctx['fragment_index'] = json.loads(stream.read())['downloader']['current_fragment']['index'] + except Exception: + ctx['ytdl_corrupt'] = True + finally: + stream.close() def _write_ytdl_file(self, ctx): frag_index_stream, _ = sanitize_open(self.ytdl_filename(ctx['filename']), 'w') @@ -158,11 +163,17 @@ class FragmentFD(FileDownloader): if self.__do_ytdl_file(ctx): if os.path.isfile(encodeFilename(self.ytdl_filename(ctx['filename']))): self._read_ytdl_file(ctx) - if ctx['fragment_index'] > 0 and resume_len == 0: + is_corrupt = ctx.get('ytdl_corrupt') is True + is_inconsistent = ctx['fragment_index'] > 0 and resume_len == 0 + if is_corrupt or is_inconsistent: + message = ( + '.ytdl file is corrupt' if is_corrupt else + 'Inconsistent state of incomplete fragment download') self.report_warning( - 'Inconsistent state of incomplete fragment download. ' - 'Restarting from the beginning...') + '%s. Restarting from the beginning...' % message) ctx['fragment_index'] = resume_len = 0 + if 'ytdl_corrupt' in ctx: + del ctx['ytdl_corrupt'] self._write_ytdl_file(ctx) else: self._write_ytdl_file(ctx) From 106c8c3edbc5b7e95cfba79ddc6252fad0adb859 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 29 Apr 2018 19:04:40 +0700 Subject: [PATCH 007/156] [nrktv] Update API host (closes #16324) --- youtube_dl/extractor/nrk.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py index 18ead9426..3b4f51f61 100644 --- a/youtube_dl/extractor/nrk.py +++ b/youtube_dl/extractor/nrk.py @@ -237,7 +237,7 @@ class NRKTVIE(NRKBaseIE): (?:/\d{2}-\d{2}-\d{4})? (?:\#del=(?P\d+))? ''' % _EPISODE_RE - _API_HOST = 'psapi-ne.nrk.no' + _API_HOST = 'psapi-we.nrk.no' _TESTS = [{ 'url': 'https://tv.nrk.no/serie/20-spoersmaal-tv/MUHH48000314/23-05-2014', From 12b0d4e0e1df6d6a8b9ce10b9a69013497adc2b0 Mon Sep 17 00:00:00 2001 From: Meneth32 Date: Sun, 29 Apr 2018 16:59:40 +0200 Subject: [PATCH 008/156] [redditr] Add support for old.reddit.com URLs --- youtube_dl/extractor/reddit.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/reddit.py b/youtube_dl/extractor/reddit.py index 53b1c967e..8372925be 100644 --- a/youtube_dl/extractor/reddit.py +++ b/youtube_dl/extractor/reddit.py @@ -47,7 +47,7 @@ class RedditIE(InfoExtractor): class RedditRIE(InfoExtractor): - _VALID_URL = r'(?Phttps?://(?:www\.)?reddit\.com/r/[^/]+/comments/(?P[^/?#&]+))' + _VALID_URL = r'(?Phttps?://(?:(?:www|old)\.)?reddit\.com/r/[^/]+/comments/(?P[^/?#&]+))' _TESTS = [{ 'url': 'https://www.reddit.com/r/videos/comments/6rrwyj/that_small_heart_attack/', 'info_dict': { @@ -74,6 +74,10 @@ class RedditRIE(InfoExtractor): # imgur 'url': 'https://www.reddit.com/r/MadeMeSmile/comments/6t7wi5/wait_for_it/', 'only_matching': True, + }, { + # imgur @ old reddit + 'url': 'https://old.reddit.com/r/MadeMeSmile/comments/6t7wi5/wait_for_it/', + 'only_matching': True, }, { # streamable 'url': 'https://www.reddit.com/r/videos/comments/6t7sg9/comedians_hilarious_joke_about_the_guam_flag/', From 01aec8488084e62aa188b5167e57d01ef66cd256 Mon Sep 17 00:00:00 2001 From: Bastian de Groot Date: Sun, 29 Apr 2018 17:14:37 +0200 Subject: [PATCH 009/156] [generic] Prefer enclosures over links in RSS feeds --- youtube_dl/extractor/generic.py | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index d48914495..252f97c26 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -191,6 +191,16 @@ class GenericIE(InfoExtractor): 'title': 'pdv_maddow_netcast_m4v-02-27-2015-201624', } }, + # RSS feed with enclosures and unsupported link URLs + { + 'url': 'http://www.hellointernet.fm/podcast?format=rss', + 'info_dict': { + 'id': 'http://www.hellointernet.fm/podcast?format=rss', + 'description': 'CGP Grey and Brady Haran talk about YouTube, life, work, whatever.', + 'title': 'Hello Internet', + }, + 'playlist_mincount': 100, + }, # SMIL from http://videolectures.net/promogram_igor_mekjavic_eng { 'url': 'http://videolectures.net/promogram_igor_mekjavic_eng/video/1/smil.xml', @@ -2026,13 +2036,15 @@ class GenericIE(InfoExtractor): entries = [] for it in doc.findall('./channel/item'): - next_url = xpath_text(it, 'link', fatal=False) + next_url = None + enclosure_nodes = it.findall('./enclosure') + for e in enclosure_nodes: + next_url = e.attrib.get('url') + if next_url: + break + if not next_url: - enclosure_nodes = it.findall('./enclosure') - for e in enclosure_nodes: - next_url = e.attrib.get('url') - if next_url: - break + next_url = xpath_text(it, 'link', fatal=False) if not next_url: continue From 30226342ab346263b684170c4ce7d5266fec212e Mon Sep 17 00:00:00 2001 From: Niklas Haas Date: Sun, 29 Apr 2018 11:23:23 +0200 Subject: [PATCH 010/156] [youtube] Correctly disable polymer on all requests Rather than just the one that use the _download_webpage helper. The need for this was made apparent by 0fe7783e, which refactored _download_json in a way that completely avoids the use of _download_webpage, thus breaking youtube. Fixes #16323 --- youtube_dl/extractor/youtube.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index e7bd1f18f..04aeb91af 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -246,9 +246,9 @@ class YoutubeBaseInfoExtractor(InfoExtractor): return True - def _download_webpage(self, *args, **kwargs): + def _download_webpage_handle(self, *args, **kwargs): kwargs.setdefault('query', {})['disable_polymer'] = 'true' - return super(YoutubeBaseInfoExtractor, self)._download_webpage( + return super(YoutubeBaseInfoExtractor, self)._download_webpage_handle( *args, **compat_kwargs(kwargs)) def _real_initialize(self): From e5eadfa82f10bda43294d1da85024eec29c7973f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 29 Apr 2018 22:49:47 +0700 Subject: [PATCH 011/156] [udemy,xiami,yandexmusic] Override _download_webpage_handle instead of _download_webpage --- youtube_dl/extractor/udemy.py | 4 ++-- youtube_dl/extractor/xiami.py | 4 ++-- youtube_dl/extractor/yandexmusic.py | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/udemy.py b/youtube_dl/extractor/udemy.py index 6d6c0a98f..439ed2a89 100644 --- a/youtube_dl/extractor/udemy.py +++ b/youtube_dl/extractor/udemy.py @@ -115,9 +115,9 @@ class UdemyIE(InfoExtractor): error_str += ' - %s' % error_data.get('formErrors') raise ExtractorError(error_str, expected=True) - def _download_webpage(self, *args, **kwargs): + def _download_webpage_handle(self, *args, **kwargs): kwargs.setdefault('headers', {})['User-Agent'] = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/603.2.4 (KHTML, like Gecko) Version/10.1.1 Safari/603.2.4' - return super(UdemyIE, self)._download_webpage( + return super(UdemyIE, self)._download_webpage_handle( *args, **compat_kwargs(kwargs)) def _download_json(self, url_or_request, *args, **kwargs): diff --git a/youtube_dl/extractor/xiami.py b/youtube_dl/extractor/xiami.py index 7f871c8ec..8333fb534 100644 --- a/youtube_dl/extractor/xiami.py +++ b/youtube_dl/extractor/xiami.py @@ -9,8 +9,8 @@ from ..utils import int_or_none class XiamiBaseIE(InfoExtractor): _API_BASE_URL = 'http://www.xiami.com/song/playlist/cat/json/id' - def _download_webpage(self, *args, **kwargs): - webpage = super(XiamiBaseIE, self)._download_webpage(*args, **kwargs) + def _download_webpage_handle(self, *args, **kwargs): + webpage = super(XiamiBaseIE, self)._download_webpage_handle(*args, **kwargs) if '>Xiami is currently not available in your country.<' in webpage: self.raise_geo_restricted('Xiami is currently not available in your country') return webpage diff --git a/youtube_dl/extractor/yandexmusic.py b/youtube_dl/extractor/yandexmusic.py index eb1062142..e85eca073 100644 --- a/youtube_dl/extractor/yandexmusic.py +++ b/youtube_dl/extractor/yandexmusic.py @@ -34,8 +34,8 @@ class YandexMusicBaseIE(InfoExtractor): 'youtube-dl with --cookies', expected=True) - def _download_webpage(self, *args, **kwargs): - webpage = super(YandexMusicBaseIE, self)._download_webpage(*args, **kwargs) + def _download_webpage_handle(self, *args, **kwargs): + webpage = super(YandexMusicBaseIE, self)._download_webpage_handle(*args, **kwargs) if 'Нам очень жаль, но запросы, поступившие с вашего IP-адреса, похожи на автоматические.' in webpage: self._raise_captcha() return webpage From 796bf9de45d6f01bf2d34ae22e1eacdc1a649fab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 29 Apr 2018 22:56:07 +0700 Subject: [PATCH 012/156] [yandexmusic] Convert release_year to int --- youtube_dl/extractor/yandexmusic.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/yandexmusic.py b/youtube_dl/extractor/yandexmusic.py index e85eca073..009203851 100644 --- a/youtube_dl/extractor/yandexmusic.py +++ b/youtube_dl/extractor/yandexmusic.py @@ -57,14 +57,14 @@ class YandexMusicTrackIE(YandexMusicBaseIE): 'info_dict': { 'id': '4878838', 'ext': 'mp3', - 'title': 'Carlo Ambrosio & Fabio Di Bari, Carlo Ambrosio - Gypsy Eyes 1', + 'title': 'Carlo Ambrosio, Carlo Ambrosio & Fabio Di Bari - Gypsy Eyes 1', 'filesize': 4628061, 'duration': 193.04, 'track': 'Gypsy Eyes 1', 'album': 'Gypsy Soul', 'album_artist': 'Carlo Ambrosio', - 'artist': 'Carlo Ambrosio & Fabio Di Bari, Carlo Ambrosio', - 'release_year': '2009', + 'artist': 'Carlo Ambrosio, Carlo Ambrosio & Fabio Di Bari', + 'release_year': 2009, }, 'skip': 'Travis CI servers blocked by YandexMusic', } @@ -120,7 +120,7 @@ class YandexMusicTrackIE(YandexMusicBaseIE): track_info.update({ 'album': album.get('title'), 'album_artist': extract_artist(album.get('artists')), - 'release_year': compat_str(year) if year else None, + 'release_year': int_or_none(year), }) track_artist = extract_artist(track.get('artists')) From 4a733545867a014eb786348f8fb9e6ae95850742 Mon Sep 17 00:00:00 2001 From: Alex Seiler Date: Sun, 5 Nov 2017 18:07:35 +0100 Subject: [PATCH 013/156] [zattoo] Add extractor (closes #14668) --- youtube_dl/extractor/extractors.py | 6 + youtube_dl/extractor/zattoo.py | 234 +++++++++++++++++++++++++++++ 2 files changed, 240 insertions(+) create mode 100644 youtube_dl/extractor/zattoo.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 6fb65e4fe..9fe3f649d 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1418,5 +1418,11 @@ from .youtube import ( ) from .zapiks import ZapiksIE from .zaq1 import Zaq1IE +from .zattoo import ( + QuicklineIE, + QuicklineLiveIE, + ZattooIE, + ZattooLiveIE, +) from .zdf import ZDFIE, ZDFChannelIE from .zingmp3 import ZingMp3IE diff --git a/youtube_dl/extractor/zattoo.py b/youtube_dl/extractor/zattoo.py new file mode 100644 index 000000000..928f22566 --- /dev/null +++ b/youtube_dl/extractor/zattoo.py @@ -0,0 +1,234 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from uuid import uuid4 +import re + +from .common import InfoExtractor +from ..utils import ( + compat_str, + ExtractorError, + sanitized_Request, + urlencode_postdata, +) + + +class ZattooBaseIE(InfoExtractor): + + _NETRC_MACHINE = 'zattoo' + _HOST_URL = 'https://zattoo.com' + + _power_guide_hash = None + + def _login(self, uuid, session_id): + (username, password) = self._get_login_info() + if not username or not password: + raise ExtractorError( + 'A valid %s account is needed to access this media.' % self._NETRC_MACHINE, + expected=True) + login_form = { + 'login': username, + 'password': password, + 'remember': True, + } + request = sanitized_Request( + '%s/zapi/v2/account/login' % self._HOST_URL, + urlencode_postdata(login_form)) + request.add_header( + 'Referer', '%s/login' % self._HOST_URL) + request.add_header( + 'Content-Type', 'application/x-www-form-urlencoded; charset=UTF-8') + request.add_header( + 'Cookie', 'uuid=%s; beaker.session.id=%s' % (uuid, session_id)) + response = self._request_webpage( + request, None, 'Logging in') + data = self._parse_json(response.read(), None) + return data['session']['power_guide_hash'] + + def _get_app_token_and_version(self): + host_webpage = self._download_webpage( + self._HOST_URL, None, 'Downloading %s' % self._HOST_URL) + app_token = self._html_search_regex( + r'[^/]+)/(?P[0-9]+)' + + def _real_extract(self, url): + channel_name, video_id = re.match(self._VALID_URL, url).groups() + return self._extract_video(channel_name, video_id) + + +class QuicklineLiveIE(QuicklineBaseIE): + _VALID_URL = r'https?://(?:www\.)?mobiltv\.quickline\.com/watch/(?P[^/]+)$' + + def _real_extract(self, url): + channel_name = video_id = self._match_id(url) + return self._extract_video(channel_name, video_id, is_live=True) + + +class ZattooIE(ZattooBaseIE): + _VALID_URL = r'https?://(?:www\.)?zattoo\.com/watch/(?P[^/]+?)/(?P[0-9]+)[^/]+(?:/(?P[0-9]+))?' + + # Since regular videos are only available for 7 days and recorded videos + # are only available for a specific user, we cannot have detailed tests. + _TESTS = [{ + 'url': 'https://zattoo.com/watch/prosieben/130671867-maze-runner-die-auserwaehlten-in-der-brandwueste', + 'only_matching': True, + }, { + 'url': 'https://zattoo.com/watch/srf_zwei/132905652-eishockey-spengler-cup/102791477/1512211800000/1514433500000/92000', + 'only_matching': True, + }] + + def _real_extract(self, url): + channel_name, video_id, record_id = re.match(self._VALID_URL, url).groups() + return self._extract_video(channel_name, video_id, record_id) + + +class ZattooLiveIE(ZattooBaseIE): + _VALID_URL = r'https?://(?:www\.)?zattoo\.com/watch/(?P[^/]+)$' + + _TEST = { + 'url': 'https://zattoo.com/watch/srf1', + 'only_matching': True, + } + + def _real_extract(self, url): + channel_name = video_id = self._match_id(url) + return self._extract_video(channel_name, video_id, is_live=True) From 67ca1a8ef7ea6094e1e34518b93cdb5ba59f31b3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 1 May 2018 01:48:21 +0700 Subject: [PATCH 014/156] [zattoo] Improve and simplify (closes #14676) --- youtube_dl/extractor/zattoo.py | 238 +++++++++++++++++++-------------- 1 file changed, 137 insertions(+), 101 deletions(-) diff --git a/youtube_dl/extractor/zattoo.py b/youtube_dl/extractor/zattoo.py index 928f22566..773073d85 100644 --- a/youtube_dl/extractor/zattoo.py +++ b/youtube_dl/extractor/zattoo.py @@ -1,84 +1,82 @@ # coding: utf-8 from __future__ import unicode_literals -from uuid import uuid4 import re +from uuid import uuid4 from .common import InfoExtractor -from ..utils import ( +from ..compat import ( + compat_HTTPError, compat_str, +) +from ..utils import ( ExtractorError, - sanitized_Request, + int_or_none, + try_get, urlencode_postdata, ) class ZattooBaseIE(InfoExtractor): - _NETRC_MACHINE = 'zattoo' _HOST_URL = 'https://zattoo.com' _power_guide_hash = None - def _login(self, uuid, session_id): + def _login(self): (username, password) = self._get_login_info() if not username or not password: - raise ExtractorError( - 'A valid %s account is needed to access this media.' % self._NETRC_MACHINE, - expected=True) - login_form = { - 'login': username, - 'password': password, - 'remember': True, - } - request = sanitized_Request( - '%s/zapi/v2/account/login' % self._HOST_URL, - urlencode_postdata(login_form)) - request.add_header( - 'Referer', '%s/login' % self._HOST_URL) - request.add_header( - 'Content-Type', 'application/x-www-form-urlencoded; charset=UTF-8') - request.add_header( - 'Cookie', 'uuid=%s; beaker.session.id=%s' % (uuid, session_id)) - response = self._request_webpage( - request, None, 'Logging in') - data = self._parse_json(response.read(), None) - return data['session']['power_guide_hash'] + self.raise_login_required( + 'A valid %s account is needed to access this media.' + % self._NETRC_MACHINE) - def _get_app_token_and_version(self): - host_webpage = self._download_webpage( - self._HOST_URL, None, 'Downloading %s' % self._HOST_URL) + try: + data = self._download_json( + '%s/zapi/v2/account/login' % self._HOST_URL, None, 'Logging in', + data=urlencode_postdata({ + 'login': username, + 'password': password, + 'remember': 'true', + }), headers={ + 'Referer': '%s/login' % self._HOST_URL, + 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', + }) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400: + raise ExtractorError( + 'Unable to login: incorrect username and/or password', + expected=True) + raise + + self._power_guide_hash = data['session']['power_guide_hash'] + + def _real_initialize(self): + webpage = self._download_webpage( + self._HOST_URL, None, 'Downloading app token') app_token = self._html_search_regex( - r'(?:(?!\1).)+?)\1', + webpage, 'app token', group='token') app_version = self._html_search_regex( - r'