From 20dbcfc1cd8eb0d99fb8fbb04e4e49538a57d64d Mon Sep 17 00:00:00 2001 From: Tatsuyuki Ishi Date: Wed, 6 Sep 2017 11:24:34 +0900 Subject: [PATCH 1/7] [compat] Add compat_zip --- youtube_dl/compat.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index 9e4e13bcf..2a62248ef 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -6,6 +6,7 @@ import collections import email import getpass import io +import itertools import optparse import os import re @@ -15,7 +16,6 @@ import socket import struct import subprocess import sys -import itertools import xml.etree.ElementTree @@ -2898,6 +2898,13 @@ else: compat_struct_pack = struct.pack compat_struct_unpack = struct.unpack +try: + from future_builtins import zip as compat_zip +except ImportError: # not 2.6+ or is 3.x + try: + from itertools import izip as compat_zip # < 2.5 or 3.x + except ImportError: + compat_zip = zip __all__ = [ 'compat_HTMLParseError', @@ -2948,5 +2955,6 @@ __all__ = [ 'compat_urlretrieve', 'compat_xml_parse_error', 'compat_xpath', + 'compat_zip', 'workaround_optparse_bug9161', ] From d5a11b49483d78601f2232648d9502c607f7905e Mon Sep 17 00:00:00 2001 From: Tatsuyuki Ishi Date: Wed, 6 Sep 2017 11:25:28 +0900 Subject: [PATCH 2/7] [mixcloud] Add handling for new frontend --- youtube_dl/extractor/mixcloud.py | 150 +++++++++++++++++++------------ 1 file changed, 92 insertions(+), 58 deletions(-) diff --git a/youtube_dl/extractor/mixcloud.py b/youtube_dl/extractor/mixcloud.py index f6360cce6..3c34d6be4 100644 --- a/youtube_dl/extractor/mixcloud.py +++ b/youtube_dl/extractor/mixcloud.py @@ -9,9 +9,9 @@ from .common import InfoExtractor from ..compat import ( compat_chr, compat_ord, - compat_str, compat_urllib_parse_unquote, compat_urlparse, + compat_zip ) from ..utils import ( clean_html, @@ -54,27 +54,12 @@ class MixcloudIE(InfoExtractor): 'only_matching': True, }] - _keys = [ - 'return { requestAnimationFrame: function(callback) { callback(); }, innerHeight: 500 };', - 'pleasedontdownloadourmusictheartistswontgetpaid', - 'window.addEventListener = window.addEventListener || function() {};', - '(function() { return new Date().toLocaleDateString(); })()' - ] - _current_key = None - - # See https://www.mixcloud.com/media/js2/www_js_2.9e23256562c080482435196ca3975ab5.js - def _decrypt_play_info(self, play_info, video_id): - play_info = base64.b64decode(play_info.encode('ascii')) - for num, key in enumerate(self._keys, start=1): - try: - return self._parse_json( - ''.join([ - compat_chr(compat_ord(ch) ^ compat_ord(key[idx % len(key)])) - for idx, ch in enumerate(play_info)]), - video_id) - except ExtractorError: - if num == len(self._keys): - raise + @staticmethod + def _decrypt_xor_cipher(key, ciphertext): + """Encrypt/Decrypt XOR cipher. Both ways are possible because it's XOR.""" + return ''.join([ + compat_chr(compat_ord(ch) ^ compat_ord(k)) + for ch, k in compat_zip(ciphertext, itertools.cycle(key))]) def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) @@ -84,54 +69,103 @@ class MixcloudIE(InfoExtractor): webpage = self._download_webpage(url, track_id) - if not self._current_key: - js_url = self._search_regex( - r']+\bsrc=["\"](https://(?:www\.)?mixcloud\.com/media/js2/www_js_4\.[^>]+\.js)', - webpage, 'js url', default=None) - if js_url: - js = self._download_webpage(js_url, track_id, fatal=False) - if js: - KEY_RE_TEMPLATE = r'player\s*:\s*{.*?\b%s\s*:\s*(["\'])(?P(?:(?!\1).)+)\1' - for key_name in ('value', 'key_value', 'key_value.*?', '.*?value.*?'): - key = self._search_regex( - KEY_RE_TEMPLATE % key_name, js, 'key', - default=None, group='key') - if key and isinstance(key, compat_str): - self._keys.insert(0, key) - self._current_key = key + # Legacy path + encrypted_play_info = self._search_regex( + r'm-play-info="([^"]+)"', webpage, 'play info', default=None) + + if encrypted_play_info is not None: + # Decode + encrypted_play_info = base64.b64decode(encrypted_play_info) + else: + # New path + full_info_json = self._parse_json(self._html_search_regex( + r'', webpage, 'play info'), 'play info') + for item in full_info_json: + item_data = item.get("cloudcast", {}) \ + .get("data", {}) \ + .get("cloudcastLookup", {}) + if item_data \ + .get("streamInfo", {}) \ + .get("url", "") != "": + info_json = item_data + break message = self._html_search_regex( r'(?s)]+class="global-message cloudcast-disabled-notice-light"[^>]*>(.+?)<(?:a|/div)', webpage, 'error message', default=None) - encrypted_play_info = self._search_regex( - r'm-play-info="([^"]+)"', webpage, 'play info') + js_url = self._search_regex( + r']+\bsrc=["\"](https://(?:www\.)?mixcloud\.com/media/js2/www_js_4\.[^>]+\.js)', + webpage, 'js url', default=None) + if js_url is None: + js_url = self._search_regex( + r']+\bsrc=["\"](https://(?:www\.)?mixcloud\.com/media/js/www\.[^>]+\.js)', + webpage, 'js url') + js = self._download_webpage(js_url, track_id) + # Known plaintext attack + if encrypted_play_info: + kp = '{"stream_url":' + kpa_target = encrypted_play_info + else: + kp = 'https://' + kpa_target = base64.b64decode(info_json["streamInfo"]["url"]) + partial_key = self._decrypt_xor_cipher(kpa_target, kp) + for quote in ["'", '"']: + key = self._search_regex(r'{0}({1}[^{0}]*){0}'.format(quote, re.escape(partial_key)), js, + "encryption key", default=None) + if key is not None: + break - play_info = self._decrypt_play_info(encrypted_play_info, track_id) + if encrypted_play_info is not None: + play_info = self._parse_json(self._decrypt_xor_cipher(key, encrypted_play_info), 'play info') + if message and 'stream_url' not in play_info: + raise ExtractorError('%s said: %s' % (self.IE_NAME, message), expected=True) + song_url = play_info['stream_url'] + formats = [{ + 'format_id': 'normal', + 'url': song_url + }] - if message and 'stream_url' not in play_info: - raise ExtractorError('%s said: %s' % (self.IE_NAME, message), expected=True) + title = self._html_search_regex(r'm-title="([^"]+)"', webpage, 'title') + thumbnail = self._proto_relative_url(self._html_search_regex( + r'm-thumbnail-url="([^"]+)"', webpage, 'thumbnail', fatal=False)) + uploader = self._html_search_regex( + r'm-owner-name="([^"]+)"', webpage, 'uploader', fatal=False) + uploader_id = self._search_regex( + r'\s+"profile": "([^"]+)",', webpage, 'uploader id', fatal=False) + description = self._og_search_description(webpage) + view_count = str_to_int(self._search_regex( + [r'([0-9,.]+)', + r'(?:m|data)-tooltip=["\']([\d,.]+) plays'], + webpage, 'play count', default=None)) - song_url = play_info['stream_url'] - - title = self._html_search_regex(r'm-title="([^"]+)"', webpage, 'title') - thumbnail = self._proto_relative_url(self._html_search_regex( - r'm-thumbnail-url="([^"]+)"', webpage, 'thumbnail', fatal=False)) - uploader = self._html_search_regex( - r'm-owner-name="([^"]+)"', webpage, 'uploader', fatal=False) - uploader_id = self._search_regex( - r'\s+"profile": "([^"]+)",', webpage, 'uploader id', fatal=False) - description = self._og_search_description(webpage) - view_count = str_to_int(self._search_regex( - [r'([0-9,.]+)', - r'(?:m|data)-tooltip=["\']([\d,.]+) plays'], - webpage, 'play count', default=None)) + else: + title = info_json['name'] + thumbnail = 'https://thumbnailer.mixcloud.com/unsafe/600x600/' + info_json['picture']['urlRoot'] + uploader = info_json['owner']['displayName'] + uploader_id = info_json['owner']['username'] + description = info_json['description'] + view_count = info_json['plays'] + formats = [ + { + 'format_id': 'normal', + 'url': self._decrypt_xor_cipher(key, base64.b64decode(info_json['streamInfo']['url'])) + }, + { + 'format_id': 'hls', + 'url': self._decrypt_xor_cipher(key, base64.b64decode(info_json['streamInfo']['hlsUrl'])) + }, + { + 'format_id': 'dash', + 'url': self._decrypt_xor_cipher(key, base64.b64decode(info_json['streamInfo']['dashUrl'])) + } + ] return { 'id': track_id, 'title': title, - 'url': song_url, + 'formats': formats, 'description': description, 'thumbnail': thumbnail, 'uploader': uploader, From 7ab078eae9d34687e0a3db3e76e79cff6316a677 Mon Sep 17 00:00:00 2001 From: Tatsuyuki Ishi Date: Wed, 13 Sep 2017 22:23:57 +0900 Subject: [PATCH 3/7] Address review issues --- youtube_dl/extractor/mixcloud.py | 51 +++++++++++++++----------------- 1 file changed, 24 insertions(+), 27 deletions(-) diff --git a/youtube_dl/extractor/mixcloud.py b/youtube_dl/extractor/mixcloud.py index 3c34d6be4..df4237174 100644 --- a/youtube_dl/extractor/mixcloud.py +++ b/youtube_dl/extractor/mixcloud.py @@ -18,7 +18,7 @@ from ..utils import ( ExtractorError, OnDemandPagedList, str_to_int, -) + try_get) class MixcloudIE(InfoExtractor): @@ -81,12 +81,8 @@ class MixcloudIE(InfoExtractor): full_info_json = self._parse_json(self._html_search_regex( r'', webpage, 'play info'), 'play info') for item in full_info_json: - item_data = item.get("cloudcast", {}) \ - .get("data", {}) \ - .get("cloudcastLookup", {}) - if item_data \ - .get("streamInfo", {}) \ - .get("url", "") != "": + item_data = try_get(item, lambda x: x['cloudcast']['data']['cloudcastLookup']) + if try_get(item_data, lambda x: x['streamInfo']['url']) not in ['', None]: info_json = item_data break @@ -108,7 +104,7 @@ class MixcloudIE(InfoExtractor): kpa_target = encrypted_play_info else: kp = 'https://' - kpa_target = base64.b64decode(info_json["streamInfo"]["url"]) + kpa_target = base64.b64decode(info_json['streamInfo']['url']) partial_key = self._decrypt_xor_cipher(kpa_target, kp) for quote in ["'", '"']: key = self._search_regex(r'{0}({1}[^{0}]*){0}'.format(quote, re.escape(partial_key)), js, @@ -142,25 +138,26 @@ class MixcloudIE(InfoExtractor): else: title = info_json['name'] - thumbnail = 'https://thumbnailer.mixcloud.com/unsafe/600x600/' + info_json['picture']['urlRoot'] - uploader = info_json['owner']['displayName'] - uploader_id = info_json['owner']['username'] - description = info_json['description'] - view_count = info_json['plays'] - formats = [ - { - 'format_id': 'normal', - 'url': self._decrypt_xor_cipher(key, base64.b64decode(info_json['streamInfo']['url'])) - }, - { - 'format_id': 'hls', - 'url': self._decrypt_xor_cipher(key, base64.b64decode(info_json['streamInfo']['hlsUrl'])) - }, - { - 'format_id': 'dash', - 'url': self._decrypt_xor_cipher(key, base64.b64decode(info_json['streamInfo']['dashUrl'])) - } - ] + thumbnail = try_get(info_json, + lambda x: 'https://thumbnailer.mixcloud.com/unsafe/600x600/' + x['picture']['urlRoot']) + uploader = try_get(info_json, lambda x: x['owner']['displayName']) + uploader_id = try_get(info_json, lambda x: x['owner']['username']) + description = try_get(info_json, lambda x: x['description']) + view_count = try_get(info_json, lambda x: x['plays']) + formats = [{ + 'format_id': 'normal', + 'url': self._decrypt_xor_cipher(key, base64.b64decode(info_json['streamInfo']['url'])) + }] + + hls_encrypted = try_get(info_json, lambda x: x['streamInfo']['hlsUrl']) + if hls_encrypted is not None: + hls_url = self._decrypt_xor_cipher(key, base64.b64decode(hls_encrypted)) + formats.extend(self._extract_m3u8_formats(hls_url, title)) + + dash_encrypted = try_get(info_json, lambda x: x['streamInfo']['dashUrl']) + if dash_encrypted is not None: + dash_url = self._decrypt_xor_cipher(key, base64.b64decode(dash_encrypted)) + formats.extend(self._extract_mpd_formats(dash_url, title)) return { 'id': track_id, From 6dd969ade49ac7e6f42993814c1ee2c7b04ab4ee Mon Sep 17 00:00:00 2001 From: Tatsuyuki Ishi Date: Fri, 15 Sep 2017 22:05:27 +0900 Subject: [PATCH 4/7] Add fallback known-plaintext There seems to be some "special" URLs that are not served from their CDN. --- youtube_dl/extractor/mixcloud.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/mixcloud.py b/youtube_dl/extractor/mixcloud.py index df4237174..e31f1d977 100644 --- a/youtube_dl/extractor/mixcloud.py +++ b/youtube_dl/extractor/mixcloud.py @@ -100,17 +100,21 @@ class MixcloudIE(InfoExtractor): js = self._download_webpage(js_url, track_id) # Known plaintext attack if encrypted_play_info: - kp = '{"stream_url":' + kps = ['{"stream_url":'] kpa_target = encrypted_play_info else: - kp = 'https://' + kps = ['https://', 'http://'] kpa_target = base64.b64decode(info_json['streamInfo']['url']) - partial_key = self._decrypt_xor_cipher(kpa_target, kp) - for quote in ["'", '"']: - key = self._search_regex(r'{0}({1}[^{0}]*){0}'.format(quote, re.escape(partial_key)), js, - "encryption key", default=None) - if key is not None: - break + for kp in kps: + partial_key = self._decrypt_xor_cipher(kpa_target, kp) + for quote in ["'", '"']: + key = self._search_regex(r'{0}({1}[^{0}]*){0}'.format(quote, re.escape(partial_key)), js, + "encryption key", default=None) + if key is not None: + break + else: + continue + break if encrypted_play_info is not None: play_info = self._parse_json(self._decrypt_xor_cipher(key, encrypted_play_info), 'play info') From b507277375718d85afb5a77f1da1d36f2aeca0a7 Mon Sep 17 00:00:00 2001 From: Tatsuyuki Ishi Date: Fri, 15 Sep 2017 22:08:48 +0900 Subject: [PATCH 5/7] Ensure data variables are assigned --- youtube_dl/extractor/mixcloud.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/youtube_dl/extractor/mixcloud.py b/youtube_dl/extractor/mixcloud.py index e31f1d977..c0db41b53 100644 --- a/youtube_dl/extractor/mixcloud.py +++ b/youtube_dl/extractor/mixcloud.py @@ -85,6 +85,8 @@ class MixcloudIE(InfoExtractor): if try_get(item_data, lambda x: x['streamInfo']['url']) not in ['', None]: info_json = item_data break + else: + raise ExtractorError('Failed to extract matching stream info') message = self._html_search_regex( r'(?s)]+class="global-message cloudcast-disabled-notice-light"[^>]*>(.+?)<(?:a|/div)', @@ -115,6 +117,8 @@ class MixcloudIE(InfoExtractor): else: continue break + else: + raise ExtractorError('Failed to extract encryption key') if encrypted_play_info is not None: play_info = self._parse_json(self._decrypt_xor_cipher(key, encrypted_play_info), 'play info') From a26f2947dd61f5acb3892c92b7cdfb9e3b0bfe8d Mon Sep 17 00:00:00 2001 From: Tatsuyuki Ishi Date: Wed, 20 Sep 2017 16:46:25 +0900 Subject: [PATCH 6/7] Address review issue (try 2) --- youtube_dl/extractor/mixcloud.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/mixcloud.py b/youtube_dl/extractor/mixcloud.py index c0db41b53..691764ad2 100644 --- a/youtube_dl/extractor/mixcloud.py +++ b/youtube_dl/extractor/mixcloud.py @@ -82,7 +82,7 @@ class MixcloudIE(InfoExtractor): r'', webpage, 'play info'), 'play info') for item in full_info_json: item_data = try_get(item, lambda x: x['cloudcast']['data']['cloudcastLookup']) - if try_get(item_data, lambda x: x['streamInfo']['url']) not in ['', None]: + if try_get(item_data, lambda x: x['streamInfo']['url']): info_json = item_data break else: @@ -152,17 +152,19 @@ class MixcloudIE(InfoExtractor): uploader_id = try_get(info_json, lambda x: x['owner']['username']) description = try_get(info_json, lambda x: x['description']) view_count = try_get(info_json, lambda x: x['plays']) + + stream_info = info_json['streamInfo'] formats = [{ 'format_id': 'normal', - 'url': self._decrypt_xor_cipher(key, base64.b64decode(info_json['streamInfo']['url'])) + 'url': self._decrypt_xor_cipher(key, base64.b64decode(stream_info['url'])) }] - hls_encrypted = try_get(info_json, lambda x: x['streamInfo']['hlsUrl']) + hls_encrypted = stream_info.get('hlsUrl') if hls_encrypted is not None: hls_url = self._decrypt_xor_cipher(key, base64.b64decode(hls_encrypted)) formats.extend(self._extract_m3u8_formats(hls_url, title)) - dash_encrypted = try_get(info_json, lambda x: x['streamInfo']['dashUrl']) + dash_encrypted = stream_info.get('dashUrl') if dash_encrypted is not None: dash_url = self._decrypt_xor_cipher(key, base64.b64decode(dash_encrypted)) formats.extend(self._extract_mpd_formats(dash_url, title)) From 79ec7fb1149ba314f439a0e5148905e96f051b23 Mon Sep 17 00:00:00 2001 From: Tatsuyuki Ishi Date: Thu, 21 Sep 2017 20:56:23 +0900 Subject: [PATCH 7/7] Reduce code duplication --- youtube_dl/extractor/mixcloud.py | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/youtube_dl/extractor/mixcloud.py b/youtube_dl/extractor/mixcloud.py index 691764ad2..481182380 100644 --- a/youtube_dl/extractor/mixcloud.py +++ b/youtube_dl/extractor/mixcloud.py @@ -61,6 +61,13 @@ class MixcloudIE(InfoExtractor): compat_chr(compat_ord(ch) ^ compat_ord(k)) for ch, k in compat_zip(ciphertext, itertools.cycle(key))]) + @staticmethod + def _decrypt_and_extend(stream_info, url_key, getter, key, formats): + maybe_url = stream_info.get(url_key) + if maybe_url is not None: + decrypted = MixcloudIE._decrypt_xor_cipher(key, base64.b64decode(maybe_url)) + formats.extend(getter(decrypted)) + def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) uploader = mobj.group(1) @@ -154,20 +161,15 @@ class MixcloudIE(InfoExtractor): view_count = try_get(info_json, lambda x: x['plays']) stream_info = info_json['streamInfo'] - formats = [{ + formats = [] + self._decrypt_and_extend(stream_info, 'url', lambda x: [{ 'format_id': 'normal', - 'url': self._decrypt_xor_cipher(key, base64.b64decode(stream_info['url'])) - }] - - hls_encrypted = stream_info.get('hlsUrl') - if hls_encrypted is not None: - hls_url = self._decrypt_xor_cipher(key, base64.b64decode(hls_encrypted)) - formats.extend(self._extract_m3u8_formats(hls_url, title)) - - dash_encrypted = stream_info.get('dashUrl') - if dash_encrypted is not None: - dash_url = self._decrypt_xor_cipher(key, base64.b64decode(dash_encrypted)) - formats.extend(self._extract_mpd_formats(dash_url, title)) + 'url': x + }], key, formats) + self._decrypt_and_extend(stream_info, 'hlsUrl', lambda x: self._extract_m3u8_formats(x, title), key, + formats) + self._decrypt_and_extend(stream_info, 'dashUrl', lambda x: self._extract_mpd_formats(x, title), key, + formats) return { 'id': track_id,