From ebd92e71f25c6bcdf53b26af3e7ce8eaac1d98d0 Mon Sep 17 00:00:00 2001 From: Henrik Hank Date: Mon, 14 May 2018 22:27:23 +0200 Subject: [PATCH] Allow creation of internet shortcut files with new --write-link switch and similar ones. --- README.md | 28 ++++++++++++-- test/parameters.json | 5 +++ test/test_compat.py | 35 ++++++++++++++++-- test/test_utils.py | 27 ++++++++++++++ youtube_dl/YoutubeDL.py | 67 ++++++++++++++++++++++++++++++++- youtube_dl/__init__.py | 4 ++ youtube_dl/compat.py | 26 ++++++++++++- youtube_dl/options.py | 25 +++++++++++-- youtube_dl/utils.py | 82 +++++++++++++++++++++++++++++++++++++++++ 9 files changed, 284 insertions(+), 15 deletions(-) diff --git a/README.md b/README.md index d9fe2350a..e4a13e6b8 100644 --- a/README.md +++ b/README.md @@ -176,7 +176,11 @@ Alternatively, refer to the [developer instructions](#developer-instructions) fo age --download-archive FILE Download only videos not listed in the archive file. Record the IDs of all - downloaded videos in it. + downloaded videos in it. When the switches + --write-link (or similar) and + --skip-download are used additionally, the + IDs will also be recorded, even though + nothing was actually downloaded. --include-ads Download advertisements as well (experimental) @@ -268,12 +272,24 @@ Alternatively, refer to the [developer instructions](#developer-instructions) fo --no-cache-dir Disable filesystem caching --rm-cache-dir Delete all filesystem cache files -## Thumbnail images: +## Thumbnail Images: --write-thumbnail Write thumbnail image to disk --write-all-thumbnails Write all thumbnail image formats to disk --list-thumbnails Simulate and list all available thumbnail formats +## Internet Shortcut Options: + --write-link Write an internet shortcut file, depending + on the current platform (.url/.webloc/ + .desktop). The URL may be cached by the OS. + --write-url-link Write a Windows internet shortcut file + (.url). Note that the OS caches the URL + based on the file path. + --write-webloc-link Write a macOS internet shortcut file + (.webloc) + --write-desktop-link Write a Linux internet shortcut file + (.desktop) + ## Verbosity / Simulation Options: -q, --quiet Activate quiet mode --no-warnings Ignore warnings @@ -385,7 +401,7 @@ Alternatively, refer to the [developer instructions](#developer-instructions) fo --ap-list-mso List all supported multiple-system operators -## Post-processing Options: +## Post-Processing Options: -x, --extract-audio Convert video files to audio-only files (requires ffmpeg or avconv and ffprobe or avprobe) @@ -497,7 +513,11 @@ The `-o` option allows users to indicate a template for the output file names. **tl;dr:** [navigate me to examples](#output-template-examples). -The basic usage is not to set any template arguments when downloading a single file, like in `youtube-dl -o funny_video.flv "https://some/video"`. However, it may contain special sequences that will be replaced when downloading each video. The special sequences may be formatted according to [python string formatting operations](https://docs.python.org/2/library/stdtypes.html#string-formatting). For example, `%(NAME)s` or `%(NAME)05d`. To clarify, that is a percent symbol followed by a name in parentheses, followed by a formatting operations. Allowed names along with sequence type are: +The basic usage is not to set any template arguments when downloading a single file, like in `youtube-dl -o funny_video.flv "https://some/video"`. However, it may contain special sequences that will be replaced when downloading each video. The special sequences may be formatted according to [Python string formatting operations](https://docs.python.org/2/library/stdtypes.html#string-formatting). For example, `%(NAME)s` or `%(NAME)05d`. To clarify, that is a percent symbol followed by a name in parentheses, followed by a formatting operation. + +You can, e.g., limit the number of characters of the title to prevent errors with too long filenames or file paths: `%(title).100s`. You may want to check the length of the rest of your filename or path and adapt the number accordingly. + +Allowed names along with sequence type are: - `id` (string): Video identifier - `title` (string): Video title diff --git a/test/parameters.json b/test/parameters.json index 7bf59c25f..2e186e2d5 100644 --- a/test/parameters.json +++ b/test/parameters.json @@ -35,6 +35,11 @@ "verbose": true, "writedescription": false, "writeinfojson": true, + "writeannotations": false, + "writelink": false, + "writeurllink": false, + "writewebloclink": false, + "writedesktoplink": false, "writesubtitles": false, "allsubtitles": false, "listssubtitles": false, diff --git a/test/test_compat.py b/test/test_compat.py index d6c54e135..63ae8e799 100644 --- a/test/test_compat.py +++ b/test/test_compat.py @@ -18,6 +18,8 @@ from youtube_dl.compat import ( compat_shlex_split, compat_str, compat_struct_unpack, + compat_urllib_parse_quote, + compat_urllib_parse_quote_plus, compat_urllib_parse_unquote, compat_urllib_parse_unquote_plus, compat_urllib_parse_urlencode, @@ -52,6 +54,29 @@ class TestCompat(unittest.TestCase): dir(youtube_dl.compat))) - set(['unicode_literals']) self.assertEqual(all_names, sorted(present_names)) + def test_compat_urllib_parse_quote(self): + self.assertEqual(compat_urllib_parse_quote('abc def'), 'abc%20def') + self.assertEqual(compat_urllib_parse_quote('/~user/abc+def'), '/%7Euser/abc%2Bdef') + self.assertEqual(compat_urllib_parse_quote('/~user/abc+def', safe='/~+'), '/~user/abc+def') + self.assertEqual(compat_urllib_parse_quote(''), '') + self.assertEqual(compat_urllib_parse_quote('%'), '%25') + self.assertEqual(compat_urllib_parse_quote('%', safe='%'), '%') + self.assertEqual(compat_urllib_parse_quote('津波'), '%E6%B4%A5%E6%B3%A2') + self.assertEqual( + compat_urllib_parse_quote( + ''' + %%a''', safe='<>=":%/ \r\n'), + ''' + %%a''') + self.assertEqual( + compat_urllib_parse_quote( + '''(^◣_◢^)っ︻デ═一 ⇀ ⇀ ⇀ ⇀ ⇀ ↶%I%Break%25Things%''', safe='% '), + '''%28%5E%E2%97%A3_%E2%97%A2%5E%29%E3%81%A3%EF%B8%BB%E3%83%87%E2%95%90%E4%B8%80 %E2%87%80 %E2%87%80 %E2%87%80 %E2%87%80 %E2%87%80 %E2%86%B6%I%Break%25Things%''') + + def test_compat_urllib_parse_quote_plus(self): + self.assertEqual(compat_urllib_parse_quote_plus('abc def'), 'abc+def') + self.assertEqual(compat_urllib_parse_quote_plus('~/abc def'), '%7E%2Fabc+def') + def test_compat_urllib_parse_unquote(self): self.assertEqual(compat_urllib_parse_unquote('abc%20def'), 'abc def') self.assertEqual(compat_urllib_parse_unquote('%7e/abc+def'), '~/abc+def') @@ -63,12 +88,14 @@ class TestCompat(unittest.TestCase): self.assertEqual(compat_urllib_parse_unquote('%2f'), '/') self.assertEqual(compat_urllib_parse_unquote('%E6%B4%A5%E6%B3%A2'), '津波') self.assertEqual( - compat_urllib_parse_unquote(''' -%%a'''), + compat_urllib_parse_unquote( + ''' + %%a'''), ''' -%%a''') + %%a''') self.assertEqual( - compat_urllib_parse_unquote('''%28%5E%E2%97%A3_%E2%97%A2%5E%29%E3%81%A3%EF%B8%BB%E3%83%87%E2%95%90%E4%B8%80 %E2%87%80 %E2%87%80 %E2%87%80 %E2%87%80 %E2%87%80 %E2%86%B6%I%Break%25Things%'''), + compat_urllib_parse_unquote( + '''%28%5E%E2%97%A3_%E2%97%A2%5E%29%E3%81%A3%EF%B8%BB%E3%83%87%E2%95%90%E4%B8%80 %E2%87%80 %E2%87%80 %E2%87%80 %E2%87%80 %E2%87%80 %E2%86%B6%I%Break%25Things%'''), '''(^◣_◢^)っ︻デ═一 ⇀ ⇀ ⇀ ⇀ ⇀ ↶%I%Break%Things%''') def test_compat_urllib_parse_unquote_plus(self): diff --git a/test/test_utils.py b/test/test_utils.py index 14503ab53..f881f12a4 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -96,6 +96,7 @@ from youtube_dl.utils import ( cli_valueless_option, cli_bool_option, parse_codecs, + iri_to_uri, ) from youtube_dl.compat import ( compat_chr, @@ -1333,6 +1334,32 @@ Line 1 self.assertEqual(get_elements_by_attribute('class', 'foo bar', html), ['nice', 'also nice']) self.assertEqual(get_elements_by_attribute('class', 'foo', html), []) self.assertEqual(get_elements_by_attribute('class', 'no-such-foo', html), []) + + def test_iri_to_uri(self): + self.assertEqual( + iri_to_uri('https://www.google.com/search?q=foo&ie=utf-8&oe=utf-8&client=firefox-b'), + 'https://www.google.com/search?q=foo&ie=utf-8&oe=utf-8&client=firefox-b') # Same + self.assertEqual( + iri_to_uri('https://www.google.com/search?q=Käsesoßenrührlöffel'), # German for cheese sauce stirring spoon + 'https://www.google.com/search?q=K%C3%A4seso%C3%9Fenr%C3%BChrl%C3%B6ffel') + self.assertEqual( + iri_to_uri('https://www.google.com/search?q=lt<+gt>+eq%3D+amp%26+percent%25+hash%23+colon%3A+tilde~#trash=?&garbage=#'), + 'https://www.google.com/search?q=lt%3C+gt%3E+eq%3D+amp%26+percent%25+hash%23+colon%3A+tilde~#trash=?&garbage=#') + self.assertEqual( + iri_to_uri('http://правозащита38.рф/category/news/'), + 'http://xn--38-6kcaak9aj5chl4a3g.xn--p1ai/category/news/') + self.assertEqual( + iri_to_uri('http://www.правозащита38.рф/category/news/'), + 'http://www.xn--38-6kcaak9aj5chl4a3g.xn--p1ai/category/news/') + self.assertEqual( + iri_to_uri('https://i❤.ws/emojidomain/👍👏🤝💪'), + 'https://xn--i-7iq.ws/emojidomain/%F0%9F%91%8D%F0%9F%91%8F%F0%9F%A4%9D%F0%9F%92%AA') + self.assertEqual( + iri_to_uri('http://日本語.jp/'), + 'http://xn--wgv71a119e.jp/') + self.assertEqual( + iri_to_uri('http://导航.中国/'), + 'http://xn--fet810g.xn--fiqs8s/') if __name__ == '__main__': diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 046e03247..20fd53b69 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -49,6 +49,7 @@ from .utils import ( date_from_str, DateRange, DEFAULT_OUTTMPL, + DESKTOP_LINK_TEMPLATE, determine_ext, determine_protocol, DownloadError, @@ -61,6 +62,7 @@ from .utils import ( formatSeconds, GeoRestrictedError, int_or_none, + iri_to_uri, ISO3166Utils, locked_file, make_HTTPS_handler, @@ -83,9 +85,12 @@ from .utils import ( sanitized_Request, std_headers, subtitles_filename, + to_high_limit_path, UnavailableVideoError, url_basename, + URL_LINK_TEMPLATE, version_tuple, + WEBLOC_LINK_TEMPLATE, write_json_file, write_string, YoutubeDLCookieProcessor, @@ -178,6 +183,11 @@ class YoutubeDL(object): writeannotations: Write the video annotations to a .annotations.xml file writethumbnail: Write the thumbnail image to a file write_all_thumbnails: Write all thumbnail formats to files + writelink: Write an internet shortcut file, depending on the + current platform (.url/.webloc/.desktop) + writeurllink: Write a Windows internet shortcut file (.url) + writewebloclink: Write a macOS internet shortcut file (.webloc) + writedesktoplink: Write a Linux internet shortcut file (.desktop) writesubtitles: Write the video subtitles to a file writeautomaticsub: Write the automatically generated subtitles to a file allsubtitles: Downloads all the subtitles of the video @@ -204,7 +214,9 @@ class YoutubeDL(object): downloaded. None for no limit. download_archive: File name of a file where all downloads are recorded. Videos already present in the file are not downloaded - again. + again. When 'writelink' (or similar) and + 'skip_download' are also present, the videos will be + recorded, too. cookiefile: File name where cookies should be read from and dumped to. nocheckcertificate:Do not verify SSL certificates prefer_insecure: Use HTTP instead of HTTPS to retrieve information. @@ -1407,6 +1419,8 @@ class YoutubeDL(object): raise ExtractorError('Missing "id" field in extractor result') if 'title' not in info_dict: raise ExtractorError('Missing "title" field in extractor result') + if 'webpage_url' not in info_dict: + raise ExtractorError('Missing "webpage_url" field in extractor result. Should have been augmented with it.') def report_force_conversion(field, field_not, conversion): self.report_warning( @@ -1836,7 +1850,56 @@ class YoutubeDL(object): self._write_thumbnails(info_dict, filename) - if not self.params.get('skip_download', False): + # Write internet shortcut files + url_link = webloc_link = desktop_link = False + if self.params.get('writelink', False): + if sys.platform == "darwin": # macOS. + webloc_link = True + elif sys.platform.startswith("linux"): + desktop_link = True + else: # if sys.platform in ['win32', 'cygwin']: + url_link = True + if self.params.get('writeurllink', False): + url_link = True + if self.params.get('writewebloclink', False): + webloc_link = True + if self.params.get('writedesktoplink', False): + desktop_link = True + + if url_link or webloc_link or desktop_link: + ascii_url = iri_to_uri(info_dict['webpage_url']) + + def _write_link_file(extension, template, newline, embed_filename): + linkfn = replace_extension(filename, extension, info_dict.get('ext')) + if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(linkfn)): + self.to_screen('[info] Internet shortcut is already present') + else: + try: + self.to_screen('[info] Writing internet shortcut to: ' + linkfn) + with io.open(encodeFilename(to_high_limit_path(linkfn)), 'w', encoding='utf-8', newline=newline) as linkfile: + template_vars = { 'url': ascii_url } + if embed_filename: + template_vars['filename'] = linkfn[:-len(extension) - 1] + linkfile.write(template % template_vars) + except (OSError, IOError): + self.report_error('Cannot write internet shortcut ' + linkfn) + return False + return True + + if url_link: + if not _write_link_file('url', URL_LINK_TEMPLATE, '\r\n', embed_filename=False): return + if webloc_link: + if not _write_link_file('webloc', WEBLOC_LINK_TEMPLATE, '\n', embed_filename=False): return + if desktop_link: + if not _write_link_file('desktop', DESKTOP_LINK_TEMPLATE, '\n', embed_filename=True ): return + + if self.params.get('skip_download', False): + # Regarding the download archive, consider internet shortcut creation in conjunction with the `--skip-download` switch as everything the user wants. (See also help for the`--download-archive` switch.) + if url_link or webloc_link or desktop_link: + self.record_download_archive(info_dict) + + # Download + else: # No `--skip-download` try: def dl(name, info): fd = get_suitable_downloader(info, self.params)(self, self.params) diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index ba435ea42..b05494c39 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -369,6 +369,10 @@ def _real_main(argv=None): 'writeinfojson': opts.writeinfojson, 'writethumbnail': opts.writethumbnail, 'write_all_thumbnails': opts.write_all_thumbnails, + 'writelink': opts.writelink, + 'writeurllink': opts.writeurllink, + 'writewebloclink': opts.writewebloclink, + 'writedesktoplink': opts.writedesktoplink, 'writesubtitles': opts.writesubtitles, 'writeautomaticsub': opts.writeautomaticsub, 'allsubtitles': opts.allsubtitles, diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index 4a611f183..1268e7379 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -37,15 +37,20 @@ try: except ImportError: # Python 2 import urllib as compat_urllib_parse +try: + import urllib.parse as compat_urlparse +except ImportError: # Python 2 + import urlparse as compat_urlparse + try: from urllib.parse import urlparse as compat_urllib_parse_urlparse except ImportError: # Python 2 from urlparse import urlparse as compat_urllib_parse_urlparse try: - import urllib.parse as compat_urlparse + from urllib.parse import urlunparse as compat_urllib_parse_urlunparse except ImportError: # Python 2 - import urlparse as compat_urlparse + from urlparse import urlunparse as compat_urllib_parse_urlunparse try: import urllib.response as compat_urllib_response @@ -2354,6 +2359,20 @@ try: except NameError: compat_str = str +try: + from urllib.parse import quote as compat_urllib_parse_quote + from urllib.parse import quote_plus as compat_urllib_parse_quote_plus +except ImportError: # Python 2 + def compat_urllib_parse_quote(string, safe='/'): + return compat_urllib_parse.quote( + string.encode('utf-8'), + str(safe)) + + def compat_urllib_parse_quote_plus(string, safe=''): + return compat_urllib_parse.quote_plus( + string.encode('utf-8'), + str(safe)) + try: from urllib.parse import unquote_to_bytes as compat_urllib_parse_unquote_to_bytes from urllib.parse import unquote as compat_urllib_parse_unquote @@ -2992,11 +3011,14 @@ __all__ = [ 'compat_tokenize_tokenize', 'compat_urllib_error', 'compat_urllib_parse', + 'compat_urllib_parse_quote', + 'compat_urllib_parse_quote_plus', 'compat_urllib_parse_unquote', 'compat_urllib_parse_unquote_plus', 'compat_urllib_parse_unquote_to_bytes', 'compat_urllib_parse_urlencode', 'compat_urllib_parse_urlparse', + 'compat_urllib_parse_urlunparse', 'compat_urllib_request', 'compat_urllib_request_DataHandler', 'compat_urllib_response', diff --git a/youtube_dl/options.py b/youtube_dl/options.py index b692c6b3b..54d44af36 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -343,7 +343,7 @@ def parseOpts(overrideArguments=None): selection.add_option( '--download-archive', metavar='FILE', dest='download_archive', - help='Download only videos not listed in the archive file. Record the IDs of all downloaded videos in it.') + help='Download only videos not listed in the archive file. Record the IDs of all downloaded videos in it. When the switches --write-link (or similar) and --skip-download are used additionally, the IDs will also be recorded, even though nothing was actually downloaded.') selection.add_option( '--include-ads', dest='include_ads', action='store_true', @@ -764,7 +764,7 @@ def parseOpts(overrideArguments=None): action='store_true', dest='rm_cachedir', help='Delete all filesystem cache files') - thumbnail = optparse.OptionGroup(parser, 'Thumbnail images') + thumbnail = optparse.OptionGroup(parser, 'Thumbnail Images') thumbnail.add_option( '--write-thumbnail', action='store_true', dest='writethumbnail', default=False, @@ -778,7 +778,25 @@ def parseOpts(overrideArguments=None): action='store_true', dest='list_thumbnails', default=False, help='Simulate and list all available thumbnail formats') - postproc = optparse.OptionGroup(parser, 'Post-processing Options') + link = optparse.OptionGroup(parser, 'Internet Shortcut Options') + link.add_option( + '--write-link', + action='store_true', dest='writelink', default=False, + help='Write an internet shortcut file, depending on the current platform (.url/.webloc/.desktop). The URL may be cached by the OS.') + link.add_option( + '--write-url-link', + action='store_true', dest='writeurllink', default=False, + help='Write a Windows internet shortcut file (.url). Note that the OS caches the URL based on the file path.') + link.add_option( + '--write-webloc-link', + action='store_true', dest='writewebloclink', default=False, + help='Write a macOS internet shortcut file (.webloc)') + link.add_option( + '--write-desktop-link', + action='store_true', dest='writedesktoplink', default=False, + help='Write a Linux internet shortcut file (.desktop)') + + postproc = optparse.OptionGroup(parser, 'Post-Processing Options') postproc.add_option( '-x', '--extract-audio', action='store_true', dest='extractaudio', default=False, @@ -866,6 +884,7 @@ def parseOpts(overrideArguments=None): parser.add_option_group(downloader) parser.add_option_group(filesystem) parser.add_option_group(thumbnail) + parser.add_option_group(link) parser.add_option_group(verbosity) parser.add_option_group(workarounds) parser.add_option_group(video_format) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index f9ca63c58..f1aa620fc 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -57,6 +57,9 @@ from .compat import ( compat_urllib_parse, compat_urllib_parse_urlencode, compat_urllib_parse_urlparse, + compat_urllib_parse_urlunparse, + compat_urllib_parse_quote, + compat_urllib_parse_quote_plus, compat_urllib_parse_unquote_plus, compat_urllib_request, compat_urlparse, @@ -3902,3 +3905,82 @@ def random_birthday(year_field, month_field, day_field): month_field: str(random.randint(1, 12)), day_field: str(random.randint(1, 31)), } + + +# Templates for internet shortcut files, which are plain text files. +URL_LINK_TEMPLATE = ''' +[InternetShortcut] +URL=%(url)s +'''.lstrip() + +WEBLOC_LINK_TEMPLATE = ''' + + + + +\tURL +\t%(url)s + + +'''.lstrip() + +DESKTOP_LINK_TEMPLATE = ''' +[Desktop Entry] +Encoding=UTF-8 +Name=Link to %(filename)s +Type=Link +URL=%(url)s +Icon=text-html +'''.lstrip() + + +def iri_to_uri(iri): + """ + Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only). + + The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact. + """ + + iri_parts = compat_urllib_parse_urlparse(iri) + + if '[' in iri_parts.netloc: + raise ValueError('IPv6 URIs are not, yet, supported.') + # Querying `.netloc`, when there's only one bracket, also raises a ValueError. + + # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is. + + net_location = '' + if iri_parts.username: + net_location += compat_urllib_parse_quote(iri_parts.username, safe=r"!$%&'()*+,~") + if iri_parts.password is not None: + net_location += ':' + compat_urllib_parse_quote(iri_parts.password, safe=r"!$%&'()*+,~") + net_location += '@' + + net_location += iri_parts.hostname.encode('idna').decode('utf-8') # Punycode for Unicode hostnames. + # The 'idna' encoding produces ASCII text. + if iri_parts.port is not None and iri_parts.port != 80: + net_location += ':' + str(iri_parts.port) + + return compat_urllib_parse_urlunparse( ( + iri_parts.scheme, + net_location, + + compat_urllib_parse_quote_plus(iri_parts.path, safe=r"!$%&'()*+,/:;=@|~"), + + # Unsure about the `safe` argument , since this is a legacy way of handling parameters. + compat_urllib_parse_quote_plus(iri_parts.params, safe=r"!$%&'()*+,/:;=@|~"), + + # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component. + compat_urllib_parse_quote_plus(iri_parts.query, safe=r"!$%&'()*+,/:;=?@{|}~"), + + compat_urllib_parse_quote_plus(iri_parts.fragment, safe=r"!#$%&'()*+,/:;=?@{|}~")) ) + + # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes. + + +def to_high_limit_path(path): + if sys.platform in ['win32', 'cygwin']: + # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited. + return r'\\?\ '.rstrip() + os.path.abspath(path) + + return path