diff --git a/AUTHORS b/AUTHORS index d1693224e..901c1b263 100644 --- a/AUTHORS +++ b/AUTHORS @@ -140,3 +140,6 @@ Behrouz Abbasi ngld nyuszika7h Shaun Walbridge +Lee Jenkins +Anssi Hannula +Lukáš Lalinský diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 42333c450..f8ab29631 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -125,7 +125,7 @@ If you want to add support for a new site, you can follow this quick list (assum ``` 5. Add an import in [`youtube_dl/extractor/__init__.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/__init__.py). 6. Run `python test/test_download.py TestDownload.test_YourExtractor`. This *should fail* at first, but you can continually re-run it until you're done. If you decide to add more than one test, then rename ``_TEST`` to ``_TESTS`` and make it into a list of dictionaries. The tests will be then be named `TestDownload.test_YourExtractor`, `TestDownload.test_YourExtractor_1`, `TestDownload.test_YourExtractor_2`, etc. -7. Have a look at [`youtube_dl/common/extractor/common.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should and may return](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L62-L200). Add tests and code for as many as you want. +7. Have a look at [`youtube_dl/extractor/common.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should and may return](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L62-L200). Add tests and code for as many as you want. 8. If you can, check the code with [flake8](https://pypi.python.org/pypi/flake8). 9. When the tests pass, [add](http://git-scm.com/docs/git-add) the new files and [commit](http://git-scm.com/docs/git-commit) them and [push](http://git-scm.com/docs/git-push) the result, like this: diff --git a/README.md b/README.md index 25844eb6d..2ed751791 100644 --- a/README.md +++ b/README.md @@ -9,6 +9,7 @@ youtube-dl - download videos from youtube.com or other video platforms - [VIDEO SELECTION](#video-selection) - [FAQ](#faq) - [DEVELOPER INSTRUCTIONS](#developer-instructions) +- [EMBEDDING YOUTUBE-DL](#embedding-youtube-dl) - [BUGS](#bugs) - [COPYRIGHT](#copyright) @@ -34,7 +35,7 @@ You can also use pip: sudo pip install youtube-dl -Alternatively, refer to the developer instructions below for how to check out and work with the git repository. For further options, including PGP signatures, see https://rg3.github.io/youtube-dl/download.html . +Alternatively, refer to the [developer instructions](#developer-instructions) for how to check out and work with the git repository. For further options, including PGP signatures, see https://rg3.github.io/youtube-dl/download.html . # DESCRIPTION **youtube-dl** is a small command-line program to download videos from @@ -207,7 +208,7 @@ which means you can modify it, redistribute it or use it however you like. -p, --password PASSWORD Account password. If this option is left out, youtube-dl will ask interactively. -2, --twofactor TWOFACTOR Two-factor auth code -n, --netrc Use .netrc authentication data - --video-password PASSWORD Video password (vimeo, smotri) + --video-password PASSWORD Video password (vimeo, smotri, youku) ## Post-processing Options: -x, --extract-audio Convert video files to audio-only files (requires ffmpeg or avconv and ffprobe or avprobe) @@ -261,7 +262,7 @@ For example: machine youtube login myaccount@gmail.com password my_youtube_password machine twitch login my_twitch_account_name password my_twitch_password ``` -To activate authentication with `.netrc` file you should pass `--netrc` to youtube-dl or to place it in [configuration file](#configuration). +To activate authentication with `.netrc` file you should pass `--netrc` to youtube-dl or place it in [configuration file](#configuration). On Windows you may also need to setup `%HOME%` environment variable manually. @@ -277,8 +278,8 @@ The `-o` option allows users to indicate a template for the output file names. T - `ext`: The sequence will be replaced by the appropriate extension (like flv or mp4). - `epoch`: The sequence will be replaced by the Unix epoch when creating the file. - `autonumber`: The sequence will be replaced by a five-digit number that will be increased with each download, starting at zero. - - `playlist`: The name or the id of the playlist that contains the video. - - `playlist_index`: The index of the video in the playlist, a five-digit number. + - `playlist`: The sequence will be replaced by the name or the id of the playlist that contains the video. + - `playlist_index`: The sequence will be replaced by the index of the video in the playlist padded with leading zeros according to the total length of the playlist. - `format_id`: The sequence will be replaced by the format code specified by `--format`. The current default template is `%(title)s-%(id)s.%(ext)s`. @@ -552,7 +553,7 @@ If you want to add support for a new site, you can follow this quick list (assum ``` 5. Add an import in [`youtube_dl/extractor/__init__.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/__init__.py). 6. Run `python test/test_download.py TestDownload.test_YourExtractor`. This *should fail* at first, but you can continually re-run it until you're done. If you decide to add more than one test, then rename ``_TEST`` to ``_TESTS`` and make it into a list of dictionaries. The tests will be then be named `TestDownload.test_YourExtractor`, `TestDownload.test_YourExtractor_1`, `TestDownload.test_YourExtractor_2`, etc. -7. Have a look at [`youtube_dl/common/extractor/common.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should and may return](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L62-L200). Add tests and code for as many as you want. +7. Have a look at [`youtube_dl/extractor/common.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should and may return](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L62-L200). Add tests and code for as many as you want. 8. If you can, check the code with [flake8](https://pypi.python.org/pypi/flake8). 9. When the tests pass, [add](http://git-scm.com/docs/git-add) the new files and [commit](http://git-scm.com/docs/git-commit) them and [push](http://git-scm.com/docs/git-push) the result, like this: diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 8d9db53a6..66091e6be 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -166,7 +166,7 @@ - **Folketinget**: Folketinget (ft.dk; Danish parliament) - **FootyRoom** - **Foxgay** - - **FoxNews** + - **FoxNews**: Fox News and Fox Business Video - **FoxSports** - **france2.fr:generation-quoi** - **FranceCulture** @@ -195,7 +195,7 @@ - **GodTube** - **GoldenMoustache** - **Golem** - - **GorillaVid**: GorillaVid.in, daclips.in, movpod.in, fastvideo.in and realvid.net + - **GorillaVid**: GorillaVid.in, daclips.in, movpod.in, fastvideo.in, realvid.net and filehoot.com - **Goshgay** - **Groupon** - **Hark** @@ -220,6 +220,7 @@ - **imdb**: Internet Movie Database trailers - **imdb:list**: Internet Movie Database lists - **Imgur** + - **ImgurAlbum** - **Ina** - **Indavideo** - **IndavideoEmbed** @@ -303,11 +304,11 @@ - **MPORA** - **MSNBC** - **MTV** + - **mtv.de** - **mtviggy.com** - **mtvservices:embedded** - **MuenchenTV**: münchen.tv - **MusicPlayOn** - - **MusicVault** - **muzu.tv** - **Mwave** - **MySpace** @@ -465,7 +466,7 @@ - **Sexu** - **SexyKarma**: Sexy Karma and Watch Indian Porn - **Shahid** - - **Shared** + - **Shared**: shared.sx and vivo.sx - **ShareSix** - **Sina** - **Slideshare** @@ -631,6 +632,7 @@ - **vine:user** - **vk**: VK - **vk:uservideos**: VK - User's Videos + - **vlive** - **Vodlocker** - **VoiceRepublic** - **Vporn** diff --git a/test/test_compat.py b/test/test_compat.py index c3ba8ad2e..4ee0dc99d 100644 --- a/test/test_compat.py +++ b/test/test_compat.py @@ -14,6 +14,7 @@ from youtube_dl.utils import get_filesystem_encoding from youtube_dl.compat import ( compat_getenv, compat_expanduser, + compat_shlex_split, compat_urllib_parse_unquote, compat_urllib_parse_unquote_plus, ) @@ -67,5 +68,8 @@ class TestCompat(unittest.TestCase): self.assertEqual(compat_urllib_parse_unquote_plus('abc%20def'), 'abc def') self.assertEqual(compat_urllib_parse_unquote_plus('%7e/abc+def'), '~/abc def') + def test_compat_shlex_split(self): + self.assertEqual(compat_shlex_split('-option "one two"'), ['-option', 'one two']) + if __name__ == '__main__': unittest.main() diff --git a/test/test_utils.py b/test/test_utils.py index a759b2da9..a5f164c49 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -57,11 +57,16 @@ from youtube_dl.utils import ( urlencode_postdata, version_tuple, xpath_with_ns, + xpath_element, xpath_text, + xpath_attr, render_table, match_str, parse_dfxp_time_expr, dfxp2srt, + cli_option, + cli_valueless_option, + cli_bool_option, ) @@ -264,6 +269,16 @@ class TestUtil(unittest.TestCase): self.assertEqual(find('media:song/media:author').text, 'The Author') self.assertEqual(find('media:song/url').text, 'http://server.com/download.mp3') + def test_xpath_element(self): + doc = xml.etree.ElementTree.Element('root') + div = xml.etree.ElementTree.SubElement(doc, 'div') + p = xml.etree.ElementTree.SubElement(div, 'p') + p.text = 'Foo' + self.assertEqual(xpath_element(doc, 'div/p'), p) + self.assertEqual(xpath_element(doc, 'div/bar', default='default'), 'default') + self.assertTrue(xpath_element(doc, 'div/bar') is None) + self.assertRaises(ExtractorError, xpath_element, doc, 'div/bar', fatal=True) + def test_xpath_text(self): testxml = '''
@@ -272,9 +287,25 @@ class TestUtil(unittest.TestCase): ''' doc = xml.etree.ElementTree.fromstring(testxml) self.assertEqual(xpath_text(doc, 'div/p'), 'Foo') + self.assertEqual(xpath_text(doc, 'div/bar', default='default'), 'default') self.assertTrue(xpath_text(doc, 'div/bar') is None) self.assertRaises(ExtractorError, xpath_text, doc, 'div/bar', fatal=True) + def test_xpath_attr(self): + testxml = ''' +
+

Foo

+
+
''' + doc = xml.etree.ElementTree.fromstring(testxml) + self.assertEqual(xpath_attr(doc, 'div/p', 'x'), 'a') + self.assertEqual(xpath_attr(doc, 'div/bar', 'x'), None) + self.assertEqual(xpath_attr(doc, 'div/p', 'y'), None) + self.assertEqual(xpath_attr(doc, 'div/bar', 'x', default='default'), 'default') + self.assertEqual(xpath_attr(doc, 'div/p', 'y', default='default'), 'default') + self.assertRaises(ExtractorError, xpath_attr, doc, 'div/bar', 'x', fatal=True) + self.assertRaises(ExtractorError, xpath_attr, doc, 'div/p', 'y', fatal=True) + def test_smuggle_url(self): data = {"ö": "ö", "abc": [3]} url = 'https://foo.bar/baz?x=y#a' @@ -646,6 +677,51 @@ The first line ''' self.assertEqual(dfxp2srt(dfxp_data_no_default_namespace), srt_data) + def test_cli_option(self): + self.assertEqual(cli_option({'proxy': '127.0.0.1:3128'}, '--proxy', 'proxy'), ['--proxy', '127.0.0.1:3128']) + self.assertEqual(cli_option({'proxy': None}, '--proxy', 'proxy'), []) + self.assertEqual(cli_option({}, '--proxy', 'proxy'), []) + + def test_cli_valueless_option(self): + self.assertEqual(cli_valueless_option( + {'downloader': 'external'}, '--external-downloader', 'downloader', 'external'), ['--external-downloader']) + self.assertEqual(cli_valueless_option( + {'downloader': 'internal'}, '--external-downloader', 'downloader', 'external'), []) + self.assertEqual(cli_valueless_option( + {'nocheckcertificate': True}, '--no-check-certificate', 'nocheckcertificate'), ['--no-check-certificate']) + self.assertEqual(cli_valueless_option( + {'nocheckcertificate': False}, '--no-check-certificate', 'nocheckcertificate'), []) + self.assertEqual(cli_valueless_option( + {'checkcertificate': True}, '--no-check-certificate', 'checkcertificate', False), []) + self.assertEqual(cli_valueless_option( + {'checkcertificate': False}, '--no-check-certificate', 'checkcertificate', False), ['--no-check-certificate']) + + def test_cli_bool_option(self): + self.assertEqual( + cli_bool_option( + {'nocheckcertificate': True}, '--no-check-certificate', 'nocheckcertificate'), + ['--no-check-certificate', 'true']) + self.assertEqual( + cli_bool_option( + {'nocheckcertificate': True}, '--no-check-certificate', 'nocheckcertificate', separator='='), + ['--no-check-certificate=true']) + self.assertEqual( + cli_bool_option( + {'nocheckcertificate': True}, '--check-certificate', 'nocheckcertificate', 'false', 'true'), + ['--check-certificate', 'false']) + self.assertEqual( + cli_bool_option( + {'nocheckcertificate': True}, '--check-certificate', 'nocheckcertificate', 'false', 'true', '='), + ['--check-certificate=false']) + self.assertEqual( + cli_bool_option( + {'nocheckcertificate': False}, '--check-certificate', 'nocheckcertificate', 'false', 'true'), + ['--check-certificate', 'true']) + self.assertEqual( + cli_bool_option( + {'nocheckcertificate': False}, '--check-certificate', 'nocheckcertificate', 'false', 'true', '='), + ['--check-certificate=true']) + if __name__ == '__main__': unittest.main() diff --git a/test/test_write_annotations.py b/test/test_write_annotations.py index 780636c77..84b8f39e0 100644 --- a/test/test_write_annotations.py +++ b/test/test_write_annotations.py @@ -33,7 +33,7 @@ params = get_params({ TEST_ID = 'gr51aVj-mLg' -ANNOTATIONS_FILE = TEST_ID + '.flv.annotations.xml' +ANNOTATIONS_FILE = TEST_ID + '.annotations.xml' EXPECTED_ANNOTATIONS = ['Speech bubble', 'Note', 'Title', 'Spotlight', 'Label'] diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index cad6b026e..d65253882 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -69,6 +69,7 @@ from .utils import ( version_tuple, write_json_file, write_string, + YoutubeDLCookieProcessor, YoutubeDLHandler, prepend_extension, replace_extension, @@ -284,7 +285,11 @@ class YoutubeDL(object): self._num_downloads = 0 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)] self._err_file = sys.stderr - self.params = params + self.params = { + # Default parameters + 'nocheckcertificate': False, + } + self.params.update(params) self.cache = Cache(self) if params.get('bidi_workaround', False): @@ -1939,8 +1944,7 @@ class YoutubeDL(object): if os.access(opts_cookiefile, os.R_OK): self.cookiejar.load() - cookie_processor = compat_urllib_request.HTTPCookieProcessor( - self.cookiejar) + cookie_processor = YoutubeDLCookieProcessor(self.cookiejar) if opts_proxy is not None: if opts_proxy == '': proxies = {} @@ -2009,7 +2013,7 @@ class YoutubeDL(object): (info_dict['extractor'], info_dict['id'], thumb_display_id)) try: uf = self.urlopen(t['url']) - with open(thumb_filename, 'wb') as thumbf: + with open(encodeFilename(thumb_filename), 'wb') as thumbf: shutil.copyfileobj(uf, thumbf) self.to_screen('[%s] %s: Writing thumbnail %sto: %s' % (info_dict['extractor'], info_dict['id'], thumb_display_id, thumb_filename)) diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 55b22c889..5e2ed4d4b 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -9,7 +9,6 @@ import codecs import io import os import random -import shlex import sys @@ -20,6 +19,7 @@ from .compat import ( compat_expanduser, compat_getpass, compat_print, + compat_shlex_split, workaround_optparse_bug9161, ) from .utils import ( @@ -262,10 +262,10 @@ def _real_main(argv=None): parser.error('setting filesize xattr requested but python-xattr is not available') external_downloader_args = None if opts.external_downloader_args: - external_downloader_args = shlex.split(opts.external_downloader_args) + external_downloader_args = compat_shlex_split(opts.external_downloader_args) postprocessor_args = None if opts.postprocessor_args: - postprocessor_args = shlex.split(opts.postprocessor_args) + postprocessor_args = compat_shlex_split(opts.postprocessor_args) match_filter = ( None if opts.match_filter is None else match_filter_func(opts.match_filter)) diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index ace5bd716..1ff42d94b 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -5,6 +5,7 @@ import getpass import optparse import os import re +import shlex import shutil import socket import subprocess @@ -79,6 +80,11 @@ try: except ImportError: import BaseHTTPServer as compat_http_server +try: + compat_str = unicode # Python 2 +except NameError: + compat_str = str + try: from urllib.parse import unquote_to_bytes as compat_urllib_parse_unquote_to_bytes from urllib.parse import unquote as compat_urllib_parse_unquote @@ -99,7 +105,7 @@ except ImportError: # Python 2 # Is it a string-like object? string.split return b'' - if isinstance(string, unicode): + if isinstance(string, compat_str): string = string.encode('utf-8') bits = string.split(b'%') if len(bits) == 1: @@ -149,11 +155,6 @@ except ImportError: # Python 2 string = string.replace('+', ' ') return compat_urllib_parse_unquote(string, encoding, errors) -try: - compat_str = unicode # Python 2 -except NameError: - compat_str = str - try: compat_basestring = basestring # Python 2 except NameError: @@ -227,6 +228,17 @@ except ImportError: # Python < 3.3 return "'" + s.replace("'", "'\"'\"'") + "'" +if sys.version_info >= (2, 7, 3): + compat_shlex_split = shlex.split +else: + # Working around shlex issue with unicode strings on some python 2 + # versions (see http://bugs.python.org/issue1548891) + def compat_shlex_split(s, comments=False, posix=True): + if isinstance(s, compat_str): + s = s.encode('utf-8') + return shlex.split(s, comments, posix) + + def compat_ord(c): if type(c) is int: return c @@ -459,6 +471,7 @@ __all__ = [ 'compat_ord', 'compat_parse_qs', 'compat_print', + 'compat_shlex_split', 'compat_socket_create_connection', 'compat_str', 'compat_subprocess_get_DEVNULL', diff --git a/youtube_dl/downloader/external.py b/youtube_dl/downloader/external.py index 6c310346c..2bc011266 100644 --- a/youtube_dl/downloader/external.py +++ b/youtube_dl/downloader/external.py @@ -5,6 +5,10 @@ import subprocess from .common import FileDownloader from ..utils import ( + cli_option, + cli_valueless_option, + cli_bool_option, + cli_configuration_args, encodeFilename, encodeArgument, ) @@ -46,19 +50,16 @@ class ExternalFD(FileDownloader): return info_dict['protocol'] in ('http', 'https', 'ftp', 'ftps') def _option(self, command_option, param): - param = self.params.get(param) - if param is None: - return [] - if isinstance(param, bool): - return [command_option] - return [command_option, param] + return cli_option(self.params, command_option, param) + + def _bool_option(self, command_option, param, true_value='true', false_value='false', separator=None): + return cli_bool_option(self.params, command_option, param, true_value, false_value, separator) + + def _valueless_option(self, command_option, param, expected_value=True): + return cli_valueless_option(self.params, command_option, param, expected_value) def _configuration_args(self, default=[]): - ex_args = self.params.get('external_downloader_args') - if ex_args is None: - return default - assert isinstance(ex_args, list) - return ex_args + return cli_configuration_args(self.params, 'external_downloader_args', default) def _call_downloader(self, tmpfilename, info_dict): """ Either overwrite this or implement _make_cmd """ @@ -80,6 +81,8 @@ class CurlFD(ExternalFD): for key, val in info_dict['http_headers'].items(): cmd += ['--header', '%s: %s' % (key, val)] cmd += self._option('--interface', 'source_address') + cmd += self._option('--proxy', 'proxy') + cmd += self._valueless_option('--insecure', 'nocheckcertificate') cmd += self._configuration_args() cmd += ['--', info_dict['url']] return cmd @@ -102,7 +105,7 @@ class WgetFD(ExternalFD): cmd += ['--header', '%s: %s' % (key, val)] cmd += self._option('--bind-address', 'source_address') cmd += self._option('--proxy', 'proxy') - cmd += self._option('--no-check-certificate', 'nocheckcertificate') + cmd += self._valueless_option('--no-check-certificate', 'nocheckcertificate') cmd += self._configuration_args() cmd += ['--', info_dict['url']] return cmd @@ -121,6 +124,7 @@ class Aria2cFD(ExternalFD): cmd += ['--header', '%s: %s' % (key, val)] cmd += self._option('--interface', 'source_address') cmd += self._option('--all-proxy', 'proxy') + cmd += self._bool_option('--check-certificate', 'nocheckcertificate', 'false', 'true', '=') cmd += ['--', info_dict['url']] return cmd diff --git a/youtube_dl/downloader/f4m.py b/youtube_dl/downloader/f4m.py index 275564b59..174180db5 100644 --- a/youtube_dl/downloader/f4m.py +++ b/youtube_dl/downloader/f4m.py @@ -13,6 +13,8 @@ from ..compat import ( compat_urllib_error, ) from ..utils import ( + encodeFilename, + sanitize_open, struct_pack, struct_unpack, xpath_text, @@ -343,18 +345,19 @@ class F4mFD(FragmentFD): success = ctx['dl'].download(frag_filename, {'url': url}) if not success: return False - with open(frag_filename, 'rb') as down: - down_data = down.read() - reader = FlvReader(down_data) - while True: - _, box_type, box_data = reader.read_box_info() - if box_type == b'mdat': - dest_stream.write(box_data) - break + (down, frag_sanitized) = sanitize_open(frag_filename, 'rb') + down_data = down.read() + down.close() + reader = FlvReader(down_data) + while True: + _, box_type, box_data = reader.read_box_info() + if box_type == b'mdat': + dest_stream.write(box_data) + break if live: - os.remove(frag_filename) + os.remove(encodeFilename(frag_sanitized)) else: - frags_filenames.append(frag_filename) + frags_filenames.append(frag_sanitized) except (compat_urllib_error.HTTPError, ) as err: if live and (err.code == 404 or err.code == 410): # We didn't keep up with the live window. Continue @@ -375,6 +378,6 @@ class F4mFD(FragmentFD): self._finish_frag_download(ctx) for frag_file in frags_filenames: - os.remove(frag_file) + os.remove(encodeFilename(frag_file)) return True diff --git a/youtube_dl/downloader/hls.py b/youtube_dl/downloader/hls.py index 2b6c3370f..7743e176a 100644 --- a/youtube_dl/downloader/hls.py +++ b/youtube_dl/downloader/hls.py @@ -12,6 +12,7 @@ from ..postprocessor.ffmpeg import FFmpegPostProcessor from ..utils import ( encodeArgument, encodeFilename, + sanitize_open, ) @@ -30,7 +31,7 @@ class HlsFD(FileDownloader): args = [ encodeArgument(opt) for opt in (ffpp.executable, '-y', '-i', url, '-f', 'mp4', '-c', 'copy', '-bsf:a', 'aac_adtstoasc')] - args.append(encodeFilename(tmpfilename, True)) + args.append(encodeFilename(ffpp._ffmpeg_filename_argument(tmpfilename), True)) self._debug_cmd(args) @@ -89,13 +90,14 @@ class NativeHlsFD(FragmentFD): success = ctx['dl'].download(frag_filename, {'url': frag_url}) if not success: return False - with open(frag_filename, 'rb') as down: - ctx['dest_stream'].write(down.read()) - frags_filenames.append(frag_filename) + down, frag_sanitized = sanitize_open(frag_filename, 'rb') + ctx['dest_stream'].write(down.read()) + down.close() + frags_filenames.append(frag_sanitized) self._finish_frag_download(ctx) for frag_file in frags_filenames: - os.remove(frag_file) + os.remove(encodeFilename(frag_file)) return True diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 88757a382..f7305751c 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -138,7 +138,6 @@ from .dump import DumpIE from .dumpert import DumpertIE from .defense import DefenseGouvFrIE from .discovery import DiscoveryIE -from .divxstage import DivxStageIE from .dropbox import DropboxIE from .eagleplatform import EaglePlatformIE from .ebaumsworld import EbaumsWorldIE @@ -241,7 +240,10 @@ from .imdb import ( ImdbIE, ImdbListIE ) -from .imgur import ImgurIE +from .imgur import ( + ImgurIE, + ImgurAlbumIE, +) from .ina import InaIE from .indavideo import ( IndavideoIE, @@ -340,10 +342,10 @@ from .mtv import ( MTVIE, MTVServicesEmbeddedIE, MTVIggyIE, + MTVDEIE, ) from .muenchentv import MuenchenTVIE from .musicplayon import MusicPlayOnIE -from .musicvault import MusicVaultIE from .muzu import MuzuTVIE from .mwave import MwaveIE from .myspace import MySpaceIE, MySpaceAlbumIE @@ -364,6 +366,9 @@ from .nbc import ( from .ndr import ( NDRIE, NJoyIE, + NDREmbedBaseIE, + NDREmbedIE, + NJoyEmbedIE, ) from .ndtv import NDTVIE from .netzkino import NetzkinoIE @@ -399,7 +404,11 @@ from .normalboots import NormalbootsIE from .nosvideo import NosVideoIE from .nova import NovaIE from .novamov import NovaMovIE -from .nowness import NownessIE +from .nowness import ( + NownessIE, + NownessPlaylistIE, + NownessSeriesIE, +) from .nowtv import NowTVIE from .nowvideo import NowVideoIE from .npo import ( @@ -429,7 +438,6 @@ from .ooyala import ( OoyalaIE, OoyalaExternalIE, ) -from .openfilm import OpenFilmIE from .orf import ( ORFTVthekIE, ORFOE1IE, @@ -740,6 +748,7 @@ from .vk import ( VKIE, VKUserVideosIE, ) +from .vlive import VLiveIE from .vodlocker import VodlockerIE from .voicerepublic import VoiceRepublicIE from .vporn import VpornIE @@ -800,7 +809,7 @@ from .youtube import ( YoutubeIE, YoutubeChannelIE, YoutubeFavouritesIE, - YoutubeHistoryIE, + #YoutubeHistoryIE, YoutubePlaylistIE, YoutubeRecommendedIE, YoutubeSearchDateIE, diff --git a/youtube_dl/extractor/abc.py b/youtube_dl/extractor/abc.py index dc0fb85d6..f9a389f67 100644 --- a/youtube_dl/extractor/abc.py +++ b/youtube_dl/extractor/abc.py @@ -1,16 +1,20 @@ from __future__ import unicode_literals import re -import json from .common import InfoExtractor +from ..utils import ( + ExtractorError, + js_to_json, + int_or_none, +) class ABCIE(InfoExtractor): IE_NAME = 'abc.net.au' _VALID_URL = r'http://www\.abc\.net\.au/news/[^/]+/[^/]+/(?P\d+)' - _TEST = { + _TESTS = [{ 'url': 'http://www.abc.net.au/news/2014-11-05/australia-to-staff-ebola-treatment-centre-in-sierra-leone/5868334', 'md5': 'cb3dd03b18455a661071ee1e28344d9f', 'info_dict': { @@ -19,22 +23,47 @@ class ABCIE(InfoExtractor): 'title': 'Australia to help staff Ebola treatment centre in Sierra Leone', 'description': 'md5:809ad29c67a05f54eb41f2a105693a67', }, - } + }, { + 'url': 'http://www.abc.net.au/news/2015-08-17/warren-entsch-introduces-same-sex-marriage-bill/6702326', + 'md5': 'db2a5369238b51f9811ad815b69dc086', + 'info_dict': { + 'id': 'NvqvPeNZsHU', + 'ext': 'mp4', + 'upload_date': '20150816', + 'uploader': 'ABC News (Australia)', + 'description': 'Government backbencher Warren Entsch introduces a cross-party sponsored bill to legalise same-sex marriage, saying the bill is designed to promote "an inclusive Australia, not a divided one.". Read more here: http://ab.co/1Mwc6ef', + 'uploader_id': 'NewsOnABC', + 'title': 'Marriage Equality: Warren Entsch introduces same sex marriage bill', + }, + 'add_ie': ['Youtube'], + }] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - urls_info_json = self._search_regex( - r'inlineVideoData\.push\((.*?)\);', webpage, 'video urls', - flags=re.DOTALL) - urls_info = json.loads(urls_info_json.replace('\'', '"')) + mobj = re.search( + r'inline(?PVideo|YouTube)Data\.push\((?P[^)]+)\);', + webpage) + if mobj is None: + raise ExtractorError('Unable to extract video urls') + + urls_info = self._parse_json( + mobj.group('json_data'), video_id, transform_source=js_to_json) + + if not isinstance(urls_info, list): + urls_info = [urls_info] + + if mobj.group('type') == 'YouTube': + return self.playlist_result([ + self.url_result(url_info['url']) for url_info in urls_info]) + formats = [{ 'url': url_info['url'], - 'width': int(url_info['width']), - 'height': int(url_info['height']), - 'tbr': int(url_info['bitrate']), - 'filesize': int(url_info['filesize']), + 'width': int_or_none(url_info.get('width')), + 'height': int_or_none(url_info.get('height')), + 'tbr': int_or_none(url_info.get('bitrate')), + 'filesize': int_or_none(url_info.get('filesize')), } for url_info in urls_info] self._sort_formats(formats) diff --git a/youtube_dl/extractor/academicearth.py b/youtube_dl/extractor/academicearth.py index 47313fba8..34095501c 100644 --- a/youtube_dl/extractor/academicearth.py +++ b/youtube_dl/extractor/academicearth.py @@ -15,7 +15,7 @@ class AcademicEarthCourseIE(InfoExtractor): 'title': 'Laws of Nature', 'description': 'Introduce yourself to the laws of nature with these free online college lectures from Yale, Harvard, and MIT.', }, - 'playlist_count': 4, + 'playlist_count': 3, } def _real_extract(self, url): diff --git a/youtube_dl/extractor/adultswim.py b/youtube_dl/extractor/adultswim.py index 39335b827..4327c2f61 100644 --- a/youtube_dl/extractor/adultswim.py +++ b/youtube_dl/extractor/adultswim.py @@ -156,7 +156,7 @@ class AdultSwimIE(InfoExtractor): xpath_text(idoc, './/trt', 'segment duration').strip()) formats = [] - file_els = idoc.findall('.//files/file') + file_els = idoc.findall('.//files/file') or idoc.findall('./files/file') for file_el in file_els: bitrate = file_el.attrib.get('bitrate') diff --git a/youtube_dl/extractor/airmozilla.py b/youtube_dl/extractor/airmozilla.py index 611ad1e9d..f8e70f4e5 100644 --- a/youtube_dl/extractor/airmozilla.py +++ b/youtube_dl/extractor/airmozilla.py @@ -20,14 +20,14 @@ class AirMozillaIE(InfoExtractor): 'id': '6x4q2w', 'ext': 'mp4', 'title': 'Privacy Lab - a meetup for privacy minded people in San Francisco', - 'thumbnail': 're:https://\w+\.cloudfront\.net/6x4q2w/poster\.jpg\?t=\d+', + 'thumbnail': 're:https?://vid\.ly/(?P[0-9a-z-]+)/poster', 'description': 'Brings together privacy professionals and others interested in privacy at for-profits, non-profits, and NGOs in an effort to contribute to the state of the ecosystem...', 'timestamp': 1422487800, 'upload_date': '20150128', 'location': 'SFO Commons', 'duration': 3780, 'view_count': int, - 'categories': ['Main'], + 'categories': ['Main', 'Privacy'], } } diff --git a/youtube_dl/extractor/aljazeera.py b/youtube_dl/extractor/aljazeera.py index 612708e25..184a14a4f 100644 --- a/youtube_dl/extractor/aljazeera.py +++ b/youtube_dl/extractor/aljazeera.py @@ -16,6 +16,7 @@ class AlJazeeraIE(InfoExtractor): 'uploader': 'Al Jazeera English', }, 'add_ie': ['Brightcove'], + 'skip': 'Not accessible from Travis CI server', } def _real_extract(self, url): diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py index 76de24477..2a00da3ee 100644 --- a/youtube_dl/extractor/arte.py +++ b/youtube_dl/extractor/arte.py @@ -4,6 +4,10 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import ( + compat_parse_qs, + compat_urllib_parse_urlparse, +) from ..utils import ( find_xpath_attr, unified_strdate, @@ -77,7 +81,13 @@ class ArteTVPlus7IE(InfoExtractor): def _extract_from_webpage(self, webpage, video_id, lang): json_url = self._html_search_regex( [r'arte_vp_url=["\'](.*?)["\']', r'data-url=["\']([^"]+)["\']'], - webpage, 'json vp url') + webpage, 'json vp url', default=None) + if not json_url: + iframe_url = self._html_search_regex( + r']+src=(["\'])(?P.+\bjson_url=.+?)\1', + webpage, 'iframe url', group='url') + json_url = compat_parse_qs( + compat_urllib_parse_urlparse(iframe_url).query)['json_url'][0] return self._extract_from_json_url(json_url, video_id, lang) def _extract_from_json_url(self, json_url, video_id, lang): diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index abc5a44a1..42526357a 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -21,6 +21,7 @@ class BBCCoUkIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:(?:(?:programmes|iplayer(?:/[^/]+)?/(?:episode|playlist))/)|music/clips[/#])(?P[\da-z]{8})' _MEDIASELECTOR_URLS = [ + 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/iptv-all/vpid/%s', 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/pc/vpid/%s', ] @@ -189,6 +190,12 @@ class BBCCoUkIE(InfoExtractor): # Skip DASH until supported elif transfer_format == 'dash': pass + elif transfer_format == 'hls': + m3u8_formats = self._extract_m3u8_formats( + href, programme_id, ext='mp4', entry_protocol='m3u8_native', + m3u8_id=supplier, fatal=False) + if m3u8_formats: + formats.extend(m3u8_formats) # Direct link else: formats.append({ diff --git a/youtube_dl/extractor/ceskatelevize.py b/youtube_dl/extractor/ceskatelevize.py index dda583680..e857e66f4 100644 --- a/youtube_dl/extractor/ceskatelevize.py +++ b/youtube_dl/extractor/ceskatelevize.py @@ -17,55 +17,81 @@ from ..utils import ( class CeskaTelevizeIE(InfoExtractor): - _VALID_URL = r'https?://www\.ceskatelevize\.cz/(porady|ivysilani)/(.+/)?(?P[^?#]+)' - - _TESTS = [ - { - 'url': 'http://www.ceskatelevize.cz/ivysilani/ivysilani/10441294653-hyde-park-civilizace/214411058091220', - 'info_dict': { - 'id': '214411058091220', - 'ext': 'mp4', - 'title': 'Hyde Park Civilizace', - 'description': 'Věda a současná civilizace. Interaktivní pořad - prostor pro vaše otázky a komentáře', - 'thumbnail': 're:^https?://.*\.jpg', - 'duration': 3350, - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, + _VALID_URL = r'https?://www\.ceskatelevize\.cz/(porady|ivysilani)/(?:[^/]+/)*(?P[^/#?]+)/*(?:[#?].*)?$' + _TESTS = [{ + 'url': 'http://www.ceskatelevize.cz/ivysilani/ivysilani/10441294653-hyde-park-civilizace/214411058091220', + 'info_dict': { + 'id': '61924494876951776', + 'ext': 'mp4', + 'title': 'Hyde Park Civilizace', + 'description': 'md5:fe93f6eda372d150759d11644ebbfb4a', + 'thumbnail': 're:^https?://.*\.jpg', + 'duration': 3350, }, - { - 'url': 'http://www.ceskatelevize.cz/ivysilani/10532695142-prvni-republika/bonus/14716-zpevacka-z-duparny-bobina', - 'info_dict': { - 'id': '14716', - 'ext': 'mp4', - 'title': 'První republika: Zpěvačka z Dupárny Bobina', - 'description': 'Sága mapující atmosféru první republiky od r. 1918 do r. 1945.', - 'thumbnail': 're:^https?://.*\.jpg', - 'duration': 88.4, - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, + 'params': { + # m3u8 download + 'skip_download': True, }, - ] + }, { + 'url': 'http://www.ceskatelevize.cz/ivysilani/10532695142-prvni-republika/bonus/14716-zpevacka-z-duparny-bobina', + 'info_dict': { + 'id': '61924494876844374', + 'ext': 'mp4', + 'title': 'První republika: Zpěvačka z Dupárny Bobina', + 'description': 'Sága mapující atmosféru první republiky od r. 1918 do r. 1945.', + 'thumbnail': 're:^https?://.*\.jpg', + 'duration': 88.4, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + # video with 18+ caution trailer + 'url': 'http://www.ceskatelevize.cz/porady/10520528904-queer/215562210900007-bogotart/', + 'info_dict': { + 'id': '215562210900007-bogotart', + 'title': 'Queer: Bogotart', + 'description': 'Alternativní průvodce současným queer světem', + }, + 'playlist': [{ + 'info_dict': { + 'id': '61924494876844842', + 'ext': 'mp4', + 'title': 'Queer: Bogotart (Varování 18+)', + 'duration': 10.2, + }, + }, { + 'info_dict': { + 'id': '61924494877068022', + 'ext': 'mp4', + 'title': 'Queer: Bogotart (Queer)', + 'thumbnail': 're:^https?://.*\.jpg', + 'duration': 1558.3, + }, + }], + 'params': { + # m3u8 download + 'skip_download': True, + }, + }] def _real_extract(self, url): url = url.replace('/porady/', '/ivysilani/').replace('/video/', '') mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + playlist_id = mobj.group('id') - webpage = self._download_webpage(url, video_id) + webpage = self._download_webpage(url, playlist_id) NOT_AVAILABLE_STRING = 'This content is not available at your territory due to limited copyright.' if '%s

' % NOT_AVAILABLE_STRING in webpage: raise ExtractorError(NOT_AVAILABLE_STRING, expected=True) - typ = self._html_search_regex(r'getPlaylistUrl\(\[\{"type":"(.+?)","id":".+?"\}\],', webpage, 'type') - episode_id = self._html_search_regex(r'getPlaylistUrl\(\[\{"type":".+?","id":"(.+?)"\}\],', webpage, 'episode_id') + typ = self._html_search_regex( + r'getPlaylistUrl\(\[\{"type":"(.+?)","id":".+?"\}\],', webpage, 'type') + episode_id = self._html_search_regex( + r'getPlaylistUrl\(\[\{"type":".+?","id":"(.+?)"\}\],', webpage, 'episode_id') data = { 'playlist[0][type]': typ, @@ -83,7 +109,7 @@ class CeskaTelevizeIE(InfoExtractor): req.add_header('X-Requested-With', 'XMLHttpRequest') req.add_header('Referer', url) - playlistpage = self._download_json(req, video_id) + playlistpage = self._download_json(req, playlist_id) playlist_url = playlistpage['url'] if playlist_url == 'error_region': @@ -92,33 +118,43 @@ class CeskaTelevizeIE(InfoExtractor): req = compat_urllib_request.Request(compat_urllib_parse_unquote(playlist_url)) req.add_header('Referer', url) - playlist = self._download_json(req, video_id) + playlist_title = self._og_search_title(webpage) + playlist_description = self._og_search_description(webpage) - item = playlist['playlist'][0] - formats = [] - for format_id, stream_url in item['streamUrls'].items(): - formats.extend(self._extract_m3u8_formats(stream_url, video_id, 'mp4')) - self._sort_formats(formats) + playlist = self._download_json(req, playlist_id)['playlist'] + playlist_len = len(playlist) - title = self._og_search_title(webpage) - description = self._og_search_description(webpage) - duration = float_or_none(item.get('duration')) - thumbnail = item.get('previewImageUrl') + entries = [] + for item in playlist: + formats = [] + for format_id, stream_url in item['streamUrls'].items(): + formats.extend(self._extract_m3u8_formats( + stream_url, playlist_id, 'mp4', entry_protocol='m3u8_native')) + self._sort_formats(formats) - subtitles = {} - subs = item.get('subtitles') - if subs: - subtitles = self.extract_subtitles(episode_id, subs) + item_id = item.get('id') or item['assetId'] + title = item['title'] - return { - 'id': episode_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'duration': duration, - 'formats': formats, - 'subtitles': subtitles, - } + duration = float_or_none(item.get('duration')) + thumbnail = item.get('previewImageUrl') + + subtitles = {} + if item.get('type') == 'VOD': + subs = item.get('subtitles') + if subs: + subtitles = self.extract_subtitles(episode_id, subs) + + entries.append({ + 'id': item_id, + 'title': playlist_title if playlist_len == 1 else '%s (%s)' % (playlist_title, title), + 'description': playlist_description if playlist_len == 1 else None, + 'thumbnail': thumbnail, + 'duration': duration, + 'formats': formats, + 'subtitles': subtitles, + }) + + return self.playlist_result(entries, playlist_id, playlist_title, playlist_description) def _get_subtitles(self, episode_id, subs): original_subtitles = self._download_webpage( diff --git a/youtube_dl/extractor/cinemassacre.py b/youtube_dl/extractor/cinemassacre.py index c949a4814..fd1770dac 100644 --- a/youtube_dl/extractor/cinemassacre.py +++ b/youtube_dl/extractor/cinemassacre.py @@ -6,6 +6,7 @@ import re from .common import InfoExtractor from ..utils import ExtractorError from .bliptv import BlipTVIE +from .screenwavemedia import ScreenwaveMediaIE class CinemassacreIE(InfoExtractor): @@ -83,10 +84,10 @@ class CinemassacreIE(InfoExtractor): playerdata_url = self._search_regex( [ - r'src="(http://(?:player2\.screenwavemedia\.com|player\.screenwavemedia\.com/play)/[a-zA-Z]+\.php\?[^"]*\bid=.+?)"', - r']+src="((?:https?:)?//(?:[^.]+\.)?youtube\.com/.+?)"', + ScreenwaveMediaIE.EMBED_PATTERN, + r']+src="(?P(?:https?:)?//(?:[^.]+\.)?youtube\.com/.+?)"', ], - webpage, 'player data URL', default=None) + webpage, 'player data URL', default=None, group='url') if not playerdata_url: playerdata_url = BlipTVIE._extract_url(webpage) if not playerdata_url: diff --git a/youtube_dl/extractor/clubic.py b/youtube_dl/extractor/clubic.py index 14f215c5c..1dfa7c12e 100644 --- a/youtube_dl/extractor/clubic.py +++ b/youtube_dl/extractor/clubic.py @@ -12,9 +12,9 @@ from ..utils import ( class ClubicIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?clubic\.com/video/[^/]+/video.*-(?P[0-9]+)\.html' + _VALID_URL = r'http://(?:www\.)?clubic\.com/video/(?:[^/]+/)*video.*-(?P[0-9]+)\.html' - _TEST = { + _TESTS = [{ 'url': 'http://www.clubic.com/video/clubic-week/video-clubic-week-2-0-le-fbi-se-lance-dans-la-photo-d-identite-448474.html', 'md5': '1592b694ba586036efac1776b0b43cd3', 'info_dict': { @@ -24,7 +24,10 @@ class ClubicIE(InfoExtractor): 'description': 're:Gueule de bois chez Nokia. Le constructeur a indiqué cette.*', 'thumbnail': 're:^http://img\.clubic\.com/.*\.jpg$', } - } + }, { + 'url': 'http://www.clubic.com/video/video-clubic-week-2-0-apple-iphone-6s-et-plus-mais-surtout-le-pencil-469792.html', + 'only_matching': True, + }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 5d24bcb6a..d694e818e 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -510,6 +510,12 @@ class InfoExtractor(object): """Report attempt to log in.""" self.to_screen('Logging in') + @staticmethod + def raise_login_required(msg='This video is only available for registered users'): + raise ExtractorError( + '%s. Use --username and --password or --netrc to provide account credentials.' % msg, + expected=True) + # Methods for following #608 @staticmethod def url_result(url, ie=None, video_id=None, video_title=None): @@ -725,9 +731,10 @@ class InfoExtractor(object): @staticmethod def _hidden_inputs(html): + html = re.sub(r'', '', html) hidden_inputs = {} - for input in re.findall(r']+)>', html): - if not re.search(r'type=(["\'])hidden\1', input): + for input in re.findall(r'(?i)]+)>', html): + if not re.search(r'type=(["\'])(?:hidden|submit)\1', input): continue name = re.search(r'name=(["\'])(?P.+?)\1', input) if not name: @@ -740,7 +747,7 @@ class InfoExtractor(object): def _form_hidden_inputs(self, form_id, html): form = self._search_regex( - r'(?s)]+?id=(["\'])%s\1[^>]*>(?P
.+?)
' % form_id, + r'(?is)]+?id=(["\'])%s\1[^>]*>(?P
.+?)
' % form_id, html, '%s form' % form_id, group='form') return self._hidden_inputs(form) diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index 33a033a7f..95952bc29 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -20,16 +20,34 @@ from ..utils import ( ExtractorError, bytes_to_intlist, intlist_to_bytes, + int_or_none, remove_end, unified_strdate, urlencode_postdata, + xpath_text, ) from ..aes import ( aes_cbc_decrypt, ) -class CrunchyrollIE(InfoExtractor): +class CrunchyrollBaseIE(InfoExtractor): + def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None): + request = (url_or_request if isinstance(url_or_request, compat_urllib_request.Request) + else compat_urllib_request.Request(url_or_request)) + # Accept-Language must be set explicitly to accept any language to avoid issues + # similar to https://github.com/rg3/youtube-dl/issues/6797. + # Along with IP address Crunchyroll uses Accept-Language to guess whether georestriction + # should be imposed or not (from what I can see it just takes the first language + # ignoring the priority and requires it to correspond the IP). By the way this causes + # Crunchyroll to not work in georestriction cases in some browsers that don't place + # the locale lang first in header. However allowing any language seems to workaround the issue. + request.add_header('Accept-Language', '*') + return super(CrunchyrollBaseIE, self)._download_webpage( + request, video_id, note, errnote, fatal, tries, timeout, encoding) + + +class CrunchyrollIE(CrunchyrollBaseIE): _VALID_URL = r'https?://(?:(?Pwww|m)\.)?(?Pcrunchyroll\.(?:com|fr)/(?:media(?:-|/\?id=)|[^/]*/[^/?&]*?)(?P[0-9]+))(?:[/?&]|$)' _NETRC_MACHINE = 'crunchyroll' _TESTS = [{ @@ -237,7 +255,9 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text webpage_url = 'http://www.' + mobj.group('url') webpage = self._download_webpage(webpage_url, video_id, 'Downloading webpage') - note_m = self._html_search_regex(r'
(.+?)
', webpage, 'trailer-notice', default='') + note_m = self._html_search_regex( + r'
(.+?)
', + webpage, 'trailer-notice', default='') if note_m: raise ExtractorError(note_m) @@ -247,15 +267,22 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text if msg.get('type') == 'error': raise ExtractorError('crunchyroll returned error: %s' % msg['message_body'], expected=True) + if 'To view this, please log in to verify you are 18 or older.' in webpage: + self.raise_login_required() + video_title = self._html_search_regex(r']*>(.+?)', webpage, 'video_title', flags=re.DOTALL) video_title = re.sub(r' {2,}', ' ', video_title) video_description = self._html_search_regex(r'"description":"([^"]+)', webpage, 'video_description', default='') if not video_description: video_description = None - video_upload_date = self._html_search_regex(r'
Availability for free users:(.+?)
', webpage, 'video_upload_date', fatal=False, flags=re.DOTALL) + video_upload_date = self._html_search_regex( + [r'
Availability for free users:(.+?)
', r'
[^<>]+\s*(.+?\d{4})\s*
'], + webpage, 'video_upload_date', fatal=False, flags=re.DOTALL) if video_upload_date: video_upload_date = unified_strdate(video_upload_date) - video_uploader = self._html_search_regex(r'
\s*Publisher:(.+?)
', webpage, 'video_uploader', fatal=False, flags=re.DOTALL) + video_uploader = self._html_search_regex( + r']+href="/publisher/[^"]+"[^>]*>([^<]+)', webpage, + 'video_uploader', fatal=False) playerdata_url = compat_urllib_parse_unquote(self._html_search_regex(r'"config_url":"([^"]+)', webpage, 'playerdata_url')) playerdata_req = compat_urllib_request.Request(playerdata_url) @@ -281,6 +308,13 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text stream_info = streamdata.find('./{default}preload/stream_info') video_url = stream_info.find('./host').text video_play_path = stream_info.find('./file').text + metadata = stream_info.find('./metadata') + format_info = { + 'format': video_format, + 'format_id': video_format, + 'height': int_or_none(xpath_text(metadata, './height')), + 'width': int_or_none(xpath_text(metadata, './width')), + } if '.fplive.net/' in video_url: video_url = re.sub(r'^rtmpe?://', 'http://', video_url.strip()) @@ -289,19 +323,18 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text netloc='v.lvlt.crcdn.net', path='%s/%s' % (remove_end(parsed_video_url.path, '/'), video_play_path.split(':')[-1]))) if self._is_valid_url(direct_video_url, video_id, video_format): - formats.append({ + format_info.update({ 'url': direct_video_url, - 'format_id': video_format, }) + formats.append(format_info) continue - formats.append({ + format_info.update({ 'url': video_url, 'play_path': video_play_path, 'ext': 'flv', - 'format': video_format, - 'format_id': video_format, }) + formats.append(format_info) subtitles = self.extract_subtitles(video_id, webpage) @@ -317,7 +350,7 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text } -class CrunchyrollShowPlaylistIE(InfoExtractor): +class CrunchyrollShowPlaylistIE(CrunchyrollBaseIE): IE_NAME = "crunchyroll:playlist" _VALID_URL = r'https?://(?:(?Pwww|m)\.)?(?Pcrunchyroll\.com/(?!(?:news|anime-news|library|forum|launchcalendar|lineup|store|comics|freetrial|login))(?P[\w\-]+))/?$' diff --git a/youtube_dl/extractor/dcn.py b/youtube_dl/extractor/dcn.py index 82261e25c..6f2fea5ff 100644 --- a/youtube_dl/extractor/dcn.py +++ b/youtube_dl/extractor/dcn.py @@ -44,8 +44,8 @@ class DCNIE(InfoExtractor): title = video.get('title_en') or video['title_ar'] webpage = self._download_webpage( - 'http://admin.mangomolo.com/analytics/index.php/customers/embed/video?' - + compat_urllib_parse.urlencode({ + 'http://admin.mangomolo.com/analytics/index.php/customers/embed/video?' + + compat_urllib_parse.urlencode({ 'id': video['id'], 'user_id': video['user_id'], 'signature': video['signature'], diff --git a/youtube_dl/extractor/divxstage.py b/youtube_dl/extractor/divxstage.py deleted file mode 100644 index b88379e06..000000000 --- a/youtube_dl/extractor/divxstage.py +++ /dev/null @@ -1,27 +0,0 @@ -from __future__ import unicode_literals - -from .novamov import NovaMovIE - - -class DivxStageIE(NovaMovIE): - IE_NAME = 'divxstage' - IE_DESC = 'DivxStage' - - _VALID_URL = NovaMovIE._VALID_URL_TEMPLATE % {'host': 'divxstage\.(?:eu|net|ch|co|at|ag|to)'} - - _HOST = 'www.divxstage.eu' - - _FILE_DELETED_REGEX = r'>This file no longer exists on our servers.<' - _TITLE_REGEX = r'
\s*([^<]+)' - _DESCRIPTION_REGEX = r'
\s*[^<]+\s*

([^<]+)

' - - _TEST = { - 'url': 'http://www.divxstage.eu/video/57f238e2e5e01', - 'md5': '63969f6eb26533a1968c4d325be63e72', - 'info_dict': { - 'id': '57f238e2e5e01', - 'ext': 'flv', - 'title': 'youtubedl test video', - 'description': 'This is a test video for youtubedl.', - } - } diff --git a/youtube_dl/extractor/dumpert.py b/youtube_dl/extractor/dumpert.py index 999fb5620..1f00386fe 100644 --- a/youtube_dl/extractor/dumpert.py +++ b/youtube_dl/extractor/dumpert.py @@ -9,8 +9,8 @@ from ..utils import qualities class DumpertIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?dumpert\.nl/mediabase/(?P[0-9]+/[0-9a-zA-Z]+)' - _TEST = { + _VALID_URL = r'https?://(?:www\.)?dumpert\.nl/(?:mediabase|embed)/(?P[0-9]+/[0-9a-zA-Z]+)' + _TESTS = [{ 'url': 'http://www.dumpert.nl/mediabase/6646981/951bc60f/', 'md5': '1b9318d7d5054e7dcb9dc7654f21d643', 'info_dict': { @@ -20,11 +20,15 @@ class DumpertIE(InfoExtractor): 'description': 'Niet schrikken hoor', 'thumbnail': 're:^https?://.*\.jpg$', } - } + }, { + 'url': 'http://www.dumpert.nl/embed/6675421/dc440fe7/', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) + url = 'https://www.dumpert.nl/mediabase/' + video_id req = compat_urllib_request.Request(url) req.add_header('Cookie', 'nsfw=1; cpc=10') webpage = self._download_webpage(req, video_id) diff --git a/youtube_dl/extractor/eagleplatform.py b/youtube_dl/extractor/eagleplatform.py index 688dfc2f7..a1ee51568 100644 --- a/youtube_dl/extractor/eagleplatform.py +++ b/youtube_dl/extractor/eagleplatform.py @@ -79,7 +79,7 @@ class EaglePlatformIE(InfoExtractor): age_limit = 0 if age_restriction == 'allow_all' else 18 m3u8_data = self._download_json( - media['sources']['secure_m3u8']['auto'], + self._proto_relative_url(media['sources']['secure_m3u8']['auto'], 'http:'), video_id, 'Downloading m3u8 JSON') formats = self._extract_m3u8_formats( diff --git a/youtube_dl/extractor/eroprofile.py b/youtube_dl/extractor/eroprofile.py index 316033cf1..7fcd0151d 100644 --- a/youtube_dl/extractor/eroprofile.py +++ b/youtube_dl/extractor/eroprofile.py @@ -71,8 +71,7 @@ class EroProfileIE(InfoExtractor): m = re.search(r'You must be logged in to view this video\.', webpage) if m: - raise ExtractorError( - 'This video requires login. Please specify a username and password and try again.', expected=True) + self.raise_login_required('This video requires login') video_id = self._search_regex( [r"glbUpdViews\s*\('\d*','(\d+)'", r'p/report/video/(\d+)'], diff --git a/youtube_dl/extractor/fc2.py b/youtube_dl/extractor/fc2.py index e4f7195a8..a406945e8 100644 --- a/youtube_dl/extractor/fc2.py +++ b/youtube_dl/extractor/fc2.py @@ -10,12 +10,13 @@ from ..compat import ( compat_urlparse, ) from ..utils import ( + encode_dict, ExtractorError, ) class FC2IE(InfoExtractor): - _VALID_URL = r'^http://video\.fc2\.com/(?:[^/]+/)?content/(?P[^/]+)' + _VALID_URL = r'^http://video\.fc2\.com/(?:[^/]+/)*content/(?P[^/]+)' IE_NAME = 'fc2' _NETRC_MACHINE = 'fc2' _TESTS = [{ @@ -37,6 +38,9 @@ class FC2IE(InfoExtractor): 'password': '(snip)', 'skip': 'requires actual password' } + }, { + 'url': 'http://video.fc2.com/en/a/content/20130926eZpARwsF', + 'only_matching': True, }] def _login(self): @@ -52,10 +56,7 @@ class FC2IE(InfoExtractor): 'Submit': ' Login ', } - # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode - # chokes on unicode - login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k, v in login_form_strs.items()) - login_data = compat_urllib_parse.urlencode(login_form).encode('utf-8') + login_data = compat_urllib_parse.urlencode(encode_dict(login_form_strs)).encode('utf-8') request = compat_urllib_request.Request( 'https://secure.id.fc2.com/index.php?mode=login&switch_language=en', login_data) @@ -80,7 +81,7 @@ class FC2IE(InfoExtractor): title = self._og_search_title(webpage) thumbnail = self._og_search_thumbnail(webpage) - refer = url.replace('/content/', '/a/content/') + refer = url.replace('/content/', '/a/content/') if '/a/content/' not in url else url mimi = hashlib.md5((video_id + '_gGddgPfeaf_gzyr').encode('utf-8')).hexdigest() diff --git a/youtube_dl/extractor/foxnews.py b/youtube_dl/extractor/foxnews.py index 917f76b1e..3a4a59135 100644 --- a/youtube_dl/extractor/foxnews.py +++ b/youtube_dl/extractor/foxnews.py @@ -1,5 +1,7 @@ from __future__ import unicode_literals +import re + from .common import InfoExtractor from ..utils import ( parse_iso8601, @@ -8,7 +10,8 @@ from ..utils import ( class FoxNewsIE(InfoExtractor): - _VALID_URL = r'https?://video\.foxnews\.com/v/(?:video-embed\.html\?video_id=)?(?P\d+)' + IE_DESC = 'Fox News and Fox Business Video' + _VALID_URL = r'https?://(?Pvideo\.fox(?:news|business)\.com)/v/(?:video-embed\.html\?video_id=)?(?P\d+)' _TESTS = [ { 'url': 'http://video.foxnews.com/v/3937480/frozen-in-time/#sp=show-clips', @@ -42,13 +45,19 @@ class FoxNewsIE(InfoExtractor): 'url': 'http://video.foxnews.com/v/video-embed.html?video_id=3937480&d=video.foxnews.com', 'only_matching': True, }, + { + 'url': 'http://video.foxbusiness.com/v/4442309889001', + 'only_matching': True, + }, ] def _real_extract(self, url): - video_id = self._match_id(url) + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + host = mobj.group('host') video = self._download_json( - 'http://video.foxnews.com/v/feed/video/%s.js?template=fox' % video_id, video_id) + 'http://%s/v/feed/video/%s.js?template=fox' % (host, video_id), video_id) item = video['channel']['item'] title = item['title'] diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py index 75723c00d..129984a5f 100644 --- a/youtube_dl/extractor/francetv.py +++ b/youtube_dl/extractor/francetv.py @@ -78,9 +78,14 @@ class FranceTVBaseInfoExtractor(InfoExtractor): }) self._sort_formats(formats) + title = info['titre'] + subtitle = info.get('sous_titre') + if subtitle: + title += ' - %s' % subtitle + return { 'id': video_id, - 'title': info['titre'], + 'title': title, 'description': clean_html(info['synopsis']), 'thumbnail': compat_urlparse.urljoin('http://pluzz.francetv.fr', info['image']), 'duration': int_or_none(info.get('real_duration')) or parse_duration(info['duree']), @@ -214,15 +219,15 @@ class FranceTVIE(FranceTVBaseInfoExtractor): }, # france5 { - 'url': 'http://www.france5.fr/emissions/c-a-dire/videos/92837968', - 'md5': '78f0f4064f9074438e660785bbf2c5d9', + 'url': 'http://www.france5.fr/emissions/c-a-dire/videos/quels_sont_les_enjeux_de_cette_rentree_politique__31-08-2015_908948?onglet=tous&page=1', + 'md5': 'f6c577df3806e26471b3d21631241fd0', 'info_dict': { - 'id': '108961659', + 'id': '123327454', 'ext': 'flv', - 'title': 'C à dire ?!', - 'description': 'md5:1a4aeab476eb657bf57c4ff122129f81', - 'upload_date': '20140915', - 'timestamp': 1410795000, + 'title': 'C à dire ?! - Quels sont les enjeux de cette rentrée politique ?', + 'description': 'md5:4a0d5cb5dce89d353522a84462bae5a4', + 'upload_date': '20150831', + 'timestamp': 1441035120, }, }, # franceo diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 376feecae..8881a8a23 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -4,6 +4,7 @@ from __future__ import unicode_literals import os import re +import sys from .common import InfoExtractor from .youtube import YoutubeIE @@ -48,6 +49,7 @@ from .vimeo import VimeoIE from .dailymotion import DailymotionCloudIE from .onionstudios import OnionStudiosIE from .snagfilms import SnagFilmsEmbedIE +from .screenwavemedia import ScreenwaveMediaIE class GenericIE(InfoExtractor): @@ -229,6 +231,22 @@ class GenericIE(InfoExtractor): 'skip_download': False, } }, + { + # redirect in Refresh HTTP header + 'url': 'https://www.facebook.com/l.php?u=https%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DpO8h3EaFRdo&h=TAQHsoToz&enc=AZN16h-b6o4Zq9pZkCCdOLNKMN96BbGMNtcFwHSaazus4JHT_MFYkAA-WARTX2kvsCIdlAIyHZjl6d33ILIJU7Jzwk_K3mcenAXoAzBNoZDI_Q7EXGDJnIhrGkLXo_LJ_pAa2Jzbx17UHMd3jAs--6j2zaeto5w9RTn8T_1kKg3fdC5WPX9Dbb18vzH7YFX0eSJmoa6SP114rvlkw6pkS1-T&s=1', + 'info_dict': { + 'id': 'pO8h3EaFRdo', + 'ext': 'mp4', + 'title': 'Tripeo Boiler Room x Dekmantel Festival DJ Set', + 'description': 'md5:6294cc1af09c4049e0652b51a2df10d5', + 'upload_date': '20150917', + 'uploader_id': 'brtvofficial', + 'uploader': 'Boiler Room', + }, + 'params': { + 'skip_download': False, + }, + }, { 'url': 'http://www.hodiho.fr/2013/02/regis-plante-sa-jeep.html', 'md5': '85b90ccc9d73b4acd9138d3af4c27f89', @@ -1001,6 +1019,16 @@ class GenericIE(InfoExtractor): 'description': 'New experience with Acrobat DC', 'duration': 248.667, }, + }, + # ScreenwaveMedia embed + { + 'url': 'http://www.thecinemasnob.com/the-cinema-snob/a-nightmare-on-elm-street-2-freddys-revenge1', + 'md5': '24ace5baba0d35d55c6810b51f34e9e0', + 'info_dict': { + 'id': 'cinemasnob-55d26273809dd', + 'ext': 'mp4', + 'title': 'cinemasnob', + }, } ] @@ -1718,6 +1746,11 @@ class GenericIE(InfoExtractor): if snagfilms_url: return self.url_result(snagfilms_url) + # Look for ScreenwaveMedia embeds + mobj = re.search(ScreenwaveMediaIE.EMBED_PATTERN, webpage) + if mobj is not None: + return self.url_result(unescapeHTML(mobj.group('url')), 'ScreenwaveMedia') + # Look for AdobeTVVideo embeds mobj = re.search( r']+src=[\'"]((?:https?:)?//video\.tv\.adobe\.com/v/\d+[^"]+)[\'"]', @@ -1781,7 +1814,7 @@ class GenericIE(InfoExtractor): found = filter_video(re.findall(r'.*?]*)?\s+src=["\'](.*?)["\']', webpage) + found = re.findall(r'(?s)<(?:video|audio)[^<]*(?:>.*?]*)?\s+src=["\'](.*?)["\']', webpage) if not found: REDIRECT_REGEX = r'[0-9]{,2};\s*(?:URL|url)=\'?([^\'"]+)' found = re.search( @@ -1792,6 +1825,9 @@ class GenericIE(InfoExtractor): # Look also in Refresh HTTP header refresh_header = head_response.headers.get('Refresh') if refresh_header: + # In python 2 response HTTP headers are bytestrings + if sys.version_info < (3, 0) and isinstance(refresh_header, str): + refresh_header = refresh_header.decode('iso-8859-1') found = re.search(REDIRECT_REGEX, refresh_header) if found: new_url = compat_urlparse.urljoin(url, unescapeHTML(found.group(1))) diff --git a/youtube_dl/extractor/globo.py b/youtube_dl/extractor/globo.py index 8a95793ca..33d6432a6 100644 --- a/youtube_dl/extractor/globo.py +++ b/youtube_dl/extractor/globo.py @@ -13,6 +13,7 @@ from ..compat import ( from ..utils import ( ExtractorError, float_or_none, + int_or_none, ) @@ -359,13 +360,8 @@ class GloboIE(InfoExtractor): self._API_URL_TEMPLATE % video_id, video_id)['videos'][0] title = video['title'] - duration = float_or_none(video['duration'], 1000) - like_count = video['likes'] - uploader = video['channel'] - uploader_id = video['channel_id'] formats = [] - for resource in video['resources']: resource_id = resource.get('_id') if not resource_id: @@ -407,6 +403,11 @@ class GloboIE(InfoExtractor): self._sort_formats(formats) + duration = float_or_none(video.get('duration'), 1000) + like_count = int_or_none(video.get('likes')) + uploader = video.get('channel') + uploader_id = video.get('channel_id') + return { 'id': video_id, 'title': title, diff --git a/youtube_dl/extractor/gorillavid.py b/youtube_dl/extractor/gorillavid.py index f006f0cb1..d23e3eac1 100644 --- a/youtube_dl/extractor/gorillavid.py +++ b/youtube_dl/extractor/gorillavid.py @@ -10,15 +10,16 @@ from ..compat import ( ) from ..utils import ( ExtractorError, + encode_dict, int_or_none, ) class GorillaVidIE(InfoExtractor): - IE_DESC = 'GorillaVid.in, daclips.in, movpod.in, fastvideo.in and realvid.net' + IE_DESC = 'GorillaVid.in, daclips.in, movpod.in, fastvideo.in, realvid.net and filehoot.com' _VALID_URL = r'''(?x) https?://(?P(?:www\.)? - (?:daclips\.in|gorillavid\.in|movpod\.in|fastvideo\.in|realvid\.net))/ + (?:daclips\.in|gorillavid\.in|movpod\.in|fastvideo\.in|realvid\.net|filehoot\.com))/ (?:embed-)?(?P[0-9a-zA-Z]+)(?:-[0-9]+x[0-9]+\.html)? ''' @@ -67,13 +68,22 @@ class GorillaVidIE(InfoExtractor): }, { 'url': 'http://movpod.in/0wguyyxi1yca', 'only_matching': True, + }, { + 'url': 'http://filehoot.com/3ivfabn7573c.html', + 'info_dict': { + 'id': '3ivfabn7573c', + 'ext': 'mp4', + 'title': 'youtube-dl test video \'äBaW_jenozKc.mp4.mp4', + 'thumbnail': 're:http://.*\.jpg', + } }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') - webpage = self._download_webpage('http://%s/%s' % (mobj.group('host'), video_id), video_id) + url = 'http://%s/%s' % (mobj.group('host'), video_id) + webpage = self._download_webpage(url, video_id) if re.search(self._FILE_NOT_FOUND_REGEX, webpage) is not None: raise ExtractorError('Video %s does not exist' % video_id, expected=True) @@ -87,7 +97,7 @@ class GorillaVidIE(InfoExtractor): if countdown: self._sleep(countdown, video_id) - post = compat_urllib_parse.urlencode(fields) + post = compat_urllib_parse.urlencode(encode_dict(fields)) req = compat_urllib_request.Request(url, post) req.add_header('Content-type', 'application/x-www-form-urlencoded') @@ -95,7 +105,7 @@ class GorillaVidIE(InfoExtractor): webpage = self._download_webpage(req, video_id, 'Downloading video page') title = self._search_regex( - [r'style="z-index: [0-9]+;">([^<]+)', r'>Watch (.+) '], + [r'style="z-index: [0-9]+;">([^<]+)', r'([^<]+)', r'>Watch (.+) '], webpage, 'title', default=None) or self._og_search_title(webpage) video_url = self._search_regex( r'file\s*:\s*["\'](http[^"\']+)["\'],', webpage, 'file url') diff --git a/youtube_dl/extractor/imgur.py b/youtube_dl/extractor/imgur.py index d692ea79a..70c8ca64e 100644 --- a/youtube_dl/extractor/imgur.py +++ b/youtube_dl/extractor/imgur.py @@ -13,7 +13,7 @@ from ..utils import ( class ImgurIE(InfoExtractor): - _VALID_URL = r'https?://(?:i\.)?imgur\.com/(?P[a-zA-Z0-9]+)' + _VALID_URL = r'https?://(?:i\.)?imgur\.com/(?!gallery)(?P[a-zA-Z0-9]+)' _TESTS = [{ 'url': 'https://i.imgur.com/A61SaA1.gifv', @@ -97,3 +97,28 @@ class ImgurIE(InfoExtractor): 'description': self._og_search_description(webpage), 'title': self._og_search_title(webpage), } + + +class ImgurAlbumIE(InfoExtractor): + _VALID_URL = r'https?://(?:i\.)?imgur\.com/gallery/(?P[a-zA-Z0-9]+)' + + _TEST = { + 'url': 'http://imgur.com/gallery/Q95ko', + 'info_dict': { + 'id': 'Q95ko', + }, + 'playlist_count': 25, + } + + def _real_extract(self, url): + album_id = self._match_id(url) + + album_images = self._download_json( + 'http://imgur.com/gallery/%s/album_images/hit.json?all=true' % album_id, + album_id)['data']['images'] + + entries = [ + self.url_result('http://imgur.com/%s' % image['hash']) + for image in album_images if image.get('hash')] + + return self.playlist_result(entries, album_id) diff --git a/youtube_dl/extractor/kaltura.py b/youtube_dl/extractor/kaltura.py index d28730492..3dca0e566 100644 --- a/youtube_dl/extractor/kaltura.py +++ b/youtube_dl/extractor/kaltura.py @@ -13,12 +13,24 @@ from ..utils import ( class KalturaIE(InfoExtractor): _VALID_URL = r'''(?x) - (?:kaltura:| - https?://(:?(?:www|cdnapisec)\.)?kaltura\.com/index\.php/kwidget/(?:[^/]+/)*?wid/_ - )(?P\d+) - (?::| - /(?:[^/]+/)*?entry_id/ - )(?P[0-9a-z_]+)''' + (?: + kaltura:(?P\d+):(?P[0-9a-z_]+)| + https?:// + (:?(?:www|cdnapisec)\.)?kaltura\.com/ + (?: + (?: + # flash player + index\.php/kwidget/ + (?:[^/]+/)*?wid/_(?P\d+)/ + (?:[^/]+/)*?entry_id/(?P[0-9a-z_]+)| + # html5 player + html5/html5lib/ + (?:[^/]+/)*?entry_id/(?P[0-9a-z_]+) + .*\?.*\bwid=_(?P\d+) + ) + ) + ) + ''' _API_BASE = 'http://cdnapi.kaltura.com/api_v3/index.php?' _TESTS = [ { @@ -43,6 +55,10 @@ class KalturaIE(InfoExtractor): 'url': 'https://cdnapisec.kaltura.com/index.php/kwidget/wid/_557781/uiconf_id/22845202/entry_id/1_plr1syf3', 'only_matching': True, }, + { + 'url': 'https://cdnapisec.kaltura.com/html5/html5lib/v2.30.2/mwEmbedFrame.php/p/1337/uiconf_id/20540612/entry_id/1_sf5ovm7u?wid=_243342', + 'only_matching': True, + } ] def _kaltura_api_call(self, video_id, actions, *args, **kwargs): @@ -105,9 +121,9 @@ class KalturaIE(InfoExtractor): video_id, actions, note='Downloading video info JSON') def _real_extract(self, url): - video_id = self._match_id(url) mobj = re.match(self._VALID_URL, url) - partner_id, entry_id = mobj.group('partner_id'), mobj.group('id') + partner_id = mobj.group('partner_id_s') or mobj.group('partner_id') or mobj.group('partner_id_html5') + entry_id = mobj.group('id_s') or mobj.group('id') or mobj.group('id_html5') info, source_data = self._get_video_info(entry_id, partner_id) @@ -126,7 +142,7 @@ class KalturaIE(InfoExtractor): self._sort_formats(formats) return { - 'id': video_id, + 'id': entry_id, 'title': info['name'], 'formats': formats, 'description': info.get('description'), diff --git a/youtube_dl/extractor/krasview.py b/youtube_dl/extractor/krasview.py index 96f95979a..0ae8ebd68 100644 --- a/youtube_dl/extractor/krasview.py +++ b/youtube_dl/extractor/krasview.py @@ -25,6 +25,9 @@ class KrasViewIE(InfoExtractor): 'duration': 27, 'thumbnail': 're:^https?://.*\.jpg', }, + 'params': { + 'skip_download': 'Not accessible from Travis CI server', + }, } def _real_extract(self, url): diff --git a/youtube_dl/extractor/kuwo.py b/youtube_dl/extractor/kuwo.py index 1077846f2..fa233377d 100644 --- a/youtube_dl/extractor/kuwo.py +++ b/youtube_dl/extractor/kuwo.py @@ -202,6 +202,7 @@ class KuwoSingerIE(InfoExtractor): 'title': 'Ali', }, 'playlist_mincount': 95, + 'skip': 'Regularly stalls travis build', # See https://travis-ci.org/rg3/youtube-dl/jobs/78878540 }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/lynda.py b/youtube_dl/extractor/lynda.py index 5b9157ed4..378117270 100644 --- a/youtube_dl/extractor/lynda.py +++ b/youtube_dl/extractor/lynda.py @@ -118,9 +118,7 @@ class LyndaIE(LyndaBaseIE): 'lynda returned error: %s' % video_json['Message'], expected=True) if video_json['HasAccess'] is False: - raise ExtractorError( - 'Video %s is only available for members. ' - % video_id + self._ACCOUNT_CREDENTIALS_HINT, expected=True) + self.raise_login_required('Video %s is only available for members' % video_id) video_id = compat_str(video_json['ID']) duration = video_json['DurationInSeconds'] diff --git a/youtube_dl/extractor/mailru.py b/youtube_dl/extractor/mailru.py index 54a14cb94..ab1300185 100644 --- a/youtube_dl/extractor/mailru.py +++ b/youtube_dl/extractor/mailru.py @@ -25,6 +25,7 @@ class MailRuIE(InfoExtractor): 'uploader_id': 'sonypicturesrus@mail.ru', 'duration': 184, }, + 'skip': 'Not accessible from Travis CI server', }, { 'url': 'http://my.mail.ru/corp/hitech/video/news_hi-tech_mail_ru/1263.html', @@ -39,6 +40,7 @@ class MailRuIE(InfoExtractor): 'uploader_id': 'hitech@corp.mail.ru', 'duration': 245, }, + 'skip': 'Not accessible from Travis CI server', }, ] diff --git a/youtube_dl/extractor/mit.py b/youtube_dl/extractor/mit.py index d7ab6a9ae..f088ab9e2 100644 --- a/youtube_dl/extractor/mit.py +++ b/youtube_dl/extractor/mit.py @@ -18,12 +18,12 @@ class TechTVMITIE(InfoExtractor): _TEST = { 'url': 'http://techtv.mit.edu/videos/25418-mit-dna-learning-center-set', - 'md5': '1f8cb3e170d41fd74add04d3c9330e5f', + 'md5': '00a3a27ee20d44bcaa0933ccec4a2cf7', 'info_dict': { 'id': '25418', 'ext': 'mp4', - 'title': 'MIT DNA Learning Center Set', - 'description': 'md5:82313335e8a8a3f243351ba55bc1b474', + 'title': 'MIT DNA and Protein Sets', + 'description': 'md5:46f5c69ce434f0a97e7c628cc142802d', }, } @@ -33,8 +33,8 @@ class TechTVMITIE(InfoExtractor): 'http://techtv.mit.edu/videos/%s' % video_id, video_id) clean_page = re.compile(r'', re.S).sub('', raw_page) - base_url = self._search_regex( - r'ipadUrl: \'(.+?cloudfront.net/)', raw_page, 'base url') + base_url = self._proto_relative_url(self._search_regex( + r'ipadUrl: \'(.+?cloudfront.net/)', raw_page, 'base url'), 'http:') formats_json = self._search_regex( r'bitrates: (\[.+?\])', raw_page, 'video formats') formats_mit = json.loads(formats_json) diff --git a/youtube_dl/extractor/mitele.py b/youtube_dl/extractor/mitele.py index 852d72266..54993e2c9 100644 --- a/youtube_dl/extractor/mitele.py +++ b/youtube_dl/extractor/mitele.py @@ -1,74 +1,85 @@ from __future__ import unicode_literals -import json - from .common import InfoExtractor -from ..compat import ( - compat_urllib_parse, - compat_urllib_parse_unquote, - compat_urlparse, -) +from ..compat import compat_urllib_parse from ..utils import ( + encode_dict, get_element_by_attribute, - parse_duration, - strip_jsonp, + int_or_none, ) class MiTeleIE(InfoExtractor): - IE_NAME = 'mitele.es' + IE_DESC = 'mitele.es' _VALID_URL = r'http://www\.mitele\.es/[^/]+/[^/]+/[^/]+/(?P[^/]+)/' _TESTS = [{ 'url': 'http://www.mitele.es/programas-tv/diario-de/la-redaccion/programa-144/', + 'md5': 'ace7635b2a0b286aaa37d3ff192d2a8a', 'info_dict': { - 'id': '0fce117d', - 'ext': 'mp4', - 'title': 'Programa 144 - Tor, la web invisible', - 'description': 'md5:3b6fce7eaa41b2d97358726378d9369f', + 'id': '0NF1jJnxS1Wu3pHrmvFyw2', 'display_id': 'programa-144', + 'ext': 'flv', + 'title': 'Tor, la web invisible', + 'description': 'md5:3b6fce7eaa41b2d97358726378d9369f', + 'thumbnail': 're:(?i)^https?://.*\.jpg$', 'duration': 2913, }, - 'params': { - # m3u8 download - 'skip_download': True, - }, }] def _real_extract(self, url): - episode = self._match_id(url) - webpage = self._download_webpage(url, episode) - embed_data_json = self._search_regex( - r'(?s)MSV\.embedData\[.*?\]\s*=\s*({.*?});', webpage, 'embed data', - ).replace('\'', '"') - embed_data = json.loads(embed_data_json) + display_id = self._match_id(url) - domain = embed_data['mediaUrl'] - if not domain.startswith('http'): - # only happens in telecinco.es videos - domain = 'http://' + domain - info_url = compat_urlparse.urljoin( - domain, - compat_urllib_parse_unquote(embed_data['flashvars']['host']) - ) - info_el = self._download_xml(info_url, episode).find('./video/info') + webpage = self._download_webpage(url, display_id) - video_link = info_el.find('videoUrl/link').text - token_query = compat_urllib_parse.urlencode({'id': video_link}) - token_info = self._download_json( - embed_data['flashvars']['ov_tk'] + '?' + token_query, - episode, - transform_source=strip_jsonp - ) - formats = self._extract_m3u8_formats( - token_info['tokenizedUrl'], episode, ext='mp4') + config_url = self._search_regex( + r'data-config\s*=\s*"([^"]+)"', webpage, 'data config url') + + config = self._download_json( + config_url, display_id, 'Downloading config JSON') + + mmc = self._download_json( + config['services']['mmc'], display_id, 'Downloading mmc JSON') + + formats = [] + for location in mmc['locations']: + gat = self._proto_relative_url(location.get('gat'), 'http:') + bas = location.get('bas') + loc = location.get('loc') + ogn = location.get('ogn') + if None in (gat, bas, loc, ogn): + continue + token_data = { + 'bas': bas, + 'icd': loc, + 'ogn': ogn, + 'sta': '0', + } + media = self._download_json( + '%s/?%s' % (gat, compat_urllib_parse.urlencode(encode_dict(token_data)).encode('utf-8')), + display_id, 'Downloading %s JSON' % location['loc']) + file_ = media.get('file') + if not file_: + continue + formats.extend(self._extract_f4m_formats( + file_ + '&hdcore=3.2.0&plugin=aasp-3.2.0.77.18', + display_id, f4m_id=loc)) + + title = self._search_regex( + r'class="Destacado-text"[^>]*>\s*([^<]+)', webpage, 'title') + + video_id = self._search_regex( + r'data-media-id\s*=\s*"([^"]+)"', webpage, + 'data media id', default=None) or display_id + thumbnail = config.get('poster', {}).get('imageUrl') + duration = int_or_none(mmc.get('duration')) return { - 'id': embed_data['videoId'], - 'display_id': episode, - 'title': info_el.find('title').text, - 'formats': formats, + 'id': video_id, + 'display_id': display_id, + 'title': title, 'description': get_element_by_attribute('class', 'text', webpage), - 'thumbnail': info_el.find('thumb').text, - 'duration': parse_duration(info_el.find('duration').text), + 'thumbnail': thumbnail, + 'duration': duration, + 'formats': formats, } diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py index b48fac5e3..a597714e9 100644 --- a/youtube_dl/extractor/mtv.py +++ b/youtube_dl/extractor/mtv.py @@ -67,7 +67,7 @@ class MTVServicesInfoExtractor(InfoExtractor): return [{'url': url, 'ext': 'mp4'}] def _extract_video_formats(self, mdoc, mtvn_id): - if re.match(r'.*/(error_country_block\.swf|geoblock\.mp4)$', mdoc.find('.//src').text) is not None: + if re.match(r'.*/(error_country_block\.swf|geoblock\.mp4|copyright_error\.flv(?:\?geo\b.+?)?)$', mdoc.find('.//src').text) is not None: if mtvn_id is not None and self._MOBILE_TEMPLATE is not None: self.to_screen('The normal version is not available from your ' 'country, trying with the mobile version') @@ -114,7 +114,8 @@ class MTVServicesInfoExtractor(InfoExtractor): # Remove the templates, like &device={device} mediagen_url = re.sub(r'&[^=]*?={.*?}(?=(&|$))', '', mediagen_url) if 'acceptMethods' not in mediagen_url: - mediagen_url += '&acceptMethods=fms' + mediagen_url += '&' if '?' in mediagen_url else '?' + mediagen_url += 'acceptMethods=fms' mediagen_doc = self._download_xml(mediagen_url, video_id, 'Downloading video urls') @@ -141,7 +142,7 @@ class MTVServicesInfoExtractor(InfoExtractor): if title_el is None: title_el = itemdoc.find('.//{http://search.yahoo.com/mrss/}title') if title_el is None: - title_el = itemdoc.find('.//title') + title_el = itemdoc.find('.//title') or itemdoc.find('./title') if title_el.text is None: title_el = None @@ -174,8 +175,11 @@ class MTVServicesInfoExtractor(InfoExtractor): if self._LANG: info_url += 'lang=%s&' % self._LANG info_url += data + return self._get_videos_info_from_url(info_url, video_id) + + def _get_videos_info_from_url(self, url, video_id): idoc = self._download_xml( - info_url, video_id, + url, video_id, 'Downloading info', transform_source=fix_xml_ampersands) return self.playlist_result( [self._get_video_info(item) for item in idoc.findall('.//item')]) @@ -288,3 +292,65 @@ class MTVIggyIE(MTVServicesInfoExtractor): } } _FEED_URL = 'http://all.mtvworldverticals.com/feed-xml/' + + +class MTVDEIE(MTVServicesInfoExtractor): + IE_NAME = 'mtv.de' + _VALID_URL = r'https?://(?:www\.)?mtv\.de/(?:artists|shows|news)/(?:[^/]+/)*(?P\d+)-[^/#?]+/*(?:[#?].*)?$' + _TESTS = [{ + 'url': 'http://www.mtv.de/artists/10571-cro/videos/61131-traum', + 'info_dict': { + 'id': 'music_video-a50bc5f0b3aa4b3190aa', + 'ext': 'mp4', + 'title': 'MusicVideo_cro-traum', + 'description': 'Cro - Traum', + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + }, { + # mediagen URL without query (e.g. http://videos.mtvnn.com/mediagen/e865da714c166d18d6f80893195fcb97) + 'url': 'http://www.mtv.de/shows/933-teen-mom-2/staffeln/5353/folgen/63565-enthullungen', + 'info_dict': { + 'id': 'local_playlist-f5ae778b9832cc837189', + 'ext': 'mp4', + 'title': 'Episode_teen-mom-2_shows_season-5_episode-1_full-episode_part1', + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + }, { + # single video in pagePlaylist with different id + 'url': 'http://www.mtv.de/news/77491-mtv-movies-spotlight-pixels-teil-3', + 'info_dict': { + 'id': 'local_playlist-4e760566473c4c8c5344', + 'ext': 'mp4', + 'title': 'Article_mtv-movies-spotlight-pixels-teil-3_short-clips_part1', + 'description': 'MTV Movies Supercut', + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + playlist = self._parse_json( + self._search_regex( + r'window\.pagePlaylist\s*=\s*(\[.+?\]);\n', webpage, 'page playlist'), + video_id) + + # news pages contain single video in playlist with different id + if len(playlist) == 1: + return self._get_videos_info_from_url(playlist[0]['mrss'], video_id) + + for item in playlist: + item_id = item.get('id') + if item_id and compat_str(item_id) == video_id: + return self._get_videos_info_from_url(item['mrss'], video_id) diff --git a/youtube_dl/extractor/musicvault.py b/youtube_dl/extractor/musicvault.py deleted file mode 100644 index 0e46ac7c1..000000000 --- a/youtube_dl/extractor/musicvault.py +++ /dev/null @@ -1,63 +0,0 @@ -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor - - -class MusicVaultIE(InfoExtractor): - _VALID_URL = r'https?://www\.musicvault\.com/(?P[^/?#]*)/video/(?P[^/?#]*)_(?P[0-9]+)\.html' - _TEST = { - 'url': 'http://www.musicvault.com/the-allman-brothers-band/video/straight-from-the-heart_1010863.html', - 'md5': '3adcbdb3dcc02d647539e53f284ba171', - 'info_dict': { - 'id': '1010863', - 'ext': 'mp4', - 'uploader_id': 'the-allman-brothers-band', - 'title': 'Straight from the Heart', - 'duration': 244, - 'uploader': 'The Allman Brothers Band', - 'thumbnail': 're:^https?://.*/thumbnail/.*', - 'upload_date': '20131219', - 'location': 'Capitol Theatre (Passaic, NJ)', - 'description': 'Listen to The Allman Brothers Band perform Straight from the Heart at Capitol Theatre (Passaic, NJ) on Dec 16, 1981', - 'timestamp': int, - } - } - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - display_id = mobj.group('display_id') - webpage = self._download_webpage(url, display_id) - - thumbnail = self._search_regex( - r'(.*?)
', webpage, 'data fields') - uploader = self._html_search_regex( - r'(.*?)', data_div, 'uploader', fatal=False) - title = self._html_search_regex( - r'(.*?)', data_div, 'title') - location = self._html_search_regex( - r'(.*?)', data_div, 'location', fatal=False) - - kaltura_id = self._search_regex( - r'
\d+:\d+)', - page, 'duration', default=None)) - - formats = [] - - mp3_url = re.search(r'''\{src:'(?P
\s*

([^<]+)

', + webpage, 'description', fatal=False) + return { + '_type': 'url_transparent', + 'ie_key': 'NDREmbedBase', + 'url': 'ndr:%s' % video_id, + 'display_id': display_id, + 'description': description, } - } + + +class NDREmbedBaseIE(InfoExtractor): + IE_NAME = 'ndr:embed:base' + _VALID_URL = r'(?:ndr:(?P[\da-z]+)|https?://www\.ndr\.de/(?P[\da-z]+)-ppjson\.json)' + _TESTS = [{ + 'url': 'ndr:soundcheck3366', + 'only_matching': True, + }, { + 'url': 'http://www.ndr.de/soundcheck3366-ppjson.json', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') or mobj.group('id_s') + + ppjson = self._download_json( + 'http://www.ndr.de/%s-ppjson.json' % video_id, video_id) + + playlist = ppjson['playlist'] + + formats = [] + quality_key = qualities(('xs', 's', 'm', 'l', 'xl')) + + for format_id, f in playlist.items(): + src = f.get('src') + if not src: + continue + ext = determine_ext(src, None) + if ext == 'f4m': + formats.extend(self._extract_f4m_formats( + src + '?hdcore=3.7.0&plugin=aasp-3.7.0.39.44', video_id, f4m_id='hds')) + elif ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + src, video_id, m3u8_id='hls', entry_protocol='m3u8_native')) + else: + quality = f.get('quality') + ff = { + 'url': src, + 'format_id': quality or format_id, + 'quality': quality_key(quality), + } + type_ = f.get('type') + if type_ and type_.split('/')[0] == 'audio': + ff['vcodec'] = 'none' + ff['ext'] = ext or 'mp3' + formats.append(ff) + self._sort_formats(formats) + + config = playlist['config'] + + live = playlist.get('config', {}).get('streamType') in ['httpVideoLive', 'httpAudioLive'] + title = config['title'] + if live: + title = self._live_title(title) + uploader = ppjson.get('config', {}).get('branding') + upload_date = ppjson.get('config', {}).get('publicationDate') + duration = int_or_none(config.get('duration')) + + thumbnails = [{ + 'id': thumbnail.get('quality') or thumbnail_id, + 'url': thumbnail['src'], + 'preference': quality_key(thumbnail.get('quality')), + } for thumbnail_id, thumbnail in config.get('poster', {}).items() if thumbnail.get('src')] + + return { + 'id': video_id, + 'title': title, + 'is_live': live, + 'uploader': uploader if uploader != '-' else None, + 'upload_date': upload_date[0:8] if upload_date else None, + 'duration': duration, + 'thumbnails': thumbnails, + 'formats': formats, + } + + +class NDREmbedIE(NDREmbedBaseIE): + IE_NAME = 'ndr:embed' + _VALID_URL = r'https?://www\.ndr\.de/(?:[^/]+/)+(?P[\da-z]+)-(?:player|externalPlayer)\.html' + _TESTS = [{ + 'url': 'http://www.ndr.de/fernsehen/sendungen/ndr_aktuell/ndraktuell28488-player.html', + 'md5': '8b9306142fe65bbdefb5ce24edb6b0a9', + 'info_dict': { + 'id': 'ndraktuell28488', + 'ext': 'mp4', + 'title': 'Norddeutschland begrüßt Flüchtlinge', + 'is_live': False, + 'uploader': 'ndrtv', + 'upload_date': '20150907', + 'duration': 132, + }, + }, { + 'url': 'http://www.ndr.de/ndr2/events/soundcheck/soundcheck3366-player.html', + 'md5': '002085c44bae38802d94ae5802a36e78', + 'info_dict': { + 'id': 'soundcheck3366', + 'ext': 'mp4', + 'title': 'Ella Henderson braucht Vergleiche nicht zu scheuen', + 'is_live': False, + 'uploader': 'ndr2', + 'upload_date': '20150912', + 'duration': 3554, + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'http://www.ndr.de/info/audio51535-player.html', + 'md5': 'bb3cd38e24fbcc866d13b50ca59307b8', + 'info_dict': { + 'id': 'audio51535', + 'ext': 'mp3', + 'title': 'La Valette entgeht der Hinrichtung', + 'is_live': False, + 'uploader': 'ndrinfo', + 'upload_date': '20140729', + 'duration': 884, + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'http://www.ndr.de/fernsehen/sendungen/visite/visite11010-externalPlayer.html', + 'md5': 'ae57f80511c1e1f2fd0d0d3d31aeae7c', + 'info_dict': { + 'id': 'visite11010', + 'ext': 'mp4', + 'title': 'Visite - die ganze Sendung', + 'is_live': False, + 'uploader': 'ndrtv', + 'upload_date': '20150902', + 'duration': 3525, + }, + 'params': { + 'skip_download': True, + }, + }, { + # httpVideoLive + 'url': 'http://www.ndr.de/fernsehen/livestream/livestream217-externalPlayer.html', + 'info_dict': { + 'id': 'livestream217', + 'ext': 'flv', + 'title': 're:^NDR Fernsehen Niedersachsen \d{4}-\d{2}-\d{2} \d{2}:\d{2}$', + 'is_live': True, + 'upload_date': '20150910', + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'http://www.ndr.de/ndrkultur/audio255020-player.html', + 'only_matching': True, + }, { + 'url': 'http://www.ndr.de/fernsehen/sendungen/nordtour/nordtour7124-player.html', + 'only_matching': True, + }, { + 'url': 'http://www.ndr.de/kultur/film/videos/videoimport10424-player.html', + 'only_matching': True, + }, { + 'url': 'http://www.ndr.de/fernsehen/sendungen/hamburg_journal/hamj43006-player.html', + 'only_matching': True, + }, { + 'url': 'http://www.ndr.de/fernsehen/sendungen/weltbilder/weltbilder4518-player.html', + 'only_matching': True, + }, { + 'url': 'http://www.ndr.de/fernsehen/doku952-player.html', + 'only_matching': True, + }] + + +class NJoyEmbedIE(NDREmbedBaseIE): + IE_NAME = 'njoy:embed' + _VALID_URL = r'https?://www\.n-joy\.de/(?:[^/]+/)+(?P[\da-z]+)-(?:player|externalPlayer)_[^/]+\.html' + _TESTS = [{ + # httpVideo + 'url': 'http://www.n-joy.de/events/reeperbahnfestival/doku948-player_image-bc168e87-5263-4d6d-bd27-bb643005a6de_theme-n-joy.html', + 'md5': '8483cbfe2320bd4d28a349d62d88bd74', + 'info_dict': { + 'id': 'doku948', + 'ext': 'mp4', + 'title': 'Zehn Jahre Reeperbahn Festival - die Doku', + 'is_live': False, + 'upload_date': '20150807', + 'duration': 1011, + }, + }, { + # httpAudio + 'url': 'http://www.n-joy.de/news_wissen/stefanrichter100-player_image-d5e938b1-f21a-4b9a-86b8-aaba8bca3a13_theme-n-joy.html', + 'md5': 'd989f80f28ac954430f7b8a48197188a', + 'info_dict': { + 'id': 'stefanrichter100', + 'ext': 'mp3', + 'title': 'Interview mit einem Augenzeugen', + 'is_live': False, + 'uploader': 'njoy', + 'upload_date': '20150909', + 'duration': 140, + }, + 'params': { + 'skip_download': True, + }, + }, { + # httpAudioLive, no explicit ext + 'url': 'http://www.n-joy.de/news_wissen/webradioweltweit100-player_image-3fec0484-2244-4565-8fb8-ed25fd28b173_theme-n-joy.html', + 'info_dict': { + 'id': 'webradioweltweit100', + 'ext': 'mp3', + 'title': 're:^N-JOY Weltweit \d{4}-\d{2}-\d{2} \d{2}:\d{2}$', + 'is_live': True, + 'uploader': 'njoy', + 'upload_date': '20150810', + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'http://www.n-joy.de/musik/dockville882-player_image-3905259e-0803-4764-ac72-8b7de077d80a_theme-n-joy.html', + 'only_matching': True, + }, { + 'url': 'http://www.n-joy.de/radio/sendungen/morningshow/urlaubsfotos190-player_image-066a5df1-5c95-49ec-a323-941d848718db_theme-n-joy.html', + 'only_matching': True, + }, { + 'url': 'http://www.n-joy.de/entertainment/comedy/krudetv290-player_image-ab261bfe-51bf-4bf3-87ba-c5122ee35b3d_theme-n-joy.html', + 'only_matching': True, + }] diff --git a/youtube_dl/extractor/niconico.py b/youtube_dl/extractor/niconico.py index 0f8aa5ada..bda1cff05 100644 --- a/youtube_dl/extractor/niconico.py +++ b/youtube_dl/extractor/niconico.py @@ -12,6 +12,7 @@ from ..compat import ( compat_urlparse, ) from ..utils import ( + encode_dict, ExtractorError, int_or_none, parse_duration, @@ -100,10 +101,7 @@ class NiconicoIE(InfoExtractor): 'mail': username, 'password': password, } - # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode - # chokes on unicode - login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k, v in login_form_strs.items()) - login_data = compat_urllib_parse.urlencode(login_form).encode('utf-8') + login_data = compat_urllib_parse.urlencode(encode_dict(login_form_strs)).encode('utf-8') request = compat_urllib_request.Request( 'https://secure.nicovideo.jp/secure/login', login_data) login_results = self._download_webpage( diff --git a/youtube_dl/extractor/nowness.py b/youtube_dl/extractor/nowness.py index 6b2f3f55a..b97f62fdb 100644 --- a/youtube_dl/extractor/nowness.py +++ b/youtube_dl/extractor/nowness.py @@ -1,64 +1,134 @@ # encoding: utf-8 from __future__ import unicode_literals -import re - from .brightcove import BrightcoveIE from .common import InfoExtractor from ..utils import ExtractorError +from ..compat import ( + compat_str, + compat_urllib_request, +) -class NownessIE(InfoExtractor): - _VALID_URL = r'https?://(?:(?:www|cn)\.)?nowness\.com/[^?#]*?/(?P[0-9]+)/(?P[^/]+?)(?:$|[?#])' +class NownessBaseIE(InfoExtractor): + def _extract_url_result(self, post): + if post['type'] == 'video': + for media in post['media']: + if media['type'] == 'video': + video_id = media['content'] + source = media['source'] + if source == 'brightcove': + player_code = self._download_webpage( + 'http://www.nowness.com/iframe?id=%s' % video_id, video_id, + note='Downloading player JavaScript', + errnote='Unable to download player JavaScript') + bc_url = BrightcoveIE._extract_brightcove_url(player_code) + if bc_url is None: + raise ExtractorError('Could not find player definition') + return self.url_result(bc_url, 'Brightcove') + elif source == 'vimeo': + return self.url_result('http://vimeo.com/%s' % video_id, 'Vimeo') + elif source == 'youtube': + return self.url_result(video_id, 'Youtube') + elif source == 'cinematique': + # youtube-dl currently doesn't support cinematique + # return self.url_result('http://cinematique.com/embed/%s' % video_id, 'Cinematique') + pass - _TESTS = [ - { - 'url': 'http://www.nowness.com/day/2013/6/27/3131/candor--the-art-of-gesticulation', - 'md5': '068bc0202558c2e391924cb8cc470676', - 'info_dict': { - 'id': '2520295746001', - 'ext': 'mp4', - 'title': 'Candor: The Art of Gesticulation', - 'description': 'Candor: The Art of Gesticulation', - 'thumbnail': 're:^https?://.*\.jpg', - 'uploader': 'Nowness', - } + def _api_request(self, url, request_path): + display_id = self._match_id(url) + request = compat_urllib_request.Request( + 'http://api.nowness.com/api/' + request_path % display_id, + headers={ + 'X-Nowness-Language': 'zh-cn' if 'cn.nowness.com' in url else 'en-us', + }) + return display_id, self._download_json(request, display_id) + + +class NownessIE(NownessBaseIE): + IE_NAME = 'nowness' + _VALID_URL = r'https?://(?:(?:www|cn)\.)?nowness\.com/(?:story|(?:series|category)/[^/]+)/(?P[^/]+?)(?:$|[?#])' + _TESTS = [{ + 'url': 'https://www.nowness.com/story/candor-the-art-of-gesticulation', + 'md5': '068bc0202558c2e391924cb8cc470676', + 'info_dict': { + 'id': '2520295746001', + 'ext': 'mp4', + 'title': 'Candor: The Art of Gesticulation', + 'description': 'Candor: The Art of Gesticulation', + 'thumbnail': 're:^https?://.*\.jpg', + 'uploader': 'Nowness', }, - { - 'url': 'http://cn.nowness.com/day/2014/8/7/4069/kasper-bj-rke-ft-jaakko-eino-kalevi--tnr', - 'md5': 'e79cf125e387216f86b2e0a5b5c63aa3', - 'info_dict': { - 'id': '3716354522001', - 'ext': 'mp4', - 'title': 'Kasper Bjørke ft. Jaakko Eino Kalevi: TNR', - 'description': 'Kasper Bjørke ft. Jaakko Eino Kalevi: TNR', - 'thumbnail': 're:^https?://.*\.jpg', - 'uploader': 'Nowness', - } + }, { + 'url': 'https://cn.nowness.com/story/kasper-bjorke-ft-jaakko-eino-kalevi-tnr', + 'md5': 'e79cf125e387216f86b2e0a5b5c63aa3', + 'info_dict': { + 'id': '3716354522001', + 'ext': 'mp4', + 'title': 'Kasper Bjørke ft. Jaakko Eino Kalevi: TNR', + 'description': 'Kasper Bjørke ft. Jaakko Eino Kalevi: TNR', + 'thumbnail': 're:^https?://.*\.jpg', + 'uploader': 'Nowness', }, - ] + }, { + # vimeo + 'url': 'https://www.nowness.com/series/nowness-picks/jean-luc-godard-supercut', + 'md5': '9a5a6a8edf806407e411296ab6bc2a49', + 'info_dict': { + 'id': '130020913', + 'ext': 'mp4', + 'title': 'Bleu, Blanc, Rouge - A Godard Supercut', + 'description': 'md5:f0ea5f1857dffca02dbd37875d742cec', + 'thumbnail': 're:^https?://.*\.jpg', + 'upload_date': '20150607', + 'uploader': 'Cinema Sem Lei', + 'uploader_id': 'cinemasemlei', + }, + }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('slug') + _, post = self._api_request(url, 'post/getBySlug/%s') + return self._extract_url_result(post) - webpage = self._download_webpage(url, video_id) - player_url = self._search_regex( - r'"([^"]+/content/issue-[0-9.]+.js)"', webpage, 'player URL') - real_id = self._search_regex( - r'\sdata-videoId="([0-9]+)"', webpage, 'internal video ID') - player_code = self._download_webpage( - player_url, video_id, - note='Downloading player JavaScript', - errnote='Player download failed') - player_code = player_code.replace("'+d+'", real_id) +class NownessPlaylistIE(NownessBaseIE): + IE_NAME = 'nowness:playlist' + _VALID_URL = r'https?://(?:(?:www|cn)\.)?nowness\.com/playlist/(?P\d+)' + _TEST = { + 'url': 'https://www.nowness.com/playlist/3286/i-guess-thats-why-they-call-it-the-blues', + 'info_dict': { + 'id': '3286', + }, + 'playlist_mincount': 8, + } - bc_url = BrightcoveIE._extract_brightcove_url(player_code) - if bc_url is None: - raise ExtractorError('Could not find player definition') - return { - '_type': 'url', - 'url': bc_url, - 'ie_key': 'Brightcove', - } + def _real_extract(self, url): + playlist_id, playlist = self._api_request(url, 'post?PlaylistId=%s') + entries = [self._extract_url_result(item) for item in playlist['items']] + return self.playlist_result(entries, playlist_id) + + +class NownessSeriesIE(NownessBaseIE): + IE_NAME = 'nowness:series' + _VALID_URL = r'https?://(?:(?:www|cn)\.)?nowness\.com/series/(?P[^/]+?)(?:$|[?#])' + _TEST = { + 'url': 'https://www.nowness.com/series/60-seconds', + 'info_dict': { + 'id': '60', + 'title': '60 Seconds', + 'description': 'One-minute wisdom in a new NOWNESS series', + }, + 'playlist_mincount': 4, + } + + def _real_extract(self, url): + display_id, series = self._api_request(url, 'series/getBySlug/%s') + entries = [self._extract_url_result(post) for post in series['posts']] + series_title = None + series_description = None + translations = series.get('translations', []) + if translations: + series_title = translations[0].get('title') or translations[0]['seoTitle'] + series_description = translations[0].get('seoDescription') + return self.playlist_result( + entries, compat_str(series['id']), series_title, series_description) diff --git a/youtube_dl/extractor/nowtv.py b/youtube_dl/extractor/nowtv.py index 66c627bec..c8257719f 100644 --- a/youtube_dl/extractor/nowtv.py +++ b/youtube_dl/extractor/nowtv.py @@ -130,10 +130,16 @@ class NowTVIE(InfoExtractor): }, { 'url': 'http://www.nowtv.at/rtl/bauer-sucht-frau/die-neuen-bauern-und-eine-hochzeit/preview?return=/rtl/bauer-sucht-frau/die-neuen-bauern-und-eine-hochzeit', 'only_matching': True, + }, { + 'url': 'http://www.nowtv.de/rtl2/echtzeit/list/aktuell/schnelles-geld-am-ende-der-welt/player', + 'only_matching': True, }] def _real_extract(self, url): display_id = self._match_id(url) + display_id_split = display_id.split('/') + if len(display_id) > 2: + display_id = '/'.join((display_id_split[0], display_id_split[-1])) info = self._download_json( 'https://api.nowtv.de/v3/movies/%s?fields=id,title,free,geoblocked,articleLong,articleShort,broadcastStartDate,seoUrl,duration,format,files' % display_id, diff --git a/youtube_dl/extractor/nowvideo.py b/youtube_dl/extractor/nowvideo.py index dec09cdfe..17baa9679 100644 --- a/youtube_dl/extractor/nowvideo.py +++ b/youtube_dl/extractor/nowvideo.py @@ -7,7 +7,7 @@ class NowVideoIE(NovaMovIE): IE_NAME = 'nowvideo' IE_DESC = 'NowVideo' - _VALID_URL = NovaMovIE._VALID_URL_TEMPLATE % {'host': 'nowvideo\.(?:ch|sx|eu|at|ag|co|li)'} + _VALID_URL = NovaMovIE._VALID_URL_TEMPLATE % {'host': 'nowvideo\.(?:ch|ec|sx|eu|at|ag|co|li)'} _HOST = 'www.nowvideo.ch' diff --git a/youtube_dl/extractor/odnoklassniki.py b/youtube_dl/extractor/odnoklassniki.py index 003d27de7..ccc88cfb1 100644 --- a/youtube_dl/extractor/odnoklassniki.py +++ b/youtube_dl/extractor/odnoklassniki.py @@ -4,6 +4,7 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..compat import compat_urllib_parse_unquote from ..utils import ( + ExtractorError, unified_strdate, int_or_none, qualities, @@ -12,7 +13,7 @@ from ..utils import ( class OdnoklassnikiIE(InfoExtractor): - _VALID_URL = r'https?://(?:odnoklassniki|ok)\.ru/(?:video|web-api/video/moviePlayer)/(?P[\d-]+)' + _VALID_URL = r'https?://(?:www\.)?(?:odnoklassniki|ok)\.ru/(?:video|web-api/video/moviePlayer)/(?P[\d-]+)' _TESTS = [{ # metadata in JSON 'url': 'http://ok.ru/video/20079905452', @@ -28,6 +29,7 @@ class OdnoklassnikiIE(InfoExtractor): 'like_count': int, 'age_limit': 0, }, + 'skip': 'Video has been blocked', }, { # metadataUrl 'url': 'http://ok.ru/video/63567059965189-0', @@ -43,9 +45,27 @@ class OdnoklassnikiIE(InfoExtractor): 'like_count': int, 'age_limit': 0, }, + }, { + # YouTube embed (metadataUrl, provider == USER_YOUTUBE) + 'url': 'http://ok.ru/video/64211978996595-1', + 'md5': '5d7475d428845cd2e13bae6f1a992278', + 'info_dict': { + 'id': '64211978996595-1', + 'ext': 'mp4', + 'title': 'Космическая среда от 26 августа 2015', + 'description': 'md5:848eb8b85e5e3471a3a803dae1343ed0', + 'duration': 440, + 'upload_date': '20150826', + 'uploader_id': '750099571', + 'uploader': 'Алина П', + 'age_limit': 0, + }, }, { 'url': 'http://ok.ru/web-api/video/moviePlayer/20079905452', 'only_matching': True, + }, { + 'url': 'http://www.ok.ru/video/20648036891', + 'only_matching': True, }] def _real_extract(self, url): @@ -54,9 +74,16 @@ class OdnoklassnikiIE(InfoExtractor): webpage = self._download_webpage( 'http://ok.ru/video/%s' % video_id, video_id) + error = self._search_regex( + r'[^>]+class="vp_video_stub_txt"[^>]*>([^<]+)<', + webpage, 'error', default=None) + if error: + raise ExtractorError(error, expected=True) + player = self._parse_json( unescapeHTML(self._search_regex( - r'data-attributes="([^"]+)"', webpage, 'player')), + r'data-options=(?P["\'])(?P{.+?%s.+?})(?P=quote)' % video_id, + webpage, 'player', group='player')), video_id) flashvars = player['flashvars'] @@ -89,16 +116,7 @@ class OdnoklassnikiIE(InfoExtractor): like_count = int_or_none(metadata.get('likeCount')) - quality = qualities(('mobile', 'lowest', 'low', 'sd', 'hd')) - - formats = [{ - 'url': f['url'], - 'ext': 'mp4', - 'format_id': f['name'], - 'quality': quality(f['name']), - } for f in metadata['videos']] - - return { + info = { 'id': video_id, 'title': title, 'thumbnail': thumbnail, @@ -108,5 +126,24 @@ class OdnoklassnikiIE(InfoExtractor): 'uploader_id': uploader_id, 'like_count': like_count, 'age_limit': age_limit, - 'formats': formats, } + + if metadata.get('provider') == 'USER_YOUTUBE': + info.update({ + '_type': 'url_transparent', + 'url': movie['contentId'], + }) + return info + + quality = qualities(('mobile', 'lowest', 'low', 'sd', 'hd')) + + formats = [{ + 'url': f['url'], + 'ext': 'mp4', + 'format_id': f['name'], + 'quality': quality(f['name']), + } for f in metadata['videos']] + self._sort_formats(formats) + + info['formats'] = formats + return info diff --git a/youtube_dl/extractor/openfilm.py b/youtube_dl/extractor/openfilm.py deleted file mode 100644 index d2ceedd01..000000000 --- a/youtube_dl/extractor/openfilm.py +++ /dev/null @@ -1,70 +0,0 @@ -from __future__ import unicode_literals - -import json - -from .common import InfoExtractor -from ..compat import compat_urllib_parse_unquote_plus -from ..utils import ( - parse_iso8601, - parse_age_limit, - int_or_none, -) - - -class OpenFilmIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)openfilm\.com/videos/(?P.+)' - _TEST = { - 'url': 'http://www.openfilm.com/videos/human-resources-remastered', - 'md5': '42bcd88c2f3ec13b65edf0f8ad1cac37', - 'info_dict': { - 'id': '32736', - 'display_id': 'human-resources-remastered', - 'ext': 'mp4', - 'title': 'Human Resources (Remastered)', - 'description': 'Social Engineering in the 20th Century.', - 'thumbnail': 're:^https?://.*\.jpg$', - 'duration': 7164, - 'timestamp': 1334756988, - 'upload_date': '20120418', - 'uploader_id': '41117', - 'view_count': int, - 'age_limit': 0, - }, - } - - def _real_extract(self, url): - display_id = self._match_id(url) - - webpage = self._download_webpage(url, display_id) - - player = compat_urllib_parse_unquote_plus( - self._og_search_video_url(webpage)) - - video = json.loads(self._search_regex( - r'\bp=({.+?})(?:&|$)', player, 'video JSON')) - - video_url = '%s1.mp4' % video['location'] - video_id = video.get('video_id') - display_id = video.get('alias') or display_id - title = video.get('title') - description = video.get('description') - thumbnail = video.get('main_thumb') - duration = int_or_none(video.get('duration')) - timestamp = parse_iso8601(video.get('dt_published'), ' ') - uploader_id = video.get('user_id') - view_count = int_or_none(video.get('views_count')) - age_limit = parse_age_limit(video.get('age_limit')) - - return { - 'id': video_id, - 'display_id': display_id, - 'url': video_url, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'duration': duration, - 'timestamp': timestamp, - 'uploader_id': uploader_id, - 'view_count': view_count, - 'age_limit': age_limit, - } diff --git a/youtube_dl/extractor/playwire.py b/youtube_dl/extractor/playwire.py index bdc71017b..6d138ef25 100644 --- a/youtube_dl/extractor/playwire.py +++ b/youtube_dl/extractor/playwire.py @@ -19,7 +19,7 @@ class PlaywireIE(InfoExtractor): 'id': '3353705', 'ext': 'mp4', 'title': 'S04_RM_UCL_Rus', - 'thumbnail': 're:^http://.*\.png$', + 'thumbnail': 're:^https?://.*\.png$', 'duration': 145.94, }, }, { diff --git a/youtube_dl/extractor/pluralsight.py b/youtube_dl/extractor/pluralsight.py index 7ba396aef..fd32836cc 100644 --- a/youtube_dl/extractor/pluralsight.py +++ b/youtube_dl/extractor/pluralsight.py @@ -41,9 +41,7 @@ class PluralsightIE(InfoExtractor): def _login(self): (username, password) = self._get_login_info() if username is None: - raise ExtractorError( - 'Pluralsight account is required, use --username and --password options to provide account credentials.', - expected=True) + self.raise_login_required('Pluralsight account is required') login_page = self._download_webpage( self._LOGIN_URL, None, 'Downloading login page') diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index 7b0cdc41a..a656ad85a 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -20,7 +20,7 @@ from ..aes import ( class PornHubIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?pornhub\.com/(?:view_video\.php\?viewkey=|embed/)(?P[0-9a-z]+)' + _VALID_URL = r'https?://(?:[a-z]+\.)?pornhub\.com/(?:view_video\.php\?viewkey=|embed/)(?P[0-9a-z]+)' _TESTS = [{ 'url': 'http://www.pornhub.com/view_video.php?viewkey=648719015', 'md5': '882f488fa1f0026f023f33576004a2ed', @@ -34,6 +34,9 @@ class PornHubIE(InfoExtractor): }, { 'url': 'http://www.pornhub.com/view_video.php?viewkey=ph557bbb6676d2d', 'only_matching': True, + }, { + 'url': 'http://fr.pornhub.com/view_video.php?viewkey=ph55ca2f9760862', + 'only_matching': True, }] @classmethod diff --git a/youtube_dl/extractor/rai.py b/youtube_dl/extractor/rai.py index 1631faf29..7ff1d06c4 100644 --- a/youtube_dl/extractor/rai.py +++ b/youtube_dl/extractor/rai.py @@ -5,6 +5,7 @@ import re from .common import InfoExtractor from ..compat import ( compat_urllib_parse, + compat_urlparse, ) from ..utils import ( parse_duration, @@ -72,6 +73,18 @@ class RaiIE(InfoExtractor): 'description': 'Primo appuntamento con "Il candidato" con Filippo Timi, alias Piero Zucca presidente!', 'uploader': 'RaiTre', } + }, + { + 'url': 'http://www.report.rai.it/dl/Report/puntata/ContentItem-0c7a664b-d0f4-4b2c-8835-3f82e46f433e.html', + 'md5': '037104d2c14132887e5e4cf114569214', + 'info_dict': { + 'id': '0c7a664b-d0f4-4b2c-8835-3f82e46f433e', + 'ext': 'flv', + 'title': 'Il pacco', + 'description': 'md5:4b1afae1364115ce5d78ed83cd2e5b3a', + 'uploader': 'RaiTre', + 'upload_date': '20141221', + }, } ] @@ -90,11 +103,14 @@ class RaiIE(InfoExtractor): relinker_url = self._extract_relinker_url(webpage) if not relinker_url: - iframe_path = self._search_regex( - r']+src="/?(dl/[^"]+\?iframe\b[^"]*)"', + iframe_url = self._search_regex( + [r']+src="([^"]*/dl/[^"]+\?iframe\b[^"]*)"', + r'drawMediaRaiTV\(["\'](.+?)["\']'], webpage, 'iframe') + if not iframe_url.startswith('http'): + iframe_url = compat_urlparse.urljoin(url, iframe_url) webpage = self._download_webpage( - '%s/%s' % (host, iframe_path), video_id) + iframe_url, video_id) relinker_url = self._extract_relinker_url(webpage) relinker = self._download_json( diff --git a/youtube_dl/extractor/rtve.py b/youtube_dl/extractor/rtve.py index 82cd98ac7..5b97d33ca 100644 --- a/youtube_dl/extractor/rtve.py +++ b/youtube_dl/extractor/rtve.py @@ -6,7 +6,7 @@ import re import time from .common import InfoExtractor -from ..compat import compat_urlparse +from ..compat import compat_urllib_request, compat_urlparse from ..utils import ( ExtractorError, float_or_none, @@ -102,7 +102,9 @@ class RTVEALaCartaIE(InfoExtractor): if info['state'] == 'DESPU': raise ExtractorError('The video is no longer available', expected=True) png_url = 'http://www.rtve.es/ztnr/movil/thumbnail/%s/videos/%s.png' % (self._manager, video_id) - png = self._download_webpage(png_url, video_id, 'Downloading url information') + png_request = compat_urllib_request.Request(png_url) + png_request.add_header('Referer', url) + png = self._download_webpage(png_request, video_id, 'Downloading url information') video_url = _decrypt_url(png) if not video_url.endswith('.f4m'): auth_url = video_url.replace( diff --git a/youtube_dl/extractor/ruutu.py b/youtube_dl/extractor/ruutu.py index 4e22628d0..c67ad25ce 100644 --- a/youtube_dl/extractor/ruutu.py +++ b/youtube_dl/extractor/ruutu.py @@ -6,19 +6,19 @@ from ..compat import compat_urllib_parse_urlparse from ..utils import ( determine_ext, int_or_none, + xpath_attr, xpath_text, ) class RuutuIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?ruutu\.fi/ohjelmat/(?:[^/?#]+/)*(?P[^/?#]+)' + _VALID_URL = r'https?://(?:www\.)?ruutu\.fi/video/(?P\d+)' _TESTS = [ { - 'url': 'http://www.ruutu.fi/ohjelmat/oletko-aina-halunnut-tietaa-mita-tapahtuu-vain-hetki-ennen-lahetysta-nyt-se-selvisi', + 'url': 'http://www.ruutu.fi/video/2058907', 'md5': 'ab2093f39be1ca8581963451b3c0234f', 'info_dict': { 'id': '2058907', - 'display_id': 'oletko-aina-halunnut-tietaa-mita-tapahtuu-vain-hetki-ennen-lahetysta-nyt-se-selvisi', 'ext': 'mp4', 'title': 'Oletko aina halunnut tietää mitä tapahtuu vain hetki ennen lähetystä? - Nyt se selvisi!', 'description': 'md5:cfc6ccf0e57a814360df464a91ff67d6', @@ -28,14 +28,13 @@ class RuutuIE(InfoExtractor): }, }, { - 'url': 'http://www.ruutu.fi/ohjelmat/superpesis/superpesis-katso-koko-kausi-ruudussa', + 'url': 'http://www.ruutu.fi/video/2057306', 'md5': '065a10ae4d5b8cfd9d0c3d332465e3d9', 'info_dict': { 'id': '2057306', - 'display_id': 'superpesis-katso-koko-kausi-ruudussa', 'ext': 'mp4', 'title': 'Superpesis: katso koko kausi Ruudussa', - 'description': 'md5:44c44a99fdbe5b380ab74ebd75f0af77', + 'description': 'md5:da2736052fef3b2bd5e0005e63c25eac', 'thumbnail': 're:^https?://.*\.jpg$', 'duration': 40, 'age_limit': 0, @@ -44,29 +43,10 @@ class RuutuIE(InfoExtractor): ] def _real_extract(self, url): - display_id = self._match_id(url) + video_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) - - video_id = self._search_regex( - r'data-media-id="(\d+)"', webpage, 'media id') - - video_xml_url = None - - media_data = self._search_regex( - r'jQuery\.extend\([^,]+,\s*(.+?)\);', webpage, - 'media data', default=None) - if media_data: - media_json = self._parse_json(media_data, display_id, fatal=False) - if media_json: - xml_url = media_json.get('ruutuplayer', {}).get('xmlUrl') - if xml_url: - video_xml_url = xml_url.replace('{ID}', video_id) - - if not video_xml_url: - video_xml_url = 'http://gatling.ruutu.fi/media-xml-cache?id=%s' % video_id - - video_xml = self._download_xml(video_xml_url, video_id) + video_xml = self._download_xml( + 'http://gatling.ruutu.fi/media-xml-cache?id=%s' % video_id, video_id) formats = [] processed_urls = [] @@ -109,10 +89,9 @@ class RuutuIE(InfoExtractor): return { 'id': video_id, - 'display_id': display_id, - 'title': self._og_search_title(webpage), - 'description': self._og_search_description(webpage), - 'thumbnail': self._og_search_thumbnail(webpage), + 'title': xpath_attr(video_xml, './/Behavior/Program', 'program_name', 'title', fatal=True), + 'description': xpath_attr(video_xml, './/Behavior/Program', 'description', 'description'), + 'thumbnail': xpath_attr(video_xml, './/Behavior/Startpicture', 'href', 'thumbnail'), 'duration': int_or_none(xpath_text(video_xml, './/Runtime', 'duration')), 'age_limit': int_or_none(xpath_text(video_xml, './/AgeLimit', 'age limit')), 'formats': formats, diff --git a/youtube_dl/extractor/safari.py b/youtube_dl/extractor/safari.py index f3c80708c..a602af692 100644 --- a/youtube_dl/extractor/safari.py +++ b/youtube_dl/extractor/safari.py @@ -20,7 +20,6 @@ from ..utils import ( class SafariBaseIE(InfoExtractor): _LOGIN_URL = 'https://www.safaribooksonline.com/accounts/login/' _SUCCESSFUL_LOGIN_REGEX = r']*>Sign Out' - _ACCOUNT_CREDENTIALS_HINT = 'Use --username and --password options to supply credentials for safaribooksonline.com' _NETRC_MACHINE = 'safari' _API_BASE = 'https://www.safaribooksonline.com/api/v1/book' @@ -37,9 +36,7 @@ class SafariBaseIE(InfoExtractor): def _login(self): (username, password) = self._get_login_info() if username is None: - raise ExtractorError( - self._ACCOUNT_CREDENTIALS_HINT, - expected=True) + self.raise_login_required('safaribooksonline.com account is required') headers = std_headers if 'Referer' not in headers: diff --git a/youtube_dl/extractor/screenwavemedia.py b/youtube_dl/extractor/screenwavemedia.py index 220d39078..05f93904c 100644 --- a/youtube_dl/extractor/screenwavemedia.py +++ b/youtube_dl/extractor/screenwavemedia.py @@ -12,8 +12,8 @@ from ..utils import ( class ScreenwaveMediaIE(InfoExtractor): - _VALID_URL = r'http://player\d?\.screenwavemedia\.com/(?:play/)?[a-zA-Z]+\.php\?[^"]*\bid=(?P.+)' - + _VALID_URL = r'https?://player\d?\.screenwavemedia\.com/(?:play/)?[a-zA-Z]+\.php\?.*\bid=(?P[A-Za-z0-9-]+)' + EMBED_PATTERN = r'src=(["\'])(?P(?:https?:)?//player\d?\.screenwavemedia\.com/(?:play/)?[a-zA-Z]+\.php\?.*\bid=.+?)\1' _TESTS = [{ 'url': 'http://player.screenwavemedia.com/play/play.php?playerdiv=videoarea&companiondiv=squareAd&id=Cinemassacre-19911', 'only_matching': True, @@ -33,7 +33,7 @@ class ScreenwaveMediaIE(InfoExtractor): 'http://player.screenwavemedia.com/player.js', video_id, 'Downloading playerconfig webpage') - videoserver = self._search_regex(r"\[ipaddress\]\s*=>\s*([\d\.]+)", playerdata, 'videoserver') + videoserver = self._search_regex(r'SWMServer\s*=\s*"([\d\.]+)"', playerdata, 'videoserver') sources = self._parse_json( js_to_json( @@ -56,6 +56,7 @@ class ScreenwaveMediaIE(InfoExtractor): # Fallback to hardcoded sources if JS changes again if not sources: + self.report_warning('Falling back to a hardcoded list of streams') sources = [{ 'file': 'http://%s/vod/%s_%s.mp4' % (videoserver, video_id, format_id), 'type': 'mp4', diff --git a/youtube_dl/extractor/shahid.py b/youtube_dl/extractor/shahid.py index 6e9903d5e..f76fb12c0 100644 --- a/youtube_dl/extractor/shahid.py +++ b/youtube_dl/extractor/shahid.py @@ -16,7 +16,7 @@ class ShahidIE(InfoExtractor): 'url': 'https://shahid.mbc.net/ar/episode/90574/%D8%A7%D9%84%D9%85%D9%84%D9%83-%D8%B9%D8%A8%D8%AF%D8%A7%D9%84%D9%84%D9%87-%D8%A7%D9%84%D8%A5%D9%86%D8%B3%D8%A7%D9%86-%D8%A7%D9%84%D9%85%D9%88%D8%B3%D9%85-1-%D9%83%D9%84%D9%8A%D8%A8-3.html', 'info_dict': { 'id': '90574', - 'ext': 'm3u8', + 'ext': 'mp4', 'title': 'الملك عبدالله الإنسان الموسم 1 كليب 3', 'description': 'الفيلم الوثائقي - الملك عبد الله الإنسان', 'duration': 2972, @@ -81,7 +81,7 @@ class ShahidIE(InfoExtractor): compat_urllib_parse.urlencode({ 'apiKey': 'sh@hid0nlin3', 'hash': 'b2wMCTHpSmyxGqQjJFOycRmLSex+BpTK/ooxy6vHaqs=', - }).encode('utf-8')), + })), video_id, 'Downloading video JSON') video = video[api_vars['playerType']] diff --git a/youtube_dl/extractor/shared.py b/youtube_dl/extractor/shared.py index a07677686..c5636e8e9 100644 --- a/youtube_dl/extractor/shared.py +++ b/youtube_dl/extractor/shared.py @@ -14,17 +14,28 @@ from ..utils import ( class SharedIE(InfoExtractor): - _VALID_URL = r'http://shared\.sx/(?P[\da-z]{10})' + IE_DESC = 'shared.sx and vivo.sx' + _VALID_URL = r'http://(?:shared|vivo)\.sx/(?P[\da-z]{10})' - _TEST = { + _TESTS = [{ 'url': 'http://shared.sx/0060718775', 'md5': '106fefed92a8a2adb8c98e6a0652f49b', 'info_dict': { 'id': '0060718775', 'ext': 'mp4', 'title': 'Bmp4', + 'filesize': 1720110, }, - } + }, { + 'url': 'http://vivo.sx/d7ddda0e78', + 'md5': '15b3af41be0b4fe01f4df075c2678b2c', + 'info_dict': { + 'id': 'd7ddda0e78', + 'ext': 'mp4', + 'title': 'Chicken', + 'filesize': 528031, + }, + }] def _real_extract(self, url): video_id = self._match_id(url) diff --git a/youtube_dl/extractor/smotri.py b/youtube_dl/extractor/smotri.py index 93a7cfe15..35a81ee87 100644 --- a/youtube_dl/extractor/smotri.py +++ b/youtube_dl/extractor/smotri.py @@ -330,10 +330,7 @@ class SmotriBroadcastIE(InfoExtractor): (username, password) = self._get_login_info() if username is None: - raise ExtractorError( - 'Erotic broadcasts allowed only for registered users, ' - 'use --username and --password options to provide account credentials.', - expected=True) + self.raise_login_required('Erotic broadcasts allowed only for registered users') login_form = { 'login-hint53': '1', diff --git a/youtube_dl/extractor/telecinco.py b/youtube_dl/extractor/telecinco.py index ae94f055c..2c8e9b941 100644 --- a/youtube_dl/extractor/telecinco.py +++ b/youtube_dl/extractor/telecinco.py @@ -1,24 +1,51 @@ # coding: utf-8 from __future__ import unicode_literals -from .mitele import MiTeleIE +import json + +from .common import InfoExtractor +from ..compat import ( + compat_urllib_parse, + compat_urllib_parse_unquote, + compat_urlparse, +) +from ..utils import ( + get_element_by_attribute, + parse_duration, + strip_jsonp, +) -class TelecincoIE(MiTeleIE): - IE_NAME = 'telecinco.es' - _VALID_URL = r'https?://www\.telecinco\.es/(?:[^/]+/)+(?P.+?)\.html' +class TelecincoIE(InfoExtractor): + IE_DESC = 'telecinco.es, cuatro.com and mediaset.es' + _VALID_URL = r'https?://www\.(?:telecinco\.es|cuatro\.com|mediaset\.es)/(?:[^/]+/)+(?P.+?)\.html' _TESTS = [{ 'url': 'http://www.telecinco.es/robinfood/temporada-01/t01xp14/Bacalao-cocochas-pil-pil_0_1876350223.html', + 'md5': '5cbef3ad5ef17bf0d21570332d140729', 'info_dict': { 'id': 'MDSVID20141015_0058', 'ext': 'mp4', 'title': 'Con Martín Berasategui, hacer un bacalao al ...', 'duration': 662, }, - 'params': { - # m3u8 download - 'skip_download': True, + }, { + 'url': 'http://www.cuatro.com/deportes/futbol/barcelona/Leo_Messi-Champions-Roma_2_2052780128.html', + 'md5': '0a5b9f3cc8b074f50a0578f823a12694', + 'info_dict': { + 'id': 'MDSVID20150916_0128', + 'ext': 'mp4', + 'title': '¿Quién es este ex futbolista con el que hablan ...', + 'duration': 79, + }, + }, { + 'url': 'http://www.mediaset.es/12meses/campanas/doylacara/conlatratanohaytrato/Ayudame-dar-cara-trata-trato_2_1986630220.html', + 'md5': 'ad1bfaaba922dd4a295724b05b68f86a', + 'info_dict': { + 'id': 'MDSVID20150513_0220', + 'ext': 'mp4', + 'title': '#DOYLACARA. Con la trata no hay trato', + 'duration': 50, }, }, { 'url': 'http://www.telecinco.es/informativos/nacional/Pablo_Iglesias-Informativos_Telecinco-entrevista-Pedro_Piqueras_2_1945155182.html', @@ -27,3 +54,41 @@ class TelecincoIE(MiTeleIE): 'url': 'http://www.telecinco.es/espanasinirmaslejos/Espana-gran-destino-turistico_2_1240605043.html', 'only_matching': True, }] + + def _real_extract(self, url): + episode = self._match_id(url) + webpage = self._download_webpage(url, episode) + embed_data_json = self._search_regex( + r'(?s)MSV\.embedData\[.*?\]\s*=\s*({.*?});', webpage, 'embed data', + ).replace('\'', '"') + embed_data = json.loads(embed_data_json) + + domain = embed_data['mediaUrl'] + if not domain.startswith('http'): + # only happens in telecinco.es videos + domain = 'http://' + domain + info_url = compat_urlparse.urljoin( + domain, + compat_urllib_parse_unquote(embed_data['flashvars']['host']) + ) + info_el = self._download_xml(info_url, episode).find('./video/info') + + video_link = info_el.find('videoUrl/link').text + token_query = compat_urllib_parse.urlencode({'id': video_link}) + token_info = self._download_json( + embed_data['flashvars']['ov_tk'] + '?' + token_query, + episode, + transform_source=strip_jsonp + ) + formats = self._extract_m3u8_formats( + token_info['tokenizedUrl'], episode, ext='mp4', entry_protocol='m3u8_native') + + return { + 'id': embed_data['videoId'], + 'display_id': episode, + 'title': info_el.find('title').text, + 'formats': formats, + 'description': get_element_by_attribute('class', 'text', webpage), + 'thumbnail': info_el.find('thumb').text, + 'duration': parse_duration(info_el.find('duration').text), + } diff --git a/youtube_dl/extractor/tubitv.py b/youtube_dl/extractor/tubitv.py index 2c4b21807..4f86b3ee9 100644 --- a/youtube_dl/extractor/tubitv.py +++ b/youtube_dl/extractor/tubitv.py @@ -60,9 +60,7 @@ class TubiTvIE(InfoExtractor): webpage = self._download_webpage(url, video_id) if re.search(r"<(?:DIV|div) class='login-required-screen'>", webpage): - raise ExtractorError( - 'This video requires login, use --username and --password ' - 'options to provide account credentials.', expected=True) + self.raise_login_required('This video requires login') title = self._og_search_title(webpage) description = self._og_search_description(webpage) diff --git a/youtube_dl/extractor/tudou.py b/youtube_dl/extractor/tudou.py index 84fe71aef..5f7ac4b35 100644 --- a/youtube_dl/extractor/tudou.py +++ b/youtube_dl/extractor/tudou.py @@ -2,14 +2,12 @@ from __future__ import unicode_literals -import re -import json - from .common import InfoExtractor +from ..compat import compat_str class TudouIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?tudou\.com/(?:listplay|programs(?:/view)?|albumplay)/.*?/(?P[^/?#]+?)(?:\.html)?/?(?:$|[?#])' + _VALID_URL = r'https?://(?:www\.)?tudou\.com/(?:listplay|programs(?:/view)?|albumplay)/([^/]+/)*(?P[^/?#]+?)(?:\.html)?/?(?:$|[?#])' _TESTS = [{ 'url': 'http://www.tudou.com/listplay/zzdE77v6Mmo/2xN2duXMxmw.html', 'md5': '140a49ed444bd22f93330985d8475fcb', @@ -27,41 +25,41 @@ class TudouIE(InfoExtractor): 'title': 'La Sylphide-Bolshoi-Ekaterina Krysanova & Vyacheslav Lopatin 2012', 'thumbnail': 're:^https?://.*\.jpg$', } + }, { + 'url': 'http://www.tudou.com/albumplay/cJAHGih4yYg.html', + 'only_matching': True, }] _PLAYER_URL = 'http://js.tudouui.com/bin/lingtong/PortalPlayer_177.swf' - def _url_for_id(self, id, quality=None): - info_url = "http://v2.tudou.com/f?id=" + str(id) + def _url_for_id(self, video_id, quality=None): + info_url = 'http://v2.tudou.com/f?id=' + compat_str(video_id) if quality: info_url += '&hd' + quality - webpage = self._download_webpage(info_url, id, "Opening the info webpage") - final_url = self._html_search_regex('>(.+?)', webpage, 'video url') + xml_data = self._download_xml(info_url, video_id, "Opening the info XML page") + final_url = xml_data.text return final_url def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - m = re.search(r'vcode:\s*[\'"](.+?)[\'"]', webpage) - if m and m.group(1): - return { - '_type': 'url', - 'url': 'youku:' + m.group(1), - 'ie_key': 'Youku' - } + youku_vcode = self._search_regex( + r'vcode\s*:\s*[\'"]([^\'"]*)[\'"]', webpage, 'youku vcode', default=None) + if youku_vcode: + return self.url_result('youku:' + youku_vcode, ie='Youku') title = self._search_regex( - r",kw:\s*['\"](.+?)[\"']", webpage, 'title') + r',kw\s*:\s*[\'"]([^\'"]+)[\'"]', webpage, 'title') thumbnail_url = self._search_regex( - r",pic:\s*[\"'](.+?)[\"']", webpage, 'thumbnail URL', fatal=False) + r',pic\s*:\s*[\'"]([^\'"]+)[\'"]', webpage, 'thumbnail URL', fatal=False) player_url = self._search_regex( - r"playerUrl\s*:\s*['\"](.+?\.swf)[\"']", + r'playerUrl\s*:\s*[\'"]([^\'"]+\.swf)[\'"]', webpage, 'player URL', default=self._PLAYER_URL) - segs_json = self._search_regex(r'segs: \'(.*)\'', webpage, 'segments') - segments = json.loads(segs_json) + segments = self._parse_json(self._search_regex( + r'segs: \'([^\']+)\'', webpage, 'segments'), video_id) # It looks like the keys are the arguments that have to be passed as # the hd field in the request url, we pick the higher # Also, filter non-number qualities (see issue #3643). diff --git a/youtube_dl/extractor/udemy.py b/youtube_dl/extractor/udemy.py index 4a0eaf65f..365d8b4bf 100644 --- a/youtube_dl/extractor/udemy.py +++ b/youtube_dl/extractor/udemy.py @@ -70,9 +70,7 @@ class UdemyIE(InfoExtractor): def _login(self): (username, password) = self._get_login_info() if username is None: - raise ExtractorError( - 'Udemy account is required, use --username and --password options to provide account credentials.', - expected=True) + self.raise_login_required('Udemy account is required') login_popup = self._download_webpage( self._LOGIN_URL, None, 'Downloading login popup') diff --git a/youtube_dl/extractor/vidme.py b/youtube_dl/extractor/vidme.py index 157bb74fe..9a794e609 100644 --- a/youtube_dl/extractor/vidme.py +++ b/youtube_dl/extractor/vidme.py @@ -1,10 +1,12 @@ from __future__ import unicode_literals from .common import InfoExtractor +from ..compat import compat_HTTPError from ..utils import ( + ExtractorError, int_or_none, float_or_none, - str_to_int, + parse_iso8601, ) @@ -12,18 +14,41 @@ class VidmeIE(InfoExtractor): _VALID_URL = r'https?://vid\.me/(?:e/)?(?P[\da-zA-Z]+)' _TESTS = [{ 'url': 'https://vid.me/QNB', - 'md5': 'f42d05e7149aeaec5c037b17e5d3dc82', + 'md5': 'c62f1156138dc3323902188c5b5a8bd6', 'info_dict': { 'id': 'QNB', 'ext': 'mp4', 'title': 'Fishing for piranha - the easy way', 'description': 'source: https://www.facebook.com/photo.php?v=312276045600871', - 'duration': 119.92, + 'thumbnail': 're:^https?://.*\.jpg', 'timestamp': 1406313244, 'upload_date': '20140725', - 'thumbnail': 're:^https?://.*\.jpg', + 'age_limit': 0, + 'duration': 119.92, 'view_count': int, 'like_count': int, + 'comment_count': int, + }, + }, { + 'url': 'https://vid.me/Gc6M', + 'md5': 'f42d05e7149aeaec5c037b17e5d3dc82', + 'info_dict': { + 'id': 'Gc6M', + 'ext': 'mp4', + 'title': 'O Mere Dil ke chain - Arnav and Khushi VM', + 'thumbnail': 're:^https?://.*\.jpg', + 'timestamp': 1441211642, + 'upload_date': '20150902', + 'uploader': 'SunshineM', + 'uploader_id': '3552827', + 'age_limit': 0, + 'duration': 223.72, + 'view_count': int, + 'like_count': int, + 'comment_count': int, + }, + 'params': { + 'skip_download': True, }, }, { # tests uploader field @@ -33,63 +58,94 @@ class VidmeIE(InfoExtractor): 'ext': 'mp4', 'title': 'The Carver', 'description': 'md5:e9c24870018ae8113be936645b93ba3c', - 'duration': 97.859999999999999, + 'thumbnail': 're:^https?://.*\.jpg', 'timestamp': 1433203629, 'upload_date': '20150602', 'uploader': 'Thomas', - 'thumbnail': 're:^https?://.*\.jpg', + 'uploader_id': '109747', + 'age_limit': 0, + 'duration': 97.859999999999999, 'view_count': int, 'like_count': int, + 'comment_count': int, }, 'params': { 'skip_download': True, }, }, { - # From http://naked-yogi.tumblr.com/post/118312946248/naked-smoking-stretching + # nsfw test from http://naked-yogi.tumblr.com/post/118312946248/naked-smoking-stretching 'url': 'https://vid.me/e/Wmur', - 'only_matching': True, + 'info_dict': { + 'id': 'Wmur', + 'ext': 'mp4', + 'title': 'naked smoking & stretching', + 'thumbnail': 're:^https?://.*\.jpg', + 'timestamp': 1430931613, + 'upload_date': '20150506', + 'uploader': 'naked-yogi', + 'uploader_id': '1638622', + 'age_limit': 18, + 'duration': 653.26999999999998, + 'view_count': int, + 'like_count': int, + 'comment_count': int, + }, + 'params': { + 'skip_download': True, + }, }] def _real_extract(self, url): - url = url.replace('vid.me/e/', 'vid.me/') video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - video_url = self._html_search_regex( - r'\s*([\d,\.]+)\s*plays?', - webpage, 'view count', fatal=False)) - like_count = str_to_int(self._html_search_regex( - r'class="score js-video-vote-score"[^>]+data-score="([\d,\.\s]+)">', - webpage, 'like count', fatal=False)) - uploader = self._html_search_regex( - 'class="video_author_username"[^>]*>([^<]+)', - webpage, 'uploader', default=None) + error = response.get('error') + if error: + raise ExtractorError( + '%s returned error: %s' % (self.IE_NAME, error), expected=True) + + video = response['video'] + + formats = [{ + 'format_id': f.get('type'), + 'url': f['uri'], + 'width': int_or_none(f.get('width')), + 'height': int_or_none(f.get('height')), + } for f in video.get('formats', []) if f.get('uri')] + self._sort_formats(formats) + + title = video['title'] + description = video.get('description') + thumbnail = video.get('thumbnail_url') + timestamp = parse_iso8601(video.get('date_created'), ' ') + uploader = video.get('user', {}).get('username') + uploader_id = video.get('user', {}).get('user_id') + age_limit = 18 if video.get('nsfw') is True else 0 + duration = float_or_none(video.get('duration')) + view_count = int_or_none(video.get('view_count')) + like_count = int_or_none(video.get('likes_count')) + comment_count = int_or_none(video.get('comment_count')) return { 'id': video_id, - 'url': video_url, 'title': title, 'description': description, 'thumbnail': thumbnail, + 'uploader': uploader, + 'uploader_id': uploader_id, + 'age_limit': age_limit, 'timestamp': timestamp, - 'width': width, - 'height': height, 'duration': duration, 'view_count': view_count, 'like_count': like_count, - 'uploader': uploader, + 'comment_count': comment_count, + 'formats': formats, } diff --git a/youtube_dl/extractor/vier.py b/youtube_dl/extractor/vier.py index 15377097e..c76c20614 100644 --- a/youtube_dl/extractor/vier.py +++ b/youtube_dl/extractor/vier.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals import re +import itertools from .common import InfoExtractor @@ -91,31 +92,27 @@ class VierVideosIE(InfoExtractor): mobj = re.match(self._VALID_URL, url) program = mobj.group('program') - webpage = self._download_webpage(url, program) - page_id = mobj.group('page') if page_id: page_id = int(page_id) start_page = page_id - last_page = start_page + 1 playlist_id = '%s-page%d' % (program, page_id) else: start_page = 0 - last_page = int(self._search_regex( - r'videos\?page=(\d+)">laatste', - webpage, 'last page', default=0)) + 1 playlist_id = program entries = [] - for current_page_id in range(start_page, last_page): + for current_page_id in itertools.count(start_page): current_page = self._download_webpage( 'http://www.vier.be/%s/videos?page=%d' % (program, current_page_id), program, - 'Downloading page %d' % (current_page_id + 1)) if current_page_id != page_id else webpage + 'Downloading page %d' % (current_page_id + 1)) page_entries = [ self.url_result('http://www.vier.be' + video_url, 'Vier') for video_url in re.findall( r'

', current_page)] entries.extend(page_entries) + if page_id or '>Meer<' not in current_page: + break return self.playlist_result(entries, playlist_id) diff --git a/youtube_dl/extractor/vlive.py b/youtube_dl/extractor/vlive.py new file mode 100644 index 000000000..86c1cb5ef --- /dev/null +++ b/youtube_dl/extractor/vlive.py @@ -0,0 +1,86 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import hmac +from hashlib import sha1 +from base64 import b64encode +from time import time + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + determine_ext +) +from ..compat import compat_urllib_parse + + +class VLiveIE(InfoExtractor): + IE_NAME = 'vlive' + # www.vlive.tv/video/ links redirect to m.vlive.tv/video/ for mobile devices + _VALID_URL = r'https?://(?:(www|m)\.)?vlive\.tv/video/(?P[0-9]+)' + _TEST = { + 'url': 'http://m.vlive.tv/video/1326', + 'md5': 'cc7314812855ce56de70a06a27314983', + 'info_dict': { + 'id': '1326', + 'ext': 'mp4', + 'title': '[V] Girl\'s Day\'s Broadcast', + 'creator': 'Girl\'s Day', + }, + } + _SECRET = 'rFkwZet6pqk1vQt6SxxUkAHX7YL3lmqzUMrU4IDusTo4jEBdtOhNfT4BYYAdArwH' + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage( + 'http://m.vlive.tv/video/%s' % video_id, + video_id, note='Download video page') + + title = self._og_search_title(webpage) + thumbnail = self._og_search_thumbnail(webpage) + creator = self._html_search_regex( + r']+class="name">([^<>]+)', webpage, 'creator') + + url = 'http://global.apis.naver.com/globalV/globalV/vod/%s/playinfo?' % video_id + msgpad = '%.0f' % (time() * 1000) + md = b64encode( + hmac.new(self._SECRET.encode('ascii'), + (url[:255] + msgpad).encode('ascii'), sha1).digest() + ) + url += '&' + compat_urllib_parse.urlencode({'msgpad': msgpad, 'md': md}) + playinfo = self._download_json(url, video_id, 'Downloading video json') + + if playinfo.get('message', '') != 'success': + raise ExtractorError(playinfo.get('message', 'JSON request unsuccessful')) + + if not playinfo.get('result'): + raise ExtractorError('No videos found.') + + formats = [] + for vid in playinfo['result'].get('videos', {}).get('list', []): + formats.append({ + 'url': vid['source'], + 'ext': 'mp4', + 'abr': vid.get('bitrate', {}).get('audio'), + 'vbr': vid.get('bitrate', {}).get('video'), + 'format_id': vid['encodingOption']['name'], + 'height': vid.get('height'), + 'width': vid.get('width'), + }) + self._sort_formats(formats) + + subtitles = {} + for caption in playinfo['result'].get('captions', {}).get('list', []): + subtitles[caption['language']] = [ + {'ext': determine_ext(caption['source'], default_ext='vtt'), + 'url': caption['source']}] + + return { + 'id': video_id, + 'title': title, + 'creator': creator, + 'thumbnail': thumbnail, + 'formats': formats, + 'subtitles': subtitles, + } diff --git a/youtube_dl/extractor/washingtonpost.py b/youtube_dl/extractor/washingtonpost.py index 72eb010f8..ec8b99998 100644 --- a/youtube_dl/extractor/washingtonpost.py +++ b/youtube_dl/extractor/washingtonpost.py @@ -19,25 +19,25 @@ class WashingtonPostIE(InfoExtractor): 'title': 'Sinkhole of bureaucracy', }, 'playlist': [{ - 'md5': '79132cc09ec5309fa590ae46e4cc31bc', + 'md5': 'b9be794ceb56c7267d410a13f99d801a', 'info_dict': { 'id': 'fc433c38-b146-11e3-b8b3-44b1d1cd4c1f', 'ext': 'mp4', 'title': 'Breaking Points: The Paper Mine', - 'duration': 1287, + 'duration': 1290, 'description': 'Overly complicated paper pushing is nothing new to government bureaucracy. But the way federal retirement applications are filed may be the most outdated. David Fahrenthold explains.', 'uploader': 'The Washington Post', 'timestamp': 1395527908, 'upload_date': '20140322', }, }, { - 'md5': 'e1d5734c06865cc504ad99dc2de0d443', + 'md5': '1fff6a689d8770966df78c8cb6c8c17c', 'info_dict': { 'id': '41255e28-b14a-11e3-b8b3-44b1d1cd4c1f', 'ext': 'mp4', 'title': 'The town bureaucracy sustains', 'description': 'Underneath the friendly town of Boyers is a sea of government paperwork. In a disused limestone mine, hundreds of locals now track, file and process retirement applications for the federal government. We set out to find out what it\'s like to do paperwork 230 feet underground.', - 'duration': 2217, + 'duration': 2220, 'timestamp': 1395528005, 'upload_date': '20140322', 'uploader': 'The Washington Post', diff --git a/youtube_dl/extractor/wimp.py b/youtube_dl/extractor/wimp.py index f69d46a28..e4f50e64c 100644 --- a/youtube_dl/extractor/wimp.py +++ b/youtube_dl/extractor/wimp.py @@ -1,40 +1,33 @@ from __future__ import unicode_literals -import re - from .common import InfoExtractor from .youtube import YoutubeIE class WimpIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?wimp\.com/([^/]+)/' + _VALID_URL = r'http://(?:www\.)?wimp\.com/(?P[^/]+)/' _TESTS = [{ 'url': 'http://www.wimp.com/maruexhausted/', - 'md5': 'f1acced123ecb28d9bb79f2479f2b6a1', + 'md5': 'ee21217ffd66d058e8b16be340b74883', 'info_dict': { 'id': 'maruexhausted', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Maru is exhausted.', 'description': 'md5:57e099e857c0a4ea312542b684a869b8', } }, { - # youtube video 'url': 'http://www.wimp.com/clowncar/', + 'md5': '4e2986c793694b55b37cf92521d12bb4', 'info_dict': { - 'id': 'cG4CEr2aiSg', + 'id': 'clowncar', 'ext': 'mp4', - 'title': 'Basset hound clown car...incredible!', - 'description': 'md5:8d228485e0719898c017203f900b3a35', - 'uploader': 'Gretchen Hoey', - 'uploader_id': 'gretchenandjeff1', - 'upload_date': '20140303', + 'title': 'It\'s like a clown car.', + 'description': 'md5:0e56db1370a6e49c5c1d19124c0d2fb2', }, - 'add_ie': ['Youtube'], }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group(1) + video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) video_url = self._search_regex( [r"[\"']file[\"']\s*[:,]\s*[\"'](.+?)[\"']", r"videoId\s*:\s*[\"']([^\"']+)[\"']"], diff --git a/youtube_dl/extractor/xuite.py b/youtube_dl/extractor/xuite.py index 5aac8adb3..8bbac54e2 100644 --- a/youtube_dl/extractor/xuite.py +++ b/youtube_dl/extractor/xuite.py @@ -19,7 +19,7 @@ class XuiteIE(InfoExtractor): _TESTS = [{ # Audio 'url': 'http://vlog.xuite.net/play/RGkzc1ZULTM4NjA5MTQuZmx2', - 'md5': '63a42c705772aa53fd4c1a0027f86adf', + 'md5': 'e79284c87b371424885448d11f6398c8', 'info_dict': { 'id': '3860914', 'ext': 'mp3', diff --git a/youtube_dl/extractor/yahoo.py b/youtube_dl/extractor/yahoo.py index f9afbdbab..fca5ddc69 100644 --- a/youtube_dl/extractor/yahoo.py +++ b/youtube_dl/extractor/yahoo.py @@ -101,7 +101,7 @@ class YahooIE(InfoExtractor): } }, { 'url': 'http://news.yahoo.com/video/china-moses-crazy-blues-104538833.html', - 'md5': '67010fdf3a08d290e060a4dd96baa07b', + 'md5': '88e209b417f173d86186bef6e4d1f160', 'info_dict': { 'id': 'f885cf7f-43d4-3450-9fac-46ac30ece521', 'ext': 'mp4', @@ -144,6 +144,17 @@ class YahooIE(InfoExtractor): }, { 'url': 'https://tw.news.yahoo.com/-100120367.html', 'only_matching': True, + }, { + # Query result is embedded in webpage, but explicit request to video API fails with geo restriction + 'url': 'https://screen.yahoo.com/community/communitary-community-episode-1-ladders-154501237.html', + 'md5': '4fbafb9c9b6f07aa8f870629f6671b35', + 'info_dict': { + 'id': '1f32853c-a271-3eef-8cb6-f6d6872cb504', + 'ext': 'mp4', + 'title': 'Communitary - Community Episode 1: Ladders', + 'description': 'md5:8fc39608213295748e1e289807838c97', + 'duration': 1646, + }, } ] @@ -171,6 +182,19 @@ class YahooIE(InfoExtractor): if nbc_sports_url: return self.url_result(nbc_sports_url, 'NBCSportsVPlayer') + # Query result is often embedded in webpage as JSON. Sometimes explicit requests + # to video API results in a failure with geo restriction reason therefore using + # embedded query result when present sounds reasonable. + config_json = self._search_regex( + r'window\.Af\.bootstrap\[[^\]]+\]\s*=\s*({.*?"applet_type"\s*:\s*"td-applet-videoplayer".*?});(?:|$)', + webpage, 'videoplayer applet', default=None) + if config_json: + config = self._parse_json(config_json, display_id, fatal=False) + if config: + sapi = config.get('models', {}).get('applet_model', {}).get('data', {}).get('sapi') + if sapi: + return self._extract_info(display_id, sapi, webpage) + items_json = self._search_regex( r'mediaItems: ({.*?})$', webpage, 'items', flags=re.MULTILINE, default=None) @@ -190,22 +214,10 @@ class YahooIE(InfoExtractor): video_id = info['id'] return self._get_info(video_id, display_id, webpage) - def _get_info(self, video_id, display_id, webpage): - region = self._search_regex( - r'\\?"region\\?"\s*:\s*\\?"([^"]+?)\\?"', - webpage, 'region', fatal=False, default='US') - data = compat_urllib_parse.urlencode({ - 'protocol': 'http', - 'region': region, - }) - query_url = ( - 'https://video.media.yql.yahoo.com/v1/video/sapi/streams/' - '{id}?{data}'.format(id=video_id, data=data)) - query_result = self._download_json( - query_url, display_id, 'Downloading video info') - - info = query_result['query']['results']['mediaObj'][0] + def _extract_info(self, display_id, query, webpage): + info = query['query']['results']['mediaObj'][0] meta = info.get('meta') + video_id = info.get('id') if not meta: msg = info['status'].get('msg') @@ -231,6 +243,9 @@ class YahooIE(InfoExtractor): 'ext': 'flv', }) else: + if s.get('format') == 'm3u8_playlist': + format_info['protocol'] = 'm3u8_native' + format_info['ext'] = 'mp4' format_url = compat_urlparse.urljoin(host, path) format_info['url'] = format_url formats.append(format_info) @@ -264,6 +279,21 @@ class YahooIE(InfoExtractor): 'subtitles': subtitles, } + def _get_info(self, video_id, display_id, webpage): + region = self._search_regex( + r'\\?"region\\?"\s*:\s*\\?"([^"]+?)\\?"', + webpage, 'region', fatal=False, default='US') + data = compat_urllib_parse.urlencode({ + 'protocol': 'http', + 'region': region, + }) + query_url = ( + 'https://video.media.yql.yahoo.com/v1/video/sapi/streams/' + '{id}?{data}'.format(id=video_id, data=data)) + query_result = self._download_json( + query_url, display_id, 'Downloading video info') + return self._extract_info(display_id, query_result, webpage) + class YahooSearchIE(SearchInfoExtractor): IE_DESC = 'Yahoo screen search' diff --git a/youtube_dl/extractor/yandexmusic.py b/youtube_dl/extractor/yandexmusic.py index 91829be1c..4098e4629 100644 --- a/youtube_dl/extractor/yandexmusic.py +++ b/youtube_dl/extractor/yandexmusic.py @@ -5,7 +5,11 @@ import re import hashlib from .common import InfoExtractor -from ..compat import compat_str +from ..compat import ( + compat_str, + compat_urllib_parse, + compat_urllib_request, +) from ..utils import ( int_or_none, float_or_none, @@ -67,7 +71,7 @@ class YandexMusicPlaylistBaseIE(InfoExtractor): return [ self.url_result( 'http://music.yandex.ru/album/%s/track/%s' % (track['albums'][0]['id'], track['id'])) - for track in tracks] + for track in tracks if track.get('albums') and isinstance(track.get('albums'), list)] class YandexMusicAlbumIE(YandexMusicPlaylistBaseIE): @@ -106,7 +110,7 @@ class YandexMusicPlaylistIE(YandexMusicPlaylistBaseIE): IE_DESC = 'Яндекс.Музыка - Плейлист' _VALID_URL = r'https?://music\.yandex\.(?:ru|kz|ua|by)/users/[^/]+/playlists/(?P\d+)' - _TEST = { + _TESTS = [{ 'url': 'http://music.yandex.ru/users/music.partners/playlists/1245', 'info_dict': { 'id': '1245', @@ -114,19 +118,54 @@ class YandexMusicPlaylistIE(YandexMusicPlaylistBaseIE): 'description': 'md5:3b9f27b0efbe53f2ee1e844d07155cc9', }, 'playlist_count': 6, - } + }, { + # playlist exceeding the limit of 150 tracks shipped with webpage (see + # https://github.com/rg3/youtube-dl/issues/6666) + 'url': 'https://music.yandex.ru/users/ya.playlist/playlists/1036', + 'info_dict': { + 'id': '1036', + 'title': 'Музыка 90-х', + }, + 'playlist_count': 310, + }] def _real_extract(self, url): playlist_id = self._match_id(url) webpage = self._download_webpage(url, playlist_id) - playlist = self._parse_json( + mu = self._parse_json( self._search_regex( r'var\s+Mu\s*=\s*({.+?});\s*', webpage, 'player'), - playlist_id)['pageData']['playlist'] + playlist_id) + + playlist = mu['pageData']['playlist'] + tracks, track_ids = playlist['tracks'], playlist['trackIds'] + + # tracks dictionary shipped with webpage is limited to 150 tracks, + # missing tracks should be retrieved manually. + if len(tracks) < len(track_ids): + present_track_ids = set([compat_str(track['id']) for track in tracks if track.get('id')]) + missing_track_ids = set(map(compat_str, track_ids)) - set(present_track_ids) + request = compat_urllib_request.Request( + 'https://music.yandex.ru/handlers/track-entries.jsx', + compat_urllib_parse.urlencode({ + 'entries': ','.join(missing_track_ids), + 'lang': mu.get('settings', {}).get('lang', 'en'), + 'external-domain': 'music.yandex.ru', + 'overembed': 'false', + 'sign': mu.get('authData', {}).get('user', {}).get('sign'), + 'strict': 'true', + }).encode('utf-8')) + request.add_header('Referer', url) + request.add_header('X-Requested-With', 'XMLHttpRequest') + + missing_tracks = self._download_json( + request, playlist_id, 'Downloading missing tracks JSON', fatal=False) + if missing_tracks: + tracks.extend(missing_tracks) return self.playlist_result( - self._build_playlist(playlist['tracks']), + self._build_playlist(tracks), compat_str(playlist_id), playlist['title'], playlist.get('description')) diff --git a/youtube_dl/extractor/youku.py b/youtube_dl/extractor/youku.py index 78caeb8b3..2e81d9223 100644 --- a/youtube_dl/extractor/youku.py +++ b/youtube_dl/extractor/youku.py @@ -49,6 +49,17 @@ class YoukuIE(InfoExtractor): }, 'playlist_count': 13, 'skip': 'Available in China only', + }, { + 'url': 'http://v.youku.com/v_show/id_XNjA1NzA2Njgw.html', + 'note': 'Video protected with password', + 'info_dict': { + 'id': 'XNjA1NzA2Njgw', + 'title': '邢義田复旦讲座之想象中的胡人—从“左衽孔子”说起', + }, + 'playlist_count': 19, + 'params': { + 'videopassword': '100600', + }, }] def construct_video_urls(self, data1, data2): @@ -185,9 +196,15 @@ class YoukuIE(InfoExtractor): raw_data = self._download_json(req, video_id, note=note) return raw_data['data'][0] + video_password = self._downloader.params.get('videopassword', None) + # request basic data + basic_data_url = 'http://v.youku.com/player/getPlayList/VideoIDS/%s' % video_id + if video_password: + basic_data_url += '?password=%s' % video_password + data1 = retrieve_data( - 'http://v.youku.com/player/getPlayList/VideoIDS/%s' % video_id, + basic_data_url, 'Downloading JSON metadata 1') data2 = retrieve_data( 'http://v.youku.com/player/getPlayList/VideoIDS/%s/Pf/4/ctype/12/ev/1' % video_id, diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 8e2da46e3..b252e36e1 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -26,6 +26,7 @@ from ..compat import ( ) from ..utils import ( clean_html, + encode_dict, ExtractorError, float_or_none, get_element_by_attribute, @@ -111,10 +112,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor): 'hl': 'en_US', } - # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode - # chokes on unicode - login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k, v in login_form_strs.items()) - login_data = compat_urllib_parse.urlencode(login_form).encode('ascii') + login_data = compat_urllib_parse.urlencode(encode_dict(login_form_strs)).encode('ascii') req = compat_urllib_request.Request(self._LOGIN_URL, login_data) login_results = self._download_webpage( @@ -147,8 +145,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor): 'TrustDevice': 'on', }) - tfa_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k, v in tfa_form_strs.items()) - tfa_data = compat_urllib_parse.urlencode(tfa_form).encode('ascii') + tfa_data = compat_urllib_parse.urlencode(encode_dict(tfa_form_strs)).encode('ascii') tfa_req = compat_urllib_request.Request(self._TWOFACTOR_URL, tfa_data) tfa_results = self._download_webpage( @@ -660,7 +657,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): def _extract_signature_function(self, video_id, player_url, example_sig): id_m = re.match( - r'.*?-(?P[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player)?\.(?P[a-z]+)$', + r'.*?-(?P[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player(?:-new)?)?\.(?P[a-z]+)$', player_url) if not id_m: raise ExtractorError('Cannot identify player %r' % player_url) @@ -1243,7 +1240,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0] if 'rtmpe%3Dyes' in encoded_url_map: raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True) - url_map = {} + formats = [] for url_data_str in encoded_url_map.split(','): url_data = compat_parse_qs(url_data_str) if 'itag' not in url_data or 'url' not in url_data: @@ -1289,7 +1286,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): player_desc = 'flash player %s' % player_version else: player_version = self._search_regex( - r'html5player-([^/]+?)(?:/html5player)?\.js', + r'html5player-([^/]+?)(?:/html5player(?:-new)?)?\.js', player_url, 'html5 player', fatal=False) player_desc = 'html5 player %s' % player_version @@ -1303,8 +1300,50 @@ class YoutubeIE(YoutubeBaseInfoExtractor): url += '&signature=' + signature if 'ratebypass' not in url: url += '&ratebypass=yes' - url_map[format_id] = url - formats = _map_to_format_list(url_map) + + # Some itags are not included in DASH manifest thus corresponding formats will + # lack metadata (see https://github.com/rg3/youtube-dl/pull/5993). + # Trying to extract metadata from url_encoded_fmt_stream_map entry. + mobj = re.search(r'^(?P\d+)[xX](?P\d+)$', url_data.get('size', [''])[0]) + width, height = (int(mobj.group('width')), int(mobj.group('height'))) if mobj else (None, None) + dct = { + 'format_id': format_id, + 'url': url, + 'player_url': player_url, + 'filesize': int_or_none(url_data.get('clen', [None])[0]), + 'tbr': float_or_none(url_data.get('bitrate', [None])[0], 1000), + 'width': width, + 'height': height, + 'fps': int_or_none(url_data.get('fps', [None])[0]), + 'format_note': url_data.get('quality_label', [None])[0] or url_data.get('quality', [None])[0], + } + type_ = url_data.get('type', [None])[0] + if type_: + type_split = type_.split(';') + kind_ext = type_split[0].split('/') + if len(kind_ext) == 2: + kind, ext = kind_ext + dct['ext'] = ext + if kind in ('audio', 'video'): + codecs = None + for mobj in re.finditer( + r'(?P[a-zA-Z_-]+)=(?P["\']?)(?P.+?)(?P=quote)(?:;|$)', type_): + if mobj.group('key') == 'codecs': + codecs = mobj.group('val') + break + if codecs: + codecs = codecs.split(',') + if len(codecs) == 2: + acodec, vcodec = codecs[0], codecs[1] + else: + acodec, vcodec = (codecs[0], 'none') if kind == 'audio' else ('none', codecs[0]) + dct.update({ + 'acodec': acodec, + 'vcodec': vcodec, + }) + if format_id in self._formats: + dct.update(self._formats[format_id]) + formats.append(dct) elif video_info.get('hlsvp'): manifest_url = video_info['hlsvp'][0] url_map = self._extract_from_m3u8(manifest_url, video_id) @@ -1615,12 +1654,15 @@ class YoutubeChannelIE(InfoExtractor): channel_page = self._download_webpage( url + '?view=57', channel_id, 'Downloading channel page', fatal=False) - channel_playlist_id = self._html_search_meta( - 'channelId', channel_page, 'channel id', default=None) - if not channel_playlist_id: - channel_playlist_id = self._search_regex( - r'data-channel-external-id="([^"]+)"', - channel_page, 'channel id', default=None) + if channel_page is False: + channel_playlist_id = False + else: + channel_playlist_id = self._html_search_meta( + 'channelId', channel_page, 'channel id', default=None) + if not channel_playlist_id: + channel_playlist_id = self._search_regex( + r'data-channel-external-id="([^"]+)"', + channel_page, 'channel id', default=None) if channel_playlist_id and channel_playlist_id.startswith('UC'): playlist_id = 'UU' + channel_playlist_id[2:] return self.url_result( @@ -1796,8 +1838,8 @@ class YoutubeShowIE(InfoExtractor): _VALID_URL = r'https?://www\.youtube\.com/show/(?P[^?#]*)' IE_NAME = 'youtube:show' _TESTS = [{ - 'url': 'http://www.youtube.com/show/airdisasters', - 'playlist_mincount': 3, + 'url': 'https://www.youtube.com/show/airdisasters', + 'playlist_mincount': 5, 'info_dict': { 'id': 'airdisasters', 'title': 'Air Disasters', @@ -1808,7 +1850,7 @@ class YoutubeShowIE(InfoExtractor): mobj = re.match(self._VALID_URL, url) playlist_id = mobj.group('id') webpage = self._download_webpage( - url, playlist_id, 'Downloading show webpage') + 'https://www.youtube.com/show/%s/playlists' % playlist_id, playlist_id, 'Downloading show webpage') # There's one playlist for each season of the show m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage)) self.to_screen('%s: Found %s seasons' % (playlist_id, len(m_seasons))) @@ -1931,6 +1973,7 @@ class YoutubeTruncatedURLIE(InfoExtractor): annotation_id=annotation_[^&]+| x-yt-cl=[0-9]+| hl=[^&]*| + t=[0-9]+ )? | attribution_link\?a=[^&]+ @@ -1953,6 +1996,9 @@ class YoutubeTruncatedURLIE(InfoExtractor): }, { 'url': 'https://www.youtube.com/watch?hl=en-GB', 'only_matching': True, + }, { + 'url': 'https://www.youtube.com/watch?t=2372', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/youtube_dl/options.py b/youtube_dl/options.py index 9016e3498..5eccc0a70 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -2,7 +2,6 @@ from __future__ import unicode_literals import os.path import optparse -import shlex import sys from .downloader.external import list_external_downloaders @@ -11,6 +10,7 @@ from .compat import ( compat_get_terminal_size, compat_getenv, compat_kwargs, + compat_shlex_split, ) from .utils import ( preferredencoding, @@ -28,7 +28,7 @@ def parseOpts(overrideArguments=None): try: res = [] for l in optionf: - res += shlex.split(l, comments=True) + res += compat_shlex_split(l, comments=True) finally: optionf.close() return res @@ -320,7 +320,7 @@ def parseOpts(overrideArguments=None): authentication.add_option( '--video-password', dest='videopassword', metavar='PASSWORD', - help='Video password (vimeo, smotri)') + help='Video password (vimeo, smotri, youku)') video_format = optparse.OptionGroup(parser, 'Video Format Options') video_format.add_option( diff --git a/youtube_dl/postprocessor/common.py b/youtube_dl/postprocessor/common.py index 4191d040b..599dd1df2 100644 --- a/youtube_dl/postprocessor/common.py +++ b/youtube_dl/postprocessor/common.py @@ -4,6 +4,7 @@ import os from ..utils import ( PostProcessingError, + cli_configuration_args, encodeFilename, ) @@ -61,11 +62,7 @@ class PostProcessor(object): self._downloader.report_warning(errnote) def _configuration_args(self, default=[]): - pp_args = self._downloader.params.get('postprocessor_args') - if pp_args is None: - return default - assert isinstance(pp_args, list) - return pp_args + return cli_configuration_args(self._downloader.params, 'postprocessor_args', default) class AudioConversionError(PostProcessingError): diff --git a/youtube_dl/postprocessor/ffmpeg.py b/youtube_dl/postprocessor/ffmpeg.py index 1f723908b..4f320e124 100644 --- a/youtube_dl/postprocessor/ffmpeg.py +++ b/youtube_dl/postprocessor/ffmpeg.py @@ -135,7 +135,10 @@ class FFmpegPostProcessor(PostProcessor): files_cmd = [] for path in input_paths: - files_cmd.extend([encodeArgument('-i'), encodeFilename(path, True)]) + files_cmd.extend([ + encodeArgument('-i'), + encodeFilename(self._ffmpeg_filename_argument(path), True) + ]) cmd = ([encodeFilename(self.executable, True), encodeArgument('-y')] + files_cmd + [encodeArgument(o) for o in opts] + @@ -155,10 +158,10 @@ class FFmpegPostProcessor(PostProcessor): self.run_ffmpeg_multiple_files([path], out_path, opts) def _ffmpeg_filename_argument(self, fn): - # ffmpeg broke --, see https://ffmpeg.org/trac/ffmpeg/ticket/2127 for details - if fn.startswith('-'): - return './' + fn - return fn + # Always use 'file:' because the filename may contain ':' (ffmpeg + # interprets that as a protocol) or can start with '-' (-- is broken in + # ffmpeg, see https://ffmpeg.org/trac/ffmpeg/ticket/2127 for details) + return 'file:' + fn class FFmpegExtractAudioPP(FFmpegPostProcessor): diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index e265c7574..206dd56bc 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -141,7 +141,7 @@ def write_json_file(obj, fn): if sys.version_info >= (2, 7): def find_xpath_attr(node, xpath, key, val=None): """ Find the xpath xpath[@key=val] """ - assert re.match(r'^[a-zA-Z-]+$', key) + assert re.match(r'^[a-zA-Z_-]+$', key) if val: assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val) expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val)) @@ -176,12 +176,12 @@ def xpath_with_ns(path, ns_map): return '/'.join(replaced) -def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT): +def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT): if sys.version_info < (2, 7): # Crazy 2.6 xpath = xpath.encode('ascii') n = node.find(xpath) - if n is None or n.text is None: + if n is None: if default is not NO_DEFAULT: return default elif fatal: @@ -189,9 +189,37 @@ def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT): raise ExtractorError('Could not find XML element %s' % name) else: return None + return n + + +def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT): + n = xpath_element(node, xpath, name, fatal=fatal, default=default) + if n is None or n == default: + return n + if n.text is None: + if default is not NO_DEFAULT: + return default + elif fatal: + name = xpath if name is None else name + raise ExtractorError('Could not find XML element\'s text %s' % name) + else: + return None return n.text +def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT): + n = find_xpath_attr(node, xpath, key) + if n is None: + if default is not NO_DEFAULT: + return default + elif fatal: + name = '%s[@%s]' % (xpath, key) if name is None else name + raise ExtractorError('Could not find XML attribute %s' % name) + else: + return None + return n.attrib[key] + + def get_element_by_id(id, html): """Return the content of the tag with the specified ID in the passed HTML document""" return get_element_by_attribute("id", id, html) @@ -587,6 +615,11 @@ class ContentTooShortError(Exception): def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs): + # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting + # expected HTTP responses to meet HTTP/1.0 or later (see also + # https://github.com/rg3/youtube-dl/issues/6727) + if sys.version_info < (3, 0): + kwargs['strict'] = True hc = http_class(*args, **kwargs) source_address = ydl_handler._params.get('source_address') if source_address is not None: @@ -715,7 +748,8 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler): gz = io.BytesIO(self.deflate(resp.read())) resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code) resp.msg = old_resp.msg - # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 + # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see + # https://github.com/rg3/youtube-dl/issues/6457). if 300 <= resp.code < 400: location = resp.headers.get('Location') if location: @@ -749,6 +783,30 @@ class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler): req, **kwargs) +class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor): + def __init__(self, cookiejar=None): + compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar) + + def http_response(self, request, response): + # Python 2 will choke on next HTTP request in row if there are non-ASCII + # characters in Set-Cookie HTTP header of last response (see + # https://github.com/rg3/youtube-dl/issues/6769). + # In order to at least prevent crashing we will percent encode Set-Cookie + # header before HTTPCookieProcessor starts processing it. + # if sys.version_info < (3, 0) and response.headers: + # for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'): + # set_cookie = response.headers.get(set_cookie_header) + # if set_cookie: + # set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ") + # if set_cookie != set_cookie_escaped: + # del response.headers[set_cookie_header] + # response.headers[set_cookie_header] = set_cookie_escaped + return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response) + + https_request = compat_urllib_request.HTTPCookieProcessor.http_request + https_response = http_response + + def parse_iso8601(date_str, delimiter='T', timezone=None): """ Return a UNIX timestamp from the given date """ @@ -1578,6 +1636,10 @@ def urlencode_postdata(*args, **kargs): return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii') +def encode_dict(d, encoding='utf-8'): + return dict((k.encode(encoding), v.encode(encoding)) for k, v in d.items()) + + try: etree_iter = xml.etree.ElementTree.Element.iter except AttributeError: # Python <=2.6 @@ -1918,6 +1980,32 @@ def dfxp2srt(dfxp_data): return ''.join(out) +def cli_option(params, command_option, param): + param = params.get(param) + return [command_option, param] if param is not None else [] + + +def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None): + param = params.get(param) + assert isinstance(param, bool) + if separator: + return [command_option + separator + (true_value if param else false_value)] + return [command_option, true_value if param else false_value] + + +def cli_valueless_option(params, command_option, param, expected_value=True): + param = params.get(param) + return [command_option] if param == expected_value else [] + + +def cli_configuration_args(params, param, default=[]): + ex_args = params.get(param) + if ex_args is None: + return default + assert isinstance(ex_args, list) + return ex_args + + class ISO639Utils(object): # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt _lang_map = { diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 394951ca7..0cc7411f2 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2015.08.23' +__version__ = '2015.09.09'