diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md new file mode 100644 index 000000000..5b1f573e7 --- /dev/null +++ b/.github/ISSUE_TEMPLATE.md @@ -0,0 +1,58 @@ +## Please follow the guide below + +- You will be asked some questions and requested to provide some information, please read them **carefully** and answer honestly +- Put an `x` into all the boxes [ ] relevant to your *issue* (like that [x]) +- Use *Preview* tab to see how your issue will actually look like + +--- + +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2016.03.27*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2016.03.27** + +### Before submitting an *issue* make sure you have: +- [ ] At least skimmed through [README](https://github.com/rg3/youtube-dl/blob/master/README.md) and **most notably** [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections +- [ ] [Searched](https://github.com/rg3/youtube-dl/search?type=Issues) the bugtracker for similar issues including closed ones + +### What is the purpose of your *issue*? +- [ ] Bug report (encountered problems with youtube-dl) +- [ ] Site support request (request for adding support for a new site) +- [ ] Feature request (request for a new functionality) +- [ ] Question +- [ ] Other + +--- + +### The following sections concretize particular purposed issues, you can erase any section (the contents between triple ---) not applicable to your *issue* + +--- + +### If the purpose of this *issue* is a *bug report*, *site support request* or you are not completely sure provide the full verbose output as follows: + +Add `-v` flag to **your command line** you run youtube-dl with, copy the **whole** output and insert it here. It should look similar to one below (replace it with **your** log inserted between triple ```): +``` +$ youtube-dl -v +[debug] System config: [] +[debug] User config: [] +[debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] +[debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 +[debug] youtube-dl version 2016.03.27 +[debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 +[debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 +[debug] Proxy map: {} +... + +``` + +--- + +### If the purpose of this *issue* is a *site support request* please provide all kinds of example URLs support for which should be included (replace following example URLs by **yours**): +- Single video: https://www.youtube.com/watch?v=BaW_jenozKc +- Single video: https://youtu.be/BaW_jenozKc +- Playlist: https://www.youtube.com/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc + +--- + +### Description of your *issue*, suggested solution and other information + +Explanation of your *issue* in arbitrary form goes here. Please make sure the [description is worded well enough to be understood](https://github.com/rg3/youtube-dl#is-the-description-of-the-issue-itself-sufficient). Provide as much context and examples as possible. +If work on your *issue* required an account credentials please provide them or explain how one can obtain them. diff --git a/.github/ISSUE_TEMPLATE_tmpl.md b/.github/ISSUE_TEMPLATE_tmpl.md new file mode 100644 index 000000000..a5e6a4233 --- /dev/null +++ b/.github/ISSUE_TEMPLATE_tmpl.md @@ -0,0 +1,58 @@ +## Please follow the guide below + +- You will be asked some questions and requested to provide some information, please read them **carefully** and answer honestly +- Put an `x` into all the boxes [ ] relevant to your *issue* (like that [x]) +- Use *Preview* tab to see how your issue will actually look like + +--- + +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *%(version)s*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **%(version)s** + +### Before submitting an *issue* make sure you have: +- [ ] At least skimmed through [README](https://github.com/rg3/youtube-dl/blob/master/README.md) and **most notably** [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections +- [ ] [Searched](https://github.com/rg3/youtube-dl/search?type=Issues) the bugtracker for similar issues including closed ones + +### What is the purpose of your *issue*? +- [ ] Bug report (encountered problems with youtube-dl) +- [ ] Site support request (request for adding support for a new site) +- [ ] Feature request (request for a new functionality) +- [ ] Question +- [ ] Other + +--- + +### The following sections concretize particular purposed issues, you can erase any section (the contents between triple ---) not applicable to your *issue* + +--- + +### If the purpose of this *issue* is a *bug report*, *site support request* or you are not completely sure provide the full verbose output as follows: + +Add `-v` flag to **your command line** you run youtube-dl with, copy the **whole** output and insert it here. It should look similar to one below (replace it with **your** log inserted between triple ```): +``` +$ youtube-dl -v +[debug] System config: [] +[debug] User config: [] +[debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] +[debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 +[debug] youtube-dl version %(version)s +[debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 +[debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 +[debug] Proxy map: {} +... + +``` + +--- + +### If the purpose of this *issue* is a *site support request* please provide all kinds of example URLs support for which should be included (replace following example URLs by **yours**): +- Single video: https://www.youtube.com/watch?v=BaW_jenozKc +- Single video: https://youtu.be/BaW_jenozKc +- Playlist: https://www.youtube.com/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc + +--- + +### Description of your *issue*, suggested solution and other information + +Explanation of your *issue* in arbitrary form goes here. Please make sure the [description is worded well enough to be understood](https://github.com/rg3/youtube-dl#is-the-description-of-the-issue-itself-sufficient). Provide as much context and examples as possible. +If work on your *issue* required an account credentials please provide them or explain how one can obtain them. diff --git a/AUTHORS b/AUTHORS index aa48cd5a6..ea8d39978 100644 --- a/AUTHORS +++ b/AUTHORS @@ -163,3 +163,7 @@ Patrick Griffis Aidan Rowe mutantmonkey Ben Congdon +Kacper Michajłow +José Joaquín Atria +Viťas Strádal +Kagami Hiiragi diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index c996f03ab..0df6193fb 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -85,7 +85,7 @@ To run the test, simply invoke your favorite test runner, or execute a test file If you want to create a build of youtube-dl yourself, you'll need * python -* make +* make (both GNU make and BSD make are supported) * pandoc * zip * nosetests diff --git a/Makefile b/Makefile index e98806791..3a6c37944 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ -all: youtube-dl README.md CONTRIBUTING.md README.txt youtube-dl.1 youtube-dl.bash-completion youtube-dl.zsh youtube-dl.fish supportedsites +all: youtube-dl README.md CONTRIBUTING.md ISSUE_TEMPLATE.md README.txt youtube-dl.1 youtube-dl.bash-completion youtube-dl.zsh youtube-dl.fish supportedsites clean: - rm -rf youtube-dl.1.temp.md youtube-dl.1 youtube-dl.bash-completion README.txt MANIFEST build/ dist/ .coverage cover/ youtube-dl.tar.gz youtube-dl.zsh youtube-dl.fish *.dump *.part *.info.json *.mp4 *.flv *.mp3 *.avi CONTRIBUTING.md.tmp youtube-dl youtube-dl.exe + rm -rf youtube-dl.1.temp.md youtube-dl.1 youtube-dl.bash-completion README.txt MANIFEST build/ dist/ .coverage cover/ youtube-dl.tar.gz youtube-dl.zsh youtube-dl.fish *.dump *.part *.info.json *.mp4 *.flv *.mp3 *.avi CONTRIBUTING.md.tmp ISSUE_TEMPLATE.md.tmp youtube-dl youtube-dl.exe find . -name "*.pyc" -delete find . -name "*.class" -delete @@ -12,15 +12,7 @@ SHAREDIR ?= $(PREFIX)/share PYTHON ?= /usr/bin/env python # set SYSCONFDIR to /etc if PREFIX=/usr or PREFIX=/usr/local -ifeq ($(PREFIX),/usr) - SYSCONFDIR=/etc -else - ifeq ($(PREFIX),/usr/local) - SYSCONFDIR=/etc - else - SYSCONFDIR=$(PREFIX)/etc - endif -endif +SYSCONFDIR != if [ $(PREFIX) = /usr -o $(PREFIX) = /usr/local ]; then echo /etc; else echo $(PREFIX)/etc; fi install: youtube-dl youtube-dl.1 youtube-dl.bash-completion youtube-dl.zsh youtube-dl.fish install -d $(DESTDIR)$(BINDIR) @@ -67,6 +59,9 @@ README.md: youtube_dl/*.py youtube_dl/*/*.py CONTRIBUTING.md: README.md $(PYTHON) devscripts/make_contributing.py README.md CONTRIBUTING.md +ISSUE_TEMPLATE.md: + $(PYTHON) devscripts/make_issue_template.py .github/ISSUE_TEMPLATE_tmpl.md .github/ISSUE_TEMPLATE.md + supportedsites: $(PYTHON) devscripts/make_supportedsites.py docs/supportedsites.md diff --git a/README.md b/README.md index 68db546ef..e972bf69f 100644 --- a/README.md +++ b/README.md @@ -164,6 +164,8 @@ which means you can modify it, redistribute it or use it however you like. (e.g. 50K or 4.2M) -R, --retries RETRIES Number of retries (default is 10), or "infinite". + --fragment-retries RETRIES Number of retries for a fragment (default + is 10), or "infinite" (DASH only) --buffer-size SIZE Size of download buffer (e.g. 1024 or 16K) (default is 1024) --no-resize-buffer Do not automatically adjust the buffer @@ -376,8 +378,8 @@ which means you can modify it, redistribute it or use it however you like. --no-post-overwrites Do not overwrite post-processed files; the post-processed files are overwritten by default - --embed-subs Embed subtitles in the video (only for mkv - and mp4 videos) + --embed-subs Embed subtitles in the video (only for mp4, + webm and mkv videos) --embed-thumbnail Embed thumbnail in the audio as cover art --add-metadata Write metadata to the video file --metadata-from-title FORMAT Parse additional metadata like song title / @@ -598,6 +600,7 @@ Also filtering work for comparisons `=` (equals), `!=` (not equals), `^=` (begin - `vcodec`: Name of the video codec in use - `container`: Name of the container format - `protocol`: The protocol that will be used for the actual download, lower-case. `http`, `https`, `rtsp`, `rtmp`, `rtmpe`, `m3u8`, or `m3u8_native` + - `format_id`: A short description of the format Note that none of the aforementioned meta fields are guaranteed to be present since this solely depends on the metadata obtained by particular extractor, i.e. the metadata offered by video hoster. @@ -831,7 +834,7 @@ To run the test, simply invoke your favorite test runner, or execute a test file If you want to create a build of youtube-dl yourself, you'll need * python -* make +* make (both GNU make and BSD make are supported) * pandoc * zip * nosetests diff --git a/devscripts/make_issue_template.py b/devscripts/make_issue_template.py new file mode 100644 index 000000000..b7ad23d83 --- /dev/null +++ b/devscripts/make_issue_template.py @@ -0,0 +1,29 @@ +#!/usr/bin/env python +from __future__ import unicode_literals + +import io +import optparse + + +def main(): + parser = optparse.OptionParser(usage='%prog INFILE OUTFILE') + options, args = parser.parse_args() + if len(args) != 2: + parser.error('Expected an input and an output filename') + + infile, outfile = args + + with io.open(infile, encoding='utf-8') as inf: + issue_template_tmpl = inf.read() + + # Get the version from youtube_dl/version.py without importing the package + exec(compile(open('youtube_dl/version.py').read(), + 'youtube_dl/version.py', 'exec')) + + out = issue_template_tmpl % {'version': locals()['__version__']} + + with io.open(outfile, 'w', encoding='utf-8') as outf: + outf.write(out) + +if __name__ == '__main__': + main() diff --git a/devscripts/release.sh b/devscripts/release.sh index 61806961c..6718ce39b 100755 --- a/devscripts/release.sh +++ b/devscripts/release.sh @@ -45,9 +45,9 @@ fi /bin/echo -e "\n### Changing version in version.py..." sed -i "s/__version__ = '.*'/__version__ = '$version'/" youtube_dl/version.py -/bin/echo -e "\n### Committing documentation and youtube_dl/version.py..." -make README.md CONTRIBUTING.md supportedsites -git add README.md CONTRIBUTING.md docs/supportedsites.md youtube_dl/version.py +/bin/echo -e "\n### Committing documentation, templates and youtube_dl/version.py..." +make README.md CONTRIBUTING.md ISSUE_TEMPLATE.md supportedsites +git add README.md CONTRIBUTING.md .github/ISSUE_TEMPLATE.md docs/supportedsites.md youtube_dl/version.py git commit -m "release $version" /bin/echo -e "\n### Now tagging, signing and pushing..." diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 3415efc45..00b8c247c 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -74,6 +74,7 @@ - **Bigflix** - **Bild**: Bild.de - **BiliBili** + - **BioBioChileTV** - **BleacherReport** - **BleacherReportCMS** - **blinkx** @@ -100,6 +101,7 @@ - **CBSNews**: CBS News - **CBSNewsLiveVideo**: CBS News Live Videos - **CBSSports** + - **CDA** - **CeskaTelevize** - **channel9**: Channel 9 - **Chaturbate** @@ -244,6 +246,7 @@ - **GPUTechConf** - **Groupon** - **Hark** + - **HBO** - **HearThisAt** - **Heise** - **HellPorno** @@ -344,6 +347,7 @@ - **MiTele**: mitele.es - **mixcloud** - **MLB** + - **Mnet** - **MoeVideo**: LetitBit video services: moevideo.net, playreplay.net and videochart.net - **Mofosex** - **Mojvideo** @@ -440,6 +444,7 @@ - **OnionStudios** - **Ooyala** - **OoyalaExternal** + - **Openload** - **OraTV** - **orf:fm4**: radio FM4 - **orf:iptv**: iptv.ORF.at @@ -525,6 +530,7 @@ - **RUTV**: RUTV.RU - **Ruutu** - **safari**: safaribooksonline.com online video + - **safari:api** - **safari:course**: safaribooksonline.com online courses - **Sandia**: Sandia National Laboratories - **Sapo**: SAPO Vídeos @@ -618,6 +624,7 @@ - **TheOnion** - **ThePlatform** - **ThePlatformFeed** + - **TheScene** - **TheSixtyOne** - **TheStar** - **ThisAmericanLife** @@ -786,6 +793,7 @@ - **youtube:channel**: YouTube.com channels - **youtube:favorites**: YouTube.com favourite videos, ":ytfav" for short (requires authentication) - **youtube:history**: Youtube watch history, ":ythistory" for short (requires authentication) + - **youtube:live**: YouTube.com live streams - **youtube:playlist**: YouTube.com playlists - **youtube:playlists**: YouTube.com user/channel playlists - **youtube:recommended**: YouTube.com recommended videos, ":ytrec" for short (requires authentication) diff --git a/setup.cfg b/setup.cfg index 26857750c..5760112d4 100644 --- a/setup.cfg +++ b/setup.cfg @@ -2,5 +2,5 @@ universal = True [flake8] -exclude = youtube_dl/extractor/__init__.py,devscripts/buildserver.py,setup.py,build,.git +exclude = youtube_dl/extractor/__init__.py,devscripts/buildserver.py,devscripts/make_issue_template.py,setup.py,build,.git ignore = E402,E501,E731 diff --git a/test/test_compat.py b/test/test_compat.py index b6bfad05e..cc105807a 100644 --- a/test/test_compat.py +++ b/test/test_compat.py @@ -19,6 +19,7 @@ from youtube_dl.compat import ( compat_str, compat_urllib_parse_unquote, compat_urllib_parse_unquote_plus, + compat_urllib_parse_urlencode, ) @@ -70,6 +71,12 @@ class TestCompat(unittest.TestCase): self.assertEqual(compat_urllib_parse_unquote_plus('abc%20def'), 'abc def') self.assertEqual(compat_urllib_parse_unquote_plus('%7e/abc+def'), '~/abc def') + def test_compat_urllib_parse_urlencode(self): + self.assertEqual(compat_urllib_parse_urlencode({'abc': 'def'}), 'abc=def') + self.assertEqual(compat_urllib_parse_urlencode({'abc': b'def'}), 'abc=def') + self.assertEqual(compat_urllib_parse_urlencode({b'abc': 'def'}), 'abc=def') + self.assertEqual(compat_urllib_parse_urlencode({b'abc': b'def'}), 'abc=def') + def test_compat_shlex_split(self): self.assertEqual(compat_shlex_split('-option "one two"'), ['-option', 'one two']) diff --git a/test/test_http.py b/test/test_http.py index fc59b1aed..15e0ad369 100644 --- a/test/test_http.py +++ b/test/test_http.py @@ -1,4 +1,5 @@ #!/usr/bin/env python +# coding: utf-8 from __future__ import unicode_literals # Allow direct execution @@ -120,5 +121,14 @@ class TestProxy(unittest.TestCase): response = ydl.urlopen(req).read().decode('utf-8') self.assertEqual(response, 'cn: {0}'.format(url)) + def test_proxy_with_idn(self): + ydl = YoutubeDL({ + 'proxy': 'localhost:{0}'.format(self.port), + }) + url = 'http://中文.tw/' + response = ydl.urlopen(url).read().decode('utf-8') + # b'xn--fiq228c' is '中文'.encode('idna') + self.assertEqual(response, 'normal: http://xn--fiq228c.tw/') + if __name__ == '__main__': unittest.main() diff --git a/test/test_utils.py b/test/test_utils.py index 325b870cc..a35debfe1 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -577,11 +577,11 @@ class TestUtil(unittest.TestCase): ) self.assertEqual( escape_url('http://тест.рф/фрагмент'), - 'http://тест.рф/%D1%84%D1%80%D0%B0%D0%B3%D0%BC%D0%B5%D0%BD%D1%82' + 'http://xn--e1aybc.xn--p1ai/%D1%84%D1%80%D0%B0%D0%B3%D0%BC%D0%B5%D0%BD%D1%82' ) self.assertEqual( escape_url('http://тест.рф/абв?абв=абв#абв'), - 'http://тест.рф/%D0%B0%D0%B1%D0%B2?%D0%B0%D0%B1%D0%B2=%D0%B0%D0%B1%D0%B2#%D0%B0%D0%B1%D0%B2' + 'http://xn--e1aybc.xn--p1ai/%D0%B0%D0%B1%D0%B2?%D0%B0%D0%B1%D0%B2=%D0%B0%D0%B1%D0%B2#%D0%B0%D0%B1%D0%B2' ) self.assertEqual(escape_url('http://vimeo.com/56015672#at=0'), 'http://vimeo.com/56015672#at=0') diff --git a/tox.ini b/tox.ini index 48504329f..2d7134005 100644 --- a/tox.ini +++ b/tox.ini @@ -8,6 +8,6 @@ deps = passenv = HOME defaultargs = test --exclude test_download.py --exclude test_age_restriction.py --exclude test_subtitles.py --exclude test_write_annotations.py - --exclude test_youtube_lists.py + --exclude test_youtube_lists.py --exclude test_iqiyi_sdk_interpreter.py commands = nosetests --verbose {posargs:{[testenv]defaultargs}} # --with-coverage --cover-package=youtube_dl --cover-html # test.test_download:TestDownload.test_NowVideo diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 93b6ca54d..d7aa951ff 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -39,6 +39,8 @@ from .compat import ( compat_urllib_request_DataHandler, ) from .utils import ( + age_restricted, + args_to_str, ContentTooShortError, date_from_str, DateRange, @@ -58,13 +60,16 @@ from .utils import ( PagedList, parse_filesize, PerRequestProxyHandler, - PostProcessingError, platform_name, + PostProcessingError, preferredencoding, + prepend_extension, render_table, + replace_extension, SameFileError, sanitize_filename, sanitize_path, + sanitize_url, sanitized_Request, std_headers, subtitles_filename, @@ -75,10 +80,6 @@ from .utils import ( write_string, YoutubeDLCookieProcessor, YoutubeDLHandler, - prepend_extension, - replace_extension, - args_to_str, - age_restricted, ) from .cache import Cache from .extractor import get_info_extractor, gen_extractors @@ -1229,6 +1230,7 @@ class YoutubeDL(object): t.get('preference'), t.get('width'), t.get('height'), t.get('id'), t.get('url'))) for i, t in enumerate(thumbnails): + t['url'] = sanitize_url(t['url']) if t.get('width') and t.get('height'): t['resolution'] = '%dx%d' % (t['width'], t['height']) if t.get('id') is None: @@ -1263,6 +1265,8 @@ class YoutubeDL(object): if subtitles: for _, subtitle in subtitles.items(): for subtitle_format in subtitle: + if subtitle_format.get('url'): + subtitle_format['url'] = sanitize_url(subtitle_format['url']) if 'ext' not in subtitle_format: subtitle_format['ext'] = determine_ext(subtitle_format['url']).lower() @@ -1292,6 +1296,8 @@ class YoutubeDL(object): if 'url' not in format: raise ExtractorError('Missing "url" key in result (index %d)' % i) + format['url'] = sanitize_url(format['url']) + if format.get('format_id') is None: format['format_id'] = compat_str(i) else: @@ -1836,7 +1842,7 @@ class YoutubeDL(object): if fdict.get('language'): if res: res += ' ' - res += '[%s]' % fdict['language'] + res += '[%s] ' % fdict['language'] if fdict.get('format_note') is not None: res += fdict['format_note'] + ' ' if fdict.get('tbr') is not None: diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 79b389840..737f6545d 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -144,14 +144,20 @@ def _real_main(argv=None): if numeric_limit is None: parser.error('invalid max_filesize specified') opts.max_filesize = numeric_limit - if opts.retries is not None: - if opts.retries in ('inf', 'infinite'): - opts_retries = float('inf') + + def parse_retries(retries): + if retries in ('inf', 'infinite'): + parsed_retries = float('inf') else: try: - opts_retries = int(opts.retries) + parsed_retries = int(retries) except (TypeError, ValueError): parser.error('invalid retry count specified') + return parsed_retries + if opts.retries is not None: + opts.retries = parse_retries(opts.retries) + if opts.fragment_retries is not None: + opts.fragment_retries = parse_retries(opts.fragment_retries) if opts.buffersize is not None: numeric_buffersize = FileDownloader.parse_bytes(opts.buffersize) if numeric_buffersize is None: @@ -299,7 +305,8 @@ def _real_main(argv=None): 'force_generic_extractor': opts.force_generic_extractor, 'ratelimit': opts.ratelimit, 'nooverwrites': opts.nooverwrites, - 'retries': opts_retries, + 'retries': opts.retries, + 'fragment_retries': opts.fragment_retries, 'buffersize': opts.buffersize, 'noresizebuffer': opts.noresizebuffer, 'continuedl': opts.continue_dl, diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index dbb91a6ef..76b6b0e38 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -169,6 +169,31 @@ except ImportError: # Python 2 string = string.replace('+', ' ') return compat_urllib_parse_unquote(string, encoding, errors) +try: + from urllib.parse import urlencode as compat_urllib_parse_urlencode +except ImportError: # Python 2 + # Python 2 will choke in urlencode on mixture of byte and unicode strings. + # Possible solutions are to either port it from python 3 with all + # the friends or manually ensure input query contains only byte strings. + # We will stick with latter thus recursively encoding the whole query. + def compat_urllib_parse_urlencode(query, doseq=0, encoding='utf-8'): + def encode_elem(e): + if isinstance(e, dict): + e = encode_dict(e) + elif isinstance(e, (list, tuple,)): + e = encode_list(e) + elif isinstance(e, compat_str): + e = e.encode(encoding) + return e + + def encode_dict(d): + return dict((encode_elem(k), encode_elem(v)) for k, v in d.items()) + + def encode_list(l): + return [encode_elem(e) for e in l] + + return compat_urllib_parse.urlencode(encode_elem(query), doseq=doseq) + try: from urllib.request import DataHandler as compat_urllib_request_DataHandler except ImportError: # Python < 3.4 @@ -588,6 +613,7 @@ __all__ = [ 'compat_urllib_parse_unquote', 'compat_urllib_parse_unquote_plus', 'compat_urllib_parse_unquote_to_bytes', + 'compat_urllib_parse_urlencode', 'compat_urllib_parse_urlparse', 'compat_urllib_request', 'compat_urllib_request_DataHandler', diff --git a/youtube_dl/downloader/common.py b/youtube_dl/downloader/common.py index f39db58f6..1dba9f49a 100644 --- a/youtube_dl/downloader/common.py +++ b/youtube_dl/downloader/common.py @@ -115,6 +115,10 @@ class FileDownloader(object): return '%10s' % '---b/s' return '%10s' % ('%s/s' % format_bytes(speed)) + @staticmethod + def format_retries(retries): + return 'inf' if retries == float('inf') else '%.0f' % retries + @staticmethod def best_block_size(elapsed_time, bytes): new_min = max(bytes / 2.0, 1.0) @@ -297,7 +301,9 @@ class FileDownloader(object): def report_retry(self, count, retries): """Report retry in case of HTTP error 5xx""" - self.to_screen('[download] Got server HTTP error. Retrying (attempt %d of %.0f)...' % (count, retries)) + self.to_screen( + '[download] Got server HTTP error. Retrying (attempt %d of %s)...' + % (count, self.format_retries(retries))) def report_file_already_downloaded(self, file_name): """Report file has already been fully downloaded.""" diff --git a/youtube_dl/downloader/dash.py b/youtube_dl/downloader/dash.py index 8b1b17c6e..8bbab9dbc 100644 --- a/youtube_dl/downloader/dash.py +++ b/youtube_dl/downloader/dash.py @@ -4,6 +4,7 @@ import os import re from .fragment import FragmentFD +from ..compat import compat_urllib_error from ..utils import ( sanitize_open, encodeFilename, @@ -36,20 +37,41 @@ class DashSegmentsFD(FragmentFD): segments_filenames = [] - def append_url_to_file(target_url, target_filename): - success = ctx['dl'].download(target_filename, {'url': combine_url(base_url, target_url)}) - if not success: + fragment_retries = self.params.get('fragment_retries', 0) + + def append_url_to_file(target_url, tmp_filename, segment_name): + target_filename = '%s-%s' % (tmp_filename, segment_name) + count = 0 + while count <= fragment_retries: + try: + success = ctx['dl'].download(target_filename, {'url': combine_url(base_url, target_url)}) + if not success: + return False + down, target_sanitized = sanitize_open(target_filename, 'rb') + ctx['dest_stream'].write(down.read()) + down.close() + segments_filenames.append(target_sanitized) + break + except (compat_urllib_error.HTTPError, ) as err: + # YouTube may often return 404 HTTP error for a fragment causing the + # whole download to fail. However if the same fragment is immediately + # retried with the same request data this usually succeeds (1-2 attemps + # is usually enough) thus allowing to download the whole file successfully. + # So, we will retry all fragments that fail with 404 HTTP error for now. + if err.code != 404: + raise + # Retry fragment + count += 1 + if count <= fragment_retries: + self.report_retry_fragment(segment_name, count, fragment_retries) + if count > fragment_retries: + self.report_error('giving up after %s fragment retries' % fragment_retries) return False - down, target_sanitized = sanitize_open(target_filename, 'rb') - ctx['dest_stream'].write(down.read()) - down.close() - segments_filenames.append(target_sanitized) if initialization_url: - append_url_to_file(initialization_url, ctx['tmpfilename'] + '-Init') + append_url_to_file(initialization_url, ctx['tmpfilename'], 'Init') for i, segment_url in enumerate(segment_urls): - segment_filename = '%s-Seg%d' % (ctx['tmpfilename'], i) - append_url_to_file(segment_url, segment_filename) + append_url_to_file(segment_url, ctx['tmpfilename'], 'Seg%d' % i) self._finish_frag_download(ctx) diff --git a/youtube_dl/downloader/f4m.py b/youtube_dl/downloader/f4m.py index fc9642905..664d87543 100644 --- a/youtube_dl/downloader/f4m.py +++ b/youtube_dl/downloader/f4m.py @@ -223,6 +223,12 @@ def write_metadata_tag(stream, metadata): write_unsigned_int(stream, FLV_TAG_HEADER_LEN + len(metadata)) +def remove_encrypted_media(media): + return list(filter(lambda e: 'drmAdditionalHeaderId' not in e.attrib and + 'drmAdditionalHeaderSetId' not in e.attrib, + media)) + + def _add_ns(prop): return '{http://ns.adobe.com/f4m/1.0}%s' % prop @@ -244,9 +250,7 @@ class F4mFD(FragmentFD): # without drmAdditionalHeaderId or drmAdditionalHeaderSetId attribute if 'id' not in e.attrib: self.report_error('Missing ID in f4m DRM') - media = list(filter(lambda e: 'drmAdditionalHeaderId' not in e.attrib and - 'drmAdditionalHeaderSetId' not in e.attrib, - media)) + media = remove_encrypted_media(media) if not media: self.report_error('Unsupported DRM') return media diff --git a/youtube_dl/downloader/fragment.py b/youtube_dl/downloader/fragment.py index a5bae9669..ba903ae10 100644 --- a/youtube_dl/downloader/fragment.py +++ b/youtube_dl/downloader/fragment.py @@ -19,8 +19,17 @@ class HttpQuietDownloader(HttpFD): class FragmentFD(FileDownloader): """ A base file downloader class for fragmented media (e.g. f4m/m3u8 manifests). + + Available options: + + fragment_retries: Number of times to retry a fragment for HTTP error (DASH only) """ + def report_retry_fragment(self, fragment_name, count, retries): + self.to_screen( + '[download] Got server HTTP error. Retrying fragment %s (attempt %d of %s)...' + % (fragment_name, count, self.format_retries(retries))) + def _prepare_and_start_frag_download(self, ctx): self._prepare_frag_download(ctx) self._start_frag_download(ctx) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 529051a93..7b0f2b21a 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -72,6 +72,7 @@ from .bet import BetIE from .bigflix import BigflixIE from .bild import BildIE from .bilibili import BiliBiliIE +from .biobiochiletv import BioBioChileTVIE from .bleacherreport import ( BleacherReportIE, BleacherReportCMSIE, @@ -108,6 +109,7 @@ from .cbsnews import ( ) from .cbssports import CBSSportsIE from .ccc import CCCIE +from .cda import CDAIE from .ceskatelevize import CeskaTelevizeIE from .channel9 import Channel9IE from .chaturbate import ChaturbateIE @@ -125,6 +127,7 @@ from .cloudy import CloudyIE from .clubic import ClubicIE from .clyp import ClypIE from .cmt import CMTIE +from .cnbc import CNBCIE from .cnet import CNETIE from .cnn import ( CNNIE, @@ -408,6 +411,7 @@ from .mit import TechTVMITIE, MITIE, OCWMITIE from .mitele import MiTeleIE from .mixcloud import MixcloudIE from .mlb import MLBIE +from .mnet import MnetIE from .mpora import MporaIE from .moevideo import MoeVideoIE from .mofosex import MofosexIE @@ -438,6 +442,7 @@ from .nationalgeographic import NationalGeographicIE from .naver import NaverIE from .nba import NBAIE from .nbc import ( + CSNNEIE, NBCIE, NBCNewsIE, NBCSportsIE, @@ -533,6 +538,7 @@ from .ooyala import ( OoyalaIE, OoyalaExternalIE, ) +from .openload import OpenloadIE from .ora import OraTVIE from .orf import ( ORFTVthekIE, @@ -628,6 +634,7 @@ from .ruutu import RuutuIE from .sandia import SandiaIE from .safari import ( SafariIE, + SafariApiIE, SafariCourseIE, ) from .sapo import SapoIE @@ -739,6 +746,7 @@ from .theplatform import ( ThePlatformIE, ThePlatformFeedIE, ) +from .thescene import TheSceneIE from .thesixtyone import TheSixtyOneIE from .thestar import TheStarIE from .thisamericanlife import ThisAmericanLifeIE @@ -955,7 +963,9 @@ from .youtube import ( YoutubeChannelIE, YoutubeFavouritesIE, YoutubeHistoryIE, + YoutubeLiveIE, YoutubePlaylistIE, + YoutubePlaylistsIE, YoutubeRecommendedIE, YoutubeSearchDateIE, YoutubeSearchIE, @@ -965,7 +975,6 @@ from .youtube import ( YoutubeTruncatedIDIE, YoutubeTruncatedURLIE, YoutubeUserIE, - YoutubePlaylistsIE, YoutubeWatchLaterIE, ) from .zapiks import ZapiksIE diff --git a/youtube_dl/extractor/abc.py b/youtube_dl/extractor/abc.py index 6a29e587f..b584277be 100644 --- a/youtube_dl/extractor/abc.py +++ b/youtube_dl/extractor/abc.py @@ -12,7 +12,7 @@ from ..utils import ( class ABCIE(InfoExtractor): IE_NAME = 'abc.net.au' - _VALID_URL = r'http://www\.abc\.net\.au/news/(?:[^/]+/){1,2}(?P\d+)' + _VALID_URL = r'https?://www\.abc\.net\.au/news/(?:[^/]+/){1,2}(?P\d+)' _TESTS = [{ 'url': 'http://www.abc.net.au/news/2014-11-05/australia-to-staff-ebola-treatment-centre-in-sierra-leone/5868334', diff --git a/youtube_dl/extractor/abc7news.py b/youtube_dl/extractor/abc7news.py index 122dc9099..c04949c21 100644 --- a/youtube_dl/extractor/abc7news.py +++ b/youtube_dl/extractor/abc7news.py @@ -44,6 +44,7 @@ class Abc7NewsIE(InfoExtractor): 'contentURL', webpage, 'm3u8 url', fatal=True) formats = self._extract_m3u8_formats(m3u8, display_id, 'mp4') + self._sort_formats(formats) title = self._og_search_title(webpage).strip() description = self._og_search_description(webpage).strip() diff --git a/youtube_dl/extractor/addanime.py b/youtube_dl/extractor/addanime.py index e3e6d2113..55a9322a7 100644 --- a/youtube_dl/extractor/addanime.py +++ b/youtube_dl/extractor/addanime.py @@ -6,7 +6,7 @@ from .common import InfoExtractor from ..compat import ( compat_HTTPError, compat_str, - compat_urllib_parse, + compat_urllib_parse_urlencode, compat_urllib_parse_urlparse, ) from ..utils import ( @@ -16,7 +16,7 @@ from ..utils import ( class AddAnimeIE(InfoExtractor): - _VALID_URL = r'http://(?:\w+\.)?add-anime\.net/(?:watch_video\.php\?(?:.*?)v=|video/)(?P[\w_]+)' + _VALID_URL = r'https?://(?:\w+\.)?add-anime\.net/(?:watch_video\.php\?(?:.*?)v=|video/)(?P[\w_]+)' _TESTS = [{ 'url': 'http://www.add-anime.net/watch_video.php?v=24MR3YO5SAS9', 'md5': '72954ea10bc979ab5e2eb288b21425a0', @@ -60,7 +60,7 @@ class AddAnimeIE(InfoExtractor): confirm_url = ( parsed_url.scheme + '://' + parsed_url.netloc + action + '?' + - compat_urllib_parse.urlencode({ + compat_urllib_parse_urlencode({ 'jschl_vc': vc, 'jschl_answer': compat_str(av_val)})) self._download_webpage( confirm_url, video_id, diff --git a/youtube_dl/extractor/aftonbladet.py b/youtube_dl/extractor/aftonbladet.py index e0518cf26..d548592fe 100644 --- a/youtube_dl/extractor/aftonbladet.py +++ b/youtube_dl/extractor/aftonbladet.py @@ -6,7 +6,7 @@ from ..utils import int_or_none class AftonbladetIE(InfoExtractor): - _VALID_URL = r'http://tv\.aftonbladet\.se/abtv/articles/(?P[0-9]+)' + _VALID_URL = r'https?://tv\.aftonbladet\.se/abtv/articles/(?P[0-9]+)' _TEST = { 'url': 'http://tv.aftonbladet.se/abtv/articles/36015', 'info_dict': { diff --git a/youtube_dl/extractor/aljazeera.py b/youtube_dl/extractor/aljazeera.py index cddcaa489..b081695d8 100644 --- a/youtube_dl/extractor/aljazeera.py +++ b/youtube_dl/extractor/aljazeera.py @@ -4,7 +4,7 @@ from .common import InfoExtractor class AlJazeeraIE(InfoExtractor): - _VALID_URL = r'http://www\.aljazeera\.com/programmes/.*?/(?P[^/]+)\.html' + _VALID_URL = r'https?://www\.aljazeera\.com/programmes/.*?/(?P[^/]+)\.html' _TEST = { 'url': 'http://www.aljazeera.com/programmes/the-slum/2014/08/deliverance-201482883754237240.html', diff --git a/youtube_dl/extractor/amp.py b/youtube_dl/extractor/amp.py index 69e6baff7..138fa0808 100644 --- a/youtube_dl/extractor/amp.py +++ b/youtube_dl/extractor/amp.py @@ -69,12 +69,14 @@ class AMPIE(InfoExtractor): self._sort_formats(formats) + timestamp = parse_iso8601(item.get('pubDate'), ' ') or parse_iso8601(item.get('dc-date')) + return { 'id': video_id, 'title': get_media_node('title'), 'description': get_media_node('description'), 'thumbnails': thumbnails, - 'timestamp': parse_iso8601(item.get('pubDate'), ' '), + 'timestamp': timestamp, 'duration': int_or_none(media_content[0].get('@attributes', {}).get('duration')), 'subtitles': subtitles, 'formats': formats, diff --git a/youtube_dl/extractor/animeondemand.py b/youtube_dl/extractor/animeondemand.py index 0158407f6..9b01e38f5 100644 --- a/youtube_dl/extractor/animeondemand.py +++ b/youtube_dl/extractor/animeondemand.py @@ -3,10 +3,13 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_urlparse +from ..compat import ( + compat_urlparse, + compat_str, +) from ..utils import ( determine_ext, - encode_dict, + extract_attributes, ExtractorError, sanitized_Request, urlencode_postdata, @@ -34,6 +37,10 @@ class AnimeOnDemandIE(InfoExtractor): # Episodes without titles 'url': 'https://www.anime-on-demand.de/anime/162', 'only_matching': True, + }, { + # ger/jap, Dub/OmU, account required + 'url': 'https://www.anime-on-demand.de/anime/169', + 'only_matching': True, }] def _login(self): @@ -44,6 +51,10 @@ class AnimeOnDemandIE(InfoExtractor): login_page = self._download_webpage( self._LOGIN_URL, None, 'Downloading login page') + if '>Our licensing terms allow the distribution of animes only to German-speaking countries of Europe' in login_page: + self.raise_geo_restricted( + '%s is only available in German-speaking countries of Europe' % self.IE_NAME) + login_form = self._form_hidden_inputs('new_user', login_page) login_form.update({ @@ -59,7 +70,7 @@ class AnimeOnDemandIE(InfoExtractor): post_url = compat_urlparse.urljoin(self._LOGIN_URL, post_url) request = sanitized_Request( - post_url, urlencode_postdata(encode_dict(login_form))) + post_url, urlencode_postdata(login_form)) request.add_header('Referer', self._LOGIN_URL) response = self._download_webpage( @@ -126,33 +137,86 @@ class AnimeOnDemandIE(InfoExtractor): formats = [] - playlist_url = self._search_regex( - r'data-playlist=(["\'])(?P.+?)\1', - episode_html, 'data playlist', default=None, group='url') - if playlist_url: - request = sanitized_Request( - compat_urlparse.urljoin(url, playlist_url), - headers={ - 'X-Requested-With': 'XMLHttpRequest', - 'X-CSRF-Token': csrf_token, - 'Referer': url, - 'Accept': 'application/json, text/javascript, */*; q=0.01', - }) + for input_ in re.findall( + r']+class=["\'].*?streamstarter_html5[^>]+>', episode_html): + attributes = extract_attributes(input_) + playlist_urls = [] + for playlist_key in ('data-playlist', 'data-otherplaylist'): + playlist_url = attributes.get(playlist_key) + if isinstance(playlist_url, compat_str) and re.match( + r'/?[\da-zA-Z]+', playlist_url): + playlist_urls.append(attributes[playlist_key]) + if not playlist_urls: + continue - playlist = self._download_json( - request, video_id, 'Downloading playlist JSON', fatal=False) - if playlist: - playlist = playlist['playlist'][0] - title = playlist['title'] + lang = attributes.get('data-lang') + lang_note = attributes.get('value') + + for playlist_url in playlist_urls: + kind = self._search_regex( + r'videomaterialurl/\d+/([^/]+)/', + playlist_url, 'media kind', default=None) + format_id_list = [] + if lang: + format_id_list.append(lang) + if kind: + format_id_list.append(kind) + if not format_id_list: + format_id_list.append(compat_str(num)) + format_id = '-'.join(format_id_list) + format_note = ', '.join(filter(None, (kind, lang_note))) + request = sanitized_Request( + compat_urlparse.urljoin(url, playlist_url), + headers={ + 'X-Requested-With': 'XMLHttpRequest', + 'X-CSRF-Token': csrf_token, + 'Referer': url, + 'Accept': 'application/json, text/javascript, */*; q=0.01', + }) + playlist = self._download_json( + request, video_id, 'Downloading %s playlist JSON' % format_id, + fatal=False) + if not playlist: + continue + start_video = playlist.get('startvideo', 0) + playlist = playlist.get('playlist') + if not playlist or not isinstance(playlist, list): + continue + playlist = playlist[start_video] + title = playlist.get('title') + if not title: + continue description = playlist.get('description') for source in playlist.get('sources', []): file_ = source.get('file') - if file_ and determine_ext(file_) == 'm3u8': - formats = self._extract_m3u8_formats( + if not file_: + continue + ext = determine_ext(file_) + format_id_list = [lang, kind] + if ext == 'm3u8': + format_id_list.append('hls') + elif source.get('type') == 'video/dash' or ext == 'mpd': + format_id_list.append('dash') + format_id = '-'.join(filter(None, format_id_list)) + if ext == 'm3u8': + file_formats = self._extract_m3u8_formats( file_, video_id, 'mp4', - entry_protocol='m3u8_native', m3u8_id='hls') + entry_protocol='m3u8_native', m3u8_id=format_id, fatal=False) + elif source.get('type') == 'video/dash' or ext == 'mpd': + continue + file_formats = self._extract_mpd_formats( + file_, video_id, mpd_id=format_id, fatal=False) + else: + continue + for f in file_formats: + f.update({ + 'language': lang, + 'format_note': format_note, + }) + formats.extend(file_formats) if formats: + self._sort_formats(formats) f = common_info.copy() f.update({ 'title': title, @@ -161,16 +225,18 @@ class AnimeOnDemandIE(InfoExtractor): }) entries.append(f) - m = re.search( - r'data-dialog-header=(["\'])(?P.+?)\1[^>]+href=(["\'])(?P<href>.+?)\3[^>]*>Teaser<', - episode_html) - if m: - f = common_info.copy() - f.update({ - 'id': '%s-teaser' % f['id'], - 'title': m.group('title'), - 'url': compat_urlparse.urljoin(url, m.group('href')), - }) - entries.append(f) + # Extract teaser only when full episode is not available + if not formats: + m = re.search( + r'data-dialog-header=(["\'])(?P<title>.+?)\1[^>]+href=(["\'])(?P<href>.+?)\3[^>]*>Teaser<', + episode_html) + if m: + f = common_info.copy() + f.update({ + 'id': '%s-teaser' % f['id'], + 'title': m.group('title'), + 'url': compat_urlparse.urljoin(url, m.group('href')), + }) + entries.append(f) return self.playlist_result(entries, anime_id, anime_title, anime_description) diff --git a/youtube_dl/extractor/aol.py b/youtube_dl/extractor/aol.py index b761b2cc4..95a99c6b0 100644 --- a/youtube_dl/extractor/aol.py +++ b/youtube_dl/extractor/aol.py @@ -5,7 +5,7 @@ from .common import InfoExtractor class AolIE(InfoExtractor): IE_NAME = 'on.aol.com' - _VALID_URL = r'(?:aol-video:|http://on\.aol\.com/video/.*-)(?P<id>[0-9]+)(?:$|\?)' + _VALID_URL = r'(?:aol-video:|https?://on\.aol\.com/video/.*-)(?P<id>[0-9]+)(?:$|\?)' _TESTS = [{ 'url': 'http://on.aol.com/video/u-s--official-warns-of-largest-ever-irs-phone-scam-518167793?icid=OnHomepageC2Wide_MustSee_Img', @@ -25,7 +25,7 @@ class AolIE(InfoExtractor): class AolFeaturesIE(InfoExtractor): IE_NAME = 'features.aol.com' - _VALID_URL = r'http://features\.aol\.com/video/(?P<id>[^/?#]+)' + _VALID_URL = r'https?://features\.aol\.com/video/(?P<id>[^/?#]+)' _TESTS = [{ 'url': 'http://features.aol.com/video/behind-secret-second-careers-late-night-talk-show-hosts', diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py index 3e119e21b..ae0f27dcb 100644 --- a/youtube_dl/extractor/arte.py +++ b/youtube_dl/extractor/arte.py @@ -23,7 +23,7 @@ from ..utils import ( class ArteTvIE(InfoExtractor): - _VALID_URL = r'http://videos\.arte\.tv/(?P<lang>fr|de|en|es)/.*-(?P<id>.*?)\.html' + _VALID_URL = r'https?://videos\.arte\.tv/(?P<lang>fr|de|en|es)/.*-(?P<id>.*?)\.html' IE_NAME = 'arte.tv' def _real_extract(self, url): diff --git a/youtube_dl/extractor/atresplayer.py b/youtube_dl/extractor/atresplayer.py index b8f9ae005..d2f388964 100644 --- a/youtube_dl/extractor/atresplayer.py +++ b/youtube_dl/extractor/atresplayer.py @@ -6,16 +6,14 @@ import hashlib import re from .common import InfoExtractor -from ..compat import ( - compat_str, - compat_urllib_parse, -) +from ..compat import compat_str from ..utils import ( - int_or_none, - float_or_none, - sanitized_Request, - xpath_text, ExtractorError, + float_or_none, + int_or_none, + sanitized_Request, + urlencode_postdata, + xpath_text, ) @@ -86,7 +84,7 @@ class AtresPlayerIE(InfoExtractor): } request = sanitized_Request( - self._LOGIN_URL, compat_urllib_parse.urlencode(login_form).encode('utf-8')) + self._LOGIN_URL, urlencode_postdata(login_form)) request.add_header('Content-Type', 'application/x-www-form-urlencoded') response = self._download_webpage( request, None, 'Logging in as %s' % username) diff --git a/youtube_dl/extractor/azubu.py b/youtube_dl/extractor/azubu.py index 011edf128..efa624de1 100644 --- a/youtube_dl/extractor/azubu.py +++ b/youtube_dl/extractor/azubu.py @@ -98,7 +98,7 @@ class AzubuIE(InfoExtractor): class AzubuLiveIE(InfoExtractor): - _VALID_URL = r'http://www.azubu.tv/(?P<id>[^/]+)$' + _VALID_URL = r'https?://www.azubu.tv/(?P<id>[^/]+)$' _TEST = { 'url': 'http://www.azubu.tv/MarsTVMDLen', @@ -120,6 +120,7 @@ class AzubuLiveIE(InfoExtractor): bc_info = self._download_json(req, user) m3u8_url = next(source['src'] for source in bc_info['sources'] if source['container'] == 'M2TS') formats = self._extract_m3u8_formats(m3u8_url, user, ext='mp4') + self._sort_formats(formats) return { 'id': info['id'], diff --git a/youtube_dl/extractor/baidu.py b/youtube_dl/extractor/baidu.py index 76b21e596..234a661d3 100644 --- a/youtube_dl/extractor/baidu.py +++ b/youtube_dl/extractor/baidu.py @@ -9,7 +9,7 @@ from ..utils import unescapeHTML class BaiduVideoIE(InfoExtractor): IE_DESC = '百度视频' - _VALID_URL = r'http://v\.baidu\.com/(?P<type>[a-z]+)/(?P<id>\d+)\.htm' + _VALID_URL = r'https?://v\.baidu\.com/(?P<type>[a-z]+)/(?P<id>\d+)\.htm' _TESTS = [{ 'url': 'http://v.baidu.com/comic/1069.htm?frp=bdbrand&q=%E4%B8%AD%E5%8D%8E%E5%B0%8F%E5%BD%93%E5%AE%B6', 'info_dict': { diff --git a/youtube_dl/extractor/bambuser.py b/youtube_dl/extractor/bambuser.py index da986e063..0eb1930c2 100644 --- a/youtube_dl/extractor/bambuser.py +++ b/youtube_dl/extractor/bambuser.py @@ -4,15 +4,13 @@ import re import itertools from .common import InfoExtractor -from ..compat import ( - compat_urllib_parse, - compat_str, -) +from ..compat import compat_str from ..utils import ( ExtractorError, - int_or_none, float_or_none, + int_or_none, sanitized_Request, + urlencode_postdata, ) @@ -58,7 +56,7 @@ class BambuserIE(InfoExtractor): } request = sanitized_Request( - self._LOGIN_URL, compat_urllib_parse.urlencode(login_form).encode('utf-8')) + self._LOGIN_URL, urlencode_postdata(login_form)) request.add_header('Referer', self._LOGIN_URL) response = self._download_webpage( request, None, 'Logging in as %s' % username) diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index e62b3860e..dedf721bd 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -688,6 +688,10 @@ class BBCIE(BBCCoUkIE): # custom redirection to www.bbc.com 'url': 'http://www.bbc.co.uk/news/science-environment-33661876', 'only_matching': True, + }, { + # single video article embedded with data-media-vpid + 'url': 'http://www.bbc.co.uk/sport/rowing/35908187', + 'only_matching': True, }] @classmethod @@ -817,7 +821,7 @@ class BBCIE(BBCCoUkIE): # single video story (e.g. http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret) programme_id = self._search_regex( - [r'data-video-player-vpid="(%s)"' % self._ID_REGEX, + [r'data-(?:video-player|media)-vpid="(%s)"' % self._ID_REGEX, r'<param[^>]+name="externalIdentifier"[^>]+value="(%s)"' % self._ID_REGEX, r'videoId\s*:\s*["\'](%s)["\']' % self._ID_REGEX], webpage, 'vpid', default=None) @@ -942,7 +946,7 @@ class BBCIE(BBCCoUkIE): class BBCCoUkArticleIE(InfoExtractor): - _VALID_URL = 'http://www.bbc.co.uk/programmes/articles/(?P<id>[a-zA-Z0-9]+)' + _VALID_URL = r'https?://www.bbc.co.uk/programmes/articles/(?P<id>[a-zA-Z0-9]+)' IE_NAME = 'bbc.co.uk:article' IE_DESC = 'BBC articles' diff --git a/youtube_dl/extractor/behindkink.py b/youtube_dl/extractor/behindkink.py index 1bdc25812..9bca853b3 100644 --- a/youtube_dl/extractor/behindkink.py +++ b/youtube_dl/extractor/behindkink.py @@ -8,7 +8,7 @@ from ..utils import url_basename class BehindKinkIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?behindkink\.com/(?P<year>[0-9]{4})/(?P<month>[0-9]{2})/(?P<day>[0-9]{2})/(?P<id>[^/#?_]+)' + _VALID_URL = r'https?://(?:www\.)?behindkink\.com/(?P<year>[0-9]{4})/(?P<month>[0-9]{2})/(?P<day>[0-9]{2})/(?P<id>[^/#?_]+)' _TEST = { 'url': 'http://www.behindkink.com/2014/12/05/what-are-you-passionate-about-marley-blaze/', 'md5': '507b57d8fdcd75a41a9a7bdb7989c762', diff --git a/youtube_dl/extractor/bet.py b/youtube_dl/extractor/bet.py index 03dad4636..986245bf0 100644 --- a/youtube_dl/extractor/bet.py +++ b/youtube_dl/extractor/bet.py @@ -94,6 +94,7 @@ class BetIE(InfoExtractor): xpath_with_ns('./media:thumbnail', NS_MAP)).get('url') formats = self._extract_smil_formats(smil_url, display_id) + self._sort_formats(formats) return { 'id': video_id, diff --git a/youtube_dl/extractor/bilibili.py b/youtube_dl/extractor/bilibili.py index 59beb11bc..8baff2041 100644 --- a/youtube_dl/extractor/bilibili.py +++ b/youtube_dl/extractor/bilibili.py @@ -14,7 +14,7 @@ from ..utils import ( class BiliBiliIE(InfoExtractor): - _VALID_URL = r'http://www\.bilibili\.(?:tv|com)/video/av(?P<id>\d+)(?:/index_(?P<page_num>\d+).html)?' + _VALID_URL = r'https?://www\.bilibili\.(?:tv|com)/video/av(?P<id>\d+)(?:/index_(?P<page_num>\d+).html)?' _TESTS = [{ 'url': 'http://www.bilibili.tv/video/av1074402/', diff --git a/youtube_dl/extractor/biobiochiletv.py b/youtube_dl/extractor/biobiochiletv.py new file mode 100644 index 000000000..133228133 --- /dev/null +++ b/youtube_dl/extractor/biobiochiletv.py @@ -0,0 +1,86 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import remove_end + + +class BioBioChileTVIE(InfoExtractor): + _VALID_URL = r'https?://tv\.biobiochile\.cl/notas/(?:[^/]+/)+(?P<id>[^/]+)\.shtml' + + _TESTS = [{ + 'url': 'http://tv.biobiochile.cl/notas/2015/10/21/sobre-camaras-y-camarillas-parlamentarias.shtml', + 'md5': '26f51f03cf580265defefb4518faec09', + 'info_dict': { + 'id': 'sobre-camaras-y-camarillas-parlamentarias', + 'ext': 'mp4', + 'title': 'Sobre Cámaras y camarillas parlamentarias', + 'thumbnail': 're:^https?://.*\.jpg$', + 'uploader': 'Fernando Atria', + }, + }, { + # different uploader layout + 'url': 'http://tv.biobiochile.cl/notas/2016/03/18/natalia-valdebenito-repasa-a-diputado-hasbun-paso-a-la-categoria-de-hablar-brutalidades.shtml', + 'md5': 'edc2e6b58974c46d5b047dea3c539ff3', + 'info_dict': { + 'id': 'natalia-valdebenito-repasa-a-diputado-hasbun-paso-a-la-categoria-de-hablar-brutalidades', + 'ext': 'mp4', + 'title': 'Natalia Valdebenito repasa a diputado Hasbún: Pasó a la categoría de hablar brutalidades', + 'thumbnail': 're:^https?://.*\.jpg$', + 'uploader': 'Piangella Obrador', + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'http://tv.biobiochile.cl/notas/2015/10/22/ninos-transexuales-de-quien-es-la-decision.shtml', + 'only_matching': True, + }, { + 'url': 'http://tv.biobiochile.cl/notas/2015/10/21/exclusivo-hector-pinto-formador-de-chupete-revela-version-del-ex-delantero-albo.shtml', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + title = remove_end(self._og_search_title(webpage), ' - BioBioChile TV') + + file_url = self._search_regex( + r'loadFWPlayerVideo\([^,]+,\s*(["\'])(?P<url>.+?)\1', + webpage, 'file url', group='url') + + base_url = self._search_regex( + r'file\s*:\s*(["\'])(?P<url>.+?)\1\s*\+\s*fileURL', webpage, + 'base url', default='http://unlimited2-cl.digitalproserver.com/bbtv/', + group='url') + + formats = self._extract_m3u8_formats( + '%s%s/playlist.m3u8' % (base_url, file_url), video_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id='hls', fatal=False) + f = { + 'url': '%s%s' % (base_url, file_url), + 'format_id': 'http', + 'protocol': 'http', + 'preference': 1, + } + if formats: + f_copy = formats[-1].copy() + f_copy.update(f) + f = f_copy + formats.append(f) + self._sort_formats(formats) + + thumbnail = self._og_search_thumbnail(webpage) + uploader = self._html_search_regex( + r'<a[^>]+href=["\']https?://busca\.biobiochile\.cl/author[^>]+>(.+?)</a>', + webpage, 'uploader', fatal=False) + + return { + 'id': video_id, + 'title': title, + 'thumbnail': thumbnail, + 'uploader': uploader, + 'formats': formats, + } diff --git a/youtube_dl/extractor/bokecc.py b/youtube_dl/extractor/bokecc.py index 122a1cbb6..86a7f4d7d 100644 --- a/youtube_dl/extractor/bokecc.py +++ b/youtube_dl/extractor/bokecc.py @@ -33,7 +33,7 @@ class BokeCCBaseIE(InfoExtractor): class BokeCCIE(BokeCCBaseIE): _IE_DESC = 'CC视频' - _VALID_URL = r'http://union\.bokecc\.com/playvideo\.bo\?(?P<query>.*)' + _VALID_URL = r'https?://union\.bokecc\.com/playvideo\.bo\?(?P<query>.*)' _TESTS = [{ 'url': 'http://union.bokecc.com/playvideo.bo?vid=E44D40C15E65EA30&uid=CD0C5D3C8614B28B', diff --git a/youtube_dl/extractor/bpb.py b/youtube_dl/extractor/bpb.py index c28e72927..6ad45a1e6 100644 --- a/youtube_dl/extractor/bpb.py +++ b/youtube_dl/extractor/bpb.py @@ -12,7 +12,7 @@ from ..utils import ( class BpbIE(InfoExtractor): IE_DESC = 'Bundeszentrale für politische Bildung' - _VALID_URL = r'http://www\.bpb\.de/mediathek/(?P<id>[0-9]+)/' + _VALID_URL = r'https?://www\.bpb\.de/mediathek/(?P<id>[0-9]+)/' _TEST = { 'url': 'http://www.bpb.de/mediathek/297/joachim-gauck-zu-1989-und-die-erinnerung-an-die-ddr', diff --git a/youtube_dl/extractor/breakcom.py b/youtube_dl/extractor/breakcom.py index aa08051b1..725859b4d 100644 --- a/youtube_dl/extractor/breakcom.py +++ b/youtube_dl/extractor/breakcom.py @@ -11,7 +11,7 @@ from ..utils import ( class BreakIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?break\.com/video/(?:[^/]+/)*.+-(?P<id>\d+)' + _VALID_URL = r'https?://(?:www\.)?break\.com/video/(?:[^/]+/)*.+-(?P<id>\d+)' _TESTS = [{ 'url': 'http://www.break.com/video/when-girls-act-like-guys-2468056', 'info_dict': { diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index 59e8008f9..c9e43a275 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -136,13 +136,16 @@ class BrightcoveLegacyIE(InfoExtractor): else: flashvars = {} + data_url = object_doc.attrib.get('data', '') + data_url_params = compat_parse_qs(compat_urllib_parse_urlparse(data_url).query) + def find_param(name): if name in flashvars: return flashvars[name] node = find_xpath_attr(object_doc, './param', 'name', name) if node is not None: return node.attrib['value'] - return None + return data_url_params.get(name) params = {} @@ -294,7 +297,7 @@ class BrightcoveLegacyIE(InfoExtractor): 'uploader': video_info.get('publisherName'), } - renditions = video_info.get('renditions') + renditions = video_info.get('renditions', []) + video_info.get('IOSRenditions', []) if renditions: formats = [] for rend in renditions: @@ -316,13 +319,23 @@ class BrightcoveLegacyIE(InfoExtractor): if ext is None: ext = determine_ext(url) size = rend.get('size') - formats.append({ + a_format = { 'url': url, 'ext': ext, 'height': rend.get('frameHeight'), 'width': rend.get('frameWidth'), 'filesize': size if size != 0 else None, - }) + } + + # m3u8 manifests with remote == false are media playlists + # Not calling _extract_m3u8_formats here to save network traffic + if ext == 'm3u8': + a_format.update({ + 'ext': 'mp4', + 'protocol': 'm3u8', + }) + + formats.append(a_format) self._sort_formats(formats) info['formats'] = formats elif video_info.get('FLVFullLengthURL') is not None: diff --git a/youtube_dl/extractor/camdemy.py b/youtube_dl/extractor/camdemy.py index 897f3a104..6ffbeabd3 100644 --- a/youtube_dl/extractor/camdemy.py +++ b/youtube_dl/extractor/camdemy.py @@ -6,7 +6,7 @@ import re from .common import InfoExtractor from ..compat import ( - compat_urllib_parse, + compat_urllib_parse_urlencode, compat_urlparse, ) from ..utils import ( @@ -16,7 +16,7 @@ from ..utils import ( class CamdemyIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?camdemy\.com/media/(?P<id>\d+)' + _VALID_URL = r'https?://(?:www\.)?camdemy\.com/media/(?P<id>\d+)' _TESTS = [{ # single file 'url': 'http://www.camdemy.com/media/5181/', @@ -104,7 +104,7 @@ class CamdemyIE(InfoExtractor): class CamdemyFolderIE(InfoExtractor): - _VALID_URL = r'http://www.camdemy.com/folder/(?P<id>\d+)' + _VALID_URL = r'https?://www.camdemy.com/folder/(?P<id>\d+)' _TESTS = [{ # links with trailing slash 'url': 'http://www.camdemy.com/folder/450', @@ -139,7 +139,7 @@ class CamdemyFolderIE(InfoExtractor): parsed_url = list(compat_urlparse.urlparse(url)) query = dict(compat_urlparse.parse_qsl(parsed_url[4])) query.update({'displayMode': 'list'}) - parsed_url[4] = compat_urllib_parse.urlencode(query) + parsed_url[4] = compat_urllib_parse_urlencode(query) final_url = compat_urlparse.urlunparse(parsed_url) page = self._download_webpage(final_url, folder_id) diff --git a/youtube_dl/extractor/cbsnews.py b/youtube_dl/extractor/cbsnews.py index 8ddcc5097..e6b7f3584 100644 --- a/youtube_dl/extractor/cbsnews.py +++ b/youtube_dl/extractor/cbsnews.py @@ -11,7 +11,7 @@ from ..utils import ( class CBSNewsIE(ThePlatformIE): IE_DESC = 'CBS News' - _VALID_URL = r'http://(?:www\.)?cbsnews\.com/(?:news|videos)/(?P<id>[\da-z_-]+)' + _VALID_URL = r'https?://(?:www\.)?cbsnews\.com/(?:news|videos)/(?P<id>[\da-z_-]+)' _TESTS = [ { @@ -96,7 +96,7 @@ class CBSNewsIE(ThePlatformIE): class CBSNewsLiveVideoIE(InfoExtractor): IE_DESC = 'CBS News Live Videos' - _VALID_URL = r'http://(?:www\.)?cbsnews\.com/live/video/(?P<id>[\da-z_-]+)' + _VALID_URL = r'https?://(?:www\.)?cbsnews\.com/live/video/(?P<id>[\da-z_-]+)' _TEST = { 'url': 'http://www.cbsnews.com/live/video/clinton-sanders-prepare-to-face-off-in-nh/', @@ -122,6 +122,7 @@ class CBSNewsLiveVideoIE(InfoExtractor): for entry in f4m_formats: # URLs without the extra param induce an 404 error entry.update({'extra_param_to_segment_url': hdcore_sign}) + self._sort_formats(f4m_formats) return { 'id': video_id, diff --git a/youtube_dl/extractor/cbssports.py b/youtube_dl/extractor/cbssports.py index ae47e74cc..549ae32f3 100644 --- a/youtube_dl/extractor/cbssports.py +++ b/youtube_dl/extractor/cbssports.py @@ -6,7 +6,7 @@ from .common import InfoExtractor class CBSSportsIE(InfoExtractor): - _VALID_URL = r'http://www\.cbssports\.com/video/player/(?P<section>[^/]+)/(?P<id>[^/]+)' + _VALID_URL = r'https?://www\.cbssports\.com/video/player/(?P<section>[^/]+)/(?P<id>[^/]+)' _TEST = { 'url': 'http://www.cbssports.com/video/player/tennis/318462531970/0/us-open-flashbacks-1990s', diff --git a/youtube_dl/extractor/cda.py b/youtube_dl/extractor/cda.py new file mode 100755 index 000000000..498d2c0d8 --- /dev/null +++ b/youtube_dl/extractor/cda.py @@ -0,0 +1,96 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + decode_packed_codes, + ExtractorError, + parse_duration +) + + +class CDAIE(InfoExtractor): + _VALID_URL = r'https?://(?:(?:www\.)?cda\.pl/video|ebd\.cda\.pl/[0-9]+x[0-9]+)/(?P<id>[0-9a-z]+)' + _TESTS = [{ + 'url': 'http://www.cda.pl/video/5749950c', + 'md5': '6f844bf51b15f31fae165365707ae970', + 'info_dict': { + 'id': '5749950c', + 'ext': 'mp4', + 'height': 720, + 'title': 'Oto dlaczego przed zakrętem należy zwolnić.', + 'duration': 39 + } + }, { + 'url': 'http://www.cda.pl/video/57413289', + 'md5': 'a88828770a8310fc00be6c95faf7f4d5', + 'info_dict': { + 'id': '57413289', + 'ext': 'mp4', + 'title': 'Lądowanie na lotnisku na Maderze', + 'duration': 137 + } + }, { + 'url': 'http://ebd.cda.pl/0x0/5749950c', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage('http://ebd.cda.pl/0x0/' + video_id, video_id) + + if 'Ten film jest dostępny dla użytkowników premium' in webpage: + raise ExtractorError('This video is only available for premium users.', expected=True) + + title = self._html_search_regex(r'<title>(.+?)', webpage, 'title') + + formats = [] + + info_dict = { + 'id': video_id, + 'title': title, + 'formats': formats, + 'duration': None, + } + + def extract_format(page, version): + unpacked = decode_packed_codes(page) + format_url = self._search_regex( + r"url:\\'(.+?)\\'", unpacked, '%s url' % version, fatal=False) + if not format_url: + return + f = { + 'url': format_url, + } + m = re.search( + r']+data-quality="(?P[^"]+)"[^>]+href="[^"]+"[^>]+class="[^"]*quality-btn-active[^"]*">(?P[0-9]+)p', + page) + if m: + f.update({ + 'format_id': m.group('format_id'), + 'height': int(m.group('height')), + }) + info_dict['formats'].append(f) + if not info_dict['duration']: + info_dict['duration'] = parse_duration(self._search_regex( + r"duration:\\'(.+?)\\'", unpacked, 'duration', fatal=False)) + + extract_format(webpage, 'default') + + for href, resolution in re.findall( + r']+data-quality="[^"]+"[^>]+href="([^"]+)"[^>]+class="quality-btn"[^>]*>([0-9]+p)', + webpage): + webpage = self._download_webpage( + href, video_id, 'Downloading %s version information' % resolution, fatal=False) + if not webpage: + # Manually report warning because empty page is returned when + # invalid version is requested. + self.report_warning('Unable to download %s version information' % resolution) + continue + extract_format(webpage, resolution) + + self._sort_formats(formats) + + return info_dict diff --git a/youtube_dl/extractor/ceskatelevize.py b/youtube_dl/extractor/ceskatelevize.py index b27b4e670..6652c8e42 100644 --- a/youtube_dl/extractor/ceskatelevize.py +++ b/youtube_dl/extractor/ceskatelevize.py @@ -5,7 +5,6 @@ import re from .common import InfoExtractor from ..compat import ( - compat_urllib_parse, compat_urllib_parse_unquote, compat_urllib_parse_urlparse, ) @@ -13,6 +12,7 @@ from ..utils import ( ExtractorError, float_or_none, sanitized_Request, + urlencode_postdata, ) @@ -102,7 +102,7 @@ class CeskaTelevizeIE(InfoExtractor): req = sanitized_Request( 'http://www.ceskatelevize.cz/ivysilani/ajax/get-client-playlist', - data=compat_urllib_parse.urlencode(data)) + data=urlencode_postdata(data)) req.add_header('Content-type', 'application/x-www-form-urlencoded') req.add_header('x-addr', '127.0.0.1') @@ -129,7 +129,8 @@ class CeskaTelevizeIE(InfoExtractor): formats = [] for format_id, stream_url in item['streamUrls'].items(): formats.extend(self._extract_m3u8_formats( - stream_url, playlist_id, 'mp4', entry_protocol='m3u8_native')) + stream_url, playlist_id, 'mp4', + entry_protocol='m3u8_native', fatal=False)) self._sort_formats(formats) item_id = item.get('id') or item['assetId'] diff --git a/youtube_dl/extractor/chaturbate.py b/youtube_dl/extractor/chaturbate.py index 242fba311..b2234549e 100644 --- a/youtube_dl/extractor/chaturbate.py +++ b/youtube_dl/extractor/chaturbate.py @@ -48,6 +48,7 @@ class ChaturbateIE(InfoExtractor): raise ExtractorError('Unable to find stream URL') formats = self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4') + self._sort_formats(formats) return { 'id': video_id, diff --git a/youtube_dl/extractor/cliphunter.py b/youtube_dl/extractor/cliphunter.py index 2996b6b09..19f8b397e 100644 --- a/youtube_dl/extractor/cliphunter.py +++ b/youtube_dl/extractor/cliphunter.py @@ -19,7 +19,7 @@ def _decode(s): class CliphunterIE(InfoExtractor): IE_NAME = 'cliphunter' - _VALID_URL = r'''(?x)http://(?:www\.)?cliphunter\.com/w/ + _VALID_URL = r'''(?x)https?://(?:www\.)?cliphunter\.com/w/ (?P[0-9]+)/ (?P.+?)(?:$|[#\?]) ''' diff --git a/youtube_dl/extractor/clipsyndicate.py b/youtube_dl/extractor/clipsyndicate.py index 8306d6fb7..0b6ad895f 100644 --- a/youtube_dl/extractor/clipsyndicate.py +++ b/youtube_dl/extractor/clipsyndicate.py @@ -8,7 +8,7 @@ from ..utils import ( class ClipsyndicateIE(InfoExtractor): - _VALID_URL = r'http://(?:chic|www)\.clipsyndicate\.com/video/play(list/\d+)?/(?P\d+)' + _VALID_URL = r'https?://(?:chic|www)\.clipsyndicate\.com/video/play(list/\d+)?/(?P\d+)' _TESTS = [{ 'url': 'http://www.clipsyndicate.com/video/play/4629301/brick_briscoe', diff --git a/youtube_dl/extractor/cloudy.py b/youtube_dl/extractor/cloudy.py index 0fa720ee8..9e267e6c0 100644 --- a/youtube_dl/extractor/cloudy.py +++ b/youtube_dl/extractor/cloudy.py @@ -6,7 +6,7 @@ import re from .common import InfoExtractor from ..compat import ( compat_parse_qs, - compat_urllib_parse, + compat_urllib_parse_urlencode, compat_HTTPError, ) from ..utils import ( @@ -64,7 +64,7 @@ class CloudyIE(InfoExtractor): 'errorUrl': error_url, }) - data_url = self._API_URL % (video_host, compat_urllib_parse.urlencode(form)) + data_url = self._API_URL % (video_host, compat_urllib_parse_urlencode(form)) player_data = self._download_webpage( data_url, video_id, 'Downloading player data') data = compat_parse_qs(player_data) diff --git a/youtube_dl/extractor/clubic.py b/youtube_dl/extractor/clubic.py index 1dfa7c12e..2fba93543 100644 --- a/youtube_dl/extractor/clubic.py +++ b/youtube_dl/extractor/clubic.py @@ -12,7 +12,7 @@ from ..utils import ( class ClubicIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?clubic\.com/video/(?:[^/]+/)*video.*-(?P[0-9]+)\.html' + _VALID_URL = r'https?://(?:www\.)?clubic\.com/video/(?:[^/]+/)*video.*-(?P[0-9]+)\.html' _TESTS = [{ 'url': 'http://www.clubic.com/video/clubic-week/video-clubic-week-2-0-le-fbi-se-lance-dans-la-photo-d-identite-448474.html', diff --git a/youtube_dl/extractor/cnbc.py b/youtube_dl/extractor/cnbc.py new file mode 100644 index 000000000..25b308752 --- /dev/null +++ b/youtube_dl/extractor/cnbc.py @@ -0,0 +1,33 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import smuggle_url + + +class CNBCIE(InfoExtractor): + _VALID_URL = r'https?://video\.cnbc\.com/gallery/\?video=(?P[0-9]+)' + _TEST = { + 'url': 'http://video.cnbc.com/gallery/?video=3000503714', + 'info_dict': { + 'id': '3000503714', + 'ext': 'mp4', + 'title': 'Fighting zombies is big business', + 'description': 'md5:0c100d8e1a7947bd2feec9a5550e519e', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + return { + '_type': 'url_transparent', + 'ie_key': 'ThePlatform', + 'url': smuggle_url( + 'http://link.theplatform.com/s/gZWlPC/media/guid/2408950221/%s?mbr=true&manifest=m3u' % video_id, + {'force_smil_url': True}), + 'id': video_id, + } diff --git a/youtube_dl/extractor/comcarcoff.py b/youtube_dl/extractor/comcarcoff.py index 7dff68492..747c245c8 100644 --- a/youtube_dl/extractor/comcarcoff.py +++ b/youtube_dl/extractor/comcarcoff.py @@ -11,7 +11,7 @@ from ..utils import ( class ComCarCoffIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?comediansincarsgettingcoffee\.com/(?P[a-z0-9\-]*)' + _VALID_URL = r'https?://(?:www\.)?comediansincarsgettingcoffee\.com/(?P[a-z0-9\-]*)' _TESTS = [{ 'url': 'http://comediansincarsgettingcoffee.com/miranda-sings-happy-thanksgiving-miranda/', 'info_dict': { @@ -41,7 +41,13 @@ class ComCarCoffIE(InfoExtractor): display_id = full_data['activeVideo']['video'] video_data = full_data.get('videos', {}).get(display_id) or full_data['singleshots'][display_id] + video_id = compat_str(video_data['mediaId']) + title = video_data['title'] + formats = self._extract_m3u8_formats( + video_data['mediaUrl'], video_id, 'mp4') + self._sort_formats(formats) + thumbnails = [{ 'url': video_data['images']['thumb'], }, { @@ -54,15 +60,14 @@ class ComCarCoffIE(InfoExtractor): video_data.get('duration')) return { - '_type': 'url_transparent', - 'url': 'crackle:%s' % video_id, 'id': video_id, 'display_id': display_id, - 'title': video_data['title'], + 'title': title, 'description': video_data.get('description'), 'timestamp': timestamp, 'duration': duration, 'thumbnails': thumbnails, + 'formats': formats, 'season_number': int_or_none(video_data.get('season')), 'episode_number': int_or_none(video_data.get('episode')), 'webpage_url': 'http://comediansincarsgettingcoffee.com/%s' % (video_data.get('urlSlug', video_data.get('slug'))), diff --git a/youtube_dl/extractor/comedycentral.py b/youtube_dl/extractor/comedycentral.py index 5b1b99675..0c59102e0 100644 --- a/youtube_dl/extractor/comedycentral.py +++ b/youtube_dl/extractor/comedycentral.py @@ -5,7 +5,7 @@ import re from .mtv import MTVServicesInfoExtractor from ..compat import ( compat_str, - compat_urllib_parse, + compat_urllib_parse_urlencode, ) from ..utils import ( ExtractorError, @@ -201,7 +201,7 @@ class ComedyCentralShowsIE(MTVServicesInfoExtractor): # Correct cc.com in uri uri = re.sub(r'(episode:[^.]+)(\.cc)?\.com', r'\1.com', uri) - index_url = 'http://%s.cc.com/feeds/mrss?%s' % (show_name, compat_urllib_parse.urlencode({'uri': uri})) + index_url = 'http://%s.cc.com/feeds/mrss?%s' % (show_name, compat_urllib_parse_urlencode({'uri': uri})) idoc = self._download_xml( index_url, epTitle, 'Downloading show index', 'Unable to download episode index') diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 770105a5b..9b7ab8924 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -21,9 +21,10 @@ from ..compat import ( compat_os_name, compat_str, compat_urllib_error, - compat_urllib_parse, + compat_urllib_parse_urlencode, compat_urlparse, ) +from ..downloader.f4m import remove_encrypted_media from ..utils import ( NO_DEFAULT, age_restricted, @@ -989,6 +990,11 @@ class InfoExtractor(object): if not media_nodes: manifest_version = '2.0' media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media') + # Remove unsupported DRM protected media from final formats + # rendition (see https://github.com/rg3/youtube-dl/issues/8573). + media_nodes = remove_encrypted_media(media_nodes) + if not media_nodes: + return formats base_url = xpath_text( manifest, ['{http://ns.adobe.com/f4m/1.0}baseURL', '{http://ns.adobe.com/f4m/2.0}baseURL'], 'base URL', default=None) @@ -1021,8 +1027,6 @@ class InfoExtractor(object): 'height': int_or_none(media_el.attrib.get('height')), 'preference': preference, }) - self._sort_formats(formats) - return formats def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None, @@ -1143,7 +1147,6 @@ class InfoExtractor(object): last_media = None formats.append(f) last_info = {} - self._sort_formats(formats) return formats @staticmethod @@ -1300,7 +1303,7 @@ class InfoExtractor(object): 'plugin': 'flowplayer-3.2.0.1', } f4m_url += '&' if '?' in f4m_url else '?' - f4m_url += compat_urllib_parse.urlencode(f4m_params) + f4m_url += compat_urllib_parse_urlencode(f4m_params) formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False)) continue @@ -1317,8 +1320,6 @@ class InfoExtractor(object): }) continue - self._sort_formats(formats) - return formats def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'): @@ -1536,7 +1537,6 @@ class InfoExtractor(object): existing_format.update(f) else: self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type) - self._sort_formats(formats) return formats def _live_title(self, name): diff --git a/youtube_dl/extractor/condenast.py b/youtube_dl/extractor/condenast.py index 6f92ae2ed..e8f2b5a07 100644 --- a/youtube_dl/extractor/condenast.py +++ b/youtube_dl/extractor/condenast.py @@ -5,7 +5,7 @@ import re from .common import InfoExtractor from ..compat import ( - compat_urllib_parse, + compat_urllib_parse_urlencode, compat_urllib_parse_urlparse, compat_urlparse, ) @@ -45,7 +45,7 @@ class CondeNastIE(InfoExtractor): 'wmagazine': 'W Magazine', } - _VALID_URL = r'http://(?:video|www|player)\.(?P%s)\.com/(?Pwatch|series|video|embed(?:js)?)/(?P[^/?#]+)' % '|'.join(_SITES.keys()) + _VALID_URL = r'https?://(?:video|www|player)\.(?P%s)\.com/(?Pwatch|series|video|embed(?:js)?)/(?P[^/?#]+)' % '|'.join(_SITES.keys()) IE_DESC = 'Condé Nast media group: %s' % ', '.join(sorted(_SITES.values())) EMBED_URL = r'(?:https?:)?//player\.(?P%s)\.com/(?Pembed(?:js)?)/.+?' % '|'.join(_SITES.keys()) @@ -97,7 +97,7 @@ class CondeNastIE(InfoExtractor): video_id = self._search_regex(r'videoId: [\'"](.+?)[\'"]', params, 'video id') player_id = self._search_regex(r'playerId: [\'"](.+?)[\'"]', params, 'player id') target = self._search_regex(r'target: [\'"](.+?)[\'"]', params, 'target') - data = compat_urllib_parse.urlencode({'videoId': video_id, + data = compat_urllib_parse_urlencode({'videoId': video_id, 'playerId': player_id, 'target': target, }) diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index 85fa7a725..8ae3f2890 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -11,8 +11,8 @@ from math import pow, sqrt, floor from .common import InfoExtractor from ..compat import ( compat_etree_fromstring, - compat_urllib_parse, compat_urllib_parse_unquote, + compat_urllib_parse_urlencode, compat_urllib_request, compat_urlparse, ) @@ -78,7 +78,7 @@ class CrunchyrollBaseIE(InfoExtractor): # See https://github.com/rg3/youtube-dl/issues/7202. qs['skip_wall'] = ['1'] return compat_urlparse.urlunparse( - parsed_url._replace(query=compat_urllib_parse.urlencode(qs, True))) + parsed_url._replace(query=compat_urllib_parse_urlencode(qs, True))) class CrunchyrollIE(CrunchyrollBaseIE): @@ -308,7 +308,7 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text playerdata_url = compat_urllib_parse_unquote(self._html_search_regex(r'"config_url":"([^"]+)', webpage, 'playerdata_url')) playerdata_req = sanitized_Request(playerdata_url) - playerdata_req.data = compat_urllib_parse.urlencode({'current_page': webpage_url}) + playerdata_req.data = urlencode_postdata({'current_page': webpage_url}) playerdata_req.add_header('Content-Type', 'application/x-www-form-urlencoded') playerdata = self._download_webpage(playerdata_req, video_id, note='Downloading media info') @@ -322,7 +322,7 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text streamdata_req = sanitized_Request( 'http://www.crunchyroll.com/xml/?req=RpcApiVideoPlayer_GetStandardConfig&media_id=%s&video_format=%s&video_quality=%s' % (stream_id, stream_format, stream_quality), - compat_urllib_parse.urlencode({'current_page': url}).encode('utf-8')) + compat_urllib_parse_urlencode({'current_page': url}).encode('utf-8')) streamdata_req.add_header('Content-Type', 'application/x-www-form-urlencoded') streamdata = self._download_xml( streamdata_req, video_id, diff --git a/youtube_dl/extractor/cspan.py b/youtube_dl/extractor/cspan.py index b8b9d058d..84b36f44c 100644 --- a/youtube_dl/extractor/cspan.py +++ b/youtube_dl/extractor/cspan.py @@ -15,7 +15,7 @@ from .senateisvp import SenateISVPIE class CSpanIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?c-span\.org/video/\?(?P[0-9a-f]+)' + _VALID_URL = r'https?://(?:www\.)?c-span\.org/video/\?(?P[0-9a-f]+)' IE_DESC = 'C-SPAN' _TESTS = [{ 'url': 'http://www.c-span.org/video/?313572-1/HolderonV', diff --git a/youtube_dl/extractor/ctsnews.py b/youtube_dl/extractor/ctsnews.py index 45049bf37..1622fc844 100644 --- a/youtube_dl/extractor/ctsnews.py +++ b/youtube_dl/extractor/ctsnews.py @@ -8,7 +8,7 @@ from ..utils import parse_iso8601, ExtractorError class CtsNewsIE(InfoExtractor): IE_DESC = '華視新聞' # https connection failed (Connection reset) - _VALID_URL = r'http://news\.cts\.com\.tw/[a-z]+/[a-z]+/\d+/(?P\d+)\.html' + _VALID_URL = r'https?://news\.cts\.com\.tw/[a-z]+/[a-z]+/\d+/(?P\d+)\.html' _TESTS = [{ 'url': 'http://news.cts.com.tw/cts/international/201501/201501291578109.html', 'md5': 'a9875cb790252b08431186d741beaabe', diff --git a/youtube_dl/extractor/cwtv.py b/youtube_dl/extractor/cwtv.py index 36af67013..f5cefd966 100644 --- a/youtube_dl/extractor/cwtv.py +++ b/youtube_dl/extractor/cwtv.py @@ -57,6 +57,7 @@ class CWTVIE(InfoExtractor): formats = self._extract_m3u8_formats( video_data['videos']['variantplaylist']['uri'], video_id, 'mp4') + self._sort_formats(formats) thumbnails = [{ 'url': image['uri'], diff --git a/youtube_dl/extractor/daum.py b/youtube_dl/extractor/daum.py index c84c51058..86024a745 100644 --- a/youtube_dl/extractor/daum.py +++ b/youtube_dl/extractor/daum.py @@ -8,8 +8,8 @@ import itertools from .common import InfoExtractor from ..compat import ( compat_parse_qs, - compat_urllib_parse, compat_urllib_parse_unquote, + compat_urllib_parse_urlencode, compat_urlparse, ) from ..utils import ( @@ -70,7 +70,7 @@ class DaumIE(InfoExtractor): def _real_extract(self, url): video_id = compat_urllib_parse_unquote(self._match_id(url)) - query = compat_urllib_parse.urlencode({'vid': video_id}) + query = compat_urllib_parse_urlencode({'vid': video_id}) movie_data = self._download_json( 'http://videofarm.daum.net/controller/api/closed/v1_2/IntegratedMovieData.json?' + query, video_id, 'Downloading video formats info') @@ -86,7 +86,7 @@ class DaumIE(InfoExtractor): formats = [] for format_el in movie_data['output_list']['output_list']: profile = format_el['profile'] - format_query = compat_urllib_parse.urlencode({ + format_query = compat_urllib_parse_urlencode({ 'vid': video_id, 'profile': profile, }) diff --git a/youtube_dl/extractor/dcn.py b/youtube_dl/extractor/dcn.py index 15a1c40f7..5deff5f30 100644 --- a/youtube_dl/extractor/dcn.py +++ b/youtube_dl/extractor/dcn.py @@ -6,7 +6,7 @@ import base64 from .common import InfoExtractor from ..compat import ( - compat_urllib_parse, + compat_urllib_parse_urlencode, compat_str, ) from ..utils import ( @@ -15,6 +15,7 @@ from ..utils import ( sanitized_Request, smuggle_url, unsmuggle_url, + urlencode_postdata, ) @@ -106,7 +107,7 @@ class DCNVideoIE(DCNBaseIE): webpage = self._download_webpage( 'http://admin.mangomolo.com/analytics/index.php/customers/embed/video?' + - compat_urllib_parse.urlencode({ + compat_urllib_parse_urlencode({ 'id': video_data['id'], 'user_id': video_data['user_id'], 'signature': video_data['signature'], @@ -133,7 +134,7 @@ class DCNLiveIE(DCNBaseIE): webpage = self._download_webpage( 'http://admin.mangomolo.com/analytics/index.php/customers/embed/index?' + - compat_urllib_parse.urlencode({ + compat_urllib_parse_urlencode({ 'id': base64.b64encode(channel_data['user_id'].encode()).decode(), 'channelid': base64.b64encode(channel_data['id'].encode()).decode(), 'signature': channel_data['signature'], @@ -174,7 +175,7 @@ class DCNSeasonIE(InfoExtractor): data['show_id'] = show_id request = sanitized_Request( 'http://admin.mangomolo.com/analytics/index.php/plus/show', - compat_urllib_parse.urlencode(data), + urlencode_postdata(data), { 'Origin': 'http://www.dcndigital.ae', 'Content-Type': 'application/x-www-form-urlencoded' diff --git a/youtube_dl/extractor/dctp.py b/youtube_dl/extractor/dctp.py index aa2c09eb6..9099f5046 100644 --- a/youtube_dl/extractor/dctp.py +++ b/youtube_dl/extractor/dctp.py @@ -6,7 +6,7 @@ from ..compat import compat_str class DctpTvIE(InfoExtractor): - _VALID_URL = r'http://www.dctp.tv/(#/)?filme/(?P.+?)/$' + _VALID_URL = r'https?://www.dctp.tv/(#/)?filme/(?P.+?)/$' _TEST = { 'url': 'http://www.dctp.tv/filme/videoinstallation-fuer-eine-kaufhausfassade/', 'info_dict': { diff --git a/youtube_dl/extractor/defense.py b/youtube_dl/extractor/defense.py index 98e3aedfd..9fe144e14 100644 --- a/youtube_dl/extractor/defense.py +++ b/youtube_dl/extractor/defense.py @@ -5,7 +5,7 @@ from .common import InfoExtractor class DefenseGouvFrIE(InfoExtractor): IE_NAME = 'defense.gouv.fr' - _VALID_URL = r'http://.*?\.defense\.gouv\.fr/layout/set/ligthboxvideo/base-de-medias/webtv/(?P[^/?#]*)' + _VALID_URL = r'https?://.*?\.defense\.gouv\.fr/layout/set/ligthboxvideo/base-de-medias/webtv/(?P[^/?#]*)' _TEST = { 'url': 'http://www.defense.gouv.fr/layout/set/ligthboxvideo/base-de-medias/webtv/attaque-chimique-syrienne-du-21-aout-2013-1', diff --git a/youtube_dl/extractor/dfb.py b/youtube_dl/extractor/dfb.py index 263532cc6..cdfeccacb 100644 --- a/youtube_dl/extractor/dfb.py +++ b/youtube_dl/extractor/dfb.py @@ -38,6 +38,7 @@ class DFBIE(InfoExtractor): token_el = f4m_info.find('token') manifest_url = token_el.attrib['url'] + '?' + 'hdnea=' + token_el.attrib['auth'] + '&hdcore=3.2.0' formats = self._extract_f4m_formats(manifest_url, display_id) + self._sort_formats(formats) return { 'id': video_id, diff --git a/youtube_dl/extractor/discovery.py b/youtube_dl/extractor/discovery.py index ce680a9f3..5f1275b39 100644 --- a/youtube_dl/extractor/discovery.py +++ b/youtube_dl/extractor/discovery.py @@ -9,7 +9,7 @@ from ..compat import compat_str class DiscoveryIE(InfoExtractor): - _VALID_URL = r'''(?x)http://(?:www\.)?(?: + _VALID_URL = r'''(?x)https?://(?:www\.)?(?: discovery| investigationdiscovery| discoverylife| @@ -63,18 +63,23 @@ class DiscoveryIE(InfoExtractor): video_title = info.get('playlist_title') or info.get('video_title') - entries = [{ - 'id': compat_str(video_info['id']), - 'formats': self._extract_m3u8_formats( + entries = [] + + for idx, video_info in enumerate(info['playlist']): + formats = self._extract_m3u8_formats( video_info['src'], display_id, 'mp4', 'm3u8_native', m3u8_id='hls', - note='Download m3u8 information for video %d' % (idx + 1)), - 'title': video_info['title'], - 'description': video_info.get('description'), - 'duration': parse_duration(video_info.get('video_length')), - 'webpage_url': video_info.get('href') or video_info.get('url'), - 'thumbnail': video_info.get('thumbnailURL'), - 'alt_title': video_info.get('secondary_title'), - 'timestamp': parse_iso8601(video_info.get('publishedDate')), - } for idx, video_info in enumerate(info['playlist'])] + note='Download m3u8 information for video %d' % (idx + 1)) + self._sort_formats(formats) + entries.append({ + 'id': compat_str(video_info['id']), + 'formats': formats, + 'title': video_info['title'], + 'description': video_info.get('description'), + 'duration': parse_duration(video_info.get('video_length')), + 'webpage_url': video_info.get('href') or video_info.get('url'), + 'thumbnail': video_info.get('thumbnailURL'), + 'alt_title': video_info.get('secondary_title'), + 'timestamp': parse_iso8601(video_info.get('publishedDate')), + }) return self.playlist_result(entries, display_id, video_title) diff --git a/youtube_dl/extractor/douyutv.py b/youtube_dl/extractor/douyutv.py index bdc768c78..3915cb182 100644 --- a/youtube_dl/extractor/douyutv.py +++ b/youtube_dl/extractor/douyutv.py @@ -10,7 +10,7 @@ from ..compat import (compat_str, compat_basestring) class DouyuTVIE(InfoExtractor): IE_DESC = '斗鱼' - _VALID_URL = r'http://(?:www\.)?douyutv\.com/(?P[A-Za-z0-9]+)' + _VALID_URL = r'https?://(?:www\.)?douyu(?:tv)?\.com/(?P[A-Za-z0-9]+)' _TESTS = [{ 'url': 'http://www.douyutv.com/iseven', 'info_dict': { @@ -60,6 +60,9 @@ class DouyuTVIE(InfoExtractor): 'params': { 'skip_download': True, }, + }, { + 'url': 'http://www.douyu.com/xiaocang', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/dplay.py b/youtube_dl/extractor/dplay.py index a638c827c..66bbfc6ca 100644 --- a/youtube_dl/extractor/dplay.py +++ b/youtube_dl/extractor/dplay.py @@ -10,7 +10,7 @@ from ..utils import int_or_none class DPlayIE(InfoExtractor): - _VALID_URL = r'http://(?Pit\.dplay\.com|www\.dplay\.(?:dk|se|no))/[^/]+/(?P[^/?#]+)' + _VALID_URL = r'https?://(?Pit\.dplay\.com|www\.dplay\.(?:dk|se|no))/[^/]+/(?P[^/?#]+)' _TESTS = [{ 'url': 'http://it.dplay.com/take-me-out/stagione-1-episodio-25/', @@ -118,6 +118,8 @@ class DPlayIE(InfoExtractor): if info.get(protocol): extract_formats(protocol, info[protocol]) + self._sort_formats(formats) + return { 'id': video_id, 'display_id': display_id, diff --git a/youtube_dl/extractor/dramafever.py b/youtube_dl/extractor/dramafever.py index d35e88881..3b6529f4b 100644 --- a/youtube_dl/extractor/dramafever.py +++ b/youtube_dl/extractor/dramafever.py @@ -6,7 +6,6 @@ import itertools from .amp import AMPIE from ..compat import ( compat_HTTPError, - compat_urllib_parse, compat_urlparse, ) from ..utils import ( @@ -14,6 +13,7 @@ from ..utils import ( clean_html, int_or_none, sanitized_Request, + urlencode_postdata ) @@ -50,7 +50,7 @@ class DramaFeverBaseIE(AMPIE): } request = sanitized_Request( - self._LOGIN_URL, compat_urllib_parse.urlencode(login_form).encode('utf-8')) + self._LOGIN_URL, urlencode_postdata(login_form)) response = self._download_webpage( request, None, 'Logging in as %s' % username) diff --git a/youtube_dl/extractor/dreisat.py b/youtube_dl/extractor/dreisat.py index 028144f20..0040e70d4 100644 --- a/youtube_dl/extractor/dreisat.py +++ b/youtube_dl/extractor/dreisat.py @@ -7,7 +7,7 @@ from .zdf import ZDFIE class DreiSatIE(ZDFIE): IE_NAME = '3sat' - _VALID_URL = r'(?:http://)?(?:www\.)?3sat\.de/mediathek/(?:index\.php|mediathek\.php)?\?(?:(?:mode|display)=[^&]+&)*obj=(?P[0-9]+)$' + _VALID_URL = r'(?:https?://)?(?:www\.)?3sat\.de/mediathek/(?:index\.php|mediathek\.php)?\?(?:(?:mode|display)=[^&]+&)*obj=(?P[0-9]+)$' _TESTS = [ { 'url': 'http://www.3sat.de/mediathek/index.php?mode=play&obj=45918', diff --git a/youtube_dl/extractor/dvtv.py b/youtube_dl/extractor/dvtv.py index c1a4bc757..974c69dbc 100644 --- a/youtube_dl/extractor/dvtv.py +++ b/youtube_dl/extractor/dvtv.py @@ -15,7 +15,7 @@ class DVTVIE(InfoExtractor): IE_NAME = 'dvtv' IE_DESC = 'http://video.aktualne.cz/' - _VALID_URL = r'http://video\.aktualne\.cz/(?:[^/]+/)+r~(?P[0-9a-f]{32})' + _VALID_URL = r'https?://video\.aktualne\.cz/(?:[^/]+/)+r~(?P[0-9a-f]{32})' _TESTS = [{ 'url': 'http://video.aktualne.cz/dvtv/vondra-o-ceskem-stoleti-pri-pohledu-na-havla-mi-bylo-trapne/r~e5efe9ca855511e4833a0025900fea04/', diff --git a/youtube_dl/extractor/dw.py b/youtube_dl/extractor/dw.py index b6c985547..ae7c571bd 100644 --- a/youtube_dl/extractor/dw.py +++ b/youtube_dl/extractor/dw.py @@ -39,13 +39,13 @@ class DWIE(InfoExtractor): hidden_inputs = self._hidden_inputs(webpage) title = hidden_inputs['media_title'] - formats = [] if hidden_inputs.get('player_type') == 'video' and hidden_inputs.get('stream_file') == '1': formats = self._extract_smil_formats( 'http://www.dw.com/smil/v-%s' % media_id, media_id, transform_source=lambda s: s.replace( 'rtmp://tv-od.dw.de/flash/', 'http://tv-download.dw.de/dwtv_video/flv/')) + self._sort_formats(formats) else: formats = [{'url': hidden_inputs['file_name']}] diff --git a/youtube_dl/extractor/echomsk.py b/youtube_dl/extractor/echomsk.py index d2d94049d..6b7cc652f 100644 --- a/youtube_dl/extractor/echomsk.py +++ b/youtube_dl/extractor/echomsk.py @@ -7,7 +7,7 @@ from .common import InfoExtractor class EchoMskIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?echo\.msk\.ru/sounds/(?P\d+)' + _VALID_URL = r'https?://(?:www\.)?echo\.msk\.ru/sounds/(?P\d+)' _TEST = { 'url': 'http://www.echo.msk.ru/sounds/1464134.html', 'md5': '2e44b3b78daff5b458e4dbc37f191f7c', diff --git a/youtube_dl/extractor/eroprofile.py b/youtube_dl/extractor/eroprofile.py index 7fcd0151d..297f8a6f5 100644 --- a/youtube_dl/extractor/eroprofile.py +++ b/youtube_dl/extractor/eroprofile.py @@ -3,7 +3,7 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_urllib_parse +from ..compat import compat_urllib_parse_urlencode from ..utils import ( ExtractorError, unescapeHTML @@ -43,7 +43,7 @@ class EroProfileIE(InfoExtractor): if username is None: return - query = compat_urllib_parse.urlencode({ + query = compat_urllib_parse_urlencode({ 'username': username, 'password': password, 'url': 'http://www.eroprofile.com/', diff --git a/youtube_dl/extractor/exfm.py b/youtube_dl/extractor/exfm.py index 0c0fe6d65..09ed4f2b5 100644 --- a/youtube_dl/extractor/exfm.py +++ b/youtube_dl/extractor/exfm.py @@ -8,7 +8,7 @@ from .common import InfoExtractor class ExfmIE(InfoExtractor): IE_NAME = 'exfm' IE_DESC = 'ex.fm' - _VALID_URL = r'http://(?:www\.)?ex\.fm/song/(?P[^/]+)' + _VALID_URL = r'https?://(?:www\.)?ex\.fm/song/(?P[^/]+)' _SOUNDCLOUD_URL = r'http://(?:www\.)?api\.soundcloud\.com/tracks/([^/]+)/stream' _TESTS = [ { diff --git a/youtube_dl/extractor/fc2.py b/youtube_dl/extractor/fc2.py index 9580f5c0c..c7d69ff1f 100644 --- a/youtube_dl/extractor/fc2.py +++ b/youtube_dl/extractor/fc2.py @@ -5,19 +5,18 @@ import hashlib from .common import InfoExtractor from ..compat import ( - compat_urllib_parse, compat_urllib_request, compat_urlparse, ) from ..utils import ( - encode_dict, ExtractorError, sanitized_Request, + urlencode_postdata, ) class FC2IE(InfoExtractor): - _VALID_URL = r'^http://video\.fc2\.com/(?:[^/]+/)*content/(?P[^/]+)' + _VALID_URL = r'^https?://video\.fc2\.com/(?:[^/]+/)*content/(?P[^/]+)' IE_NAME = 'fc2' _NETRC_MACHINE = 'fc2' _TESTS = [{ @@ -57,7 +56,7 @@ class FC2IE(InfoExtractor): 'Submit': ' Login ', } - login_data = compat_urllib_parse.urlencode(encode_dict(login_form_strs)).encode('utf-8') + login_data = urlencode_postdata(login_form_strs) request = sanitized_Request( 'https://secure.id.fc2.com/index.php?mode=login&switch_language=en', login_data) diff --git a/youtube_dl/extractor/firstpost.py b/youtube_dl/extractor/firstpost.py index 298227d57..e8936cb24 100644 --- a/youtube_dl/extractor/firstpost.py +++ b/youtube_dl/extractor/firstpost.py @@ -4,7 +4,7 @@ from .common import InfoExtractor class FirstpostIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?firstpost\.com/[^/]+/.*-(?P[0-9]+)\.html' + _VALID_URL = r'https?://(?:www\.)?firstpost\.com/[^/]+/.*-(?P[0-9]+)\.html' _TEST = { 'url': 'http://www.firstpost.com/india/india-to-launch-indigenous-aircraft-carrier-monday-1025403.html', diff --git a/youtube_dl/extractor/firsttv.py b/youtube_dl/extractor/firsttv.py index 510d4b108..98b165143 100644 --- a/youtube_dl/extractor/firsttv.py +++ b/youtube_dl/extractor/firsttv.py @@ -8,7 +8,7 @@ from ..utils import int_or_none class FirstTVIE(InfoExtractor): IE_NAME = '1tv' IE_DESC = 'Первый канал' - _VALID_URL = r'http://(?:www\.)?1tv\.ru/(?:[^/]+/)+(?P.+)' + _VALID_URL = r'https?://(?:www\.)?1tv\.ru/(?:[^/]+/)+(?P.+)' _TESTS = [{ 'url': 'http://www.1tv.ru/videoarchive/73390', diff --git a/youtube_dl/extractor/fivemin.py b/youtube_dl/extractor/fivemin.py index 67d50a386..6b8345416 100644 --- a/youtube_dl/extractor/fivemin.py +++ b/youtube_dl/extractor/fivemin.py @@ -4,8 +4,8 @@ import re from .common import InfoExtractor from ..compat import ( - compat_urllib_parse, compat_parse_qs, + compat_urllib_parse_urlencode, compat_urllib_parse_urlparse, compat_urlparse, ) @@ -109,7 +109,7 @@ class FiveMinIE(InfoExtractor): response = self._download_json( 'https://syn.5min.com/handlers/SenseHandler.ashx?' + - compat_urllib_parse.urlencode({ + compat_urllib_parse_urlencode({ 'func': 'GetResults', 'playlist': video_id, 'sid': sid, diff --git a/youtube_dl/extractor/fktv.py b/youtube_dl/extractor/fktv.py index 5f6e65dae..a3a291599 100644 --- a/youtube_dl/extractor/fktv.py +++ b/youtube_dl/extractor/fktv.py @@ -10,7 +10,7 @@ from ..utils import ( class FKTVIE(InfoExtractor): IE_NAME = 'fernsehkritik.tv' - _VALID_URL = r'http://(?:www\.)?fernsehkritik\.tv/folge-(?P[0-9]+)(?:/.*)?' + _VALID_URL = r'https?://(?:www\.)?fernsehkritik\.tv/folge-(?P[0-9]+)(?:/.*)?' _TEST = { 'url': 'http://fernsehkritik.tv/folge-1', diff --git a/youtube_dl/extractor/flickr.py b/youtube_dl/extractor/flickr.py index 18f439df9..0a3de1498 100644 --- a/youtube_dl/extractor/flickr.py +++ b/youtube_dl/extractor/flickr.py @@ -1,7 +1,7 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..compat import compat_urllib_parse +from ..compat import compat_urllib_parse_urlencode from ..utils import ( ExtractorError, int_or_none, @@ -42,7 +42,7 @@ class FlickrIE(InfoExtractor): } if secret: query['secret'] = secret - data = self._download_json(self._API_BASE_URL + compat_urllib_parse.urlencode(query), video_id, note) + data = self._download_json(self._API_BASE_URL + compat_urllib_parse_urlencode(query), video_id, note) if data['stat'] != 'ok': raise ExtractorError(data['message']) return data diff --git a/youtube_dl/extractor/footyroom.py b/youtube_dl/extractor/footyroom.py index 370fd006f..d2503ae2e 100644 --- a/youtube_dl/extractor/footyroom.py +++ b/youtube_dl/extractor/footyroom.py @@ -5,7 +5,7 @@ from .common import InfoExtractor class FootyRoomIE(InfoExtractor): - _VALID_URL = r'http://footyroom\.com/(?P[^/]+)' + _VALID_URL = r'https?://footyroom\.com/(?P[^/]+)' _TESTS = [{ 'url': 'http://footyroom.com/schalke-04-0-2-real-madrid-2015-02/', 'info_dict': { diff --git a/youtube_dl/extractor/foxgay.py b/youtube_dl/extractor/foxgay.py index 08b8ea362..70c1a815d 100644 --- a/youtube_dl/extractor/foxgay.py +++ b/youtube_dl/extractor/foxgay.py @@ -4,7 +4,7 @@ from .common import InfoExtractor class FoxgayIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?foxgay\.com/videos/(?:\S+-)?(?P\d+)\.shtml' + _VALID_URL = r'https?://(?:www\.)?foxgay\.com/videos/(?:\S+-)?(?P\d+)\.shtml' _TEST = { 'url': 'http://foxgay.com/videos/fuck-turkish-style-2582.shtml', 'md5': '80d72beab5d04e1655a56ad37afe6841', diff --git a/youtube_dl/extractor/foxnews.py b/youtube_dl/extractor/foxnews.py index 1dc50318c..b04da2415 100644 --- a/youtube_dl/extractor/foxnews.py +++ b/youtube_dl/extractor/foxnews.py @@ -18,8 +18,8 @@ class FoxNewsIE(AMPIE): 'title': 'Frozen in Time', 'description': '16-year-old girl is size of toddler', 'duration': 265, - # 'timestamp': 1304411491, - # 'upload_date': '20110503', + 'timestamp': 1304411491, + 'upload_date': '20110503', 'thumbnail': 're:^https?://.*\.jpg$', }, }, @@ -32,8 +32,8 @@ class FoxNewsIE(AMPIE): 'title': "Rep. Luis Gutierrez on if Obama's immigration plan is legal", 'description': "Congressman discusses president's plan", 'duration': 292, - # 'timestamp': 1417662047, - # 'upload_date': '20141204', + 'timestamp': 1417662047, + 'upload_date': '20141204', 'thumbnail': 're:^https?://.*\.jpg$', }, 'params': { diff --git a/youtube_dl/extractor/franceinter.py b/youtube_dl/extractor/franceinter.py index 0388ba00c..2369f868d 100644 --- a/youtube_dl/extractor/franceinter.py +++ b/youtube_dl/extractor/franceinter.py @@ -6,7 +6,7 @@ from ..utils import int_or_none class FranceInterIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?franceinter\.fr/player/reecouter\?play=(?P[0-9]+)' + _VALID_URL = r'https?://(?:www\.)?franceinter\.fr/player/reecouter\?play=(?P[0-9]+)' _TEST = { 'url': 'http://www.franceinter.fr/player/reecouter?play=793962', 'md5': '4764932e466e6f6c79c317d2e74f6884', diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py index 3f4ac3093..ad94e31f3 100644 --- a/youtube_dl/extractor/francetv.py +++ b/youtube_dl/extractor/francetv.py @@ -60,28 +60,31 @@ class FranceTVBaseInfoExtractor(InfoExtractor): video_id, 'Downloading f4m manifest token', fatal=False) if f4m_url: formats.extend(self._extract_f4m_formats( - f4m_url + '&hdcore=3.7.0&plugin=aasp-3.7.0.39.44', video_id, 1, format_id)) + f4m_url + '&hdcore=3.7.0&plugin=aasp-3.7.0.39.44', + video_id, f4m_id=format_id, fatal=False)) elif ext == 'm3u8': - formats.extend(self._extract_m3u8_formats(video_url, video_id, 'mp4', m3u8_id=format_id)) + formats.extend(self._extract_m3u8_formats( + video_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id=format_id, fatal=False)) elif video_url.startswith('rtmp'): formats.append({ 'url': video_url, 'format_id': 'rtmp-%s' % format_id, 'ext': 'flv', - 'preference': 1, }) else: - formats.append({ - 'url': video_url, - 'format_id': format_id, - 'preference': -1, - }) + if self._is_valid_url(video_url, video_id, format_id): + formats.append({ + 'url': video_url, + 'format_id': format_id, + }) self._sort_formats(formats) title = info['titre'] subtitle = info.get('sous_titre') if subtitle: title += ' - %s' % subtitle + title = title.strip() subtitles = {} subtitles_list = [{ @@ -125,13 +128,13 @@ class PluzzIE(FranceTVBaseInfoExtractor): class FranceTvInfoIE(FranceTVBaseInfoExtractor): IE_NAME = 'francetvinfo.fr' - _VALID_URL = r'https?://(?:www|mobile)\.francetvinfo\.fr/.*/(?P.+)\.html' + _VALID_URL = r'https?://(?:www|mobile|france3-regions)\.francetvinfo\.fr/.*/(?P<title>.+)\.html' _TESTS = [{ 'url': 'http://www.francetvinfo.fr/replay-jt/france-3/soir-3/jt-grand-soir-3-lundi-26-aout-2013_393427.html', 'info_dict': { 'id': '84981923', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Soir 3', 'upload_date': '20130826', 'timestamp': 1377548400, @@ -139,6 +142,10 @@ class FranceTvInfoIE(FranceTVBaseInfoExtractor): 'fr': 'mincount:2', }, }, + 'params': { + # m3u8 downloads + 'skip_download': True, + }, }, { 'url': 'http://www.francetvinfo.fr/elections/europeennes/direct-europeennes-regardez-le-debat-entre-les-candidats-a-la-presidence-de-la-commission_600639.html', 'info_dict': { @@ -155,11 +162,32 @@ class FranceTvInfoIE(FranceTVBaseInfoExtractor): 'url': 'http://www.francetvinfo.fr/economie/entreprises/les-entreprises-familiales-le-secret-de-la-reussite_933271.html', 'md5': 'f485bda6e185e7d15dbc69b72bae993e', 'info_dict': { - 'id': '556e03339473995ee145930c', + 'id': 'NI_173343', 'ext': 'mp4', 'title': 'Les entreprises familiales : le secret de la réussite', 'thumbnail': 're:^https?://.*\.jpe?g$', - } + 'timestamp': 1433273139, + 'upload_date': '20150602', + }, + 'params': { + # m3u8 downloads + 'skip_download': True, + }, + }, { + 'url': 'http://france3-regions.francetvinfo.fr/bretagne/cotes-d-armor/thalassa-echappee-breizh-ce-venredi-dans-les-cotes-d-armor-954961.html', + 'md5': 'f485bda6e185e7d15dbc69b72bae993e', + 'info_dict': { + 'id': 'NI_657393', + 'ext': 'mp4', + 'title': 'Olivier Monthus, réalisateur de "Bretagne, le choix de l’Armor"', + 'description': 'md5:a3264114c9d29aeca11ced113c37b16c', + 'thumbnail': 're:^https?://.*\.jpe?g$', + 'timestamp': 1458300695, + 'upload_date': '20160318', + }, + 'params': { + 'skip_download': True, + }, }] def _real_extract(self, url): @@ -172,7 +200,9 @@ class FranceTvInfoIE(FranceTVBaseInfoExtractor): return self.url_result(dmcloud_url, 'DailymotionCloud') video_id, catalogue = self._search_regex( - r'id-video=([^@]+@[^"]+)', webpage, 'video id').split('@') + (r'id-video=([^@]+@[^"]+)', + r'<a[^>]+href="(?:https?:)?//videos\.francetv\.fr/video/([^@]+@[^"]+)"'), + webpage, 'video id').split('@') return self._extract_video(video_id, catalogue) diff --git a/youtube_dl/extractor/freevideo.py b/youtube_dl/extractor/freevideo.py index c7bec027b..cd8423a6f 100644 --- a/youtube_dl/extractor/freevideo.py +++ b/youtube_dl/extractor/freevideo.py @@ -5,7 +5,7 @@ from ..utils import ExtractorError class FreeVideoIE(InfoExtractor): - _VALID_URL = r'^http://www.freevideo.cz/vase-videa/(?P<id>[^.]+)\.html(?:$|[?#])' + _VALID_URL = r'^https?://www.freevideo.cz/vase-videa/(?P<id>[^.]+)\.html(?:$|[?#])' _TEST = { 'url': 'http://www.freevideo.cz/vase-videa/vysukany-zadecek-22033.html', diff --git a/youtube_dl/extractor/funimation.py b/youtube_dl/extractor/funimation.py index 0f37ed786..1eb528f31 100644 --- a/youtube_dl/extractor/funimation.py +++ b/youtube_dl/extractor/funimation.py @@ -5,7 +5,6 @@ from .common import InfoExtractor from ..utils import ( clean_html, determine_ext, - encode_dict, int_or_none, sanitized_Request, ExtractorError, @@ -54,10 +53,10 @@ class FunimationIE(InfoExtractor): (username, password) = self._get_login_info() if username is None: return - data = urlencode_postdata(encode_dict({ + data = urlencode_postdata({ 'email_field': username, 'password_field': password, - })) + }) login_request = sanitized_Request('http://www.funimation.com/login', data, headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 5.2; WOW64; rv:42.0) Gecko/20100101 Firefox/42.0', 'Content-Type': 'application/x-www-form-urlencoded' diff --git a/youtube_dl/extractor/gamekings.py b/youtube_dl/extractor/gamekings.py index f6b9046f9..cbcddcb7c 100644 --- a/youtube_dl/extractor/gamekings.py +++ b/youtube_dl/extractor/gamekings.py @@ -10,7 +10,7 @@ from .youtube import YoutubeIE class GamekingsIE(InfoExtractor): - _VALID_URL = r'http://www\.gamekings\.nl/(?:videos|nieuws)/(?P<id>[^/]+)' + _VALID_URL = r'https?://www\.gamekings\.nl/(?:videos|nieuws)/(?P<id>[^/]+)' _TESTS = [{ # YouTube embed video 'url': 'http://www.gamekings.nl/videos/phoenix-wright-ace-attorney-dual-destinies-review/', diff --git a/youtube_dl/extractor/gamespot.py b/youtube_dl/extractor/gamespot.py index b3f1bafcc..4ffdd7515 100644 --- a/youtube_dl/extractor/gamespot.py +++ b/youtube_dl/extractor/gamespot.py @@ -14,7 +14,7 @@ from ..utils import ( class GameSpotIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?gamespot\.com/.*-(?P<id>\d+)/?' + _VALID_URL = r'https?://(?:www\.)?gamespot\.com/.*-(?P<id>\d+)/?' _TESTS = [{ 'url': 'http://www.gamespot.com/videos/arma-3-community-guide-sitrep-i/2300-6410818/', 'md5': 'b2a30deaa8654fcccd43713a6b6a4825', diff --git a/youtube_dl/extractor/gamestar.py b/youtube_dl/extractor/gamestar.py index 590ccf526..69058a583 100644 --- a/youtube_dl/extractor/gamestar.py +++ b/youtube_dl/extractor/gamestar.py @@ -13,7 +13,7 @@ from ..utils import ( class GameStarIE(InfoExtractor): - _VALID_URL = r'http://www\.gamestar\.de/videos/.*,(?P<id>[0-9]+)\.html' + _VALID_URL = r'https?://www\.gamestar\.de/videos/.*,(?P<id>[0-9]+)\.html' _TEST = { 'url': 'http://www.gamestar.de/videos/trailer,3/hobbit-3-die-schlacht-der-fuenf-heere,76110.html', 'md5': '96974ecbb7fd8d0d20fca5a00810cea7', diff --git a/youtube_dl/extractor/gametrailers.py b/youtube_dl/extractor/gametrailers.py index c3f031d9c..1e7948ab8 100644 --- a/youtube_dl/extractor/gametrailers.py +++ b/youtube_dl/extractor/gametrailers.py @@ -9,7 +9,7 @@ from ..utils import ( class GametrailersIE(InfoExtractor): - _VALID_URL = r'http://www\.gametrailers\.com/videos/view/[^/]+/(?P<id>.+)' + _VALID_URL = r'https?://www\.gametrailers\.com/videos/view/[^/]+/(?P<id>.+)' _TEST = { 'url': 'http://www.gametrailers.com/videos/view/gametrailers-com/116437-Just-Cause-3-Review', diff --git a/youtube_dl/extractor/gdcvault.py b/youtube_dl/extractor/gdcvault.py index 3befd3e7b..59ed4c38f 100644 --- a/youtube_dl/extractor/gdcvault.py +++ b/youtube_dl/extractor/gdcvault.py @@ -3,11 +3,11 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_urllib_parse from ..utils import ( remove_end, HEADRequest, sanitized_Request, + urlencode_postdata, ) @@ -123,7 +123,7 @@ class GDCVaultIE(InfoExtractor): 'password': password, } - request = sanitized_Request(login_url, compat_urllib_parse.urlencode(login_form)) + request = sanitized_Request(login_url, urlencode_postdata(login_form)) request.add_header('Content-Type', 'application/x-www-form-urlencoded') self._download_webpage(request, display_id, 'Logging in') start_page = self._download_webpage(webpage_url, display_id, 'Getting authenticated video page') diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 26de27a7e..f3de738f7 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -59,6 +59,7 @@ from .videomore import VideomoreIE from .googledrive import GoogleDriveIE from .jwplatform import JWPlatformIE from .digiteka import DigitekaIE +from .instagram import InstagramIE class GenericIE(InfoExtractor): @@ -1123,7 +1124,23 @@ class GenericIE(InfoExtractor): # m3u8 downloads 'skip_download': True, } - } + }, + # Brightcove embed, with no valid 'renditions' but valid 'IOSRenditions' + # This video can't be played in browsers if Flash disabled and UA set to iPhone, which is actually a false alarm + { + 'url': 'https://dl.dropboxusercontent.com/u/29092637/interview.html', + 'info_dict': { + 'id': '4785848093001', + 'ext': 'mp4', + 'title': 'The Cardinal Pell Interview', + 'description': 'Sky News Contributor Andrew Bolt interviews George Pell in Rome, following the Cardinal\'s evidence before the Royal Commission into Child Abuse. ', + 'uploader': 'GlobeCast Australia - GlobeStream', + }, + 'params': { + # m3u8 downloads + 'skip_download': True, + }, + }, ] def report_following_redirect(self, new_url): @@ -1293,6 +1310,7 @@ class GenericIE(InfoExtractor): 'vcodec': 'none' if m.group('type') == 'audio' else None }] info_dict['direct'] = True + self._sort_formats(formats) info_dict['formats'] = formats return info_dict @@ -1319,6 +1337,7 @@ class GenericIE(InfoExtractor): # Is it an M3U playlist? if first_bytes.startswith(b'#EXTM3U'): info_dict['formats'] = self._extract_m3u8_formats(url, video_id, 'mp4') + self._sort_formats(info_dict['formats']) return info_dict # Maybe it's a direct link to a video? @@ -1343,15 +1362,19 @@ class GenericIE(InfoExtractor): if doc.tag == 'rss': return self._extract_rss(url, video_id, doc) elif re.match(r'^(?:{[^}]+})?smil$', doc.tag): - return self._parse_smil(doc, url, video_id) + smil = self._parse_smil(doc, url, video_id) + self._sort_formats(smil['formats']) + return smil elif doc.tag == '{http://xspf.org/ns/0/}playlist': return self.playlist_result(self._parse_xspf(doc, video_id), video_id) elif re.match(r'(?i)^(?:{[^}]+})?MPD$', doc.tag): info_dict['formats'] = self._parse_mpd_formats( doc, video_id, mpd_base_url=url.rpartition('/')[0]) + self._sort_formats(info_dict['formats']) return info_dict elif re.match(r'^{http://ns\.adobe\.com/f4m/[12]\.0}manifest$', doc.tag): info_dict['formats'] = self._parse_f4m_formats(doc, url, video_id) + self._sort_formats(info_dict['formats']) return info_dict except compat_xml_parse_error: pass @@ -1909,6 +1932,19 @@ class GenericIE(InfoExtractor): self._proto_relative_url(unescapeHTML(mobj.group(1))), 'AdobeTVVideo') + # Look for Vine embeds + mobj = re.search( + r'<iframe[^>]+src=[\'"]((?:https?:)?//(?:www\.)?vine\.co/v/[^/]+/embed/(?:simple|postcard))', + webpage) + if mobj is not None: + return self.url_result( + self._proto_relative_url(unescapeHTML(mobj.group(1))), 'Vine') + + # Look for Instagram embeds + instagram_embed_url = InstagramIE._extract_embed_url(webpage) + if instagram_embed_url is not None: + return self.url_result(instagram_embed_url, InstagramIE.ie_key()) + def check_video(vurl): if YoutubeIE.suitable(vurl): return True @@ -2023,6 +2059,9 @@ class GenericIE(InfoExtractor): else: entry_info_dict['url'] = video_url + if entry_info_dict.get('formats'): + self._sort_formats(entry_info_dict['formats']) + entries.append(entry_info_dict) if len(entries) == 1: diff --git a/youtube_dl/extractor/hotnewhiphop.py b/youtube_dl/extractor/hotnewhiphop.py index 31e219945..9db565209 100644 --- a/youtube_dl/extractor/hotnewhiphop.py +++ b/youtube_dl/extractor/hotnewhiphop.py @@ -3,16 +3,16 @@ from __future__ import unicode_literals import base64 from .common import InfoExtractor -from ..compat import compat_urllib_parse from ..utils import ( ExtractorError, HEADRequest, sanitized_Request, + urlencode_postdata, ) class HotNewHipHopIE(InfoExtractor): - _VALID_URL = r'http://www\.hotnewhiphop\.com/.*\.(?P<id>.*)\.html' + _VALID_URL = r'https?://www\.hotnewhiphop\.com/.*\.(?P<id>.*)\.html' _TEST = { 'url': 'http://www.hotnewhiphop.com/freddie-gibbs-lay-it-down-song.1435540.html', 'md5': '2c2cd2f76ef11a9b3b581e8b232f3d96', @@ -35,7 +35,7 @@ class HotNewHipHopIE(InfoExtractor): r'"contentUrl" content="(.*?)"', webpage, 'content URL') return self.url_result(video_url, ie='Youtube') - reqdata = compat_urllib_parse.urlencode([ + reqdata = urlencode_postdata([ ('mediaType', 's'), ('mediaId', video_id), ]) diff --git a/youtube_dl/extractor/hypem.py b/youtube_dl/extractor/hypem.py index b3706fe6d..f7c913054 100644 --- a/youtube_dl/extractor/hypem.py +++ b/youtube_dl/extractor/hypem.py @@ -4,7 +4,7 @@ import json import time from .common import InfoExtractor -from ..compat import compat_urllib_parse +from ..compat import compat_urllib_parse_urlencode from ..utils import ( ExtractorError, sanitized_Request, @@ -12,7 +12,7 @@ from ..utils import ( class HypemIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?hypem\.com/track/(?P<id>[^/]+)/' + _VALID_URL = r'https?://(?:www\.)?hypem\.com/track/(?P<id>[^/]+)/' _TEST = { 'url': 'http://hypem.com/track/1v6ga/BODYWORK+-+TAME', 'md5': 'b9cc91b5af8995e9f0c1cee04c575828', @@ -28,7 +28,7 @@ class HypemIE(InfoExtractor): track_id = self._match_id(url) data = {'ax': 1, 'ts': time.time()} - request = sanitized_Request(url + '?' + compat_urllib_parse.urlencode(data)) + request = sanitized_Request(url + '?' + compat_urllib_parse_urlencode(data)) response, urlh = self._download_webpage_handle( request, track_id, 'Downloading webpage with the url') diff --git a/youtube_dl/extractor/imdb.py b/youtube_dl/extractor/imdb.py index b61b2dc4e..8bed8ccd0 100644 --- a/youtube_dl/extractor/imdb.py +++ b/youtube_dl/extractor/imdb.py @@ -12,7 +12,7 @@ from ..utils import ( class ImdbIE(InfoExtractor): IE_NAME = 'imdb' IE_DESC = 'Internet Movie Database trailers' - _VALID_URL = r'http://(?:www|m)\.imdb\.com/video/imdb/vi(?P<id>\d+)' + _VALID_URL = r'https?://(?:www|m)\.imdb\.com/video/imdb/vi(?P<id>\d+)' _TEST = { 'url': 'http://www.imdb.com/video/imdb/vi2524815897', @@ -70,7 +70,7 @@ class ImdbIE(InfoExtractor): class ImdbListIE(InfoExtractor): IE_NAME = 'imdb:list' IE_DESC = 'Internet Movie Database lists' - _VALID_URL = r'http://www\.imdb\.com/list/(?P<id>[\da-zA-Z_-]{11})' + _VALID_URL = r'https?://www\.imdb\.com/list/(?P<id>[\da-zA-Z_-]{11})' _TEST = { 'url': 'http://www.imdb.com/list/JFs9NWw6XI0', 'info_dict': { diff --git a/youtube_dl/extractor/instagram.py b/youtube_dl/extractor/instagram.py index ed3e07118..4e62098b0 100644 --- a/youtube_dl/extractor/instagram.py +++ b/youtube_dl/extractor/instagram.py @@ -4,8 +4,10 @@ import re from .common import InfoExtractor from ..utils import ( + get_element_by_attribute, int_or_none, limit_length, + lowercase_escape, ) @@ -38,6 +40,18 @@ class InstagramIE(InfoExtractor): 'only_matching': True, }] + @staticmethod + def _extract_embed_url(webpage): + blockquote_el = get_element_by_attribute( + 'class', 'instagram-media', webpage) + if blockquote_el is None: + return + + mobj = re.search( + r'<a[^>]+href=([\'"])(?P<link>[^\'"]+)\1', blockquote_el) + if mobj: + return mobj.group('link') + def _real_extract(self, url): video_id = self._match_id(url) @@ -46,6 +60,8 @@ class InstagramIE(InfoExtractor): webpage, 'uploader id', fatal=False) desc = self._search_regex( r'"caption":"(.+?)"', webpage, 'description', default=None) + if desc is not None: + desc = lowercase_escape(desc) return { 'id': video_id, diff --git a/youtube_dl/extractor/internetvideoarchive.py b/youtube_dl/extractor/internetvideoarchive.py index 483cc6f9e..e60145b3d 100644 --- a/youtube_dl/extractor/internetvideoarchive.py +++ b/youtube_dl/extractor/internetvideoarchive.py @@ -5,7 +5,7 @@ import re from .common import InfoExtractor from ..compat import ( compat_urlparse, - compat_urllib_parse, + compat_urllib_parse_urlencode, ) from ..utils import ( xpath_with_ns, @@ -38,7 +38,7 @@ class InternetVideoArchiveIE(InfoExtractor): # Other player ids return m3u8 urls cleaned_dic['playerid'] = '247' cleaned_dic['videokbrate'] = '100000' - return compat_urllib_parse.urlencode(cleaned_dic) + return compat_urllib_parse_urlencode(cleaned_dic) def _real_extract(self, url): query = compat_urlparse.urlparse(url).query diff --git a/youtube_dl/extractor/iprima.py b/youtube_dl/extractor/iprima.py index 61a0de472..788bbe0d5 100644 --- a/youtube_dl/extractor/iprima.py +++ b/youtube_dl/extractor/iprima.py @@ -1,4 +1,4 @@ -# -*- coding: utf-8 -*- +# coding: utf-8 from __future__ import unicode_literals import re @@ -6,6 +6,8 @@ import time from .common import InfoExtractor from ..utils import ( + determine_ext, + js_to_json, sanitized_Request, ) @@ -30,8 +32,7 @@ class IPrimaIE(InfoExtractor): }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) @@ -43,9 +44,42 @@ class IPrimaIE(InfoExtractor): req.add_header('Referer', url) playerpage = self._download_webpage(req, video_id, note='Downloading player') - m3u8_url = self._search_regex(r"'src': '([^']+\.m3u8)'", playerpage, 'm3u8 url') + formats = [] - formats = self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4') + def extract_formats(format_url, format_key=None, lang=None): + ext = determine_ext(format_url) + new_formats = [] + if format_key == 'hls' or ext == 'm3u8': + new_formats = self._extract_m3u8_formats( + format_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False) + elif format_key == 'dash' or ext == 'mpd': + return + new_formats = self._extract_mpd_formats( + format_url, video_id, mpd_id='dash', fatal=False) + if lang: + for f in new_formats: + if not f.get('language'): + f['language'] = lang + formats.extend(new_formats) + + options = self._parse_json( + self._search_regex( + r'(?s)var\s+playerOptions\s*=\s*({.+?});', + playerpage, 'player options', default='{}'), + video_id, transform_source=js_to_json, fatal=False) + if options: + for key, tracks in options.get('tracks', {}).items(): + if not isinstance(tracks, list): + continue + for track in tracks: + src = track.get('src') + if src: + extract_formats(src, key.lower(), track.get('lang')) + + if not formats: + for _, src in re.findall(r'src["\']\s*:\s*(["\'])(.+?)\1', playerpage): + extract_formats(src) self._sort_formats(formats) diff --git a/youtube_dl/extractor/iqiyi.py b/youtube_dl/extractor/iqiyi.py index e7c0cb3f6..9e8c9432a 100644 --- a/youtube_dl/extractor/iqiyi.py +++ b/youtube_dl/extractor/iqiyi.py @@ -14,7 +14,7 @@ from .common import InfoExtractor from ..compat import ( compat_parse_qs, compat_str, - compat_urllib_parse, + compat_urllib_parse_urlencode, compat_urllib_parse_urlparse, ) from ..utils import ( @@ -165,7 +165,7 @@ class IqiyiIE(InfoExtractor): IE_NAME = 'iqiyi' IE_DESC = '爱奇艺' - _VALID_URL = r'http://(?:[^.]+\.)?iqiyi\.com/.+\.html' + _VALID_URL = r'https?://(?:[^.]+\.)?iqiyi\.com/.+\.html' _NETRC_MACHINE = 'iqiyi' @@ -322,7 +322,7 @@ class IqiyiIE(InfoExtractor): 'bird_t': timestamp, } validation_result = self._download_json( - 'http://kylin.iqiyi.com/validate?' + compat_urllib_parse.urlencode(validation_params), None, + 'http://kylin.iqiyi.com/validate?' + compat_urllib_parse_urlencode(validation_params), None, note='Validate credentials', errnote='Unable to validate credentials') MSG_MAP = { @@ -456,7 +456,7 @@ class IqiyiIE(InfoExtractor): 'QY00001': auth_result['data']['u'], }) api_video_url += '?' if '?' not in api_video_url else '&' - api_video_url += compat_urllib_parse.urlencode(param) + api_video_url += compat_urllib_parse_urlencode(param) js = self._download_json( api_video_url, video_id, note='Download video info of segment %d for format %s' % (segment_index + 1, format_id)) @@ -494,14 +494,14 @@ class IqiyiIE(InfoExtractor): } api_url = 'http://cache.video.qiyi.com/vms' + '?' + \ - compat_urllib_parse.urlencode(param) + compat_urllib_parse_urlencode(param) raw_data = self._download_json(api_url, video_id) return raw_data def get_enc_key(self, video_id): # TODO: automatic key extraction # last update at 2016-01-22 for Zombie::bite - enc_key = '8ed797d224d043e7ac23d95b70227d32' + enc_key = '4a1caba4b4465345366f28da7c117d20' return enc_key def _extract_playlist(self, webpage): diff --git a/youtube_dl/extractor/ivideon.py b/youtube_dl/extractor/ivideon.py index 617dc8c07..3ca824f79 100644 --- a/youtube_dl/extractor/ivideon.py +++ b/youtube_dl/extractor/ivideon.py @@ -5,7 +5,7 @@ import re from .common import InfoExtractor from ..compat import ( - compat_urllib_parse, + compat_urllib_parse_urlencode, compat_urlparse, ) from ..utils import qualities @@ -62,7 +62,7 @@ class IvideonIE(InfoExtractor): quality = qualities(self._QUALITIES) formats = [{ - 'url': 'https://streaming.ivideon.com/flv/live?%s' % compat_urllib_parse.urlencode({ + 'url': 'https://streaming.ivideon.com/flv/live?%s' % compat_urllib_parse_urlencode({ 'server': server_id, 'camera': camera_id, 'sessionId': 'demo', diff --git a/youtube_dl/extractor/jadorecettepub.py b/youtube_dl/extractor/jadorecettepub.py index 063e86de4..158c09a33 100644 --- a/youtube_dl/extractor/jadorecettepub.py +++ b/youtube_dl/extractor/jadorecettepub.py @@ -9,7 +9,7 @@ from .youtube import YoutubeIE class JadoreCettePubIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?jadorecettepub\.com/[0-9]{4}/[0-9]{2}/(?P<id>.*?)\.html' + _VALID_URL = r'https?://(?:www\.)?jadorecettepub\.com/[0-9]{4}/[0-9]{2}/(?P<id>.*?)\.html' _TEST = { 'url': 'http://www.jadorecettepub.com/2010/12/star-wars-massacre-par-les-japonais.html', diff --git a/youtube_dl/extractor/jeuxvideo.py b/youtube_dl/extractor/jeuxvideo.py index 137db873c..1a4227f6b 100644 --- a/youtube_dl/extractor/jeuxvideo.py +++ b/youtube_dl/extractor/jeuxvideo.py @@ -8,7 +8,7 @@ from .common import InfoExtractor class JeuxVideoIE(InfoExtractor): - _VALID_URL = r'http://.*?\.jeuxvideo\.com/.*/(.*?)\.htm' + _VALID_URL = r'https?://.*?\.jeuxvideo\.com/.*/(.*?)\.htm' _TESTS = [{ 'url': 'http://www.jeuxvideo.com/reportages-videos-jeux/0004/00046170/tearaway-playstation-vita-gc-2013-tearaway-nous-presente-ses-papiers-d-identite-00115182.htm', diff --git a/youtube_dl/extractor/kaltura.py b/youtube_dl/extractor/kaltura.py index 44d7c84a1..a65697ff5 100644 --- a/youtube_dl/extractor/kaltura.py +++ b/youtube_dl/extractor/kaltura.py @@ -6,7 +6,7 @@ import base64 from .common import InfoExtractor from ..compat import ( - compat_urllib_parse, + compat_urllib_parse_urlencode, compat_urlparse, compat_parse_qs, ) @@ -71,7 +71,7 @@ class KalturaIE(InfoExtractor): for k, v in a.items(): params['%d:%s' % (i, k)] = v - query = compat_urllib_parse.urlencode(params) + query = compat_urllib_parse_urlencode(params) url = self._API_BASE + query data = self._download_json(url, video_id, *args, **kwargs) diff --git a/youtube_dl/extractor/karaoketv.py b/youtube_dl/extractor/karaoketv.py index 06daf5a89..b4c30b7f3 100644 --- a/youtube_dl/extractor/karaoketv.py +++ b/youtube_dl/extractor/karaoketv.py @@ -9,7 +9,7 @@ from ..utils import ( class KaraoketvIE(InfoExtractor): - _VALID_URL = r'http://karaoketv\.co\.il/\?container=songs&id=(?P<id>[0-9]+)' + _VALID_URL = r'https?://karaoketv\.co\.il/\?container=songs&id=(?P<id>[0-9]+)' _TEST = { 'url': 'http://karaoketv.co.il/?container=songs&id=171568', 'info_dict': { diff --git a/youtube_dl/extractor/karrierevideos.py b/youtube_dl/extractor/karrierevideos.py index bed94bc93..2cb04e533 100644 --- a/youtube_dl/extractor/karrierevideos.py +++ b/youtube_dl/extractor/karrierevideos.py @@ -12,7 +12,7 @@ from ..utils import ( class KarriereVideosIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?karrierevideos\.at(?:/[^/]+)+/(?P<id>[^/]+)' + _VALID_URL = r'https?://(?:www\.)?karrierevideos\.at(?:/[^/]+)+/(?P<id>[^/]+)' _TESTS = [{ 'url': 'http://www.karrierevideos.at/berufsvideos/mittlere-hoehere-schulen/altenpflegerin', 'info_dict': { diff --git a/youtube_dl/extractor/kontrtube.py b/youtube_dl/extractor/kontrtube.py index a59c529f4..704bd7b34 100644 --- a/youtube_dl/extractor/kontrtube.py +++ b/youtube_dl/extractor/kontrtube.py @@ -13,7 +13,7 @@ from ..utils import ( class KontrTubeIE(InfoExtractor): IE_NAME = 'kontrtube' IE_DESC = 'KontrTube.ru - Труба зовёт' - _VALID_URL = r'http://(?:www\.)?kontrtube\.ru/videos/(?P<id>\d+)/(?P<display_id>[^/]+)/' + _VALID_URL = r'https?://(?:www\.)?kontrtube\.ru/videos/(?P<id>\d+)/(?P<display_id>[^/]+)/' _TEST = { 'url': 'http://www.kontrtube.ru/videos/2678/nad-olimpiyskoy-derevney-v-sochi-podnyat-rossiyskiy-flag/', diff --git a/youtube_dl/extractor/ku6.py b/youtube_dl/extractor/ku6.py index a602980a1..a574408e5 100644 --- a/youtube_dl/extractor/ku6.py +++ b/youtube_dl/extractor/ku6.py @@ -4,7 +4,7 @@ from .common import InfoExtractor class Ku6IE(InfoExtractor): - _VALID_URL = r'http://v\.ku6\.com/show/(?P<id>[a-zA-Z0-9\-\_]+)(?:\.)*html' + _VALID_URL = r'https?://v\.ku6\.com/show/(?P<id>[a-zA-Z0-9\-\_]+)(?:\.)*html' _TEST = { 'url': 'http://v.ku6.com/show/JG-8yS14xzBr4bCn1pu0xw...html', 'md5': '01203549b9efbb45f4b87d55bdea1ed1', diff --git a/youtube_dl/extractor/kusi.py b/youtube_dl/extractor/kusi.py index 931f34c9b..12cc56e44 100644 --- a/youtube_dl/extractor/kusi.py +++ b/youtube_dl/extractor/kusi.py @@ -16,7 +16,7 @@ from ..utils import ( class KUSIIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?kusi\.com/(?P<path>story/.+|video\?clipId=(?P<clipId>\d+))' + _VALID_URL = r'https?://(?:www\.)?kusi\.com/(?P<path>story/.+|video\?clipId=(?P<clipId>\d+))' _TESTS = [{ 'url': 'http://www.kusi.com/story/31183873/turko-files-case-closed-put-on-hold', 'md5': 'f926e7684294cf8cb7bdf8858e1b3988', diff --git a/youtube_dl/extractor/kuwo.py b/youtube_dl/extractor/kuwo.py index f94804d06..86c17c931 100644 --- a/youtube_dl/extractor/kuwo.py +++ b/youtube_dl/extractor/kuwo.py @@ -2,13 +2,13 @@ from __future__ import unicode_literals import re -import itertools from .common import InfoExtractor from ..utils import ( get_element_by_id, clean_html, ExtractorError, + InAdvancePagedList, remove_start, ) @@ -26,10 +26,23 @@ class KuwoBaseIE(InfoExtractor): def _get_formats(self, song_id, tolerate_ip_deny=False): formats = [] for file_format in self._FORMATS: + headers = {} + cn_verification_proxy = self._downloader.params.get('cn_verification_proxy') + if cn_verification_proxy: + headers['Ytdl-request-proxy'] = cn_verification_proxy + + query = { + 'format': file_format['ext'], + 'br': file_format.get('br', ''), + 'rid': 'MUSIC_%s' % song_id, + 'type': 'convert_url', + 'response': 'url' + } + song_url = self._download_webpage( - 'http://antiserver.kuwo.cn/anti.s?format=%s&br=%s&rid=MUSIC_%s&type=convert_url&response=url' % - (file_format['ext'], file_format.get('br', ''), song_id), + 'http://antiserver.kuwo.cn/anti.s', song_id, note='Download %s url info' % file_format['format'], + query=query, headers=headers, ) if song_url == 'IPDeny' and not tolerate_ip_deny: @@ -44,18 +57,13 @@ class KuwoBaseIE(InfoExtractor): 'abr': file_format.get('abr'), }) - # XXX _sort_formats fails if there are not formats, while it's not the - # desired behavior if 'IPDeny' is ignored - # This check can be removed if https://github.com/rg3/youtube-dl/pull/8051 is merged - if not tolerate_ip_deny: - self._sort_formats(formats) return formats class KuwoIE(KuwoBaseIE): IE_NAME = 'kuwo:song' IE_DESC = '酷我音乐' - _VALID_URL = r'http://www\.kuwo\.cn/yinyue/(?P<id>\d+?)/' + _VALID_URL = r'https?://www\.kuwo\.cn/yinyue/(?P<id>\d+)' _TESTS = [{ 'url': 'http://www.kuwo.cn/yinyue/635632/', 'info_dict': { @@ -80,6 +88,9 @@ class KuwoIE(KuwoBaseIE): 'params': { 'format': 'mp3-320' }, + }, { + 'url': 'http://www.kuwo.cn/yinyue/3197154?catalog=yueku2016', + 'only_matching': True, }] def _real_extract(self, url): @@ -100,6 +111,7 @@ class KuwoIE(KuwoBaseIE): lrc_content = None formats = self._get_formats(song_id) + self._sort_formats(formats) album_id = self._html_search_regex( r'<p[^>]+class="album"[^<]+<a[^>]+href="http://www\.kuwo\.cn/album/(\d+)/"', @@ -131,7 +143,7 @@ class KuwoIE(KuwoBaseIE): class KuwoAlbumIE(InfoExtractor): IE_NAME = 'kuwo:album' IE_DESC = '酷我音乐 - 专辑' - _VALID_URL = r'http://www\.kuwo\.cn/album/(?P<id>\d+?)/' + _VALID_URL = r'https?://www\.kuwo\.cn/album/(?P<id>\d+?)/' _TEST = { 'url': 'http://www.kuwo.cn/album/502294/', 'info_dict': { @@ -167,13 +179,11 @@ class KuwoAlbumIE(InfoExtractor): class KuwoChartIE(InfoExtractor): IE_NAME = 'kuwo:chart' IE_DESC = '酷我音乐 - 排行榜' - _VALID_URL = r'http://yinyue\.kuwo\.cn/billboard_(?P<id>[^.]+).htm' + _VALID_URL = r'https?://yinyue\.kuwo\.cn/billboard_(?P<id>[^.]+).htm' _TEST = { 'url': 'http://yinyue.kuwo.cn/billboard_香港中文龙虎榜.htm', 'info_dict': { 'id': '香港中文龙虎榜', - 'title': '香港中文龙虎榜', - 'description': 're:\d{4}第\d{2}期', }, 'playlist_mincount': 10, } @@ -184,30 +194,24 @@ class KuwoChartIE(InfoExtractor): url, chart_id, note='Download chart info', errnote='Unable to get chart info') - chart_name = self._html_search_regex( - r'<h1[^>]+class="unDis">([^<]+)</h1>', webpage, 'chart name') - - chart_desc = self._html_search_regex( - r'<p[^>]+class="tabDef">(\d{4}第\d{2}期)</p>', webpage, 'chart desc') - entries = [ self.url_result(song_url, 'Kuwo') for song_url in re.findall( - r'<a[^>]+href="(http://www\.kuwo\.cn/yinyue/\d+)/"', webpage) + r'<a[^>]+href="(http://www\.kuwo\.cn/yinyue/\d+)', webpage) ] - return self.playlist_result(entries, chart_id, chart_name, chart_desc) + return self.playlist_result(entries, chart_id) class KuwoSingerIE(InfoExtractor): IE_NAME = 'kuwo:singer' IE_DESC = '酷我音乐 - 歌手' - _VALID_URL = r'http://www\.kuwo\.cn/mingxing/(?P<id>[^/]+)' + _VALID_URL = r'https?://www\.kuwo\.cn/mingxing/(?P<id>[^/]+)' _TESTS = [{ 'url': 'http://www.kuwo.cn/mingxing/bruno+mars/', 'info_dict': { 'id': 'bruno+mars', 'title': 'Bruno Mars', }, - 'playlist_count': 10, + 'playlist_mincount': 329, }, { 'url': 'http://www.kuwo.cn/mingxing/Ali/music.htm', 'info_dict': { @@ -218,6 +222,8 @@ class KuwoSingerIE(InfoExtractor): 'skip': 'Regularly stalls travis build', # See https://travis-ci.org/rg3/youtube-dl/jobs/78878540 }] + PAGE_SIZE = 15 + def _real_extract(self, url): singer_id = self._match_id(url) webpage = self._download_webpage( @@ -225,25 +231,28 @@ class KuwoSingerIE(InfoExtractor): errnote='Unable to get singer info') singer_name = self._html_search_regex( - r'<div class="title clearfix">\s*<h1>([^<]+)<span', webpage, 'singer name' - ) + r'<h1>([^<]+)</h1>', webpage, 'singer name') - entries = [] - first_page_only = False if re.search(r'/music(?:_\d+)?\.htm', url) else True - for page_num in itertools.count(1): + artist_id = self._html_search_regex( + r'data-artistid="(\d+)"', webpage, 'artist id') + + page_count = int(self._html_search_regex( + r'data-page="(\d+)"', webpage, 'page count')) + + def page_func(page_num): webpage = self._download_webpage( - 'http://www.kuwo.cn/mingxing/%s/music_%d.htm' % (singer_id, page_num), - singer_id, note='Download song list page #%d' % page_num, - errnote='Unable to get song list page #%d' % page_num) + 'http://www.kuwo.cn/artist/contentMusicsAjax', + singer_id, note='Download song list page #%d' % (page_num + 1), + errnote='Unable to get song list page #%d' % (page_num + 1), + query={'artistId': artist_id, 'pn': page_num, 'rn': self.PAGE_SIZE}) - entries.extend([ + return [ self.url_result(song_url, 'Kuwo') for song_url in re.findall( - r'<p[^>]+class="m_name"><a[^>]+href="(http://www\.kuwo\.cn/yinyue/\d+)/', + r'<div[^>]+class="name"><a[^>]+href="(http://www\.kuwo\.cn/yinyue/\d+)', webpage) - ][:10 if first_page_only else None]) + ] - if first_page_only or not re.search(r'<a[^>]+href="[^"]+">下一页</a>', webpage): - break + entries = InAdvancePagedList(page_func, page_count, self.PAGE_SIZE) return self.playlist_result(entries, singer_id, singer_name) @@ -251,7 +260,7 @@ class KuwoSingerIE(InfoExtractor): class KuwoCategoryIE(InfoExtractor): IE_NAME = 'kuwo:category' IE_DESC = '酷我音乐 - 分类' - _VALID_URL = r'http://yinyue\.kuwo\.cn/yy/cinfo_(?P<id>\d+?).htm' + _VALID_URL = r'https?://yinyue\.kuwo\.cn/yy/cinfo_(?P<id>\d+?).htm' _TEST = { 'url': 'http://yinyue.kuwo.cn/yy/cinfo_86375.htm', 'info_dict': { @@ -288,7 +297,7 @@ class KuwoCategoryIE(InfoExtractor): class KuwoMvIE(KuwoBaseIE): IE_NAME = 'kuwo:mv' IE_DESC = '酷我音乐 - MV' - _VALID_URL = r'http://www\.kuwo\.cn/mv/(?P<id>\d+?)/' + _VALID_URL = r'https?://www\.kuwo\.cn/mv/(?P<id>\d+?)/' _TEST = { 'url': 'http://www.kuwo.cn/mv/6480076/', 'info_dict': { diff --git a/youtube_dl/extractor/laola1tv.py b/youtube_dl/extractor/laola1tv.py index 5d8ebbeb3..d4fbafece 100644 --- a/youtube_dl/extractor/laola1tv.py +++ b/youtube_dl/extractor/laola1tv.py @@ -5,7 +5,7 @@ import re from .common import InfoExtractor from ..compat import ( - compat_urllib_parse, + compat_urllib_parse_urlencode, compat_urlparse, ) from ..utils import ( @@ -19,7 +19,7 @@ from ..utils import ( class Laola1TvIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?laola1\.tv/(?P<lang>[a-z]+)-(?P<portal>[a-z]+)/[^/]+/(?P<slug>[^/?#&]+)' + _VALID_URL = r'https?://(?:www\.)?laola1\.tv/(?P<lang>[a-z]+)-(?P<portal>[a-z]+)/(?P<kind>[^/]+)/(?P<slug>[^/?#&]+)' _TESTS = [{ 'url': 'http://www.laola1.tv/de-de/video/straubing-tigers-koelner-haie/227883.html', 'info_dict': { @@ -33,7 +33,7 @@ class Laola1TvIE(InfoExtractor): }, 'params': { 'skip_download': True, - } + }, }, { 'url': 'http://www.laola1.tv/de-de/video/straubing-tigers-koelner-haie', 'info_dict': { @@ -47,12 +47,28 @@ class Laola1TvIE(InfoExtractor): }, 'params': { 'skip_download': True, - } + }, + }, { + 'url': 'http://www.laola1.tv/de-de/livestream/2016-03-22-belogorie-belgorod-trentino-diatec-lde', + 'info_dict': { + 'id': '487850', + 'display_id': '2016-03-22-belogorie-belgorod-trentino-diatec-lde', + 'ext': 'flv', + 'title': 'Belogorie BELGOROD - TRENTINO Diatec', + 'upload_date': '20160322', + 'uploader': 'CEV - Europäischer Volleyball Verband', + 'is_live': True, + 'categories': ['Volleyball'], + }, + 'params': { + 'skip_download': True, + }, }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) display_id = mobj.group('slug') + kind = mobj.group('kind') lang = mobj.group('lang') portal = mobj.group('portal') @@ -74,7 +90,7 @@ class Laola1TvIE(InfoExtractor): hd_doc = self._download_xml( 'http://www.laola1.tv/server/hd_video.php?%s' - % compat_urllib_parse.urlencode({ + % compat_urllib_parse_urlencode({ 'play': video_id, 'partner': partner_id, 'portal': portal, @@ -85,12 +101,17 @@ class Laola1TvIE(InfoExtractor): _v = lambda x, **k: xpath_text(hd_doc, './/video/' + x, **k) title = _v('title', fatal=True) + VS_TARGETS = { + 'video': '2', + 'livestream': '17', + } + req = sanitized_Request( 'https://club.laola1.tv/sp/laola1/api/v3/user/session/premium/player/stream-access?%s' % - compat_urllib_parse.urlencode({ + compat_urllib_parse_urlencode({ 'videoId': video_id, - 'target': '2', - 'label': 'laola1tv', + 'target': VS_TARGETS.get(kind, '2'), + 'label': _v('label'), 'area': _v('area'), }), urlencode_postdata( @@ -109,6 +130,7 @@ class Laola1TvIE(InfoExtractor): formats = self._extract_f4m_formats( '%s?hdnea=%s&hdcore=3.2.0' % (token_attrib['url'], token_auth), video_id, f4m_id='hds') + self._sort_formats(formats) categories_str = _v('meta_sports') categories = categories_str.split(',') if categories_str else [] diff --git a/youtube_dl/extractor/leeco.py b/youtube_dl/extractor/leeco.py index df47e88ba..375fdaed1 100644 --- a/youtube_dl/extractor/leeco.py +++ b/youtube_dl/extractor/leeco.py @@ -11,7 +11,7 @@ from .common import InfoExtractor from ..compat import ( compat_ord, compat_str, - compat_urllib_parse, + compat_urllib_parse_urlencode, ) from ..utils import ( determine_ext, @@ -28,7 +28,7 @@ from ..utils import ( class LeIE(InfoExtractor): IE_DESC = '乐视网' - _VALID_URL = r'http://www\.le\.com/ptv/vplay/(?P<id>\d+)\.html' + _VALID_URL = r'https?://www\.le\.com/ptv/vplay/(?P<id>\d+)\.html' _URL_TEMPLATE = 'http://www.le.com/ptv/vplay/%s.html' @@ -122,7 +122,7 @@ class LeIE(InfoExtractor): 'domain': 'www.le.com' } play_json_req = sanitized_Request( - 'http://api.le.com/mms/out/video/playJson?' + compat_urllib_parse.urlencode(params) + 'http://api.le.com/mms/out/video/playJson?' + compat_urllib_parse_urlencode(params) ) cn_verification_proxy = self._downloader.params.get('cn_verification_proxy') if cn_verification_proxy: @@ -151,7 +151,7 @@ class LeIE(InfoExtractor): for format_id in formats: if format_id in dispatch: media_url = playurl['domain'][0] + dispatch[format_id][0] - media_url += '&' + compat_urllib_parse.urlencode({ + media_url += '&' + compat_urllib_parse_urlencode({ 'm3v': 1, 'format': 1, 'expect': 3, @@ -196,7 +196,7 @@ class LeIE(InfoExtractor): class LePlaylistIE(InfoExtractor): - _VALID_URL = r'http://[a-z]+\.le\.com/[a-z]+/(?P<id>[a-z0-9_]+)' + _VALID_URL = r'https?://[a-z]+\.le\.com/[a-z]+/(?P<id>[a-z0-9_]+)' _TESTS = [{ 'url': 'http://www.le.com/tv/46177.html', @@ -305,7 +305,7 @@ class LetvCloudIE(InfoExtractor): } self.sign_data(data) return self._download_json( - 'http://api.letvcloud.com/gpc.php?' + compat_urllib_parse.urlencode(data), + 'http://api.letvcloud.com/gpc.php?' + compat_urllib_parse_urlencode(data), media_id, 'Downloading playJson data for type %s' % cf) play_json = get_play_json(cf, time.time()) diff --git a/youtube_dl/extractor/lifenews.py b/youtube_dl/extractor/lifenews.py index a8fd639cc..ba2f80a75 100644 --- a/youtube_dl/extractor/lifenews.py +++ b/youtube_dl/extractor/lifenews.py @@ -17,7 +17,7 @@ from ..utils import ( class LifeNewsIE(InfoExtractor): IE_NAME = 'lifenews' IE_DESC = 'LIFE | NEWS' - _VALID_URL = r'http://lifenews\.ru/(?:mobile/)?(?P<section>news|video)/(?P<id>\d+)' + _VALID_URL = r'https?://lifenews\.ru/(?:mobile/)?(?P<section>news|video)/(?P<id>\d+)' _TESTS = [{ # single video embedded via video/source @@ -159,7 +159,7 @@ class LifeNewsIE(InfoExtractor): class LifeEmbedIE(InfoExtractor): IE_NAME = 'life:embed' - _VALID_URL = r'http://embed\.life\.ru/embed/(?P<id>[\da-f]{32})' + _VALID_URL = r'https?://embed\.life\.ru/embed/(?P<id>[\da-f]{32})' _TEST = { 'url': 'http://embed.life.ru/embed/e50c2dec2867350528e2574c899b8291', diff --git a/youtube_dl/extractor/limelight.py b/youtube_dl/extractor/limelight.py index 1a0625ac3..2599d45c3 100644 --- a/youtube_dl/extractor/limelight.py +++ b/youtube_dl/extractor/limelight.py @@ -123,7 +123,7 @@ class LimelightBaseIE(InfoExtractor): class LimelightMediaIE(LimelightBaseIE): IE_NAME = 'limelight' - _VALID_URL = r'(?:limelight:media:|http://link\.videoplatform\.limelight\.com/media/\??\bmediaId=)(?P<id>[a-z0-9]{32})' + _VALID_URL = r'(?:limelight:media:|https?://link\.videoplatform\.limelight\.com/media/\??\bmediaId=)(?P<id>[a-z0-9]{32})' _TESTS = [{ 'url': 'http://link.videoplatform.limelight.com/media/?mediaId=3ffd040b522b4485b6d84effc750cd86', 'info_dict': { @@ -176,7 +176,7 @@ class LimelightMediaIE(LimelightBaseIE): class LimelightChannelIE(LimelightBaseIE): IE_NAME = 'limelight:channel' - _VALID_URL = r'(?:limelight:channel:|http://link\.videoplatform\.limelight\.com/media/\??\bchannelId=)(?P<id>[a-z0-9]{32})' + _VALID_URL = r'(?:limelight:channel:|https?://link\.videoplatform\.limelight\.com/media/\??\bchannelId=)(?P<id>[a-z0-9]{32})' _TEST = { 'url': 'http://link.videoplatform.limelight.com/media/?channelId=ab6a524c379342f9b23642917020c082', 'info_dict': { @@ -207,7 +207,7 @@ class LimelightChannelIE(LimelightBaseIE): class LimelightChannelListIE(LimelightBaseIE): IE_NAME = 'limelight:channel_list' - _VALID_URL = r'(?:limelight:channel_list:|http://link\.videoplatform\.limelight\.com/media/\?.*?\bchannelListId=)(?P<id>[a-z0-9]{32})' + _VALID_URL = r'(?:limelight:channel_list:|https?://link\.videoplatform\.limelight\.com/media/\?.*?\bchannelListId=)(?P<id>[a-z0-9]{32})' _TEST = { 'url': 'http://link.videoplatform.limelight.com/media/?channelListId=301b117890c4465c8179ede21fd92e2b', 'info_dict': { diff --git a/youtube_dl/extractor/lrt.py b/youtube_dl/extractor/lrt.py index 863efd896..1072405b3 100644 --- a/youtube_dl/extractor/lrt.py +++ b/youtube_dl/extractor/lrt.py @@ -37,6 +37,7 @@ class LRTIE(InfoExtractor): r'file\s*:\s*(["\'])(?P<url>.+?)\1\s*\+\s*location\.hash\.substring\(1\)', webpage, 'm3u8 url', group='url') formats = self._extract_m3u8_formats(m3u8_url, video_id, 'mp4') + self._sort_formats(formats) thumbnail = self._og_search_thumbnail(webpage) description = self._og_search_description(webpage) diff --git a/youtube_dl/extractor/lynda.py b/youtube_dl/extractor/lynda.py index d4e1ae99d..655627479 100644 --- a/youtube_dl/extractor/lynda.py +++ b/youtube_dl/extractor/lynda.py @@ -4,15 +4,13 @@ import re import json from .common import InfoExtractor -from ..compat import ( - compat_str, - compat_urllib_parse, -) +from ..compat import compat_str from ..utils import ( ExtractorError, clean_html, int_or_none, sanitized_Request, + urlencode_postdata, ) @@ -36,7 +34,7 @@ class LyndaBaseIE(InfoExtractor): 'stayPut': 'false' } request = sanitized_Request( - self._LOGIN_URL, compat_urllib_parse.urlencode(login_form).encode('utf-8')) + self._LOGIN_URL, urlencode_postdata(login_form)) login_page = self._download_webpage( request, None, 'Logging in as %s' % username) @@ -65,7 +63,7 @@ class LyndaBaseIE(InfoExtractor): 'stayPut': 'false', } request = sanitized_Request( - self._LOGIN_URL, compat_urllib_parse.urlencode(confirm_form).encode('utf-8')) + self._LOGIN_URL, urlencode_postdata(confirm_form)) login_page = self._download_webpage( request, None, 'Confirming log in and log out from another device') @@ -221,7 +219,7 @@ class LyndaCourseIE(LyndaBaseIE): 'Course %s does not exist' % course_id, expected=True) unaccessible_videos = 0 - videos = [] + entries = [] # Might want to extract videos right here from video['Formats'] as it seems 'Formats' is not provided # by single video API anymore @@ -231,20 +229,22 @@ class LyndaCourseIE(LyndaBaseIE): if video.get('HasAccess') is False: unaccessible_videos += 1 continue - if video.get('ID'): - videos.append(video['ID']) + video_id = video.get('ID') + if video_id: + entries.append({ + '_type': 'url_transparent', + 'url': 'http://www.lynda.com/%s/%s-4.html' % (course_path, video_id), + 'ie_key': LyndaIE.ie_key(), + 'chapter': chapter.get('Title'), + 'chapter_number': int_or_none(chapter.get('ChapterIndex')), + 'chapter_id': compat_str(chapter.get('ID')), + }) if unaccessible_videos > 0: self._downloader.report_warning( '%s videos are only available for members (or paid members) and will not be downloaded. ' % unaccessible_videos + self._ACCOUNT_CREDENTIALS_HINT) - entries = [ - self.url_result( - 'http://www.lynda.com/%s/%s-4.html' % (course_path, video_id), - 'Lynda') - for video_id in videos] - course_title = course.get('Title') return self.playlist_result(entries, course_id, course_title) diff --git a/youtube_dl/extractor/m6.py b/youtube_dl/extractor/m6.py index 7e025831b..d5945ad66 100644 --- a/youtube_dl/extractor/m6.py +++ b/youtube_dl/extractor/m6.py @@ -8,7 +8,7 @@ from .common import InfoExtractor class M6IE(InfoExtractor): IE_NAME = 'm6' - _VALID_URL = r'http://(?:www\.)?m6\.fr/[^/]+/videos/(?P<id>\d+)-[^\.]+\.html' + _VALID_URL = r'https?://(?:www\.)?m6\.fr/[^/]+/videos/(?P<id>\d+)-[^\.]+\.html' _TEST = { 'url': 'http://www.m6.fr/emission-les_reines_du_shopping/videos/11323908-emeline_est_la_reine_du_shopping_sur_le_theme_ma_fete_d_8217_anniversaire.html', diff --git a/youtube_dl/extractor/mailru.py b/youtube_dl/extractor/mailru.py index 71085f279..9a7098c43 100644 --- a/youtube_dl/extractor/mailru.py +++ b/youtube_dl/extractor/mailru.py @@ -13,7 +13,7 @@ from ..utils import ( class MailRuIE(InfoExtractor): IE_NAME = 'mailru' IE_DESC = 'Видео@Mail.Ru' - _VALID_URL = r'http://(?:www\.)?my\.mail\.ru/(?:video/.*#video=/?(?P<idv1>(?:[^/]+/){3}\d+)|(?:(?P<idv2prefix>(?:[^/]+/){2})video/(?P<idv2suffix>[^/]+/\d+))\.html)' + _VALID_URL = r'https?://(?:(?:www|m)\.)?my\.mail\.ru/(?:video/.*#video=/?(?P<idv1>(?:[^/]+/){3}\d+)|(?:(?P<idv2prefix>(?:[^/]+/){2})video/(?P<idv2suffix>[^/]+/\d+))\.html)' _TESTS = [ { @@ -61,6 +61,10 @@ class MailRuIE(InfoExtractor): 'duration': 6001, }, 'skip': 'Not accessible from Travis CI server', + }, + { + 'url': 'http://m.my.mail.ru/mail/3sktvtr/video/_myvideo/138.html', + 'only_matching': True, } ] diff --git a/youtube_dl/extractor/matchtv.py b/youtube_dl/extractor/matchtv.py index 28e0dfe63..80a0d7013 100644 --- a/youtube_dl/extractor/matchtv.py +++ b/youtube_dl/extractor/matchtv.py @@ -4,7 +4,7 @@ from __future__ import unicode_literals import random from .common import InfoExtractor -from ..compat import compat_urllib_parse +from ..compat import compat_urllib_parse_urlencode from ..utils import ( sanitized_Request, xpath_text, @@ -29,7 +29,7 @@ class MatchTVIE(InfoExtractor): def _real_extract(self, url): video_id = 'matchtv-live' request = sanitized_Request( - 'http://player.matchtv.ntvplus.tv/player/smil?%s' % compat_urllib_parse.urlencode({ + 'http://player.matchtv.ntvplus.tv/player/smil?%s' % compat_urllib_parse_urlencode({ 'ts': '', 'quality': 'SD', 'contentId': '561d2c0df7159b37178b4567', @@ -47,6 +47,7 @@ class MatchTVIE(InfoExtractor): video_url = self._download_json(request, video_id)['data']['videoUrl'] f4m_url = xpath_text(self._download_xml(video_url, video_id), './to') formats = self._extract_f4m_formats(f4m_url, video_id) + self._sort_formats(formats) return { 'id': video_id, 'title': self._live_title('Матч ТВ - Прямой эфир'), diff --git a/youtube_dl/extractor/metacafe.py b/youtube_dl/extractor/metacafe.py index 67d6271e1..61dadb7a7 100644 --- a/youtube_dl/extractor/metacafe.py +++ b/youtube_dl/extractor/metacafe.py @@ -5,7 +5,6 @@ import re from .common import InfoExtractor from ..compat import ( compat_parse_qs, - compat_urllib_parse, compat_urllib_parse_unquote, ) from ..utils import ( @@ -13,11 +12,12 @@ from ..utils import ( ExtractorError, int_or_none, sanitized_Request, + urlencode_postdata, ) class MetacafeIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*' + _VALID_URL = r'https?://(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*' _DISCLAIMER = 'http://www.metacafe.com/family_filter/' _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user' IE_NAME = 'metacafe' @@ -117,7 +117,7 @@ class MetacafeIE(InfoExtractor): 'filters': '0', 'submit': "Continue - I'm over 18", } - request = sanitized_Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form)) + request = sanitized_Request(self._FILTER_POST, urlencode_postdata(disclaimer_form)) request.add_header('Content-Type', 'application/x-www-form-urlencoded') self.report_age_confirmation() self._download_webpage(request, None, False, 'Unable to confirm age') diff --git a/youtube_dl/extractor/minhateca.py b/youtube_dl/extractor/minhateca.py index e46b23a6f..e6730b75a 100644 --- a/youtube_dl/extractor/minhateca.py +++ b/youtube_dl/extractor/minhateca.py @@ -2,12 +2,12 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..compat import compat_urllib_parse from ..utils import ( int_or_none, parse_duration, parse_filesize, sanitized_Request, + urlencode_postdata, ) @@ -39,7 +39,7 @@ class MinhatecaIE(InfoExtractor): ] req = sanitized_Request( 'http://minhateca.com.br/action/License/Download', - data=compat_urllib_parse.urlencode(token_data)) + data=urlencode_postdata(token_data)) req.add_header('Content-Type', 'application/x-www-form-urlencoded') data = self._download_json( req, video_id, note='Downloading metadata') diff --git a/youtube_dl/extractor/mit.py b/youtube_dl/extractor/mit.py index 819c1b90b..1aea78d11 100644 --- a/youtube_dl/extractor/mit.py +++ b/youtube_dl/extractor/mit.py @@ -91,7 +91,7 @@ class MITIE(TechTVMITIE): class OCWMITIE(InfoExtractor): IE_NAME = 'ocw.mit.edu' - _VALID_URL = r'^http://ocw\.mit\.edu/courses/(?P<topic>[a-z0-9\-]+)' + _VALID_URL = r'^https?://ocw\.mit\.edu/courses/(?P<topic>[a-z0-9\-]+)' _BASE_URL = 'http://ocw.mit.edu/' _TESTS = [ diff --git a/youtube_dl/extractor/mitele.py b/youtube_dl/extractor/mitele.py index c595f2077..7b4581dc5 100644 --- a/youtube_dl/extractor/mitele.py +++ b/youtube_dl/extractor/mitele.py @@ -2,11 +2,10 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..compat import ( - compat_urllib_parse, + compat_urllib_parse_urlencode, compat_urlparse, ) from ..utils import ( - encode_dict, get_element_by_attribute, int_or_none, ) @@ -14,7 +13,7 @@ from ..utils import ( class MiTeleIE(InfoExtractor): IE_DESC = 'mitele.es' - _VALID_URL = r'http://www\.mitele\.es/[^/]+/[^/]+/[^/]+/(?P<id>[^/]+)/' + _VALID_URL = r'https?://www\.mitele\.es/[^/]+/[^/]+/[^/]+/(?P<id>[^/]+)/' _TESTS = [{ 'url': 'http://www.mitele.es/programas-tv/diario-de/la-redaccion/programa-144/', @@ -60,7 +59,7 @@ class MiTeleIE(InfoExtractor): 'sta': '0', } media = self._download_json( - '%s/?%s' % (gat, compat_urllib_parse.urlencode(encode_dict(token_data))), + '%s/?%s' % (gat, compat_urllib_parse_urlencode(token_data)), display_id, 'Downloading %s JSON' % location['loc']) file_ = media.get('file') if not file_: @@ -68,6 +67,7 @@ class MiTeleIE(InfoExtractor): formats.extend(self._extract_f4m_formats( file_ + '&hdcore=3.2.0&plugin=aasp-3.2.0.77.18', display_id, f4m_id=loc)) + self._sort_formats(formats) title = self._search_regex( r'class="Destacado-text"[^>]*>\s*<strong>([^<]+)</strong>', webpage, 'title') diff --git a/youtube_dl/extractor/mnet.py b/youtube_dl/extractor/mnet.py new file mode 100644 index 000000000..e3f42e7bd --- /dev/null +++ b/youtube_dl/extractor/mnet.py @@ -0,0 +1,81 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + parse_duration, + parse_iso8601, +) + + +class MnetIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?mnet\.(?:com|interest\.me)/tv/vod/(?:.*?\bclip_id=)?(?P<id>[0-9]+)' + _TESTS = [{ + 'url': 'http://www.mnet.com/tv/vod/171008', + 'info_dict': { + 'id': '171008', + 'title': 'SS_이해인@히든박스', + 'description': 'md5:b9efa592c3918b615ba69fe9f8a05c55', + 'duration': 88, + 'upload_date': '20151231', + 'timestamp': 1451564040, + 'age_limit': 0, + 'thumbnails': 'mincount:5', + 'thumbnail': 're:^https?://.*\.jpg$', + 'ext': 'flv', + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + }, { + 'url': 'http://mnet.interest.me/tv/vod/172790', + 'only_matching': True, + }, { + 'url': 'http://www.mnet.com/tv/vod/vod_view.asp?clip_id=172790&tabMenu=', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + info = self._download_json( + 'http://content.api.mnet.com/player/vodConfig?id=%s&ctype=CLIP' % video_id, + video_id, 'Downloading vod config JSON')['data']['info'] + + title = info['title'] + + rtmp_info = self._download_json( + info['cdn'], video_id, 'Downloading vod cdn JSON') + + formats = [{ + 'url': rtmp_info['serverurl'] + rtmp_info['fileurl'], + 'ext': 'flv', + 'page_url': url, + 'player_url': 'http://flvfile.mnet.com/service/player/201602/cjem_player_tv.swf?v=201602191318', + }] + + description = info.get('ment') + duration = parse_duration(info.get('time')) + timestamp = parse_iso8601(info.get('date'), delimiter=' ') + age_limit = info.get('adult') + if age_limit is not None: + age_limit = 0 if age_limit == 'N' else 18 + thumbnails = [{ + 'id': thumb_format, + 'url': thumb['url'], + 'width': int_or_none(thumb.get('width')), + 'height': int_or_none(thumb.get('height')), + } for thumb_format, thumb in info.get('cover', {}).items() if thumb.get('url')] + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'duration': duration, + 'timestamp': timestamp, + 'age_limit': age_limit, + 'thumbnails': thumbnails, + 'formats': formats, + } diff --git a/youtube_dl/extractor/moevideo.py b/youtube_dl/extractor/moevideo.py index d930b9634..978d5d5bf 100644 --- a/youtube_dl/extractor/moevideo.py +++ b/youtube_dl/extractor/moevideo.py @@ -5,11 +5,11 @@ import json import re from .common import InfoExtractor -from ..compat import compat_urllib_parse from ..utils import ( ExtractorError, int_or_none, sanitized_Request, + urlencode_postdata, ) @@ -77,7 +77,7 @@ class MoeVideoIE(InfoExtractor): ], ] r_json = json.dumps(r) - post = compat_urllib_parse.urlencode({'r': r_json}) + post = urlencode_postdata({'r': r_json}) req = sanitized_Request(self._API_URL, post) req.add_header('Content-type', 'application/x-www-form-urlencoded') diff --git a/youtube_dl/extractor/moniker.py b/youtube_dl/extractor/moniker.py index f6bf94f2f..b208820fe 100644 --- a/youtube_dl/extractor/moniker.py +++ b/youtube_dl/extractor/moniker.py @@ -5,11 +5,11 @@ import os.path import re from .common import InfoExtractor -from ..compat import compat_urllib_parse from ..utils import ( ExtractorError, remove_start, sanitized_Request, + urlencode_postdata, ) @@ -88,7 +88,7 @@ class MonikerIE(InfoExtractor): fields = re.findall(r'type="hidden" name="(.+?)"\s* value="?(.+?)">', orig_webpage) data = dict(fields) - post = compat_urllib_parse.urlencode(data) + post = urlencode_postdata(data) headers = { b'Content-Type': b'application/x-www-form-urlencoded', } diff --git a/youtube_dl/extractor/mooshare.py b/youtube_dl/extractor/mooshare.py index 7cc7f054f..a85109a89 100644 --- a/youtube_dl/extractor/mooshare.py +++ b/youtube_dl/extractor/mooshare.py @@ -3,17 +3,17 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_urllib_parse from ..utils import ( ExtractorError, sanitized_Request, + urlencode_postdata, ) class MooshareIE(InfoExtractor): IE_NAME = 'mooshare' IE_DESC = 'Mooshare.biz' - _VALID_URL = r'http://(?:www\.)?mooshare\.biz/(?P<id>[\da-z]{12})' + _VALID_URL = r'https?://(?:www\.)?mooshare\.biz/(?P<id>[\da-z]{12})' _TESTS = [ { @@ -58,7 +58,7 @@ class MooshareIE(InfoExtractor): } request = sanitized_Request( - 'http://mooshare.biz/%s' % video_id, compat_urllib_parse.urlencode(download_form)) + 'http://mooshare.biz/%s' % video_id, urlencode_postdata(download_form)) request.add_header('Content-Type', 'application/x-www-form-urlencoded') self._sleep(5, video_id) diff --git a/youtube_dl/extractor/motherless.py b/youtube_dl/extractor/motherless.py index 0b4787c1d..5e1a8a71a 100644 --- a/youtube_dl/extractor/motherless.py +++ b/youtube_dl/extractor/motherless.py @@ -12,7 +12,7 @@ from ..utils import ( class MotherlessIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?motherless\.com/(?:g/[a-z0-9_]+/)?(?P<id>[A-Z0-9]+)' + _VALID_URL = r'https?://(?:www\.)?motherless\.com/(?:g/[a-z0-9_]+/)?(?P<id>[A-Z0-9]+)' _TESTS = [{ 'url': 'http://motherless.com/AC3FFE1', 'md5': '310f62e325a9fafe64f68c0bccb6e75f', @@ -69,6 +69,9 @@ class MotherlessIE(InfoExtractor): ">The page you're looking for cannot be found.<")): raise ExtractorError('Video %s does not exist' % video_id, expected=True) + if '>The content you are trying to view is for friends only.' in webpage: + raise ExtractorError('Video %s is for friends only' % video_id, expected=True) + title = self._html_search_regex( r'id="view-upload-title">\s+([^<]+)<', webpage, 'title') video_url = self._html_search_regex( diff --git a/youtube_dl/extractor/motorsport.py b/youtube_dl/extractor/motorsport.py index c1a482dba..370328b36 100644 --- a/youtube_dl/extractor/motorsport.py +++ b/youtube_dl/extractor/motorsport.py @@ -9,7 +9,7 @@ from ..compat import ( class MotorsportIE(InfoExtractor): IE_DESC = 'motorsport.com' - _VALID_URL = r'http://www\.motorsport\.com/[^/?#]+/video/(?:[^/?#]+/)(?P<id>[^/]+)/?(?:$|[?#])' + _VALID_URL = r'https?://www\.motorsport\.com/[^/?#]+/video/(?:[^/?#]+/)(?P<id>[^/]+)/?(?:$|[?#])' _TEST = { 'url': 'http://www.motorsport.com/f1/video/main-gallery/red-bull-racing-2014-rules-explained/', 'info_dict': { diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py index ed068365d..640ee3d93 100644 --- a/youtube_dl/extractor/mtv.py +++ b/youtube_dl/extractor/mtv.py @@ -4,7 +4,7 @@ import re from .common import InfoExtractor from ..compat import ( - compat_urllib_parse, + compat_urllib_parse_urlencode, compat_str, ) from ..utils import ( @@ -17,6 +17,7 @@ from ..utils import ( unescapeHTML, url_basename, RegexNotFoundError, + xpath_text, ) @@ -130,11 +131,7 @@ class MTVServicesInfoExtractor(InfoExtractor): message += item.text raise ExtractorError(message, expected=True) - description_node = itemdoc.find('description') - if description_node is not None: - description = description_node.text.strip() - else: - description = None + description = xpath_text(itemdoc, 'description') title_el = None if title_el is None: @@ -174,7 +171,7 @@ class MTVServicesInfoExtractor(InfoExtractor): data = {'uri': uri} if self._LANG: data['lang'] = self._LANG - return compat_urllib_parse.urlencode(data) + return compat_urllib_parse_urlencode(data) def _get_videos_info(self, uri): video_id = self._id_from_uri(uri) diff --git a/youtube_dl/extractor/muzu.py b/youtube_dl/extractor/muzu.py index 1e9cf8de9..cbc800481 100644 --- a/youtube_dl/extractor/muzu.py +++ b/youtube_dl/extractor/muzu.py @@ -1,9 +1,7 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..compat import ( - compat_urllib_parse, -) +from ..compat import compat_urllib_parse_urlencode class MuzuTVIE(InfoExtractor): @@ -25,7 +23,7 @@ class MuzuTVIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - info_data = compat_urllib_parse.urlencode({ + info_data = compat_urllib_parse_urlencode({ 'format': 'json', 'url': url, }) @@ -41,7 +39,7 @@ class MuzuTVIE(InfoExtractor): if video_info.get('v%s' % quality): break - data = compat_urllib_parse.urlencode({ + data = compat_urllib_parse_urlencode({ 'ai': video_id, # Even if each time you watch a video the hash changes, # it seems to work for different videos, and it will work diff --git a/youtube_dl/extractor/myspace.py b/youtube_dl/extractor/myspace.py index 83414a232..0d5238d77 100644 --- a/youtube_dl/extractor/myspace.py +++ b/youtube_dl/extractor/myspace.py @@ -2,13 +2,13 @@ from __future__ import unicode_literals import re -import json from .common import InfoExtractor -from ..compat import ( - compat_str, +from ..utils import ( + ExtractorError, + int_or_none, + parse_iso8601, ) -from ..utils import ExtractorError class MySpaceIE(InfoExtractor): @@ -24,6 +24,8 @@ class MySpaceIE(InfoExtractor): 'description': 'This country quartet was all smiles while playing a sold out show at the Pacific Amphitheatre in Orange County, California.', 'uploader': 'Five Minutes to the Stage', 'uploader_id': 'fiveminutestothestage', + 'timestamp': 1414108751, + 'upload_date': '20141023', }, 'params': { # rtmp download @@ -64,7 +66,7 @@ class MySpaceIE(InfoExtractor): 'ext': 'mp4', 'title': 'Starset - First Light', 'description': 'md5:2d5db6c9d11d527683bcda818d332414', - 'uploader': 'Jacob Soren', + 'uploader': 'Yumi K', 'uploader_id': 'SorenPromotions', 'upload_date': '20140725', } @@ -78,6 +80,19 @@ class MySpaceIE(InfoExtractor): player_url = self._search_regex( r'playerSwf":"([^"?]*)', webpage, 'player URL') + def rtmp_format_from_stream_url(stream_url, width=None, height=None): + rtmp_url, play_path = stream_url.split(';', 1) + return { + 'format_id': 'rtmp', + 'url': rtmp_url, + 'play_path': play_path, + 'player_url': player_url, + 'protocol': 'rtmp', + 'ext': 'flv', + 'width': width, + 'height': height, + } + if mobj.group('mediatype').startswith('music/song'): # songs don't store any useful info in the 'context' variable song_data = self._search_regex( @@ -93,8 +108,8 @@ class MySpaceIE(InfoExtractor): return self._search_regex( r'''data-%s=([\'"])(?P<data>.*?)\1''' % name, song_data, name, default='', group='data') - streamUrl = search_data('stream-url') - if not streamUrl: + stream_url = search_data('stream-url') + if not stream_url: vevo_id = search_data('vevo-id') youtube_id = search_data('youtube-id') if vevo_id: @@ -106,36 +121,47 @@ class MySpaceIE(InfoExtractor): else: raise ExtractorError( 'Found song but don\'t know how to download it') - info = { + return { 'id': video_id, 'title': self._og_search_title(webpage), 'uploader': search_data('artist-name'), 'uploader_id': search_data('artist-username'), 'thumbnail': self._og_search_thumbnail(webpage), + 'duration': int_or_none(search_data('duration')), + 'formats': [rtmp_format_from_stream_url(stream_url)] } else: - context = json.loads(self._search_regex( - r'context = ({.*?});', webpage, 'context')) - video = context['video'] - streamUrl = video['streamUrl'] - info = { - 'id': compat_str(video['mediaId']), + video = self._parse_json(self._search_regex( + r'context = ({.*?});', webpage, 'context'), + video_id)['video'] + formats = [] + hls_stream_url = video.get('hlsStreamUrl') + if hls_stream_url: + formats.append({ + 'format_id': 'hls', + 'url': hls_stream_url, + 'protocol': 'm3u8_native', + 'ext': 'mp4', + }) + stream_url = video.get('streamUrl') + if stream_url: + formats.append(rtmp_format_from_stream_url( + stream_url, + int_or_none(video.get('width')), + int_or_none(video.get('height')))) + self._sort_formats(formats) + return { + 'id': video_id, 'title': video['title'], - 'description': video['description'], - 'thumbnail': video['imageUrl'], - 'uploader': video['artistName'], - 'uploader_id': video['artistUsername'], + 'description': video.get('description'), + 'thumbnail': video.get('imageUrl'), + 'uploader': video.get('artistName'), + 'uploader_id': video.get('artistUsername'), + 'duration': int_or_none(video.get('duration')), + 'timestamp': parse_iso8601(video.get('dateAdded')), + 'formats': formats, } - rtmp_url, play_path = streamUrl.split(';', 1) - info.update({ - 'url': rtmp_url, - 'play_path': play_path, - 'player_url': player_url, - 'ext': 'flv', - }) - return info - class MySpaceAlbumIE(InfoExtractor): IE_NAME = 'MySpace:album' diff --git a/youtube_dl/extractor/myspass.py b/youtube_dl/extractor/myspass.py index f936b92bb..1ca7b1a9e 100644 --- a/youtube_dl/extractor/myspass.py +++ b/youtube_dl/extractor/myspass.py @@ -11,7 +11,7 @@ from ..utils import ( class MySpassIE(InfoExtractor): - _VALID_URL = r'http://www\.myspass\.de/.*' + _VALID_URL = r'https?://www\.myspass\.de/.*' _TEST = { 'url': 'http://www.myspass.de/myspass/shows/tvshows/absolute-mehrheit/Absolute-Mehrheit-vom-17022013-Die-Highlights-Teil-2--/11741/', 'md5': '0b49f4844a068f8b33f4b7c88405862b', diff --git a/youtube_dl/extractor/myvideo.py b/youtube_dl/extractor/myvideo.py index 1e21cf98a..6d447a493 100644 --- a/youtube_dl/extractor/myvideo.py +++ b/youtube_dl/extractor/myvideo.py @@ -9,8 +9,8 @@ import json from .common import InfoExtractor from ..compat import ( compat_ord, - compat_urllib_parse, compat_urllib_parse_unquote, + compat_urllib_parse_urlencode, ) from ..utils import ( ExtractorError, @@ -20,7 +20,7 @@ from ..utils import ( class MyVideoIE(InfoExtractor): _WORKING = False - _VALID_URL = r'http://(?:www\.)?myvideo\.de/(?:[^/]+/)?watch/(?P<id>[0-9]+)/[^?/]+.*' + _VALID_URL = r'https?://(?:www\.)?myvideo\.de/(?:[^/]+/)?watch/(?P<id>[0-9]+)/[^?/]+.*' IE_NAME = 'myvideo' _TEST = { 'url': 'http://www.myvideo.de/watch/8229274/bowling_fail_or_win', @@ -112,7 +112,7 @@ class MyVideoIE(InfoExtractor): encxml = compat_urllib_parse_unquote(b) if not params.get('domain'): params['domain'] = 'www.myvideo.de' - xmldata_url = '%s?%s' % (encxml, compat_urllib_parse.urlencode(params)) + xmldata_url = '%s?%s' % (encxml, compat_urllib_parse_urlencode(params)) if 'flash_playertype=MTV' in xmldata_url: self._downloader.report_warning('avoiding MTV player') xmldata_url = ( diff --git a/youtube_dl/extractor/myvidster.py b/youtube_dl/extractor/myvidster.py index a94ab8358..731c24542 100644 --- a/youtube_dl/extractor/myvidster.py +++ b/youtube_dl/extractor/myvidster.py @@ -4,7 +4,7 @@ from .common import InfoExtractor class MyVidsterIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?myvidster\.com/video/(?P<id>\d+)/' + _VALID_URL = r'https?://(?:www\.)?myvidster\.com/video/(?P<id>\d+)/' _TEST = { 'url': 'http://www.myvidster.com/video/32059805/Hot_chemistry_with_raw_love_making', diff --git a/youtube_dl/extractor/nationalgeographic.py b/youtube_dl/extractor/nationalgeographic.py index 7ce8d9b18..d5e53365c 100644 --- a/youtube_dl/extractor/nationalgeographic.py +++ b/youtube_dl/extractor/nationalgeographic.py @@ -8,7 +8,7 @@ from ..utils import ( class NationalGeographicIE(InfoExtractor): - _VALID_URL = r'http://video\.nationalgeographic\.com/.*?' + _VALID_URL = r'https?://video\.nationalgeographic\.com/.*?' _TESTS = [ { diff --git a/youtube_dl/extractor/naver.py b/youtube_dl/extractor/naver.py index 1f5fc2145..6d6f69b44 100644 --- a/youtube_dl/extractor/naver.py +++ b/youtube_dl/extractor/naver.py @@ -5,7 +5,7 @@ import re from .common import InfoExtractor from ..compat import ( - compat_urllib_parse, + compat_urllib_parse_urlencode, compat_urlparse, ) from ..utils import ( @@ -53,8 +53,8 @@ class NaverIE(InfoExtractor): raise ExtractorError('couldn\'t extract vid and key') vid = m_id.group(1) key = m_id.group(2) - query = compat_urllib_parse.urlencode({'vid': vid, 'inKey': key, }) - query_urls = compat_urllib_parse.urlencode({ + query = compat_urllib_parse_urlencode({'vid': vid, 'inKey': key, }) + query_urls = compat_urllib_parse_urlencode({ 'masterVid': vid, 'protocol': 'p2p', 'inKey': key, diff --git a/youtube_dl/extractor/nba.py b/youtube_dl/extractor/nba.py index 3e2b3e599..d896b0d04 100644 --- a/youtube_dl/extractor/nba.py +++ b/youtube_dl/extractor/nba.py @@ -6,7 +6,7 @@ import re from .common import InfoExtractor from ..compat import ( - compat_urllib_parse, + compat_urllib_parse_urlencode, compat_urlparse, ) from ..utils import ( @@ -97,7 +97,7 @@ class NBAIE(InfoExtractor): _PAGE_SIZE = 30 def _fetch_page(self, team, video_id, page): - search_url = 'http://searchapp2.nba.com/nba-search/query.jsp?' + compat_urllib_parse.urlencode({ + search_url = 'http://searchapp2.nba.com/nba-search/query.jsp?' + compat_urllib_parse_urlencode({ 'type': 'teamvideo', 'start': page * self._PAGE_SIZE + 1, 'npp': (page + 1) * self._PAGE_SIZE + 1, diff --git a/youtube_dl/extractor/nbc.py b/youtube_dl/extractor/nbc.py index bb0817e34..43d75d3ca 100644 --- a/youtube_dl/extractor/nbc.py +++ b/youtube_dl/extractor/nbc.py @@ -115,7 +115,7 @@ class NBCSportsVPlayerIE(InfoExtractor): class NBCSportsIE(InfoExtractor): # Does not include https because its certificate is invalid - _VALID_URL = r'http://www\.nbcsports\.com//?(?:[^/]+/)+(?P<id>[0-9a-z-]+)' + _VALID_URL = r'https?://www\.nbcsports\.com//?(?:[^/]+/)+(?P<id>[0-9a-z-]+)' _TEST = { 'url': 'http://www.nbcsports.com//college-basketball/ncaab/tom-izzo-michigan-st-has-so-much-respect-duke', @@ -134,6 +134,30 @@ class NBCSportsIE(InfoExtractor): NBCSportsVPlayerIE._extract_url(webpage), 'NBCSportsVPlayer') +class CSNNEIE(InfoExtractor): + _VALID_URL = r'https?://www\.csnne\.com/video/(?P<id>[0-9a-z-]+)' + + _TEST = { + 'url': 'http://www.csnne.com/video/snc-evening-update-wright-named-red-sox-no-5-starter', + 'info_dict': { + 'id': 'yvBLLUgQ8WU0', + 'ext': 'mp4', + 'title': 'SNC evening update: Wright named Red Sox\' No. 5 starter.', + 'description': 'md5:1753cfee40d9352b19b4c9b3e589b9e3', + } + } + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + return { + '_type': 'url_transparent', + 'ie_key': 'ThePlatform', + 'url': self._html_search_meta('twitter:player:stream', webpage), + 'display_id': display_id, + } + + class NBCNewsIE(ThePlatformIE): _VALID_URL = r'''(?x)https?://(?:www\.)?nbcnews\.com/ (?:video/.+?/(?P<id>\d+)| @@ -295,7 +319,7 @@ class NBCNewsIE(ThePlatformIE): class MSNBCIE(InfoExtractor): # https URLs redirect to corresponding http ones - _VALID_URL = r'http://www\.msnbc\.com/[^/]+/watch/(?P<id>[^/]+)' + _VALID_URL = r'https?://www\.msnbc\.com/[^/]+/watch/(?P<id>[^/]+)' _TEST = { 'url': 'http://www.msnbc.com/all-in-with-chris-hayes/watch/the-chaotic-gop-immigration-vote-314487875924', 'md5': '6d236bf4f3dddc226633ce6e2c3f814d', diff --git a/youtube_dl/extractor/neteasemusic.py b/youtube_dl/extractor/neteasemusic.py index 7830616f8..0d36474fa 100644 --- a/youtube_dl/extractor/neteasemusic.py +++ b/youtube_dl/extractor/neteasemusic.py @@ -8,7 +8,7 @@ import re from .common import InfoExtractor from ..compat import ( - compat_urllib_parse, + compat_urllib_parse_urlencode, compat_str, compat_itertools_count, ) @@ -153,7 +153,7 @@ class NetEaseMusicIE(NetEaseMusicBaseIE): 'ids': '[%s]' % song_id } info = self.query_api( - 'song/detail?' + compat_urllib_parse.urlencode(params), + 'song/detail?' + compat_urllib_parse_urlencode(params), song_id, 'Downloading song info')['songs'][0] formats = self.extract_formats(info) diff --git a/youtube_dl/extractor/nextmedia.py b/youtube_dl/extractor/nextmedia.py index d1688457f..aae7aeeeb 100644 --- a/youtube_dl/extractor/nextmedia.py +++ b/youtube_dl/extractor/nextmedia.py @@ -7,7 +7,7 @@ from ..utils import parse_iso8601 class NextMediaIE(InfoExtractor): IE_DESC = '蘋果日報' - _VALID_URL = r'http://hk.apple.nextmedia.com/[^/]+/[^/]+/(?P<date>\d+)/(?P<id>\d+)' + _VALID_URL = r'https?://hk.apple.nextmedia.com/[^/]+/[^/]+/(?P<date>\d+)/(?P<id>\d+)' _TESTS = [{ 'url': 'http://hk.apple.nextmedia.com/realtime/news/20141108/53109199', 'md5': 'dff9fad7009311c421176d1ac90bfe4f', @@ -68,7 +68,7 @@ class NextMediaIE(InfoExtractor): class NextMediaActionNewsIE(NextMediaIE): IE_DESC = '蘋果日報 - 動新聞' - _VALID_URL = r'http://hk.dv.nextmedia.com/actionnews/[^/]+/(?P<date>\d+)/(?P<id>\d+)/\d+' + _VALID_URL = r'https?://hk.dv.nextmedia.com/actionnews/[^/]+/(?P<date>\d+)/(?P<id>\d+)/\d+' _TESTS = [{ 'url': 'http://hk.dv.nextmedia.com/actionnews/hit/20150121/19009428/20061460', 'md5': '05fce8ffeed7a5e00665d4b7cf0f9201', @@ -93,7 +93,7 @@ class NextMediaActionNewsIE(NextMediaIE): class AppleDailyIE(NextMediaIE): IE_DESC = '臺灣蘋果日報' - _VALID_URL = r'http://(www|ent).appledaily.com.tw/(?:animation|appledaily|enews|realtimenews)/[^/]+/[^/]+/(?P<date>\d+)/(?P<id>\d+)(/.*)?' + _VALID_URL = r'https?://(www|ent).appledaily.com.tw/(?:animation|appledaily|enews|realtimenews)/[^/]+/[^/]+/(?P<date>\d+)/(?P<id>\d+)(/.*)?' _TESTS = [{ 'url': 'http://ent.appledaily.com.tw/enews/article/entertainment/20150128/36354694', 'md5': 'a843ab23d150977cc55ef94f1e2c1e4d', diff --git a/youtube_dl/extractor/nextmovie.py b/youtube_dl/extractor/nextmovie.py index 657ae77a0..9ccd7d774 100644 --- a/youtube_dl/extractor/nextmovie.py +++ b/youtube_dl/extractor/nextmovie.py @@ -2,7 +2,7 @@ from __future__ import unicode_literals from .mtv import MTVServicesInfoExtractor -from ..compat import compat_urllib_parse +from ..compat import compat_urllib_parse_urlencode class NextMovieIE(MTVServicesInfoExtractor): @@ -20,7 +20,7 @@ class NextMovieIE(MTVServicesInfoExtractor): }] def _get_feed_query(self, uri): - return compat_urllib_parse.urlencode({ + return compat_urllib_parse_urlencode({ 'feed': '1505', 'mgid': uri, }) diff --git a/youtube_dl/extractor/nfb.py b/youtube_dl/extractor/nfb.py index 5bd15f7a7..51e4a34f7 100644 --- a/youtube_dl/extractor/nfb.py +++ b/youtube_dl/extractor/nfb.py @@ -1,8 +1,10 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..compat import compat_urllib_parse -from ..utils import sanitized_Request +from ..utils import ( + sanitized_Request, + urlencode_postdata, +) class NFBIE(InfoExtractor): @@ -40,7 +42,7 @@ class NFBIE(InfoExtractor): request = sanitized_Request( 'https://www.nfb.ca/film/%s/player_config' % video_id, - compat_urllib_parse.urlencode({'getConfig': 'true'}).encode('ascii')) + urlencode_postdata({'getConfig': 'true'})) request.add_header('Content-Type', 'application/x-www-form-urlencoded') request.add_header('X-NFB-Referer', 'http://www.nfb.ca/medias/flash/NFBVideoPlayer.swf') diff --git a/youtube_dl/extractor/nhl.py b/youtube_dl/extractor/nhl.py index 8d5ce46ad..c1dea8b6c 100644 --- a/youtube_dl/extractor/nhl.py +++ b/youtube_dl/extractor/nhl.py @@ -7,7 +7,7 @@ import os from .common import InfoExtractor from ..compat import ( compat_urlparse, - compat_urllib_parse, + compat_urllib_parse_urlencode, compat_urllib_parse_urlparse ) from ..utils import ( @@ -38,7 +38,7 @@ class NHLBaseInfoExtractor(InfoExtractor): parsed_url = compat_urllib_parse_urlparse(initial_video_url) filename, ext = os.path.splitext(parsed_url.path) path = '%s_sd%s' % (filename, ext) - data = compat_urllib_parse.urlencode({ + data = compat_urllib_parse_urlencode({ 'type': 'fvod', 'path': compat_urlparse.urlunparse(parsed_url[:2] + (path,) + parsed_url[3:]) }) @@ -211,7 +211,7 @@ class NHLVideocenterIE(NHLBaseInfoExtractor): r'tab0"[^>]*?>(.*?)</td>', webpage, 'playlist title', flags=re.DOTALL).lower().capitalize() - data = compat_urllib_parse.urlencode({ + data = compat_urllib_parse_urlencode({ 'cid': cat_id, # This is the default value 'count': 12, diff --git a/youtube_dl/extractor/nick.py b/youtube_dl/extractor/nick.py index b62819ae5..ce065f2b0 100644 --- a/youtube_dl/extractor/nick.py +++ b/youtube_dl/extractor/nick.py @@ -2,7 +2,7 @@ from __future__ import unicode_literals from .mtv import MTVServicesInfoExtractor -from ..compat import compat_urllib_parse +from ..compat import compat_urllib_parse_urlencode class NickIE(MTVServicesInfoExtractor): @@ -54,7 +54,7 @@ class NickIE(MTVServicesInfoExtractor): }] def _get_feed_query(self, uri): - return compat_urllib_parse.urlencode({ + return compat_urllib_parse_urlencode({ 'feed': 'nick_arc_player_prime', 'mgid': uri, }) diff --git a/youtube_dl/extractor/niconico.py b/youtube_dl/extractor/niconico.py index 586e52a4a..dd75a48af 100644 --- a/youtube_dl/extractor/niconico.py +++ b/youtube_dl/extractor/niconico.py @@ -7,11 +7,10 @@ import datetime from .common import InfoExtractor from ..compat import ( - compat_urllib_parse, + compat_urllib_parse_urlencode, compat_urlparse, ) from ..utils import ( - encode_dict, ExtractorError, int_or_none, parse_duration, @@ -19,6 +18,7 @@ from ..utils import ( sanitized_Request, xpath_text, determine_ext, + urlencode_postdata, ) @@ -101,7 +101,7 @@ class NiconicoIE(InfoExtractor): 'mail': username, 'password': password, } - login_data = compat_urllib_parse.urlencode(encode_dict(login_form_strs)).encode('utf-8') + login_data = urlencode_postdata(login_form_strs) request = sanitized_Request( 'https://secure.nicovideo.jp/secure/login', login_data) login_results = self._download_webpage( @@ -141,7 +141,7 @@ class NiconicoIE(InfoExtractor): r'\'thumbPlayKey\'\s*:\s*\'(.*?)\'', ext_player_info, 'thumbPlayKey') # Get flv info - flv_info_data = compat_urllib_parse.urlencode({ + flv_info_data = compat_urllib_parse_urlencode({ 'k': thumb_play_key, 'v': video_id }) diff --git a/youtube_dl/extractor/noco.py b/youtube_dl/extractor/noco.py index d440313d5..06f2bda07 100644 --- a/youtube_dl/extractor/noco.py +++ b/youtube_dl/extractor/noco.py @@ -8,7 +8,6 @@ import hashlib from .common import InfoExtractor from ..compat import ( compat_str, - compat_urllib_parse, compat_urlparse, ) from ..utils import ( @@ -18,11 +17,12 @@ from ..utils import ( float_or_none, parse_iso8601, sanitized_Request, + urlencode_postdata, ) class NocoIE(InfoExtractor): - _VALID_URL = r'http://(?:(?:www\.)?noco\.tv/emission/|player\.noco\.tv/\?idvideo=)(?P<id>\d+)' + _VALID_URL = r'https?://(?:(?:www\.)?noco\.tv/emission/|player\.noco\.tv/\?idvideo=)(?P<id>\d+)' _LOGIN_URL = 'http://noco.tv/do.php' _API_URL_TEMPLATE = 'https://api.noco.tv/1.1/%s?ts=%s&tk=%s' _SUB_LANG_TEMPLATE = '&sub_lang=%s' @@ -75,7 +75,7 @@ class NocoIE(InfoExtractor): 'username': username, 'password': password, } - request = sanitized_Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form)) + request = sanitized_Request(self._LOGIN_URL, urlencode_postdata(login_form)) request.add_header('Content-Type', 'application/x-www-form-urlencoded; charset=UTF-8') login = self._download_json(request, None, 'Logging in as %s' % username) diff --git a/youtube_dl/extractor/normalboots.py b/youtube_dl/extractor/normalboots.py index 5952d136f..77e091072 100644 --- a/youtube_dl/extractor/normalboots.py +++ b/youtube_dl/extractor/normalboots.py @@ -9,7 +9,7 @@ from ..utils import ( class NormalbootsIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?normalboots\.com/video/(?P<id>[0-9a-z-]*)/?$' + _VALID_URL = r'https?://(?:www\.)?normalboots\.com/video/(?P<id>[0-9a-z-]*)/?$' _TEST = { 'url': 'http://normalboots.com/video/home-alone-games-jontron/', 'md5': '8bf6de238915dd501105b44ef5f1e0f6', diff --git a/youtube_dl/extractor/nova.py b/youtube_dl/extractor/nova.py index 3f9c776ef..17671ad39 100644 --- a/youtube_dl/extractor/nova.py +++ b/youtube_dl/extractor/nova.py @@ -12,7 +12,7 @@ from ..utils import ( class NovaIE(InfoExtractor): IE_DESC = 'TN.cz, Prásk.tv, Nova.cz, Novaplus.cz, FANDA.tv, Krásná.cz and Doma.cz' - _VALID_URL = 'http://(?:[^.]+\.)?(?P<site>tv(?:noviny)?|tn|novaplus|vymena|fanda|krasna|doma|prask)\.nova\.cz/(?:[^/]+/)+(?P<id>[^/]+?)(?:\.html|/|$)' + _VALID_URL = r'https?://(?:[^.]+\.)?(?P<site>tv(?:noviny)?|tn|novaplus|vymena|fanda|krasna|doma|prask)\.nova\.cz/(?:[^/]+/)+(?P<id>[^/]+?)(?:\.html|/|$)' _TESTS = [{ 'url': 'http://tvnoviny.nova.cz/clanek/novinky/co-na-sebe-sportaci-praskli-vime-jestli-pujde-hrdlicka-na-materskou.html?utm_source=tvnoviny&utm_medium=cpfooter&utm_campaign=novaplus', 'info_dict': { diff --git a/youtube_dl/extractor/novamov.py b/youtube_dl/extractor/novamov.py index d68c1ad79..a131f7dbd 100644 --- a/youtube_dl/extractor/novamov.py +++ b/youtube_dl/extractor/novamov.py @@ -7,7 +7,6 @@ from ..compat import compat_urlparse from ..utils import ( ExtractorError, NO_DEFAULT, - encode_dict, sanitized_Request, urlencode_postdata, ) @@ -73,7 +72,7 @@ class NovaMovIE(InfoExtractor): if not post_url.startswith('http'): post_url = compat_urlparse.urljoin(url, post_url) request = sanitized_Request( - post_url, urlencode_postdata(encode_dict(fields))) + post_url, urlencode_postdata(fields)) request.add_header('Content-Type', 'application/x-www-form-urlencoded') request.add_header('Referer', post_url) webpage = self._download_webpage( diff --git a/youtube_dl/extractor/npr.py b/youtube_dl/extractor/npr.py index 125c7010b..1777aa10b 100644 --- a/youtube_dl/extractor/npr.py +++ b/youtube_dl/extractor/npr.py @@ -1,7 +1,7 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..compat import compat_urllib_parse +from ..compat import compat_urllib_parse_urlencode from ..utils import ( int_or_none, qualities, @@ -9,7 +9,7 @@ from ..utils import ( class NprIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?npr\.org/player/v2/mediaPlayer\.html\?.*\bid=(?P<id>\d+)' + _VALID_URL = r'https?://(?:www\.)?npr\.org/player/v2/mediaPlayer\.html\?.*\bid=(?P<id>\d+)' _TESTS = [{ 'url': 'http://www.npr.org/player/v2/mediaPlayer.html?id=449974205', 'info_dict': { @@ -38,7 +38,7 @@ class NprIE(InfoExtractor): playlist_id = self._match_id(url) config = self._download_json( - 'http://api.npr.org/query?%s' % compat_urllib_parse.urlencode({ + 'http://api.npr.org/query?%s' % compat_urllib_parse_urlencode({ 'id': playlist_id, 'fields': 'titles,audio,show', 'format': 'json', diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py index 3b21fbd4d..9df200822 100644 --- a/youtube_dl/extractor/nrk.py +++ b/youtube_dl/extractor/nrk.py @@ -63,6 +63,7 @@ class NRKIE(InfoExtractor): if determine_ext(media_url) == 'f4m': formats = self._extract_f4m_formats( media_url + '?hdcore=3.5.0&plugin=aasp-3.5.0.151.81', video_id, f4m_id='hds') + self._sort_formats(formats) else: formats = [{ 'url': media_url, diff --git a/youtube_dl/extractor/ntvru.py b/youtube_dl/extractor/ntvru.py index 2cd924d05..0895d7ea4 100644 --- a/youtube_dl/extractor/ntvru.py +++ b/youtube_dl/extractor/ntvru.py @@ -11,7 +11,7 @@ from ..utils import ( class NTVRuIE(InfoExtractor): IE_NAME = 'ntv.ru' - _VALID_URL = r'http://(?:www\.)?ntv\.ru/(?P<id>.+)' + _VALID_URL = r'https?://(?:www\.)?ntv\.ru/(?P<id>.+)' _TESTS = [ { diff --git a/youtube_dl/extractor/nytimes.py b/youtube_dl/extractor/nytimes.py index 7f254b867..681683e86 100644 --- a/youtube_dl/extractor/nytimes.py +++ b/youtube_dl/extractor/nytimes.py @@ -18,8 +18,9 @@ class NYTimesBaseIE(InfoExtractor): description = video_data.get('summary') duration = float_or_none(video_data.get('duration'), 1000) - uploader = video_data['byline'] - timestamp = parse_iso8601(video_data['publication_date'][:-8]) + uploader = video_data.get('byline') + publication_date = video_data.get('publication_date') + timestamp = parse_iso8601(publication_date[:-8]) if publication_date else None def get_file_size(file_size): if isinstance(file_size, int): @@ -37,7 +38,7 @@ class NYTimesBaseIE(InfoExtractor): 'width': int_or_none(video.get('width')), 'height': int_or_none(video.get('height')), 'filesize': get_file_size(video.get('fileSize')), - } for video in video_data['renditions'] + } for video in video_data['renditions'] if video.get('url') ] self._sort_formats(formats) @@ -46,7 +47,7 @@ class NYTimesBaseIE(InfoExtractor): 'url': 'http://www.nytimes.com/%s' % image['url'], 'width': int_or_none(image.get('width')), 'height': int_or_none(image.get('height')), - } for image in video_data['images'] + } for image in video_data.get('images', []) if image.get('url') ] return { diff --git a/youtube_dl/extractor/once.py b/youtube_dl/extractor/once.py index 080045d4c..1bf96ea56 100644 --- a/youtube_dl/extractor/once.py +++ b/youtube_dl/extractor/once.py @@ -7,7 +7,7 @@ from .common import InfoExtractor class OnceIE(InfoExtractor): - _VALID_URL = r'https?://once\.unicornmedia\.com/now/[^/]+/[^/]+/(?P<domain_id>[^/]+)/(?P<application_id>[^/]+)/(?:[^/]+/)?(?P<media_item_id>[^/]+)/content\.(?:once|m3u8|mp4)' + _VALID_URL = r'https?://.+?\.unicornmedia\.com/now/[^/]+/[^/]+/(?P<domain_id>[^/]+)/(?P<application_id>[^/]+)/(?:[^/]+/)?(?P<media_item_id>[^/]+)/content\.(?:once|m3u8|mp4)' ADAPTIVE_URL_TEMPLATE = 'http://once.unicornmedia.com/now/master/playlist/%s/%s/%s/content.m3u8' PROGRESSIVE_URL_TEMPLATE = 'http://once.unicornmedia.com/now/media/progressive/%s/%s/%s/%s/content.mp4' @@ -20,6 +20,10 @@ class OnceIE(InfoExtractor): media_item_id, 'mp4', m3u8_id='hls', fatal=False) progressive_formats = [] for adaptive_format in formats: + # Prevent advertisement from embedding into m3u8 playlist (see + # https://github.com/rg3/youtube-dl/issues/8893#issuecomment-199912684) + adaptive_format['url'] = re.sub( + r'\badsegmentlength=\d+', r'adsegmentlength=0', adaptive_format['url']) rendition_id = self._search_regex( r'/now/media/playlist/[^/]+/[^/]+/([^/]+)', adaptive_format['url'], 'redition id', default=None) diff --git a/youtube_dl/extractor/ooyala.py b/youtube_dl/extractor/ooyala.py index 20b984288..16f040191 100644 --- a/youtube_dl/extractor/ooyala.py +++ b/youtube_dl/extractor/ooyala.py @@ -9,7 +9,7 @@ from ..utils import ( ExtractorError, unsmuggle_url, ) -from ..compat import compat_urllib_parse +from ..compat import compat_urllib_parse_urlencode class OoyalaBaseIE(InfoExtractor): @@ -35,7 +35,7 @@ class OoyalaBaseIE(InfoExtractor): for supported_format in ('mp4', 'm3u8', 'hds', 'rtmp'): auth_data = self._download_json( self._AUTHORIZATION_URL_TEMPLATE % (pcode, embed_code) + - compat_urllib_parse.urlencode({ + compat_urllib_parse_urlencode({ 'domain': domain, 'supportedFormats': supported_format }), diff --git a/youtube_dl/extractor/openload.py b/youtube_dl/extractor/openload.py new file mode 100644 index 000000000..4468f31fc --- /dev/null +++ b/youtube_dl/extractor/openload.py @@ -0,0 +1,107 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_chr +from ..utils import ( + encode_base_n, + ExtractorError, +) + + +class OpenloadIE(InfoExtractor): + _VALID_URL = r'https://openload.(?:co|io)/(?:f|embed)/(?P<id>[a-zA-Z0-9-]+)' + + _TESTS = [{ + 'url': 'https://openload.co/f/kUEfGclsU9o', + 'md5': 'bf1c059b004ebc7a256f89408e65c36e', + 'info_dict': { + 'id': 'kUEfGclsU9o', + 'ext': 'mp4', + 'title': 'skyrim_no-audio_1080.mp4', + 'thumbnail': 're:^https?://.*\.jpg$', + }, + }, { + 'url': 'https://openload.co/embed/kUEfGclsU9o/skyrim_no-audio_1080.mp4', + 'only_matching': True, + }, { + 'url': 'https://openload.io/f/ZAn6oz-VZGE/', + 'only_matching': True, + }] + + @staticmethod + def openload_level2_debase(m): + radix, num = int(m.group(1)) + 27, int(m.group(2)) + return '"' + encode_base_n(num, radix) + '"' + + @classmethod + def openload_level2(cls, txt): + # The function name is ǃ \u01c3 + # Using escaped unicode literals does not work in Python 3.2 + return re.sub(r'ǃ\((\d+),(\d+)\)', cls.openload_level2_debase, txt, re.UNICODE).replace('"+"', '') + + # Openload uses a variant of aadecode + # openload_decode and related functions are originally written by + # vitas@matfyz.cz and released with public domain + # See https://github.com/rg3/youtube-dl/issues/8489 + @classmethod + def openload_decode(cls, txt): + symbol_table = [ + ('_', '(゚Д゚) [゚Θ゚]'), + ('a', '(゚Д゚) [゚ω゚ノ]'), + ('b', '(゚Д゚) [゚Θ゚ノ]'), + ('c', '(゚Д゚) [\'c\']'), + ('d', '(゚Д゚) [゚ー゚ノ]'), + ('e', '(゚Д゚) [゚Д゚ノ]'), + ('f', '(゚Д゚) [1]'), + + ('o', '(゚Д゚) [\'o\']'), + ('u', '(o゚ー゚o)'), + ('c', '(゚Д゚) [\'c\']'), + + ('7', '((゚ー゚) + (o^_^o))'), + ('6', '((o^_^o) +(o^_^o) +(c^_^o))'), + ('5', '((゚ー゚) + (゚Θ゚))'), + ('4', '(-~3)'), + ('3', '(-~-~1)'), + ('2', '(-~1)'), + ('1', '(-~0)'), + ('0', '((c^_^o)-(c^_^o))'), + ] + delim = '(゚Д゚)[゚ε゚]+' + ret = '' + for aachar in txt.split(delim): + for val, pat in symbol_table: + aachar = aachar.replace(pat, val) + aachar = aachar.replace('+ ', '') + m = re.match(r'^\d+', aachar) + if m: + ret += compat_chr(int(m.group(0), 8)) + else: + m = re.match(r'^u([\da-f]+)', aachar) + if m: + ret += compat_chr(int(m.group(1), 16)) + return cls.openload_level2(ret) + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + if 'File not found' in webpage: + raise ExtractorError('File not found', expected=True) + + code = self._search_regex( + r'<video[^>]+>\s*<script[^>]+>([^<]+)</script>', + webpage, 'JS code') + + video_url = self._search_regex( + r'return\s+"(https?://[^"]+)"', self.openload_decode(code), 'video URL') + + return { + 'id': video_id, + 'title': self._og_search_title(webpage), + 'thumbnail': self._og_search_thumbnail(webpage), + 'url': video_url, + } diff --git a/youtube_dl/extractor/orf.py b/youtube_dl/extractor/orf.py index 958eb398b..66c75f8b3 100644 --- a/youtube_dl/extractor/orf.py +++ b/youtube_dl/extractor/orf.py @@ -137,7 +137,7 @@ class ORFTVthekIE(InfoExtractor): class ORFOE1IE(InfoExtractor): IE_NAME = 'orf:oe1' IE_DESC = 'Radio Österreich 1' - _VALID_URL = r'http://oe1\.orf\.at/(?:programm/|konsole.*?#\?track_id=)(?P<id>[0-9]+)' + _VALID_URL = r'https?://oe1\.orf\.at/(?:programm/|konsole.*?#\?track_id=)(?P<id>[0-9]+)' # Audios on ORF radio are only available for 7 days, so we can't add tests. _TEST = { @@ -171,7 +171,7 @@ class ORFOE1IE(InfoExtractor): class ORFFM4IE(InfoExtractor): IE_NAME = 'orf:fm4' IE_DESC = 'radio FM4' - _VALID_URL = r'http://fm4\.orf\.at/(?:7tage/?#|player/)(?P<date>[0-9]+)/(?P<show>\w+)' + _VALID_URL = r'https?://fm4\.orf\.at/(?:7tage/?#|player/)(?P<date>[0-9]+)/(?P<show>\w+)' _TEST = { 'url': 'http://fm4.orf.at/player/20160110/IS/', @@ -222,7 +222,7 @@ class ORFFM4IE(InfoExtractor): class ORFIPTVIE(InfoExtractor): IE_NAME = 'orf:iptv' IE_DESC = 'iptv.ORF.at' - _VALID_URL = r'http://iptv\.orf\.at/(?:#/)?stories/(?P<id>\d+)' + _VALID_URL = r'https?://iptv\.orf\.at/(?:#/)?stories/(?P<id>\d+)' _TEST = { 'url': 'http://iptv.orf.at/stories/2275236/', diff --git a/youtube_dl/extractor/patreon.py b/youtube_dl/extractor/patreon.py index ec8876c28..229750665 100644 --- a/youtube_dl/extractor/patreon.py +++ b/youtube_dl/extractor/patreon.py @@ -65,7 +65,7 @@ class PatreonIE(InfoExtractor): request = sanitized_Request( 'https://www.patreon.com/processLogin', - compat_urllib_parse.urlencode(login_form).encode('utf-8') + compat_urllib_parse_urlencode(login_form).encode('utf-8') ) login_page = self._download_webpage(request, None, note='Logging in as %s' % username) diff --git a/youtube_dl/extractor/philharmoniedeparis.py b/youtube_dl/extractor/philharmoniedeparis.py index 6e60e5fe9..f1008ae51 100644 --- a/youtube_dl/extractor/philharmoniedeparis.py +++ b/youtube_dl/extractor/philharmoniedeparis.py @@ -12,7 +12,7 @@ from ..utils import ( class PhilharmonieDeParisIE(InfoExtractor): IE_DESC = 'Philharmonie de Paris' - _VALID_URL = r'http://live\.philharmoniedeparis\.fr/(?:[Cc]oncert/|misc/Playlist\.ashx\?id=)(?P<id>\d+)' + _VALID_URL = r'https?://live\.philharmoniedeparis\.fr/(?:[Cc]oncert/|misc/Playlist\.ashx\?id=)(?P<id>\d+)' _TESTS = [{ 'url': 'http://live.philharmoniedeparis.fr/concert/1032066.html', 'info_dict': { diff --git a/youtube_dl/extractor/photobucket.py b/youtube_dl/extractor/photobucket.py index 788411ccc..6c8bbe1d9 100644 --- a/youtube_dl/extractor/photobucket.py +++ b/youtube_dl/extractor/photobucket.py @@ -8,7 +8,7 @@ from ..compat import compat_urllib_parse_unquote class PhotobucketIE(InfoExtractor): - _VALID_URL = r'http://(?:[a-z0-9]+\.)?photobucket\.com/.*(([\?\&]current=)|_)(?P<id>.*)\.(?P<ext>(flv)|(mp4))' + _VALID_URL = r'https?://(?:[a-z0-9]+\.)?photobucket\.com/.*(([\?\&]current=)|_)(?P<id>.*)\.(?P<ext>(flv)|(mp4))' _TEST = { 'url': 'http://media.photobucket.com/user/rachaneronas/media/TiredofLinkBuildingTryBacklinkMyDomaincom_zpsc0c3b9fa.mp4.html?filters[term]=search&filters[primary]=videos&filters[secondary]=images&sort=1&o=0', 'md5': '7dabfb92b0a31f6c16cebc0f8e60ff99', diff --git a/youtube_dl/extractor/played.py b/youtube_dl/extractor/played.py index 2856af96f..57c875ef0 100644 --- a/youtube_dl/extractor/played.py +++ b/youtube_dl/extractor/played.py @@ -5,10 +5,10 @@ import re import os.path from .common import InfoExtractor -from ..compat import compat_urllib_parse from ..utils import ( ExtractorError, sanitized_Request, + urlencode_postdata, ) @@ -40,7 +40,7 @@ class PlayedIE(InfoExtractor): self._sleep(2, video_id) - post = compat_urllib_parse.urlencode(data) + post = urlencode_postdata(data) headers = { b'Content-Type': b'application/x-www-form-urlencoded', } diff --git a/youtube_dl/extractor/playtvak.py b/youtube_dl/extractor/playtvak.py index e360404f7..1e8096a25 100644 --- a/youtube_dl/extractor/playtvak.py +++ b/youtube_dl/extractor/playtvak.py @@ -4,7 +4,7 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..compat import ( compat_urlparse, - compat_urllib_parse, + compat_urllib_parse_urlencode, ) from ..utils import ( ExtractorError, @@ -106,7 +106,7 @@ class PlaytvakIE(InfoExtractor): }) info_url = compat_urlparse.urlunparse( - parsed_url._replace(query=compat_urllib_parse.urlencode(qs, True))) + parsed_url._replace(query=compat_urllib_parse_urlencode(qs, True))) json_info = self._download_json( info_url, video_id, diff --git a/youtube_dl/extractor/pluralsight.py b/youtube_dl/extractor/pluralsight.py index 12e1c2862..df03dd419 100644 --- a/youtube_dl/extractor/pluralsight.py +++ b/youtube_dl/extractor/pluralsight.py @@ -8,7 +8,6 @@ import collections from .common import InfoExtractor from ..compat import ( compat_str, - compat_urllib_parse, compat_urlparse, ) from ..utils import ( @@ -17,6 +16,7 @@ from ..utils import ( parse_duration, qualities, sanitized_Request, + urlencode_postdata, ) @@ -76,7 +76,7 @@ class PluralsightIE(PluralsightBaseIE): post_url = compat_urlparse.urljoin(self._LOGIN_URL, post_url) request = sanitized_Request( - post_url, compat_urllib_parse.urlencode(login_form).encode('utf-8')) + post_url, urlencode_postdata(login_form)) request.add_header('Content-Type', 'application/x-www-form-urlencoded') response = self._download_webpage( @@ -279,13 +279,18 @@ class PluralsightCourseIE(PluralsightBaseIE): course_id, 'Downloading course data JSON') entries = [] - for module in course_data: + for num, module in enumerate(course_data, 1): for clip in module.get('clips', []): player_parameters = clip.get('playerParameters') if not player_parameters: continue - entries.append(self.url_result( - '%s/training/player?%s' % (self._API_BASE, player_parameters), - 'Pluralsight')) + entries.append({ + '_type': 'url_transparent', + 'url': '%s/training/player?%s' % (self._API_BASE, player_parameters), + 'ie_key': PluralsightIE.ie_key(), + 'chapter': module.get('title'), + 'chapter_number': num, + 'chapter_id': module.get('moduleRef'), + }) return self.playlist_result(entries, course_id, title, description) diff --git a/youtube_dl/extractor/porn91.py b/youtube_dl/extractor/porn91.py index 3e15533e9..9894f3262 100644 --- a/youtube_dl/extractor/porn91.py +++ b/youtube_dl/extractor/porn91.py @@ -1,7 +1,10 @@ # encoding: utf-8 from __future__ import unicode_literals -from ..compat import compat_urllib_parse +from ..compat import ( + compat_urllib_parse_unquote, + compat_urllib_parse_urlencode, +) from .common import InfoExtractor from ..utils import ( parse_duration, @@ -28,9 +31,10 @@ class Porn91IE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - url = 'http://91porn.com/view_video.php?viewkey=%s' % video_id self._set_cookie('91porn.com', 'language', 'cn_CN') - webpage = self._download_webpage(url, video_id, 'get HTML content') + + webpage = self._download_webpage( + 'http://91porn.com/view_video.php?viewkey=%s' % video_id, video_id) if '作为游客,你每天只可观看10个视频' in webpage: raise ExtractorError('91 Porn says: Daily limit 10 videos exceeded', expected=True) @@ -46,7 +50,7 @@ class Porn91IE(InfoExtractor): r'so.addVariable\(\'seccode\',\'([^\']+)\'', webpage, 'sec code') max_vid = self._search_regex( r'so.addVariable\(\'max_vid\',\'(\d+)\'', webpage, 'max vid') - url_params = compat_urllib_parse.urlencode({ + url_params = compat_urllib_parse_urlencode({ 'VID': file_id, 'mp4': '1', 'seccode': sec_code, @@ -54,8 +58,9 @@ class Porn91IE(InfoExtractor): }) info_cn = self._download_webpage( 'http://91porn.com/getfile.php?' + url_params, video_id, - 'get real video url') - video_url = self._search_regex(r'file=([^&]+)&', info_cn, 'url') + 'Downloading real video url') + video_url = compat_urllib_parse_unquote(self._search_regex( + r'file=([^&]+)&', info_cn, 'url')) duration = parse_duration(self._search_regex( r'时长:\s*</span>\s*(\d+:\d+)', webpage, 'duration', fatal=False)) diff --git a/youtube_dl/extractor/pornhd.py b/youtube_dl/extractor/pornhd.py index 57c78ba52..39b53ecf6 100644 --- a/youtube_dl/extractor/pornhd.py +++ b/youtube_dl/extractor/pornhd.py @@ -12,7 +12,7 @@ from ..utils import ( class PornHdIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?pornhd\.com/(?:[a-z]{2,4}/)?videos/(?P<id>\d+)(?:/(?P<display_id>.+))?' + _VALID_URL = r'https?://(?:www\.)?pornhd\.com/(?:[a-z]{2,4}/)?videos/(?P<id>\d+)(?:/(?P<display_id>.+))?' _TEST = { 'url': 'http://www.pornhd.com/videos/1962/sierra-day-gets-his-cum-all-over-herself-hd-porn-video', 'md5': '956b8ca569f7f4d8ec563e2c41598441', diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index 5a55c25e7..407ea08d4 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -1,10 +1,12 @@ from __future__ import unicode_literals +import itertools import os import re from .common import InfoExtractor from ..compat import ( + compat_HTTPError, compat_urllib_parse_unquote, compat_urllib_parse_unquote_plus, compat_urllib_parse_urlparse, @@ -12,6 +14,7 @@ from ..compat import ( from ..utils import ( ExtractorError, int_or_none, + orderedSet, sanitized_Request, str_to_int, ) @@ -75,7 +78,7 @@ class PornHubIE(InfoExtractor): flashvars = self._parse_json( self._search_regex( - r'var\s+flashv1ars_\d+\s*=\s*({.+?});', webpage, 'flashvars', default='{}'), + r'var\s+flashvars_\d+\s*=\s*({.+?});', webpage, 'flashvars', default='{}'), video_id) if flashvars: video_title = flashvars.get('video_title') @@ -149,9 +152,12 @@ class PornHubIE(InfoExtractor): class PornHubPlaylistBaseIE(InfoExtractor): def _extract_entries(self, webpage): return [ - self.url_result('http://www.pornhub.com/%s' % video_url, PornHubIE.ie_key()) - for video_url in set(re.findall( - r'href="/?(view_video\.php\?.*\bviewkey=[\da-z]+[^"]*)"', webpage)) + self.url_result( + 'http://www.pornhub.com/%s' % video_url, + PornHubIE.ie_key(), video_title=title) + for video_url, title in orderedSet(re.findall( + r'href="/?(view_video\.php\?.*\bviewkey=[\da-z]+[^"]*)"[^>]*\s+title="([^"]+)"', + webpage)) ] def _real_extract(self, url): @@ -185,16 +191,31 @@ class PornHubPlaylistIE(PornHubPlaylistBaseIE): class PornHubUserVideosIE(PornHubPlaylistBaseIE): _VALID_URL = r'https?://(?:www\.)?pornhub\.com/users/(?P<id>[^/]+)/videos' _TESTS = [{ - 'url': 'http://www.pornhub.com/users/rushandlia/videos', + 'url': 'http://www.pornhub.com/users/zoe_ph/videos/public', 'info_dict': { - 'id': 'rushandlia', + 'id': 'zoe_ph', }, - 'playlist_mincount': 13, + 'playlist_mincount': 171, + }, { + 'url': 'http://www.pornhub.com/users/rushandlia/videos', + 'only_matching': True, }] def _real_extract(self, url): user_id = self._match_id(url) - webpage = self._download_webpage(url, user_id) + entries = [] + for page_num in itertools.count(1): + try: + webpage = self._download_webpage( + url, user_id, 'Downloading page %d' % page_num, + query={'page': page_num}) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 404: + break + page_entries = self._extract_entries(webpage) + if not page_entries: + break + entries.extend(page_entries) - return self.playlist_result(self._extract_entries(webpage), user_id) + return self.playlist_result(entries, user_id) diff --git a/youtube_dl/extractor/pornovoisines.py b/youtube_dl/extractor/pornovoisines.py index 1a53fd71c..6b51e5c54 100644 --- a/youtube_dl/extractor/pornovoisines.py +++ b/youtube_dl/extractor/pornovoisines.py @@ -13,7 +13,7 @@ from ..utils import ( class PornoVoisinesIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?pornovoisines\.com/showvideo/(?P<id>\d+)/(?P<display_id>[^/]+)' + _VALID_URL = r'https?://(?:www\.)?pornovoisines\.com/showvideo/(?P<id>\d+)/(?P<display_id>[^/]+)' _VIDEO_URL_TEMPLATE = 'http://stream%d.pornovoisines.com' \ '/static/media/video/transcoded/%s-640x360-1000-trscded.mp4' diff --git a/youtube_dl/extractor/primesharetv.py b/youtube_dl/extractor/primesharetv.py index 85aae9576..0c1024772 100644 --- a/youtube_dl/extractor/primesharetv.py +++ b/youtube_dl/extractor/primesharetv.py @@ -1,10 +1,10 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..compat import compat_urllib_parse from ..utils import ( ExtractorError, sanitized_Request, + urlencode_postdata, ) @@ -42,7 +42,7 @@ class PrimeShareTVIE(InfoExtractor): self._sleep(wait_time, video_id) req = sanitized_Request( - url, compat_urllib_parse.urlencode(fields), headers) + url, urlencode_postdata(fields), headers) video_page = self._download_webpage( req, video_id, 'Downloading video page') diff --git a/youtube_dl/extractor/promptfile.py b/youtube_dl/extractor/promptfile.py index d5357283a..f93bd19ff 100644 --- a/youtube_dl/extractor/promptfile.py +++ b/youtube_dl/extractor/promptfile.py @@ -4,11 +4,11 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_urllib_parse from ..utils import ( determine_ext, ExtractorError, sanitized_Request, + urlencode_postdata, ) @@ -34,7 +34,7 @@ class PromptFileIE(InfoExtractor): expected=True) fields = self._hidden_inputs(webpage) - post = compat_urllib_parse.urlencode(fields) + post = urlencode_postdata(fields) req = sanitized_Request(url, post) req.add_header('Content-type', 'application/x-www-form-urlencoded') webpage = self._download_webpage( diff --git a/youtube_dl/extractor/prosiebensat1.py b/youtube_dl/extractor/prosiebensat1.py index 670e6950f..07d49d489 100644 --- a/youtube_dl/extractor/prosiebensat1.py +++ b/youtube_dl/extractor/prosiebensat1.py @@ -5,9 +5,7 @@ import re from hashlib import sha1 from .common import InfoExtractor -from ..compat import ( - compat_urllib_parse, -) +from ..compat import compat_urllib_parse_urlencode from ..utils import ( ExtractorError, determine_ext, @@ -235,7 +233,7 @@ class ProSiebenSat1IE(InfoExtractor): client_name = 'kolibri-2.0.19-splec4' client_location = url - videos_api_url = 'http://vas.sim-technik.de/vas/live/v2/videos?%s' % compat_urllib_parse.urlencode({ + videos_api_url = 'http://vas.sim-technik.de/vas/live/v2/videos?%s' % compat_urllib_parse_urlencode({ 'access_token': access_token, 'client_location': client_location, 'client_name': client_name, @@ -256,7 +254,7 @@ class ProSiebenSat1IE(InfoExtractor): client_id = g[:2] + sha1(''.join([clip_id, g, access_token, client_location, g, client_name]) .encode('utf-8')).hexdigest() - sources_api_url = 'http://vas.sim-technik.de/vas/live/v2/videos/%s/sources?%s' % (clip_id, compat_urllib_parse.urlencode({ + sources_api_url = 'http://vas.sim-technik.de/vas/live/v2/videos/%s/sources?%s' % (clip_id, compat_urllib_parse_urlencode({ 'access_token': access_token, 'client_id': client_id, 'client_location': client_location, @@ -270,7 +268,7 @@ class ProSiebenSat1IE(InfoExtractor): client_location, source_ids_str, g, client_name]) .encode('utf-8')).hexdigest() - url_api_url = 'http://vas.sim-technik.de/vas/live/v2/videos/%s/sources/url?%s' % (clip_id, compat_urllib_parse.urlencode({ + url_api_url = 'http://vas.sim-technik.de/vas/live/v2/videos/%s/sources/url?%s' % (clip_id, compat_urllib_parse_urlencode({ 'access_token': access_token, 'client_id': client_id, 'client_location': client_location, diff --git a/youtube_dl/extractor/pyvideo.py b/youtube_dl/extractor/pyvideo.py index 30a5f2de4..cc0416cb8 100644 --- a/youtube_dl/extractor/pyvideo.py +++ b/youtube_dl/extractor/pyvideo.py @@ -7,7 +7,7 @@ from .common import InfoExtractor class PyvideoIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?pyvideo\.org/video/(?P<id>\d+)/(.*)' + _VALID_URL = r'https?://(?:www\.)?pyvideo\.org/video/(?P<id>\d+)/(.*)' _TESTS = [ { diff --git a/youtube_dl/extractor/qqmusic.py b/youtube_dl/extractor/qqmusic.py index 45a3c41c5..ff0af9543 100644 --- a/youtube_dl/extractor/qqmusic.py +++ b/youtube_dl/extractor/qqmusic.py @@ -18,7 +18,7 @@ from ..utils import ( class QQMusicIE(InfoExtractor): IE_NAME = 'qqmusic' IE_DESC = 'QQ音乐' - _VALID_URL = r'http://y.qq.com/#type=song&mid=(?P<id>[0-9A-Za-z]+)' + _VALID_URL = r'https?://y.qq.com/#type=song&mid=(?P<id>[0-9A-Za-z]+)' _TESTS = [{ 'url': 'http://y.qq.com/#type=song&mid=004295Et37taLD', 'md5': '9ce1c1c8445f561506d2e3cfb0255705', @@ -172,7 +172,7 @@ class QQPlaylistBaseIE(InfoExtractor): class QQMusicSingerIE(QQPlaylistBaseIE): IE_NAME = 'qqmusic:singer' IE_DESC = 'QQ音乐 - 歌手' - _VALID_URL = r'http://y.qq.com/#type=singer&mid=(?P<id>[0-9A-Za-z]+)' + _VALID_URL = r'https?://y.qq.com/#type=singer&mid=(?P<id>[0-9A-Za-z]+)' _TEST = { 'url': 'http://y.qq.com/#type=singer&mid=001BLpXF2DyJe2', 'info_dict': { @@ -217,7 +217,7 @@ class QQMusicSingerIE(QQPlaylistBaseIE): class QQMusicAlbumIE(QQPlaylistBaseIE): IE_NAME = 'qqmusic:album' IE_DESC = 'QQ音乐 - 专辑' - _VALID_URL = r'http://y.qq.com/#type=album&mid=(?P<id>[0-9A-Za-z]+)' + _VALID_URL = r'https?://y.qq.com/#type=album&mid=(?P<id>[0-9A-Za-z]+)' _TESTS = [{ 'url': 'http://y.qq.com/#type=album&mid=000gXCTb2AhRR1', @@ -260,7 +260,7 @@ class QQMusicAlbumIE(QQPlaylistBaseIE): class QQMusicToplistIE(QQPlaylistBaseIE): IE_NAME = 'qqmusic:toplist' IE_DESC = 'QQ音乐 - 排行榜' - _VALID_URL = r'http://y\.qq\.com/#type=toplist&p=(?P<id>(top|global)_[0-9]+)' + _VALID_URL = r'https?://y\.qq\.com/#type=toplist&p=(?P<id>(top|global)_[0-9]+)' _TESTS = [{ 'url': 'http://y.qq.com/#type=toplist&p=global_123', @@ -314,7 +314,7 @@ class QQMusicToplistIE(QQPlaylistBaseIE): class QQMusicPlaylistIE(QQPlaylistBaseIE): IE_NAME = 'qqmusic:playlist' IE_DESC = 'QQ音乐 - 歌单' - _VALID_URL = r'http://y\.qq\.com/#type=taoge&id=(?P<id>[0-9]+)' + _VALID_URL = r'https?://y\.qq\.com/#type=taoge&id=(?P<id>[0-9]+)' _TESTS = [{ 'url': 'http://y.qq.com/#type=taoge&id=3462654915', diff --git a/youtube_dl/extractor/rai.py b/youtube_dl/extractor/rai.py index a4dc5c335..e36ce1aa1 100644 --- a/youtube_dl/extractor/rai.py +++ b/youtube_dl/extractor/rai.py @@ -18,7 +18,7 @@ from ..utils import ( class RaiTVIE(InfoExtractor): - _VALID_URL = r'http://(?:.+?\.)?(?:rai\.it|rai\.tv|rainews\.it)/dl/(?:[^/]+/)+media/.+?-(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})(?:-.+?)?\.html' + _VALID_URL = r'https?://(?:.+?\.)?(?:rai\.it|rai\.tv|rainews\.it)/dl/(?:[^/]+/)+media/.+?-(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})(?:-.+?)?\.html' _TESTS = [ { 'url': 'http://www.rai.tv/dl/RaiTV/programmi/media/ContentItem-cb27157f-9dd0-4aee-b788-b1f67643a391.html', @@ -175,7 +175,7 @@ class RaiTVIE(InfoExtractor): class RaiIE(InfoExtractor): - _VALID_URL = r'http://(?:.+?\.)?(?:rai\.it|rai\.tv|rainews\.it)/dl/.+?-(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})(?:-.+?)?\.html' + _VALID_URL = r'https?://(?:.+?\.)?(?:rai\.it|rai\.tv|rainews\.it)/dl/.+?-(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})(?:-.+?)?\.html' _TESTS = [ { 'url': 'http://www.report.rai.it/dl/Report/puntata/ContentItem-0c7a664b-d0f4-4b2c-8835-3f82e46f433e.html', diff --git a/youtube_dl/extractor/redtube.py b/youtube_dl/extractor/redtube.py index d6054d717..7ba41ba59 100644 --- a/youtube_dl/extractor/redtube.py +++ b/youtube_dl/extractor/redtube.py @@ -5,7 +5,7 @@ from ..utils import ExtractorError class RedTubeIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?redtube\.com/(?P<id>[0-9]+)' + _VALID_URL = r'https?://(?:www\.)?redtube\.com/(?P<id>[0-9]+)' _TEST = { 'url': 'http://www.redtube.com/66418', 'md5': '7b8c22b5e7098a3e1c09709df1126d2d', diff --git a/youtube_dl/extractor/restudy.py b/youtube_dl/extractor/restudy.py index b17c2bfc0..fd50065d4 100644 --- a/youtube_dl/extractor/restudy.py +++ b/youtube_dl/extractor/restudy.py @@ -31,6 +31,7 @@ class RestudyIE(InfoExtractor): formats = self._extract_smil_formats( 'https://www.restudy.dk/awsmedia/SmilDirectory/video_%s.xml' % video_id, video_id) + self._sort_formats(formats) return { 'id': video_id, diff --git a/youtube_dl/extractor/ringtv.py b/youtube_dl/extractor/ringtv.py index 508758075..2c2c707bd 100644 --- a/youtube_dl/extractor/ringtv.py +++ b/youtube_dl/extractor/ringtv.py @@ -6,7 +6,7 @@ from .common import InfoExtractor class RingTVIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?ringtv\.craveonline\.com/(?P<type>news|videos/video)/(?P<id>[^/?#]+)' + _VALID_URL = r'https?://(?:www\.)?ringtv\.craveonline\.com/(?P<type>news|videos/video)/(?P<id>[^/?#]+)' _TEST = { 'url': 'http://ringtv.craveonline.com/news/310833-luis-collazo-says-victor-ortiz-better-not-quit-on-jan-30', 'md5': 'd25945f5df41cdca2d2587165ac28720', diff --git a/youtube_dl/extractor/rte.py b/youtube_dl/extractor/rte.py index 042bc8dab..9c89974e7 100644 --- a/youtube_dl/extractor/rte.py +++ b/youtube_dl/extractor/rte.py @@ -49,6 +49,7 @@ class RteIE(InfoExtractor): # f4m_url = server + relative_url f4m_url = json_string['shows'][0]['media:group'][0]['rte:server'] + json_string['shows'][0]['media:group'][0]['url'] f4m_formats = self._extract_f4m_formats(f4m_url, video_id) + self._sort_formats(f4m_formats) return { 'id': video_id, diff --git a/youtube_dl/extractor/rtve.py b/youtube_dl/extractor/rtve.py index 8a8c5d2a0..79af47715 100644 --- a/youtube_dl/extractor/rtve.py +++ b/youtube_dl/extractor/rtve.py @@ -62,7 +62,7 @@ def _decrypt_url(png): class RTVEALaCartaIE(InfoExtractor): IE_NAME = 'rtve.es:alacarta' IE_DESC = 'RTVE a la carta' - _VALID_URL = r'http://www\.rtve\.es/(m/)?alacarta/videos/[^/]+/[^/]+/(?P<id>\d+)' + _VALID_URL = r'https?://www\.rtve\.es/(m/)?alacarta/videos/[^/]+/[^/]+/(?P<id>\d+)' _TESTS = [{ 'url': 'http://www.rtve.es/alacarta/videos/balonmano/o-swiss-cup-masculina-final-espana-suecia/2491869/', @@ -179,7 +179,7 @@ class RTVEInfantilIE(InfoExtractor): class RTVELiveIE(InfoExtractor): IE_NAME = 'rtve.es:live' IE_DESC = 'RTVE.es live streams' - _VALID_URL = r'http://www\.rtve\.es/directo/(?P<id>[a-zA-Z0-9-]+)' + _VALID_URL = r'https?://www\.rtve\.es/directo/(?P<id>[a-zA-Z0-9-]+)' _TESTS = [{ 'url': 'http://www.rtve.es/directo/la-1/', @@ -209,6 +209,7 @@ class RTVELiveIE(InfoExtractor): png = self._download_webpage(png_url, video_id, 'Downloading url information') m3u8_url = _decrypt_url(png) formats = self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4') + self._sort_formats(formats) return { 'id': video_id, diff --git a/youtube_dl/extractor/rtvnh.py b/youtube_dl/extractor/rtvnh.py index 7c9d4b0cd..4896d09d6 100644 --- a/youtube_dl/extractor/rtvnh.py +++ b/youtube_dl/extractor/rtvnh.py @@ -38,6 +38,7 @@ class RTVNHIE(InfoExtractor): item['file'], video_id, ext='mp4', entry_protocol='m3u8_native')) elif item.get('type') == '': formats.append({'url': item['file']}) + self._sort_formats(formats) return { 'id': video_id, diff --git a/youtube_dl/extractor/ruhd.py b/youtube_dl/extractor/ruhd.py index 0e470e73f..1f7c26299 100644 --- a/youtube_dl/extractor/ruhd.py +++ b/youtube_dl/extractor/ruhd.py @@ -5,7 +5,7 @@ from .common import InfoExtractor class RUHDIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?ruhd\.ru/play\.php\?vid=(?P<id>\d+)' + _VALID_URL = r'https?://(?:www\.)?ruhd\.ru/play\.php\?vid=(?P<id>\d+)' _TEST = { 'url': 'http://www.ruhd.ru/play.php?vid=207', 'md5': 'd1a9ec4edf8598e3fbd92bb16072ba83', diff --git a/youtube_dl/extractor/rutube.py b/youtube_dl/extractor/rutube.py index c5c47d01e..9ca4ae147 100644 --- a/youtube_dl/extractor/rutube.py +++ b/youtube_dl/extractor/rutube.py @@ -122,7 +122,7 @@ class RutubeEmbedIE(InfoExtractor): class RutubeChannelIE(InfoExtractor): IE_NAME = 'rutube:channel' IE_DESC = 'Rutube channels' - _VALID_URL = r'http://rutube\.ru/tags/video/(?P<id>\d+)' + _VALID_URL = r'https?://rutube\.ru/tags/video/(?P<id>\d+)' _TESTS = [{ 'url': 'http://rutube.ru/tags/video/1800/', 'info_dict': { @@ -156,7 +156,7 @@ class RutubeChannelIE(InfoExtractor): class RutubeMovieIE(RutubeChannelIE): IE_NAME = 'rutube:movie' IE_DESC = 'Rutube movies' - _VALID_URL = r'http://rutube\.ru/metainfo/tv/(?P<id>\d+)' + _VALID_URL = r'https?://rutube\.ru/metainfo/tv/(?P<id>\d+)' _TESTS = [] _MOVIE_TEMPLATE = 'http://rutube.ru/api/metainfo/tv/%s/?format=json' @@ -174,7 +174,7 @@ class RutubeMovieIE(RutubeChannelIE): class RutubePersonIE(RutubeChannelIE): IE_NAME = 'rutube:person' IE_DESC = 'Rutube person videos' - _VALID_URL = r'http://rutube\.ru/video/person/(?P<id>\d+)' + _VALID_URL = r'https?://rutube\.ru/video/person/(?P<id>\d+)' _TESTS = [{ 'url': 'http://rutube.ru/video/person/313878/', 'info_dict': { diff --git a/youtube_dl/extractor/rutv.py b/youtube_dl/extractor/rutv.py index f7fe1fece..a2379eb04 100644 --- a/youtube_dl/extractor/rutv.py +++ b/youtube_dl/extractor/rutv.py @@ -14,7 +14,7 @@ class RUTVIE(InfoExtractor): IE_DESC = 'RUTV.RU' _VALID_URL = r'''(?x) https?://player\.(?:rutv\.ru|vgtrk\.com)/ - (?P<path>flash2v/container\.swf\?id= + (?P<path>flash\d+v/container\.swf\?id= |iframe/(?P<type>swf|video|live)/id/ |index/iframe/cast_id/) (?P<id>\d+)''' @@ -109,7 +109,7 @@ class RUTVIE(InfoExtractor): return mobj.group('url') mobj = re.search( - r'<meta[^>]+?property=(["\'])og:video\1[^>]+?content=(["\'])(?P<url>https?://player\.(?:rutv\.ru|vgtrk\.com)/flash2v/container\.swf\?id=.+?\2)', + r'<meta[^>]+?property=(["\'])og:video\1[^>]+?content=(["\'])(?P<url>https?://player\.(?:rutv\.ru|vgtrk\.com)/flash\d+v/container\.swf\?id=.+?\2)', webpage) if mobj: return mobj.group('url') @@ -119,7 +119,7 @@ class RUTVIE(InfoExtractor): video_id = mobj.group('id') video_path = mobj.group('path') - if video_path.startswith('flash2v'): + if re.match(r'flash\d+v', video_path): video_type = 'video' elif video_path.startswith('iframe'): video_type = mobj.group('type') @@ -168,7 +168,7 @@ class RUTVIE(InfoExtractor): 'play_path': mobj.group('playpath'), 'app': mobj.group('app'), 'page_url': 'http://player.rutv.ru', - 'player_url': 'http://player.rutv.ru/flash2v/osmf.swf?i=22', + 'player_url': 'http://player.rutv.ru/flash3v/osmf.swf?i=22', 'rtmp_live': True, 'ext': 'flv', 'vbr': int(quality), diff --git a/youtube_dl/extractor/safari.py b/youtube_dl/extractor/safari.py index 256396bb8..6ba91f202 100644 --- a/youtube_dl/extractor/safari.py +++ b/youtube_dl/extractor/safari.py @@ -75,16 +75,7 @@ class SafariBaseIE(InfoExtractor): class SafariIE(SafariBaseIE): IE_NAME = 'safari' IE_DESC = 'safaribooksonline.com online video' - _VALID_URL = r'''(?x)https?:// - (?:www\.)?safaribooksonline\.com/ - (?: - library/view/[^/]+| - api/v1/book - )/ - (?P<course_id>[^/]+)/ - (?:chapter(?:-content)?/)? - (?P<part>part\d+)\.html - ''' + _VALID_URL = r'https?://(?:www\.)?safaribooksonline\.com/library/view/[^/]+/(?P<course_id>[^/]+)/(?P<part>part\d+)\.html' _TESTS = [{ 'url': 'https://www.safaribooksonline.com/library/view/hadoop-fundamentals-livelessons/9780133392838/part00.html', @@ -97,9 +88,6 @@ class SafariIE(SafariBaseIE): 'upload_date': '20150724', 'uploader_id': 'stork', }, - }, { - 'url': 'https://www.safaribooksonline.com/api/v1/book/9780133392838/chapter/part00.html', - 'only_matching': True, }, { # non-digits in course id 'url': 'https://www.safaribooksonline.com/library/view/create-a-nodejs/100000006A0210/part00.html', @@ -108,13 +96,18 @@ class SafariIE(SafariBaseIE): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - course_id = mobj.group('course_id') - part = mobj.group('part') + video_id = '%s/%s' % (mobj.group('course_id'), mobj.group('part')) - webpage = self._download_webpage(url, '%s/%s' % (course_id, part)) - reference_id = self._search_regex(r'data-reference-id="([^"]+)"', webpage, 'kaltura reference id') - partner_id = self._search_regex(r'data-partner-id="([^"]+)"', webpage, 'kaltura widget id') - ui_id = self._search_regex(r'data-ui-id="([^"]+)"', webpage, 'kaltura uiconf id') + webpage = self._download_webpage(url, video_id) + reference_id = self._search_regex( + r'data-reference-id=(["\'])(?P<id>.+?)\1', + webpage, 'kaltura reference id', group='id') + partner_id = self._search_regex( + r'data-partner-id=(["\'])(?P<id>.+?)\1', + webpage, 'kaltura widget id', group='id') + ui_id = self._search_regex( + r'data-ui-id=(["\'])(?P<id>.+?)\1', + webpage, 'kaltura uiconf id', group='id') query = { 'wid': '_%s' % partner_id, @@ -125,7 +118,7 @@ class SafariIE(SafariBaseIE): if self.LOGGED_IN: kaltura_session = self._download_json( '%s/player/kaltura_session/?reference_id=%s' % (self._API_BASE, reference_id), - course_id, 'Downloading kaltura session JSON', + video_id, 'Downloading kaltura session JSON', 'Unable to download kaltura session JSON', fatal=False) if kaltura_session: session = kaltura_session.get('session') @@ -137,6 +130,23 @@ class SafariIE(SafariBaseIE): 'Kaltura') +class SafariApiIE(SafariBaseIE): + IE_NAME = 'safari:api' + _VALID_URL = r'https?://(?:www\.)?safaribooksonline\.com/api/v1/book/(?P<course_id>[^/]+)/chapter(?:-content)?/(?P<part>part\d+)\.html' + + _TEST = { + 'url': 'https://www.safaribooksonline.com/api/v1/book/9780133392838/chapter/part00.html', + 'only_matching': True, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + part = self._download_json( + url, '%s/%s' % (mobj.group('course_id'), mobj.group('part')), + 'Downloading part JSON') + return self.url_result(part['web_url'], SafariIE.ie_key()) + + class SafariCourseIE(SafariBaseIE): IE_NAME = 'safari:course' IE_DESC = 'safaribooksonline.com online courses' @@ -168,7 +178,7 @@ class SafariCourseIE(SafariBaseIE): 'No chapters found for course %s' % course_id, expected=True) entries = [ - self.url_result(chapter, 'Safari') + self.url_result(chapter, SafariApiIE.ie_key()) for chapter in course_json['chapters']] course_title = course_json['title'] diff --git a/youtube_dl/extractor/screenjunkies.py b/youtube_dl/extractor/screenjunkies.py index f2af15f6b..dd0a6ba19 100644 --- a/youtube_dl/extractor/screenjunkies.py +++ b/youtube_dl/extractor/screenjunkies.py @@ -11,7 +11,7 @@ from ..utils import ( class ScreenJunkiesIE(InfoExtractor): - _VALID_URL = r'http://www.screenjunkies.com/video/(?P<display_id>[^/]+?)(?:-(?P<id>\d+))?(?:[/?#&]|$)' + _VALID_URL = r'https?://www.screenjunkies.com/video/(?P<display_id>[^/]+?)(?:-(?P<id>\d+))?(?:[/?#&]|$)' _TESTS = [{ 'url': 'http://www.screenjunkies.com/video/best-quentin-tarantino-movie-2841915', 'md5': '5c2b686bec3d43de42bde9ec047536b0', diff --git a/youtube_dl/extractor/senateisvp.py b/youtube_dl/extractor/senateisvp.py index 4d3b58522..c5f474dd1 100644 --- a/youtube_dl/extractor/senateisvp.py +++ b/youtube_dl/extractor/senateisvp.py @@ -48,7 +48,7 @@ class SenateISVPIE(InfoExtractor): ['arch', '', 'http://ussenate-f.akamaihd.net/'] ] _IE_NAME = 'senate.gov' - _VALID_URL = r'http://www\.senate\.gov/isvp/?\?(?P<qs>.+)' + _VALID_URL = r'https?://www\.senate\.gov/isvp/?\?(?P<qs>.+)' _TESTS = [{ 'url': 'http://www.senate.gov/isvp/?comm=judiciary&type=live&stt=&filename=judiciary031715&auto_play=false&wmode=transparent&poster=http%3A%2F%2Fwww.judiciary.senate.gov%2Fthemes%2Fjudiciary%2Fimages%2Fvideo-poster-flash-fit.png', 'info_dict': { diff --git a/youtube_dl/extractor/shahid.py b/youtube_dl/extractor/shahid.py index 1178b7a27..d95ea06be 100644 --- a/youtube_dl/extractor/shahid.py +++ b/youtube_dl/extractor/shahid.py @@ -2,7 +2,7 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..compat import compat_urllib_parse +from ..compat import compat_urllib_parse_urlencode from ..utils import ( ExtractorError, int_or_none, @@ -77,11 +77,12 @@ class ShahidIE(InfoExtractor): raise ExtractorError('This video is DRM protected.', expected=True) formats = self._extract_m3u8_formats(player['url'], video_id, 'mp4') + self._sort_formats(formats) video = self._download_json( '%s/%s/%s?%s' % ( api_vars['url'], api_vars['playerType'], api_vars['id'], - compat_urllib_parse.urlencode({ + compat_urllib_parse_urlencode({ 'apiKey': 'sh@hid0nlin3', 'hash': 'b2wMCTHpSmyxGqQjJFOycRmLSex+BpTK/ooxy6vHaqs=', })), diff --git a/youtube_dl/extractor/shared.py b/youtube_dl/extractor/shared.py index 8eda3c864..e7e5f653e 100644 --- a/youtube_dl/extractor/shared.py +++ b/youtube_dl/extractor/shared.py @@ -3,17 +3,17 @@ from __future__ import unicode_literals import base64 from .common import InfoExtractor -from ..compat import compat_urllib_parse from ..utils import ( ExtractorError, int_or_none, sanitized_Request, + urlencode_postdata, ) class SharedIE(InfoExtractor): IE_DESC = 'shared.sx and vivo.sx' - _VALID_URL = r'http://(?:shared|vivo)\.sx/(?P<id>[\da-z]{10})' + _VALID_URL = r'https?://(?:shared|vivo)\.sx/(?P<id>[\da-z]{10})' _TESTS = [{ 'url': 'http://shared.sx/0060718775', @@ -45,7 +45,7 @@ class SharedIE(InfoExtractor): download_form = self._hidden_inputs(webpage) request = sanitized_Request( - url, compat_urllib_parse.urlencode(download_form)) + url, urlencode_postdata(download_form)) request.add_header('Content-Type', 'application/x-www-form-urlencoded') video_page = self._download_webpage( diff --git a/youtube_dl/extractor/sharesix.py b/youtube_dl/extractor/sharesix.py index f1ea9bdb2..9cce5ceb4 100644 --- a/youtube_dl/extractor/sharesix.py +++ b/youtube_dl/extractor/sharesix.py @@ -4,10 +4,10 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_urllib_parse from ..utils import ( parse_duration, sanitized_Request, + urlencode_postdata, ) @@ -47,7 +47,7 @@ class ShareSixIE(InfoExtractor): fields = { 'method_free': 'Free' } - post = compat_urllib_parse.urlencode(fields) + post = urlencode_postdata(fields) req = sanitized_Request(url, post) req.add_header('Content-type', 'application/x-www-form-urlencoded') diff --git a/youtube_dl/extractor/sina.py b/youtube_dl/extractor/sina.py index b2258a0f6..d03f1b1d4 100644 --- a/youtube_dl/extractor/sina.py +++ b/youtube_dl/extractor/sina.py @@ -4,7 +4,7 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_urllib_parse +from ..compat import compat_urllib_parse_urlencode from ..utils import sanitized_Request @@ -39,7 +39,7 @@ class SinaIE(InfoExtractor): ] def _extract_video(self, video_id): - data = compat_urllib_parse.urlencode({'vid': video_id}) + data = compat_urllib_parse_urlencode({'vid': video_id}) url_doc = self._download_xml('http://v.iask.com/v_play.php?%s' % data, video_id, 'Downloading video url') image_page = self._download_webpage( diff --git a/youtube_dl/extractor/smotri.py b/youtube_dl/extractor/smotri.py index 015ef75f3..5c3fd0fec 100644 --- a/youtube_dl/extractor/smotri.py +++ b/youtube_dl/extractor/smotri.py @@ -7,12 +7,12 @@ import hashlib import uuid from .common import InfoExtractor -from ..compat import compat_urllib_parse from ..utils import ( ExtractorError, int_or_none, sanitized_Request, unified_strdate, + urlencode_postdata, ) @@ -175,7 +175,7 @@ class SmotriIE(InfoExtractor): video_form['pass'] = hashlib.md5(video_password.encode('utf-8')).hexdigest() request = sanitized_Request( - 'http://smotri.com/video/view/url/bot/', compat_urllib_parse.urlencode(video_form)) + 'http://smotri.com/video/view/url/bot/', urlencode_postdata(video_form)) request.add_header('Content-Type', 'application/x-www-form-urlencoded') video = self._download_json(request, video_id, 'Downloading video JSON') @@ -338,7 +338,7 @@ class SmotriBroadcastIE(InfoExtractor): } request = sanitized_Request( - broadcast_url + '/?no_redirect=1', compat_urllib_parse.urlencode(login_form)) + broadcast_url + '/?no_redirect=1', urlencode_postdata(login_form)) request.add_header('Content-Type', 'application/x-www-form-urlencoded') broadcast_page = self._download_webpage( request, broadcast_id, 'Logging in and confirming age') diff --git a/youtube_dl/extractor/sohu.py b/youtube_dl/extractor/sohu.py index ea8fc258d..49e5d09ae 100644 --- a/youtube_dl/extractor/sohu.py +++ b/youtube_dl/extractor/sohu.py @@ -6,7 +6,7 @@ import re from .common import InfoExtractor from ..compat import ( compat_str, - compat_urllib_parse, + compat_urllib_parse_urlencode, ) from ..utils import ( ExtractorError, @@ -170,7 +170,7 @@ class SohuIE(InfoExtractor): if retries > 0: download_note += ' (retry #%d)' % retries part_info = self._parse_json(self._download_webpage( - 'http://%s/?%s' % (allot, compat_urllib_parse.urlencode(params)), + 'http://%s/?%s' % (allot, compat_urllib_parse_urlencode(params)), video_id, download_note), video_id) video_url = part_info['url'] diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index 1efb2b980..194dabc71 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -11,10 +11,9 @@ from .common import ( from ..compat import ( compat_str, compat_urlparse, - compat_urllib_parse, + compat_urllib_parse_urlencode, ) from ..utils import ( - encode_dict, ExtractorError, int_or_none, unified_strdate, @@ -393,7 +392,7 @@ class SoundcloudUserIE(SoundcloudIE): query = COMMON_QUERY.copy() query['offset'] = 0 - next_href = base_url + '?' + compat_urllib_parse.urlencode(query) + next_href = base_url + '?' + compat_urllib_parse_urlencode(query) entries = [] for i in itertools.count(): @@ -424,7 +423,7 @@ class SoundcloudUserIE(SoundcloudIE): qs = compat_urlparse.parse_qs(parsed_next_href.query) qs.update(COMMON_QUERY) next_href = compat_urlparse.urlunparse( - parsed_next_href._replace(query=compat_urllib_parse.urlencode(qs, True))) + parsed_next_href._replace(query=compat_urllib_parse_urlencode(qs, True))) return { '_type': 'playlist', @@ -460,7 +459,7 @@ class SoundcloudPlaylistIE(SoundcloudIE): if token: data_dict['secret_token'] = token - data = compat_urllib_parse.urlencode(data_dict) + data = compat_urllib_parse_urlencode(data_dict) data = self._download_json( base_url + data, playlist_id, 'Downloading playlist') @@ -500,7 +499,7 @@ class SoundcloudSearchIE(SearchInfoExtractor, SoundcloudIE): query['client_id'] = self._CLIENT_ID query['linked_partitioning'] = '1' query['offset'] = 0 - data = compat_urllib_parse.urlencode(encode_dict(query)) + data = compat_urllib_parse_urlencode(query) next_url = '{0}{1}?{2}'.format(self._API_V2_BASE, endpoint, data) collected_results = 0 diff --git a/youtube_dl/extractor/sport5.py b/youtube_dl/extractor/sport5.py index dfe50ed45..7e6783306 100644 --- a/youtube_dl/extractor/sport5.py +++ b/youtube_dl/extractor/sport5.py @@ -8,7 +8,7 @@ from ..utils import ExtractorError class Sport5IE(InfoExtractor): - _VALID_URL = r'http://(?:www|vod)?\.sport5\.co\.il/.*\b(?:Vi|docID)=(?P<id>\d+)' + _VALID_URL = r'https?://(?:www|vod)?\.sport5\.co\.il/.*\b(?:Vi|docID)=(?P<id>\d+)' _TESTS = [ { 'url': 'http://vod.sport5.co.il/?Vc=147&Vi=176331&Page=1', diff --git a/youtube_dl/extractor/sportbox.py b/youtube_dl/extractor/sportbox.py index 86d509ae5..4f0c66213 100644 --- a/youtube_dl/extractor/sportbox.py +++ b/youtube_dl/extractor/sportbox.py @@ -99,6 +99,7 @@ class SportBoxEmbedIE(InfoExtractor): webpage, 'hls file') formats = self._extract_m3u8_formats(hls, video_id, 'mp4') + self._sort_formats(formats) title = self._search_regex( r'sportboxPlayer\.node_title\s*=\s*"([^"]+)"', webpage, 'title') diff --git a/youtube_dl/extractor/ssa.py b/youtube_dl/extractor/ssa.py index 13101c714..54d1843f2 100644 --- a/youtube_dl/extractor/ssa.py +++ b/youtube_dl/extractor/ssa.py @@ -8,7 +8,7 @@ from ..utils import ( class SSAIE(InfoExtractor): - _VALID_URL = r'http://ssa\.nls\.uk/film/(?P<id>\d+)' + _VALID_URL = r'https?://ssa\.nls\.uk/film/(?P<id>\d+)' _TEST = { 'url': 'http://ssa.nls.uk/film/3561', 'info_dict': { diff --git a/youtube_dl/extractor/streamcloud.py b/youtube_dl/extractor/streamcloud.py index 77841b946..712359885 100644 --- a/youtube_dl/extractor/streamcloud.py +++ b/youtube_dl/extractor/streamcloud.py @@ -4,8 +4,10 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_urllib_parse -from ..utils import sanitized_Request +from ..utils import ( + sanitized_Request, + urlencode_postdata, +) class StreamcloudIE(InfoExtractor): @@ -35,7 +37,7 @@ class StreamcloudIE(InfoExtractor): (?:id="[^"]+"\s+)? value="([^"]*)" ''', orig_webpage) - post = compat_urllib_parse.urlencode(fields) + post = urlencode_postdata(fields) self._sleep(12, video_id) headers = { diff --git a/youtube_dl/extractor/sztvhu.py b/youtube_dl/extractor/sztvhu.py index aa5964acb..f562aa6d3 100644 --- a/youtube_dl/extractor/sztvhu.py +++ b/youtube_dl/extractor/sztvhu.py @@ -5,7 +5,7 @@ from .common import InfoExtractor class SztvHuIE(InfoExtractor): - _VALID_URL = r'http://(?:(?:www\.)?sztv\.hu|www\.tvszombathely\.hu)/(?:[^/]+)/.+-(?P<id>[0-9]+)' + _VALID_URL = r'https?://(?:(?:www\.)?sztv\.hu|www\.tvszombathely\.hu)/(?:[^/]+)/.+-(?P<id>[0-9]+)' _TEST = { 'url': 'http://sztv.hu/hirek/cserkeszek-nepszerusitettek-a-kornyezettudatos-eletmodot-a-savaria-teren-20130909', 'md5': 'a6df607b11fb07d0e9f2ad94613375cb', diff --git a/youtube_dl/extractor/teamcoco.py b/youtube_dl/extractor/teamcoco.py index d1b7264b4..b49ab5f5b 100644 --- a/youtube_dl/extractor/teamcoco.py +++ b/youtube_dl/extractor/teamcoco.py @@ -16,7 +16,7 @@ from ..compat import compat_ord class TeamcocoIE(InfoExtractor): - _VALID_URL = r'http://teamcoco\.com/video/(?P<video_id>[0-9]+)?/?(?P<display_id>.*)' + _VALID_URL = r'https?://teamcoco\.com/video/(?P<video_id>[0-9]+)?/?(?P<display_id>.*)' _TESTS = [ { 'url': 'http://teamcoco.com/video/80187/conan-becomes-a-mary-kay-beauty-consultant', diff --git a/youtube_dl/extractor/tele13.py b/youtube_dl/extractor/tele13.py index 4e860db0a..a29a64b6d 100644 --- a/youtube_dl/extractor/tele13.py +++ b/youtube_dl/extractor/tele13.py @@ -11,7 +11,7 @@ from ..utils import ( class Tele13IE(InfoExtractor): - _VALID_URL = r'^http://(?:www\.)?t13\.cl/videos(?:/[^/]+)+/(?P<id>[\w-]+)' + _VALID_URL = r'^https?://(?:www\.)?t13\.cl/videos(?:/[^/]+)+/(?P<id>[\w-]+)' _TESTS = [ { 'url': 'http://www.t13.cl/videos/actualidad/el-circulo-de-hierro-de-michelle-bachelet-en-su-regreso-a-la-moneda', diff --git a/youtube_dl/extractor/telecinco.py b/youtube_dl/extractor/telecinco.py index 2c8e9b941..4b4b740b4 100644 --- a/youtube_dl/extractor/telecinco.py +++ b/youtube_dl/extractor/telecinco.py @@ -5,8 +5,8 @@ import json from .common import InfoExtractor from ..compat import ( - compat_urllib_parse, compat_urllib_parse_unquote, + compat_urllib_parse_urlencode, compat_urlparse, ) from ..utils import ( @@ -74,7 +74,7 @@ class TelecincoIE(InfoExtractor): info_el = self._download_xml(info_url, episode).find('./video/info') video_link = info_el.find('videoUrl/link').text - token_query = compat_urllib_parse.urlencode({'id': video_link}) + token_query = compat_urllib_parse_urlencode({'id': video_link}) token_info = self._download_json( embed_data['flashvars']['ov_tk'] + '?' + token_query, episode, @@ -82,6 +82,7 @@ class TelecincoIE(InfoExtractor): ) formats = self._extract_m3u8_formats( token_info['tokenizedUrl'], episode, ext='mp4', entry_protocol='m3u8_native') + self._sort_formats(formats) return { 'id': embed_data['videoId'], diff --git a/youtube_dl/extractor/tf1.py b/youtube_dl/extractor/tf1.py index 9ee844684..3f54b2744 100644 --- a/youtube_dl/extractor/tf1.py +++ b/youtube_dl/extractor/tf1.py @@ -6,7 +6,7 @@ from .common import InfoExtractor class TF1IE(InfoExtractor): """TF1 uses the wat.tv player.""" - _VALID_URL = r'http://(?:(?:videos|www|lci)\.tf1|www\.tfou)\.fr/(?:[^/]+/)*(?P<id>.+?)\.html' + _VALID_URL = r'https?://(?:(?:videos|www|lci)\.tf1|www\.tfou)\.fr/(?:[^/]+/)*(?P<id>.+?)\.html' _TESTS = [{ 'url': 'http://videos.tf1.fr/auto-moto/citroen-grand-c4-picasso-2013-presentation-officielle-8062060.html', 'info_dict': { diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py index 863914299..236c99972 100644 --- a/youtube_dl/extractor/theplatform.py +++ b/youtube_dl/extractor/theplatform.py @@ -82,7 +82,7 @@ class ThePlatformBaseIE(OnceIE): class ThePlatformIE(ThePlatformBaseIE): _VALID_URL = r'''(?x) (?:https?://(?:link|player)\.theplatform\.com/[sp]/(?P<provider_id>[^/]+)/ - (?:(?P<media>(?:(?:[^/]+/)+select/)?media/)|(?P<config>(?:[^/\?]+/(?:swf|config)|onsite)/select/))? + (?:(?:(?:[^/]+/)+select/)?(?P<media>media/(?:guid/\d+/)?)|(?P<config>(?:[^/\?]+/(?:swf|config)|onsite)/select/))? |theplatform:)(?P<id>[^/\?&]+)''' _TESTS = [{ @@ -170,10 +170,10 @@ class ThePlatformIE(ThePlatformBaseIE): if not provider_id: provider_id = 'dJ5BDC' - path = provider_id + path = provider_id + '/' if mobj.group('media'): - path += '/media' - path += '/' + video_id + path += mobj.group('media') + path += video_id qs_dict = compat_parse_qs(compat_urllib_parse_urlparse(url).query) if 'guid' in qs_dict: diff --git a/youtube_dl/extractor/thescene.py b/youtube_dl/extractor/thescene.py new file mode 100644 index 000000000..3e4e14031 --- /dev/null +++ b/youtube_dl/extractor/thescene.py @@ -0,0 +1,52 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor + +from ..compat import compat_urlparse +from ..utils import qualities + + +class TheSceneIE(InfoExtractor): + _VALID_URL = r'https://thescene\.com/watch/[^/]+/(?P<id>[^/#?]+)' + + _TEST = { + 'url': 'https://thescene.com/watch/vogue/narciso-rodriguez-spring-2013-ready-to-wear', + 'info_dict': { + 'id': '520e8faac2b4c00e3c6e5f43', + 'ext': 'mp4', + 'title': 'Narciso Rodriguez: Spring 2013 Ready-to-Wear', + 'display_id': 'narciso-rodriguez-spring-2013-ready-to-wear', + }, + } + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + + player_url = compat_urlparse.urljoin( + url, + self._html_search_regex( + r'id=\'js-player-script\'[^>]+src=\'(.+?)\'', webpage, 'player url')) + + player = self._download_webpage(player_url, display_id) + info = self._parse_json( + self._search_regex( + r'(?m)var\s+video\s+=\s+({.+?});$', player, 'info json'), + display_id) + + qualities_order = qualities(('low', 'high')) + formats = [{ + 'format_id': '{0}-{1}'.format(f['type'].split('/')[0], f['quality']), + 'url': f['src'], + 'quality': qualities_order(f['quality']), + } for f in info['sources'][0]] + self._sort_formats(formats) + + return { + 'id': info['id'], + 'display_id': display_id, + 'title': info['title'], + 'formats': formats, + 'thumbnail': info.get('poster_frame'), + } diff --git a/youtube_dl/extractor/thvideo.py b/youtube_dl/extractor/thvideo.py index 496f15d80..406f4a826 100644 --- a/youtube_dl/extractor/thvideo.py +++ b/youtube_dl/extractor/thvideo.py @@ -10,7 +10,7 @@ from ..utils import ( class THVideoIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?thvideo\.tv/(?:v/th|mobile\.php\?cid=)(?P<id>[0-9]+)' + _VALID_URL = r'https?://(?:www\.)?thvideo\.tv/(?:v/th|mobile\.php\?cid=)(?P<id>[0-9]+)' _TEST = { 'url': 'http://thvideo.tv/v/th1987/', 'md5': 'fa107b1f73817e325e9433505a70db50', diff --git a/youtube_dl/extractor/tinypic.py b/youtube_dl/extractor/tinypic.py index e036b8cdf..c43cace24 100644 --- a/youtube_dl/extractor/tinypic.py +++ b/youtube_dl/extractor/tinypic.py @@ -9,7 +9,7 @@ from ..utils import ExtractorError class TinyPicIE(InfoExtractor): IE_NAME = 'tinypic' IE_DESC = 'tinypic.com videos' - _VALID_URL = r'http://(?:.+?\.)?tinypic\.com/player\.php\?v=(?P<id>[^&]+)&s=\d+' + _VALID_URL = r'https?://(?:.+?\.)?tinypic\.com/player\.php\?v=(?P<id>[^&]+)&s=\d+' _TESTS = [ { diff --git a/youtube_dl/extractor/tlc.py b/youtube_dl/extractor/tlc.py index 17add9543..abad3ff64 100644 --- a/youtube_dl/extractor/tlc.py +++ b/youtube_dl/extractor/tlc.py @@ -9,7 +9,7 @@ from ..compat import compat_parse_qs class TlcDeIE(InfoExtractor): IE_NAME = 'tlc.de' - _VALID_URL = r'http://www\.tlc\.de/(?:[^/]+/)*videos/(?P<title>[^/?#]+)?(?:.*#(?P<id>\d+))?' + _VALID_URL = r'https?://www\.tlc\.de/(?:[^/]+/)*videos/(?P<title>[^/?#]+)?(?:.*#(?P<id>\d+))?' _TEST = { 'url': 'http://www.tlc.de/sendungen/breaking-amish/videos/#3235167922001', diff --git a/youtube_dl/extractor/toypics.py b/youtube_dl/extractor/toypics.py index 2756f56d3..2579ba8c6 100644 --- a/youtube_dl/extractor/toypics.py +++ b/youtube_dl/extractor/toypics.py @@ -41,7 +41,7 @@ class ToypicsIE(InfoExtractor): class ToypicsUserIE(InfoExtractor): IE_DESC = 'Toypics user profile' - _VALID_URL = r'http://videos\.toypics\.net/(?P<username>[^/?]+)(?:$|[?#])' + _VALID_URL = r'https?://videos\.toypics\.net/(?P<username>[^/?]+)(?:$|[?#])' _TEST = { 'url': 'http://videos.toypics.net/Mikey', 'info_dict': { diff --git a/youtube_dl/extractor/traileraddict.py b/youtube_dl/extractor/traileraddict.py index 0e01b15fc..747370d12 100644 --- a/youtube_dl/extractor/traileraddict.py +++ b/youtube_dl/extractor/traileraddict.py @@ -7,7 +7,7 @@ from .common import InfoExtractor class TrailerAddictIE(InfoExtractor): _WORKING = False - _VALID_URL = r'(?:http://)?(?:www\.)?traileraddict\.com/(?:trailer|clip)/(?P<movie>.+?)/(?P<trailer_name>.+)' + _VALID_URL = r'(?:https?://)?(?:www\.)?traileraddict\.com/(?:trailer|clip)/(?P<movie>.+?)/(?P<trailer_name>.+)' _TEST = { 'url': 'http://www.traileraddict.com/trailer/prince-avalanche/trailer', 'md5': '41365557f3c8c397d091da510e73ceb4', diff --git a/youtube_dl/extractor/trollvids.py b/youtube_dl/extractor/trollvids.py index d239949a6..657705623 100644 --- a/youtube_dl/extractor/trollvids.py +++ b/youtube_dl/extractor/trollvids.py @@ -7,7 +7,7 @@ from .nuevo import NuevoBaseIE class TrollvidsIE(NuevoBaseIE): - _VALID_URL = r'http://(?:www\.)?trollvids\.com/video/(?P<id>\d+)/(?P<display_id>[^/?#&]+)' + _VALID_URL = r'https?://(?:www\.)?trollvids\.com/video/(?P<id>\d+)/(?P<display_id>[^/?#&]+)' IE_NAME = 'trollvids' _TEST = { 'url': 'http://trollvids.com/video/2349002/%E3%80%90MMD-R-18%E3%80%91%E3%82%AC%E3%83%BC%E3%83%AB%E3%83%95%E3%83%AC%E3%83%B3%E3%83%89-carrymeoff', diff --git a/youtube_dl/extractor/tubitv.py b/youtube_dl/extractor/tubitv.py index 6d78b5dfe..7af233cd6 100644 --- a/youtube_dl/extractor/tubitv.py +++ b/youtube_dl/extractor/tubitv.py @@ -5,11 +5,11 @@ import codecs import re from .common import InfoExtractor -from ..compat import compat_urllib_parse from ..utils import ( ExtractorError, int_or_none, sanitized_Request, + urlencode_postdata, ) @@ -41,7 +41,7 @@ class TubiTvIE(InfoExtractor): 'username': username, 'password': password, } - payload = compat_urllib_parse.urlencode(form_data).encode('utf-8') + payload = urlencode_postdata(form_data) request = sanitized_Request(self._LOGIN_URL, payload) request.add_header('Content-Type', 'application/x-www-form-urlencoded') login_page = self._download_webpage( @@ -69,6 +69,7 @@ class TubiTvIE(InfoExtractor): apu = self._search_regex(r"apu='([^']+)'", webpage, 'apu') m3u8_url = codecs.decode(apu, 'rot_13')[::-1] formats = self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4') + self._sort_formats(formats) return { 'id': video_id, diff --git a/youtube_dl/extractor/tudou.py b/youtube_dl/extractor/tudou.py index f56b66d06..9892e8a62 100644 --- a/youtube_dl/extractor/tudou.py +++ b/youtube_dl/extractor/tudou.py @@ -6,6 +6,7 @@ from .common import InfoExtractor from ..compat import compat_str from ..utils import ( int_or_none, + InAdvancePagedList, float_or_none, unescapeHTML, ) @@ -75,15 +76,16 @@ class TudouIE(InfoExtractor): quality = sorted(filter(lambda k: k.isdigit(), segments.keys()), key=lambda k: int(k))[-1] parts = segments[quality] - result = [] len_parts = len(parts) if len_parts > 1: self.to_screen('%s: found %s parts' % (video_id, len_parts)) - for part in parts: + + def part_func(partnum): + part = parts[partnum] part_id = part['k'] final_url = self._url_for_id(part_id, quality) ext = (final_url.split('?')[0]).split('.')[-1] - part_info = { + return [{ 'id': '%s' % part_id, 'url': final_url, 'ext': ext, @@ -97,12 +99,13 @@ class TudouIE(InfoExtractor): 'http_headers': { 'Referer': self._PLAYER_URL, }, - } - result.append(part_info) + }] + + entries = InAdvancePagedList(part_func, len_parts, 1) return { '_type': 'multi_video', - 'entries': result, + 'entries': entries, 'id': video_id, 'title': title, } diff --git a/youtube_dl/extractor/tumblr.py b/youtube_dl/extractor/tumblr.py index 4f844706d..4d8b57111 100644 --- a/youtube_dl/extractor/tumblr.py +++ b/youtube_dl/extractor/tumblr.py @@ -8,7 +8,7 @@ from ..utils import int_or_none class TumblrIE(InfoExtractor): - _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/(?:post|video)/(?P<id>[0-9]+)(?:$|[/?#])' + _VALID_URL = r'https?://(?P<blog_name>[^/?#&]+)\.tumblr\.com/(?:post|video)/(?P<id>[0-9]+)(?:$|[/?#])' _TESTS = [{ 'url': 'http://tatianamaslanydaily.tumblr.com/post/54196191430/orphan-black-dvd-extra-behind-the-scenes', 'md5': '479bb068e5b16462f5176a6828829767', @@ -67,6 +67,34 @@ class TumblrIE(InfoExtractor): 'uploader_id': 'user32021558', }, 'add_ie': ['Vimeo'], + }, { + 'url': 'http://sutiblr.tumblr.com/post/139638707273', + 'md5': '2dd184b3669e049ba40563a7d423f95c', + 'info_dict': { + 'id': 'ir7qBEIKqvq', + 'ext': 'mp4', + 'title': 'Vine by sutiblr', + 'alt_title': 'Vine by sutiblr', + 'uploader': 'sutiblr', + 'uploader_id': '1198993975374495744', + 'upload_date': '20160220', + 'like_count': int, + 'comment_count': int, + 'repost_count': int, + }, + 'add_ie': ['Vine'], + }, { + 'url': 'http://vitasidorkina.tumblr.com/post/134652425014/joskriver-victoriassecret-invisibility-or', + 'md5': '01c12ceb82cbf6b2fe0703aa56b3ad72', + 'info_dict': { + 'id': '-7LnUPGlSo', + 'ext': 'mp4', + 'title': 'Video by victoriassecret', + 'description': 'Invisibility or flight…which superpower would YOU choose? #VSFashionShow #ThisOrThat', + 'uploader_id': 'victoriassecret', + 'thumbnail': 're:^https?://.*\.jpg' + }, + 'add_ie': ['Instagram'], }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/tunein.py b/youtube_dl/extractor/tunein.py index 8322cc14d..ae4cfaec2 100644 --- a/youtube_dl/extractor/tunein.py +++ b/youtube_dl/extractor/tunein.py @@ -1,7 +1,7 @@ # coding: utf-8 from __future__ import unicode_literals -import json +import re from .common import InfoExtractor from ..utils import ExtractorError @@ -27,10 +27,9 @@ class TuneInBaseIE(InfoExtractor): if not streams_url.startswith('http://'): streams_url = compat_urlparse.urljoin(url, streams_url) - stream_data = self._download_webpage( - streams_url, content_id, note='Downloading stream data') - streams = json.loads(self._search_regex( - r'\((.*)\);', stream_data, 'stream info'))['Streams'] + streams = self._download_json( + streams_url, content_id, note='Downloading stream data', + transform_source=lambda s: re.sub(r'^\s*\((.*)\);\s*$', r'\1', s))['Streams'] is_live = None formats = [] diff --git a/youtube_dl/extractor/tv2.py b/youtube_dl/extractor/tv2.py index 1457e524e..86bb7915d 100644 --- a/youtube_dl/extractor/tv2.py +++ b/youtube_dl/extractor/tv2.py @@ -14,7 +14,7 @@ from ..utils import ( class TV2IE(InfoExtractor): - _VALID_URL = 'http://(?:www\.)?tv2\.no/v/(?P<id>\d+)' + _VALID_URL = r'https?://(?:www\.)?tv2\.no/v/(?P<id>\d+)' _TEST = { 'url': 'http://www.tv2.no/v/916509/', 'info_dict': { @@ -100,7 +100,7 @@ class TV2IE(InfoExtractor): class TV2ArticleIE(InfoExtractor): - _VALID_URL = 'http://(?:www\.)?tv2\.no/(?:a|\d{4}/\d{2}/\d{2}(/[^/]+)+)/(?P<id>\d+)' + _VALID_URL = r'https?://(?:www\.)?tv2\.no/(?:a|\d{4}/\d{2}/\d{2}(/[^/]+)+)/(?P<id>\d+)' _TESTS = [{ 'url': 'http://www.tv2.no/2015/05/16/nyheter/alesund/krim/pingvin/6930542', 'info_dict': { diff --git a/youtube_dl/extractor/tvc.py b/youtube_dl/extractor/tvc.py index 3a4f393fc..4065354dd 100644 --- a/youtube_dl/extractor/tvc.py +++ b/youtube_dl/extractor/tvc.py @@ -11,7 +11,7 @@ from ..utils import ( class TVCIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?tvc\.ru/video/iframe/id/(?P<id>\d+)' + _VALID_URL = r'https?://(?:www\.)?tvc\.ru/video/iframe/id/(?P<id>\d+)' _TEST = { 'url': 'http://www.tvc.ru/video/iframe/id/74622/isPlay/false/id_stat/channel/?acc_video_id=/channel/brand/id/17/show/episodes/episode_id/39702', 'md5': 'bbc5ff531d1e90e856f60fc4b3afd708', @@ -64,7 +64,7 @@ class TVCIE(InfoExtractor): class TVCArticleIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?tvc\.ru/(?!video/iframe/id/)(?P<id>[^?#]+)' + _VALID_URL = r'https?://(?:www\.)?tvc\.ru/(?!video/iframe/id/)(?P<id>[^?#]+)' _TESTS = [{ 'url': 'http://www.tvc.ru/channel/brand/id/29/show/episodes/episode_id/39702/', 'info_dict': { diff --git a/youtube_dl/extractor/tvplay.py b/youtube_dl/extractor/tvplay.py index b4683de54..df70a6b23 100644 --- a/youtube_dl/extractor/tvplay.py +++ b/youtube_dl/extractor/tvplay.py @@ -13,7 +13,7 @@ from ..utils import ( class TVPlayIE(InfoExtractor): IE_DESC = 'TV3Play and related services' - _VALID_URL = r'''(?x)http://(?:www\.)? + _VALID_URL = r'''(?x)https?://(?:www\.)? (?:tvplay\.lv/parraides| tv3play\.lt/programos| play\.tv3\.lt/programos| diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py index d4169ec6d..36ee1adff 100644 --- a/youtube_dl/extractor/twitch.py +++ b/youtube_dl/extractor/twitch.py @@ -9,18 +9,18 @@ from .common import InfoExtractor from ..compat import ( compat_parse_qs, compat_str, - compat_urllib_parse, + compat_urllib_parse_urlencode, compat_urllib_parse_urlparse, compat_urlparse, ) from ..utils import ( - encode_dict, ExtractorError, int_or_none, orderedSet, parse_duration, parse_iso8601, sanitized_Request, + urlencode_postdata, ) @@ -82,7 +82,7 @@ class TwitchBaseIE(InfoExtractor): post_url = compat_urlparse.urljoin(redirect_url, post_url) request = sanitized_Request( - post_url, compat_urllib_parse.urlencode(encode_dict(login_form)).encode('utf-8')) + post_url, urlencode_postdata(login_form)) request.add_header('Referer', redirect_url) response = self._download_webpage( request, None, 'Logging in as %s' % username) @@ -250,7 +250,7 @@ class TwitchVodIE(TwitchItemBaseIE): formats = self._extract_m3u8_formats( '%s/vod/%s?%s' % ( self._USHER_BASE, item_id, - compat_urllib_parse.urlencode({ + compat_urllib_parse_urlencode({ 'allow_source': 'true', 'allow_audio_only': 'true', 'allow_spectre': 'true', @@ -442,7 +442,7 @@ class TwitchStreamIE(TwitchBaseIE): } formats = self._extract_m3u8_formats( '%s/api/channel/hls/%s.m3u8?%s' - % (self._USHER_BASE, channel_id, compat_urllib_parse.urlencode(query)), + % (self._USHER_BASE, channel_id, compat_urllib_parse_urlencode(query)), channel_id, 'mp4') self._prefer_source(formats) diff --git a/youtube_dl/extractor/twitter.py b/youtube_dl/extractor/twitter.py index e70b2ab3c..1f32ea2eb 100644 --- a/youtube_dl/extractor/twitter.py +++ b/youtube_dl/extractor/twitter.py @@ -102,6 +102,9 @@ class TwitterCardIE(TwitterBaseIE): r'data-(?:player-)?config="([^"]+)"', webpage, 'data player config'), video_id) + if config.get('source_type') == 'vine': + return self.url_result(config['player_url'], 'Vine') + def _search_dimensions_in_video_url(a_format, video_url): m = re.search(r'/(?P<width>\d+)x(?P<height>\d+)/', video_url) if m: @@ -110,10 +113,9 @@ class TwitterCardIE(TwitterBaseIE): 'height': int(m.group('height')), }) - playlist = config.get('playlist') - if playlist: - video_url = playlist[0]['source'] + video_url = config.get('video_url') or config.get('playlist', [{}])[0].get('source') + if video_url: f = { 'url': video_url, } @@ -185,7 +187,6 @@ class TwitterIE(InfoExtractor): 'ext': 'mp4', 'title': 'FREE THE NIPPLE - FTN supporters on Hollywood Blvd today!', 'thumbnail': 're:^https?://.*\.jpg', - 'duration': 12.922, 'description': 'FREE THE NIPPLE on Twitter: "FTN supporters on Hollywood Blvd today! http://t.co/c7jHH749xJ"', 'uploader': 'FREE THE NIPPLE', 'uploader_id': 'freethenipple', @@ -247,6 +248,18 @@ class TwitterIE(InfoExtractor): 'params': { 'skip_download': True, # requires ffmpeg }, + }, { + 'url': 'https://twitter.com/Filmdrunk/status/713801302971588609', + 'md5': '89a15ed345d13b86e9a5a5e051fa308a', + 'info_dict': { + 'id': 'MIOxnrUteUd', + 'ext': 'mp4', + 'title': 'Dr.Pepperの飲み方 #japanese #バカ #ドクペ #電動ガン', + 'uploader': 'TAKUMA', + 'uploader_id': '1004126642786242560', + 'upload_date': '20140615', + }, + 'add_ie': ['Vine'], }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/ubu.py b/youtube_dl/extractor/ubu.py index d50237758..1d52cbc98 100644 --- a/youtube_dl/extractor/ubu.py +++ b/youtube_dl/extractor/ubu.py @@ -10,7 +10,7 @@ from ..utils import ( class UbuIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?ubu\.com/film/(?P<id>[\da-z_-]+)\.html' + _VALID_URL = r'https?://(?:www\.)?ubu\.com/film/(?P<id>[\da-z_-]+)\.html' _TEST = { 'url': 'http://ubu.com/film/her_noise.html', 'md5': '138d5652618bf0f03878978db9bef1ee', diff --git a/youtube_dl/extractor/udemy.py b/youtube_dl/extractor/udemy.py index 74cc36ece..71bea5363 100644 --- a/youtube_dl/extractor/udemy.py +++ b/youtube_dl/extractor/udemy.py @@ -1,23 +1,38 @@ from __future__ import unicode_literals +import re + from .common import InfoExtractor from ..compat import ( compat_HTTPError, - compat_urllib_parse, + compat_urllib_parse_urlencode, compat_urllib_request, + compat_urlparse, ) from ..utils import ( + determine_ext, + extract_attributes, ExtractorError, float_or_none, int_or_none, sanitized_Request, unescapeHTML, + urlencode_postdata, ) class UdemyIE(InfoExtractor): IE_NAME = 'udemy' - _VALID_URL = r'https?://www\.udemy\.com/(?:[^#]+#/lecture/|lecture/view/?\?lectureId=)(?P<id>\d+)' + _VALID_URL = r'''(?x) + https?:// + www\.udemy\.com/ + (?: + [^#]+\#/lecture/| + lecture/view/?\?lectureId=| + [^/]+/learn/v4/t/lecture/ + ) + (?P<id>\d+) + ''' _LOGIN_URL = 'https://www.udemy.com/join/login-popup/?displayType=ajax&showSkipButton=1' _ORIGIN_URL = 'https://www.udemy.com' _NETRC_MACHINE = 'udemy' @@ -33,34 +48,42 @@ class UdemyIE(InfoExtractor): 'duration': 579.29, }, 'skip': 'Requires udemy account credentials', + }, { + # new URL schema + 'url': 'https://www.udemy.com/electric-bass-right-from-the-start/learn/v4/t/lecture/4580906', + 'only_matching': True, }] - def _enroll_course(self, webpage, course_id): + def _enroll_course(self, base_url, webpage, course_id): + def combine_url(base_url, url): + return compat_urlparse.urljoin(base_url, url) if not url.startswith('http') else url + checkout_url = unescapeHTML(self._search_regex( - r'href=(["\'])(?P<url>https?://(?:www\.)?udemy\.com/payment/checkout/.+?)\1', + r'href=(["\'])(?P<url>(?:https?://(?:www\.)?udemy\.com)?/payment/checkout/.+?)\1', webpage, 'checkout url', group='url', default=None)) if checkout_url: raise ExtractorError( 'Course %s is not free. You have to pay for it before you can download. ' - 'Use this URL to confirm purchase: %s' % (course_id, checkout_url), expected=True) + 'Use this URL to confirm purchase: %s' + % (course_id, combine_url(base_url, checkout_url)), + expected=True) enroll_url = unescapeHTML(self._search_regex( - r'href=(["\'])(?P<url>https?://(?:www\.)?udemy\.com/course/subscribe/.+?)\1', + r'href=(["\'])(?P<url>(?:https?://(?:www\.)?udemy\.com)?/course/subscribe/.+?)\1', webpage, 'enroll url', group='url', default=None)) if enroll_url: - webpage = self._download_webpage(enroll_url, course_id, 'Enrolling in the course') + webpage = self._download_webpage( + combine_url(base_url, enroll_url), + course_id, 'Enrolling in the course') if '>You have enrolled in' in webpage: self.to_screen('%s: Successfully enrolled in the course' % course_id) def _download_lecture(self, course_id, lecture_id): return self._download_json( 'https://www.udemy.com/api-2.0/users/me/subscribed-courses/%s/lectures/%s?%s' % ( - course_id, lecture_id, compat_urllib_parse.urlencode({ - 'video_only': '', - 'auto_play': '', - 'fields[lecture]': 'title,description,asset', + course_id, lecture_id, compat_urllib_parse_urlencode({ + 'fields[lecture]': 'title,description,view_html,asset', 'fields[asset]': 'asset_type,stream_url,thumbnail_url,download_urls,data', - 'instructorPreviewMode': 'False', })), lecture_id, 'Downloading lecture JSON') @@ -123,7 +146,7 @@ class UdemyIE(InfoExtractor): }) request = sanitized_Request( - self._LOGIN_URL, compat_urllib_parse.urlencode(login_form).encode('utf-8')) + self._LOGIN_URL, urlencode_postdata(login_form)) request.add_header('Referer', self._ORIGIN_URL) request.add_header('Origin', self._ORIGIN_URL) @@ -152,7 +175,7 @@ class UdemyIE(InfoExtractor): except ExtractorError as e: # Error could possibly mean we are not enrolled in the course if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: - self._enroll_course(webpage, course_id) + self._enroll_course(url, webpage, course_id) lecture = self._download_lecture(course_id, lecture_id) else: raise @@ -177,41 +200,90 @@ class UdemyIE(InfoExtractor): video_id = asset['id'] thumbnail = asset.get('thumbnailUrl') or asset.get('thumbnail_url') duration = float_or_none(asset.get('data', {}).get('duration')) - outputs = asset.get('data', {}).get('outputs', {}) formats = [] - for format_ in asset.get('download_urls', {}).get('Video', []): - video_url = format_.get('file') - if not video_url: - continue - format_id = format_.get('label') - f = { - 'url': format_['file'], - 'height': int_or_none(format_id), - } - if format_id: - # Some videos contain additional metadata (e.g. - # https://www.udemy.com/ios9-swift/learn/#/lecture/3383208) - output = outputs.get(format_id) - if isinstance(output, dict): - f.update({ - 'format_id': '%sp' % (output.get('label') or format_id), - 'width': int_or_none(output.get('width')), - 'height': int_or_none(output.get('height')), - 'vbr': int_or_none(output.get('video_bitrate_in_kbps')), - 'vcodec': output.get('video_codec'), - 'fps': int_or_none(output.get('frame_rate')), - 'abr': int_or_none(output.get('audio_bitrate_in_kbps')), - 'acodec': output.get('audio_codec'), - 'asr': int_or_none(output.get('audio_sample_rate')), - 'tbr': int_or_none(output.get('total_bitrate_in_kbps')), - 'filesize': int_or_none(output.get('file_size_in_bytes')), - }) - else: - f['format_id'] = '%sp' % format_id - formats.append(f) - self._sort_formats(formats) + def extract_output_format(src): + return { + 'url': src['url'], + 'format_id': '%sp' % (src.get('height') or format_id), + 'width': int_or_none(src.get('width')), + 'height': int_or_none(src.get('height')), + 'vbr': int_or_none(src.get('video_bitrate_in_kbps')), + 'vcodec': src.get('video_codec'), + 'fps': int_or_none(src.get('frame_rate')), + 'abr': int_or_none(src.get('audio_bitrate_in_kbps')), + 'acodec': src.get('audio_codec'), + 'asr': int_or_none(src.get('audio_sample_rate')), + 'tbr': int_or_none(src.get('total_bitrate_in_kbps')), + 'filesize': int_or_none(src.get('file_size_in_bytes')), + } + + outputs = asset.get('data', {}).get('outputs') + if not isinstance(outputs, dict): + outputs = {} + + def add_output_format_meta(f, key): + output = outputs.get(key) + if isinstance(output, dict): + output_format = extract_output_format(output) + output_format.update(f) + return output_format + return f + + download_urls = asset.get('download_urls') + if isinstance(download_urls, dict): + video = download_urls.get('Video') + if isinstance(video, list): + for format_ in video: + video_url = format_.get('file') + if not video_url: + continue + format_id = format_.get('label') + f = { + 'url': format_['file'], + 'format_id': '%sp' % format_id, + 'height': int_or_none(format_id), + } + if format_id: + # Some videos contain additional metadata (e.g. + # https://www.udemy.com/ios9-swift/learn/#/lecture/3383208) + f = add_output_format_meta(f, format_id) + formats.append(f) + + view_html = lecture.get('view_html') + if view_html: + view_html_urls = set() + for source in re.findall(r'<source[^>]+>', view_html): + attributes = extract_attributes(source) + src = attributes.get('src') + if not src: + continue + res = attributes.get('data-res') + height = int_or_none(res) + if src in view_html_urls: + continue + view_html_urls.add(src) + if attributes.get('type') == 'application/x-mpegURL' or determine_ext(src) == 'm3u8': + m3u8_formats = self._extract_m3u8_formats( + src, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False) + for f in m3u8_formats: + m = re.search(r'/hls_(?P<height>\d{3,4})_(?P<tbr>\d{2,})/', f['url']) + if m: + if not f.get('height'): + f['height'] = int(m.group('height')) + if not f.get('tbr'): + f['tbr'] = int(m.group('tbr')) + formats.extend(m3u8_formats) + else: + formats.append(add_output_format_meta({ + 'url': src, + 'format_id': '%dp' % height if height else None, + 'height': height, + }, res)) + + self._sort_formats(formats, field_preference=('height', 'width', 'tbr', 'format_id')) return { 'id': video_id, @@ -244,7 +316,7 @@ class UdemyCourseIE(UdemyIE): course_id = response['id'] course_title = response.get('title') - self._enroll_course(webpage, course_id) + self._enroll_course(url, webpage, course_id) response = self._download_json( 'https://www.udemy.com/api-1.1/courses/%s/curriculum' % course_id, diff --git a/youtube_dl/extractor/unistra.py b/youtube_dl/extractor/unistra.py index 594bee4f9..66d9f1bf3 100644 --- a/youtube_dl/extractor/unistra.py +++ b/youtube_dl/extractor/unistra.py @@ -7,7 +7,7 @@ from ..utils import qualities class UnistraIE(InfoExtractor): - _VALID_URL = r'http://utv\.unistra\.fr/(?:index|video)\.php\?id_video\=(?P<id>\d+)' + _VALID_URL = r'https?://utv\.unistra\.fr/(?:index|video)\.php\?id_video\=(?P<id>\d+)' _TESTS = [ { diff --git a/youtube_dl/extractor/vbox7.py b/youtube_dl/extractor/vbox7.py index 3794bcded..dff1bb702 100644 --- a/youtube_dl/extractor/vbox7.py +++ b/youtube_dl/extractor/vbox7.py @@ -2,18 +2,16 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..compat import ( - compat_urllib_parse, - compat_urlparse, -) +from ..compat import compat_urlparse from ..utils import ( ExtractorError, sanitized_Request, + urlencode_postdata, ) class Vbox7IE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?vbox7\.com/play:(?P<id>[^/]+)' + _VALID_URL = r'https?://(?:www\.)?vbox7\.com/play:(?P<id>[^/]+)' _TEST = { 'url': 'http://vbox7.com/play:249bb972c2', 'md5': '99f65c0c9ef9b682b97313e052734c3f', @@ -48,7 +46,7 @@ class Vbox7IE(InfoExtractor): webpage, 'title').split('/')[0].strip() info_url = 'http://vbox7.com/play/magare.do' - data = compat_urllib_parse.urlencode({'as3': '1', 'vid': video_id}) + data = urlencode_postdata({'as3': '1', 'vid': video_id}) info_request = sanitized_Request(info_url, data) info_request.add_header('Content-Type', 'application/x-www-form-urlencoded') info_response = self._download_webpage(info_request, video_id, 'Downloading info webpage') diff --git a/youtube_dl/extractor/veoh.py b/youtube_dl/extractor/veoh.py index 9633f7ffe..23ce0a0d1 100644 --- a/youtube_dl/extractor/veoh.py +++ b/youtube_dl/extractor/veoh.py @@ -12,7 +12,7 @@ from ..utils import ( class VeohIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?veoh\.com/(?:watch|iphone/#_Watch)/(?P<id>(?:v|yapi-)[\da-zA-Z]+)' + _VALID_URL = r'https?://(?:www\.)?veoh\.com/(?:watch|iphone/#_Watch)/(?P<id>(?:v|yapi-)[\da-zA-Z]+)' _TESTS = [ { diff --git a/youtube_dl/extractor/vesti.py b/youtube_dl/extractor/vesti.py index a0c59a2e0..cb64ae0bd 100644 --- a/youtube_dl/extractor/vesti.py +++ b/youtube_dl/extractor/vesti.py @@ -10,7 +10,7 @@ from .rutv import RUTVIE class VestiIE(InfoExtractor): IE_DESC = 'Вести.Ru' - _VALID_URL = r'http://(?:.+?\.)?vesti\.ru/(?P<id>.+)' + _VALID_URL = r'https?://(?:.+?\.)?vesti\.ru/(?P<id>.+)' _TESTS = [ { diff --git a/youtube_dl/extractor/vevo.py b/youtube_dl/extractor/vevo.py index 152fef42e..147480f64 100644 --- a/youtube_dl/extractor/vevo.py +++ b/youtube_dl/extractor/vevo.py @@ -152,7 +152,7 @@ class VevoIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - json_url = 'http://videoplayer.vevo.com/VideoService/AuthenticateVideo?isrc=%s' % video_id + json_url = 'http://api.vevo.com/VideoService/AuthenticateVideo?isrc=%s' % video_id response = self._download_json( json_url, video_id, 'Downloading video info', 'Unable to download info') video_info = response.get('video') or {} diff --git a/youtube_dl/extractor/vgtv.py b/youtube_dl/extractor/vgtv.py index e148b1ef5..b11cd254c 100644 --- a/youtube_dl/extractor/vgtv.py +++ b/youtube_dl/extractor/vgtv.py @@ -214,7 +214,7 @@ class VGTVIE(XstreamIE): class BTArticleIE(InfoExtractor): IE_NAME = 'bt:article' IE_DESC = 'Bergens Tidende Articles' - _VALID_URL = 'http://(?:www\.)?bt\.no/(?:[^/]+/)+(?P<id>[^/]+)-\d+\.html' + _VALID_URL = r'https?://(?:www\.)?bt\.no/(?:[^/]+/)+(?P<id>[^/]+)-\d+\.html' _TEST = { 'url': 'http://www.bt.no/nyheter/lokalt/Kjemper-for-internatet-1788214.html', 'md5': '2acbe8ad129b3469d5ae51b1158878df', @@ -241,7 +241,7 @@ class BTArticleIE(InfoExtractor): class BTVestlendingenIE(InfoExtractor): IE_NAME = 'bt:vestlendingen' IE_DESC = 'Bergens Tidende - Vestlendingen' - _VALID_URL = 'http://(?:www\.)?bt\.no/spesial/vestlendingen/#!/(?P<id>\d+)' + _VALID_URL = r'https?://(?:www\.)?bt\.no/spesial/vestlendingen/#!/(?P<id>\d+)' _TESTS = [{ 'url': 'http://www.bt.no/spesial/vestlendingen/#!/86588', 'md5': 'd7d17e3337dc80de6d3a540aefbe441b', diff --git a/youtube_dl/extractor/viddler.py b/youtube_dl/extractor/viddler.py index 6bfbd4d85..8d92aee87 100644 --- a/youtube_dl/extractor/viddler.py +++ b/youtube_dl/extractor/viddler.py @@ -2,7 +2,7 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..compat import ( - compat_urllib_parse, + compat_urllib_parse_urlencode, compat_urlparse, ) from ..utils import ( @@ -93,7 +93,7 @@ class ViddlerIE(InfoExtractor): headers = {'Referer': 'http://static.cdn-ec.viddler.com/js/arpeggio/v2/embed.html'} request = sanitized_Request( 'http://api.viddler.com/api/v2/viddler.videos.getPlaybackDetails.json?%s' - % compat_urllib_parse.urlencode(query), None, headers) + % compat_urllib_parse_urlencode(query), None, headers) data = self._download_json(request, video_id)['video'] formats = [] diff --git a/youtube_dl/extractor/videomore.py b/youtube_dl/extractor/videomore.py index 0bd1e1eec..04e95c66e 100644 --- a/youtube_dl/extractor/videomore.py +++ b/youtube_dl/extractor/videomore.py @@ -111,6 +111,7 @@ class VideomoreIE(InfoExtractor): video_url = xpath_text(video, './/video_url', 'video url', fatal=True) formats = self._extract_f4m_formats(video_url, video_id, f4m_id='hds') + self._sort_formats(formats) data = self._download_json( 'http://videomore.ru/video/tracks/%s.json' % video_id, diff --git a/youtube_dl/extractor/videott.py b/youtube_dl/extractor/videott.py index 2cd36508a..0f798711b 100644 --- a/youtube_dl/extractor/videott.py +++ b/youtube_dl/extractor/videott.py @@ -14,7 +14,7 @@ class VideoTtIE(InfoExtractor): _WORKING = False ID_NAME = 'video.tt' IE_DESC = 'video.tt - Your True Tube' - _VALID_URL = r'http://(?:www\.)?video\.tt/(?:(?:video|embed)/|watch_video\.php\?v=)(?P<id>[\da-zA-Z]{9})' + _VALID_URL = r'https?://(?:www\.)?video\.tt/(?:(?:video|embed)/|watch_video\.php\?v=)(?P<id>[\da-zA-Z]{9})' _TESTS = [{ 'url': 'http://www.video.tt/watch_video.php?v=amd5YujV8', diff --git a/youtube_dl/extractor/vier.py b/youtube_dl/extractor/vier.py index c76c20614..6645c6186 100644 --- a/youtube_dl/extractor/vier.py +++ b/youtube_dl/extractor/vier.py @@ -50,6 +50,7 @@ class VierIE(InfoExtractor): playlist_url = 'http://vod.streamcloud.be/%s/mp4:_definst_/%s.mp4/playlist.m3u8' % (application, filename) formats = self._extract_m3u8_formats(playlist_url, display_id, 'mp4') + self._sort_formats(formats) title = self._og_search_title(webpage, default=display_id) description = self._og_search_description(webpage, default=None) diff --git a/youtube_dl/extractor/viidea.py b/youtube_dl/extractor/viidea.py index 315984bf9..a4f914d14 100644 --- a/youtube_dl/extractor/viidea.py +++ b/youtube_dl/extractor/viidea.py @@ -15,7 +15,7 @@ from ..utils import ( class ViideaIE(InfoExtractor): - _VALID_URL = r'''(?x)http://(?:www\.)?(?: + _VALID_URL = r'''(?x)https?://(?:www\.)?(?: videolectures\.net| flexilearn\.viidea\.net| presentations\.ocwconsortium\.org| @@ -151,6 +151,7 @@ class ViideaIE(InfoExtractor): smil_url = '%s/%s/video/%s/smil.xml' % (base_url, lecture_slug, part_id) smil = self._download_smil(smil_url, lecture_id) info = self._parse_smil(smil, smil_url, lecture_id) + self._sort_formats(info['formats']) info['id'] = lecture_id if not multipart else '%s_part%s' % (lecture_id, part_id) info['display_id'] = lecture_slug if not multipart else '%s_part%s' % (lecture_slug, part_id) if multipart: diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 71c30d2cd..707a5735a 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -12,7 +12,6 @@ from ..compat import ( ) from ..utils import ( determine_ext, - encode_dict, ExtractorError, InAdvancePagedList, int_or_none, @@ -42,13 +41,13 @@ class VimeoBaseInfoExtractor(InfoExtractor): self.report_login() webpage = self._download_webpage(self._LOGIN_URL, None, False) token, vuid = self._extract_xsrft_and_vuid(webpage) - data = urlencode_postdata(encode_dict({ + data = urlencode_postdata({ 'action': 'login', 'email': username, 'password': password, 'service': 'vimeo', 'token': token, - })) + }) login_request = sanitized_Request(self._LOGIN_URL, data) login_request.add_header('Content-Type', 'application/x-www-form-urlencoded') login_request.add_header('Referer', self._LOGIN_URL) @@ -255,10 +254,10 @@ class VimeoIE(VimeoBaseInfoExtractor): if password is None: raise ExtractorError('This video is protected by a password, use the --video-password option', expected=True) token, vuid = self._extract_xsrft_and_vuid(webpage) - data = urlencode_postdata(encode_dict({ + data = urlencode_postdata({ 'password': password, 'token': token, - })) + }) if url.startswith('http://'): # vimeo only supports https now, but the user can give an http url url = url.replace('http://', 'https://') @@ -274,7 +273,7 @@ class VimeoIE(VimeoBaseInfoExtractor): password = self._downloader.params.get('videopassword') if password is None: raise ExtractorError('This video is protected by a password, use the --video-password option') - data = urlencode_postdata(encode_dict({'password': password})) + data = urlencode_postdata({'password': password}) pass_url = url + '/check-password' password_request = sanitized_Request(pass_url, data) password_request.add_header('Content-Type', 'application/x-www-form-urlencoded') @@ -575,7 +574,7 @@ class VimeoChannelIE(VimeoBaseInfoExtractor): token, vuid = self._extract_xsrft_and_vuid(webpage) fields['token'] = token fields['password'] = password - post = urlencode_postdata(encode_dict(fields)) + post = urlencode_postdata(fields) password_path = self._search_regex( r'action="([^"]+)"', login_form, 'password URL') password_url = compat_urlparse.urljoin(page_url, password_path) diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py index d560a4b5e..67220f1b7 100644 --- a/youtube_dl/extractor/vk.py +++ b/youtube_dl/extractor/vk.py @@ -5,10 +5,7 @@ import re import json from .common import InfoExtractor -from ..compat import ( - compat_str, - compat_urllib_parse, -) +from ..compat import compat_str from ..utils import ( ExtractorError, int_or_none, @@ -17,6 +14,7 @@ from ..utils import ( str_to_int, unescapeHTML, unified_strdate, + urlencode_postdata, ) from .vimeo import VimeoIE from .pladform import PladformIE @@ -204,7 +202,7 @@ class VKIE(InfoExtractor): request = sanitized_Request( 'https://login.vk.com/?act=login', - compat_urllib_parse.urlencode(login_form).encode('utf-8')) + urlencode_postdata(login_form)) login_page = self._download_webpage( request, None, note='Logging in as %s' % username) diff --git a/youtube_dl/extractor/vlive.py b/youtube_dl/extractor/vlive.py index 9e2aa58bd..baf39bb2c 100644 --- a/youtube_dl/extractor/vlive.py +++ b/youtube_dl/extractor/vlive.py @@ -7,7 +7,7 @@ from ..utils import ( float_or_none, int_or_none, ) -from ..compat import compat_urllib_parse +from ..compat import compat_urllib_parse_urlencode class VLiveIE(InfoExtractor): @@ -43,7 +43,7 @@ class VLiveIE(InfoExtractor): playinfo = self._download_json( 'http://global.apis.naver.com/rmcnmv/rmcnmv/vod_play_videoInfo.json?%s' - % compat_urllib_parse.urlencode({ + % compat_urllib_parse_urlencode({ 'videoId': long_video_id, 'key': key, 'ptc': 'http', @@ -64,7 +64,7 @@ class VLiveIE(InfoExtractor): thumbnail = self._og_search_thumbnail(webpage) creator = self._html_search_regex( - r'<div[^>]+class="info_area"[^>]*>\s*<strong[^>]+class="name"[^>]*>([^<]+)</strong>', + r'<div[^>]+class="info_area"[^>]*>\s*<a\s+[^>]*>([^<]+)', webpage, 'creator', fatal=False) view_count = int_or_none(playinfo.get('meta', {}).get('count')) diff --git a/youtube_dl/extractor/vodlocker.py b/youtube_dl/extractor/vodlocker.py index a97995a6d..a938a4007 100644 --- a/youtube_dl/extractor/vodlocker.py +++ b/youtube_dl/extractor/vodlocker.py @@ -2,11 +2,11 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..compat import compat_urllib_parse from ..utils import ( ExtractorError, NO_DEFAULT, sanitized_Request, + urlencode_postdata, ) @@ -38,7 +38,7 @@ class VodlockerIE(InfoExtractor): if fields['op'] == 'download1': self._sleep(3, video_id) # they do detect when requests happen too fast! - post = compat_urllib_parse.urlencode(fields) + post = urlencode_postdata(fields) req = sanitized_Request(url, post) req.add_header('Content-type', 'application/x-www-form-urlencoded') webpage = self._download_webpage( diff --git a/youtube_dl/extractor/vube.py b/youtube_dl/extractor/vube.py index 149e36467..10ca6acb1 100644 --- a/youtube_dl/extractor/vube.py +++ b/youtube_dl/extractor/vube.py @@ -15,7 +15,7 @@ from ..utils import ( class VubeIE(InfoExtractor): IE_NAME = 'vube' IE_DESC = 'Vube.com' - _VALID_URL = r'http://vube\.com/(?:[^/]+/)+(?P<id>[\da-zA-Z]{10})\b' + _VALID_URL = r'https?://vube\.com/(?:[^/]+/)+(?P<id>[\da-zA-Z]{10})\b' _TESTS = [ { diff --git a/youtube_dl/extractor/vuclip.py b/youtube_dl/extractor/vuclip.py index a6d9b5fee..eaa888f00 100644 --- a/youtube_dl/extractor/vuclip.py +++ b/youtube_dl/extractor/vuclip.py @@ -14,7 +14,7 @@ from ..utils import ( class VuClipIE(InfoExtractor): - _VALID_URL = r'http://(?:m\.)?vuclip\.com/w\?.*?cid=(?P<id>[0-9]+)' + _VALID_URL = r'https?://(?:m\.)?vuclip\.com/w\?.*?cid=(?P<id>[0-9]+)' _TEST = { 'url': 'http://m.vuclip.com/w?cid=922692425&fid=70295&z=1010&nvar&frm=index.html', diff --git a/youtube_dl/extractor/walla.py b/youtube_dl/extractor/walla.py index 24efbd6e6..8b9488340 100644 --- a/youtube_dl/extractor/walla.py +++ b/youtube_dl/extractor/walla.py @@ -11,7 +11,7 @@ from ..utils import ( class WallaIE(InfoExtractor): - _VALID_URL = r'http://vod\.walla\.co\.il/[^/]+/(?P<id>\d+)/(?P<display_id>.+)' + _VALID_URL = r'https?://vod\.walla\.co\.il/[^/]+/(?P<id>\d+)/(?P<display_id>.+)' _TEST = { 'url': 'http://vod.walla.co.il/movie/2642630/one-direction-all-for-one', 'info_dict': { diff --git a/youtube_dl/extractor/wat.py b/youtube_dl/extractor/wat.py index 37cf3d309..5227bb5ad 100644 --- a/youtube_dl/extractor/wat.py +++ b/youtube_dl/extractor/wat.py @@ -12,7 +12,7 @@ from ..utils import ( class WatIE(InfoExtractor): - _VALID_URL = r'(?:wat:(?P<real_id>\d{8})|http://www\.wat\.tv/video/(?P<display_id>.*)-(?P<short_id>.*?)_.*?\.html)' + _VALID_URL = r'(?:wat:(?P<real_id>\d{8})|https?://www\.wat\.tv/video/(?P<display_id>.*)-(?P<short_id>.*?)_.*?\.html)' IE_NAME = 'wat.tv' _TESTS = [ { diff --git a/youtube_dl/extractor/wdr.py b/youtube_dl/extractor/wdr.py index a851578e0..31c904303 100644 --- a/youtube_dl/extractor/wdr.py +++ b/youtube_dl/extractor/wdr.py @@ -244,7 +244,7 @@ class WDRMobileIE(InfoExtractor): class WDRMausIE(InfoExtractor): - _VALID_URL = 'http://(?:www\.)?wdrmaus\.de/(?:[^/]+/){,2}(?P<id>[^/?#]+)(?:/index\.php5|(?<!index)\.php5|/(?:$|[?#]))' + _VALID_URL = r'https?://(?:www\.)?wdrmaus\.de/(?:[^/]+/){,2}(?P<id>[^/?#]+)(?:/index\.php5|(?<!index)\.php5|/(?:$|[?#]))' IE_DESC = 'Sendung mit der Maus' _TESTS = [{ 'url': 'http://www.wdrmaus.de/aktuelle-sendung/index.php5', diff --git a/youtube_dl/extractor/weiqitv.py b/youtube_dl/extractor/weiqitv.py index e333ae345..3dafbeec2 100644 --- a/youtube_dl/extractor/weiqitv.py +++ b/youtube_dl/extractor/weiqitv.py @@ -6,7 +6,7 @@ from .common import InfoExtractor class WeiqiTVIE(InfoExtractor): IE_DESC = 'WQTV' - _VALID_URL = r'http://www\.weiqitv\.com/index/video_play\?videoId=(?P<id>[A-Za-z0-9]+)' + _VALID_URL = r'https?://www\.weiqitv\.com/index/video_play\?videoId=(?P<id>[A-Za-z0-9]+)' _TESTS = [{ 'url': 'http://www.weiqitv.com/index/video_play?videoId=53c744f09874f0e76a8b46f3', diff --git a/youtube_dl/extractor/wimp.py b/youtube_dl/extractor/wimp.py index fb0accac7..828c03dc3 100644 --- a/youtube_dl/extractor/wimp.py +++ b/youtube_dl/extractor/wimp.py @@ -5,7 +5,7 @@ from .youtube import YoutubeIE class WimpIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?wimp\.com/(?P<id>[^/]+)' + _VALID_URL = r'https?://(?:www\.)?wimp\.com/(?P<id>[^/]+)' _TESTS = [{ 'url': 'http://www.wimp.com/maruexhausted/', 'md5': 'ee21217ffd66d058e8b16be340b74883', diff --git a/youtube_dl/extractor/xbef.py b/youtube_dl/extractor/xbef.py index 4ff99e5ca..e4a2baad2 100644 --- a/youtube_dl/extractor/xbef.py +++ b/youtube_dl/extractor/xbef.py @@ -5,7 +5,7 @@ from ..compat import compat_urllib_parse_unquote class XBefIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?xbef\.com/video/(?P<id>[0-9]+)' + _VALID_URL = r'https?://(?:www\.)?xbef\.com/video/(?P<id>[0-9]+)' _TEST = { 'url': 'http://xbef.com/video/5119-glamourous-lesbians-smoking-drinking-and-fucking', 'md5': 'a478b565baff61634a98f5e5338be995', diff --git a/youtube_dl/extractor/xfileshare.py b/youtube_dl/extractor/xfileshare.py index 94abdb4f3..2d1504eaa 100644 --- a/youtube_dl/extractor/xfileshare.py +++ b/youtube_dl/extractor/xfileshare.py @@ -4,12 +4,11 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_urllib_parse from ..utils import ( ExtractorError, - encode_dict, int_or_none, sanitized_Request, + urlencode_postdata, ) @@ -109,7 +108,7 @@ class XFileShareIE(InfoExtractor): if countdown: self._sleep(countdown, video_id) - post = compat_urllib_parse.urlencode(encode_dict(fields)) + post = urlencode_postdata(fields) req = sanitized_Request(url, post) req.add_header('Content-type', 'application/x-www-form-urlencoded') diff --git a/youtube_dl/extractor/xhamster.py b/youtube_dl/extractor/xhamster.py index fd43e8854..b3547174d 100644 --- a/youtube_dl/extractor/xhamster.py +++ b/youtube_dl/extractor/xhamster.py @@ -4,6 +4,7 @@ import re from .common import InfoExtractor from ..utils import ( + dict_get, float_or_none, int_or_none, unified_strdate, @@ -170,6 +171,12 @@ class XHamsterEmbedIE(InfoExtractor): video_url = self._search_regex( r'href="(https?://xhamster\.com/movies/%s/[^"]+\.html[^"]*)"' % video_id, - webpage, 'xhamster url') + webpage, 'xhamster url', default=None) + + if not video_url: + vars = self._parse_json( + self._search_regex(r'vars\s*:\s*({.+?})\s*,\s*\n', webpage, 'vars'), + video_id) + video_url = dict_get(vars, ('downloadLink', 'homepageLink', 'commentsLink', 'shareUrl')) return self.url_result(video_url, 'XHamster') diff --git a/youtube_dl/extractor/yahoo.py b/youtube_dl/extractor/yahoo.py index 4c6142927..b2d8f4b48 100644 --- a/youtube_dl/extractor/yahoo.py +++ b/youtube_dl/extractor/yahoo.py @@ -8,6 +8,7 @@ import re from .common import InfoExtractor, SearchInfoExtractor from ..compat import ( compat_urllib_parse, + compat_urllib_parse_urlencode, compat_urlparse, ) from ..utils import ( @@ -303,7 +304,7 @@ class YahooIE(InfoExtractor): region = self._search_regex( r'\\?"region\\?"\s*:\s*\\?"([^"]+?)\\?"', webpage, 'region', fatal=False, default='US') - data = compat_urllib_parse.urlencode({ + data = compat_urllib_parse_urlencode({ 'protocol': 'http', 'region': region, }) diff --git a/youtube_dl/extractor/yam.py b/youtube_dl/extractor/yam.py index 001ee17b6..63bbc0634 100644 --- a/youtube_dl/extractor/yam.py +++ b/youtube_dl/extractor/yam.py @@ -15,7 +15,7 @@ from ..utils import ( class YamIE(InfoExtractor): IE_DESC = '蕃薯藤yam天空部落' - _VALID_URL = r'http://mymedia.yam.com/m/(?P<id>\d+)' + _VALID_URL = r'https?://mymedia.yam.com/m/(?P<id>\d+)' _TESTS = [{ # An audio hosted on Yam diff --git a/youtube_dl/extractor/yandexmusic.py b/youtube_dl/extractor/yandexmusic.py index e699e663f..025716958 100644 --- a/youtube_dl/extractor/yandexmusic.py +++ b/youtube_dl/extractor/yandexmusic.py @@ -5,15 +5,13 @@ import re import hashlib from .common import InfoExtractor -from ..compat import ( - compat_str, - compat_urllib_parse, -) +from ..compat import compat_str from ..utils import ( ExtractorError, int_or_none, float_or_none, sanitized_Request, + urlencode_postdata, ) @@ -170,14 +168,14 @@ class YandexMusicPlaylistIE(YandexMusicPlaylistBaseIE): missing_track_ids = set(map(compat_str, track_ids)) - set(present_track_ids) request = sanitized_Request( 'https://music.yandex.ru/handlers/track-entries.jsx', - compat_urllib_parse.urlencode({ + urlencode_postdata({ 'entries': ','.join(missing_track_ids), 'lang': mu.get('settings', {}).get('lang', 'en'), 'external-domain': 'music.yandex.ru', 'overembed': 'false', 'sign': mu.get('authData', {}).get('user', {}).get('sign'), 'strict': 'true', - }).encode('utf-8')) + })) request.add_header('Referer', url) request.add_header('X-Requested-With', 'XMLHttpRequest') diff --git a/youtube_dl/extractor/ynet.py b/youtube_dl/extractor/ynet.py index 869f3e819..0d943c343 100644 --- a/youtube_dl/extractor/ynet.py +++ b/youtube_dl/extractor/ynet.py @@ -9,7 +9,7 @@ from ..compat import compat_urllib_parse_unquote_plus class YnetIE(InfoExtractor): - _VALID_URL = r'http://(?:.+?\.)?ynet\.co\.il/(?:.+?/)?0,7340,(?P<id>L(?:-[0-9]+)+),00\.html' + _VALID_URL = r'https?://(?:.+?\.)?ynet\.co\.il/(?:.+?/)?0,7340,(?P<id>L(?:-[0-9]+)+),00\.html' _TESTS = [ { 'url': 'http://hot.ynet.co.il/home/0,7340,L-11659-99244,00.html', @@ -41,10 +41,12 @@ class YnetIE(InfoExtractor): m = re.search(r'ynet - HOT -- (["\']+)(?P<title>.+?)\1', title) if m: title = m.group('title') + formats = self._extract_f4m_formats(f4m_url, video_id) + self._sort_formats(formats) return { 'id': video_id, 'title': title, - 'formats': self._extract_f4m_formats(f4m_url, video_id), + 'formats': formats, 'thumbnail': self._og_search_thumbnail(webpage), } diff --git a/youtube_dl/extractor/youku.py b/youtube_dl/extractor/youku.py index 900eb2aba..fd7eb5a6d 100644 --- a/youtube_dl/extractor/youku.py +++ b/youtube_dl/extractor/youku.py @@ -8,7 +8,7 @@ import time from .common import InfoExtractor from ..compat import ( - compat_urllib_parse, + compat_urllib_parse_urlencode, compat_ord, ) from ..utils import ( @@ -138,7 +138,7 @@ class YoukuIE(InfoExtractor): '_00' + \ '/st/' + self.parse_ext_l(format) + \ '/fileid/' + get_fileid(format, n) + '?' + \ - compat_urllib_parse.urlencode(param) + compat_urllib_parse_urlencode(param) video_urls.append(video_url) video_urls_dict[format] = video_urls diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 466f5da2e..28355bf46 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -17,16 +17,15 @@ from ..swfinterp import SWFInterpreter from ..compat import ( compat_chr, compat_parse_qs, - compat_urllib_parse, compat_urllib_parse_unquote, compat_urllib_parse_unquote_plus, + compat_urllib_parse_urlencode, compat_urllib_parse_urlparse, compat_urlparse, compat_str, ) from ..utils import ( clean_html, - encode_dict, error_to_compat_str, ExtractorError, float_or_none, @@ -45,6 +44,7 @@ from ..utils import ( unified_strdate, unsmuggle_url, uppercase_escape, + urlencode_postdata, ISO3166Utils, ) @@ -116,7 +116,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor): 'hl': 'en_US', } - login_data = compat_urllib_parse.urlencode(encode_dict(login_form_strs)).encode('ascii') + login_data = urlencode_postdata(login_form_strs) req = sanitized_Request(self._LOGIN_URL, login_data) login_results = self._download_webpage( @@ -149,7 +149,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor): 'TrustDevice': 'on', }) - tfa_data = compat_urllib_parse.urlencode(encode_dict(tfa_form_strs)).encode('ascii') + tfa_data = urlencode_postdata(tfa_form_strs) tfa_req = sanitized_Request(self._TWOFACTOR_URL, tfa_data) tfa_results = self._download_webpage( @@ -234,7 +234,9 @@ class YoutubePlaylistBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor): class YoutubePlaylistsBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor): def _process_page(self, content): - for playlist_id in orderedSet(re.findall(r'href="/?playlist\?list=([0-9A-Za-z-_]{10,})"', content)): + for playlist_id in orderedSet(re.findall( + r'<h3[^>]+class="[^"]*yt-lockup-title[^"]*"[^>]*><a[^>]+href="/?playlist\?list=([0-9A-Za-z-_]{10,})"', + content)): yield self.url_result( 'https://www.youtube.com/playlist?list=%s' % playlist_id, 'YoutubePlaylist') @@ -1007,7 +1009,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): continue sub_formats = [] for ext in self._SUBTITLE_FORMATS: - params = compat_urllib_parse.urlencode({ + params = compat_urllib_parse_urlencode({ 'lang': lang, 'v': video_id, 'fmt': ext, @@ -1056,7 +1058,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if caption_url: timestamp = args['timestamp'] # We get the available subtitles - list_params = compat_urllib_parse.urlencode({ + list_params = compat_urllib_parse_urlencode({ 'type': 'list', 'tlangs': 1, 'asrs': 1, @@ -1075,7 +1077,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): sub_lang = lang_node.attrib['lang_code'] sub_formats = [] for ext in self._SUBTITLE_FORMATS: - params = compat_urllib_parse.urlencode({ + params = compat_urllib_parse_urlencode({ 'lang': original_lang, 'tlang': sub_lang, 'fmt': ext, @@ -1094,7 +1096,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): caption_tracks = args['caption_tracks'] caption_translation_languages = args['caption_translation_languages'] caption_url = compat_parse_qs(caption_tracks.split(',')[0])['u'][0] - parsed_caption_url = compat_urlparse.urlparse(caption_url) + parsed_caption_url = compat_urllib_parse_urlparse(caption_url) caption_qs = compat_parse_qs(parsed_caption_url.query) sub_lang_list = {} @@ -1110,7 +1112,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'fmt': [ext], }) sub_url = compat_urlparse.urlunparse(parsed_caption_url._replace( - query=compat_urllib_parse.urlencode(caption_qs, True))) + query=compat_urllib_parse_urlencode(caption_qs, True))) sub_formats.append({ 'url': sub_url, 'ext': ext, @@ -1140,7 +1142,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'cpn': [cpn], }) playback_url = compat_urlparse.urlunparse( - parsed_playback_url._replace(query=compat_urllib_parse.urlencode(qs, True))) + parsed_playback_url._replace(query=compat_urllib_parse_urlencode(qs, True))) self._download_webpage( playback_url, video_id, 'Marking watched', @@ -1225,7 +1227,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): # this can be viewed without login into Youtube url = proto + '://www.youtube.com/embed/%s' % video_id embed_webpage = self._download_webpage(url, video_id, 'Downloading embed webpage') - data = compat_urllib_parse.urlencode({ + data = compat_urllib_parse_urlencode({ 'video_id': video_id, 'eurl': 'https://youtube.googleapis.com/v/' + video_id, 'sts': self._search_regex( @@ -1911,7 +1913,8 @@ class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor): @classmethod def suitable(cls, url): - return False if YoutubePlaylistsIE.suitable(url) else super(YoutubeChannelIE, cls).suitable(url) + return (False if YoutubePlaylistsIE.suitable(url) or YoutubeLiveIE.suitable(url) + else super(YoutubeChannelIE, cls).suitable(url)) def _real_extract(self, url): channel_id = self._match_id(url) @@ -1986,6 +1989,51 @@ class YoutubeUserIE(YoutubeChannelIE): return super(YoutubeUserIE, cls).suitable(url) +class YoutubeLiveIE(YoutubeBaseInfoExtractor): + IE_DESC = 'YouTube.com live streams' + _VALID_URL = r'(?P<base_url>https?://(?:\w+\.)?youtube\.com/(?:user|channel)/(?P<id>[^/]+))/live' + IE_NAME = 'youtube:live' + + _TESTS = [{ + 'url': 'http://www.youtube.com/user/TheYoungTurks/live', + 'info_dict': { + 'id': 'a48o2S1cPoo', + 'ext': 'mp4', + 'title': 'The Young Turks - Live Main Show', + 'uploader': 'The Young Turks', + 'uploader_id': 'TheYoungTurks', + 'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/TheYoungTurks', + 'upload_date': '20150715', + 'license': 'Standard YouTube License', + 'description': 'md5:438179573adcdff3c97ebb1ee632b891', + 'categories': ['News & Politics'], + 'tags': ['Cenk Uygur (TV Program Creator)', 'The Young Turks (Award-Winning Work)', 'Talk Show (TV Genre)'], + 'like_count': int, + 'dislike_count': int, + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'http://www.youtube.com/channel/UC1yBKRuGpC1tSM73A0ZjYjQ/live', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + channel_id = mobj.group('id') + base_url = mobj.group('base_url') + webpage = self._download_webpage(url, channel_id, fatal=False) + if webpage: + page_type = self._og_search_property( + 'type', webpage, 'page type', default=None) + video_id = self._html_search_meta( + 'videoId', webpage, 'video id', default=None) + if page_type == 'video' and video_id and re.match(r'^[0-9A-Za-z_-]{11}$', video_id): + return self.url_result(video_id, YoutubeIE.ie_key()) + return self.url_result(base_url) + + class YoutubePlaylistsIE(YoutubePlaylistsBaseInfoExtractor): IE_DESC = 'YouTube.com user/channel playlists' _VALID_URL = r'https?://(?:\w+\.)?youtube\.com/(?:user|channel)/(?P<id>[^/]+)/playlists' @@ -2039,7 +2087,7 @@ class YoutubeSearchIE(SearchInfoExtractor, YoutubePlaylistIE): 'spf': 'navigate', } url_query.update(self._EXTRA_QUERY_ARGS) - result_url = 'https://www.youtube.com/results?' + compat_urllib_parse.urlencode(url_query) + result_url = 'https://www.youtube.com/results?' + compat_urllib_parse_urlencode(url_query) data = self._download_json( result_url, video_id='query "%s"' % query, note='Downloading page %s' % pagenum, diff --git a/youtube_dl/options.py b/youtube_dl/options.py index 9dd7a8034..7819f14ab 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -399,6 +399,10 @@ def parseOpts(overrideArguments=None): '-R', '--retries', dest='retries', metavar='RETRIES', default=10, help='Number of retries (default is %default), or "infinite".') + downloader.add_option( + '--fragment-retries', + dest='fragment_retries', metavar='RETRIES', default=10, + help='Number of retries for a fragment (default is %default), or "infinite" (DASH only)') downloader.add_option( '--buffer-size', dest='buffersize', metavar='SIZE', default='1024', @@ -720,7 +724,7 @@ def parseOpts(overrideArguments=None): postproc.add_option( '--embed-subs', action='store_true', dest='embedsubtitles', default=False, - help='Embed subtitles in the video (only for mkv and mp4 videos)') + help='Embed subtitles in the video (only for mp4, webm and mkv videos)') postproc.add_option( '--embed-thumbnail', action='store_true', dest='embedthumbnail', default=False, diff --git a/youtube_dl/postprocessor/ffmpeg.py b/youtube_dl/postprocessor/ffmpeg.py index a8819f258..06b8c0548 100644 --- a/youtube_dl/postprocessor/ffmpeg.py +++ b/youtube_dl/postprocessor/ffmpeg.py @@ -331,17 +331,34 @@ class FFmpegVideoConvertorPP(FFmpegPostProcessor): class FFmpegEmbedSubtitlePP(FFmpegPostProcessor): def run(self, information): - if information['ext'] not in ['mp4', 'mkv']: - self._downloader.to_screen('[ffmpeg] Subtitles can only be embedded in mp4 or mkv files') + if information['ext'] not in ('mp4', 'webm', 'mkv'): + self._downloader.to_screen('[ffmpeg] Subtitles can only be embedded in mp4, webm or mkv files') return [], information subtitles = information.get('requested_subtitles') if not subtitles: self._downloader.to_screen('[ffmpeg] There aren\'t any subtitles to embed') return [], information - sub_langs = list(subtitles.keys()) filename = information['filepath'] - sub_filenames = [subtitles_filename(filename, lang, sub_info['ext']) for lang, sub_info in subtitles.items()] + + ext = information['ext'] + sub_langs = [] + sub_filenames = [] + webm_vtt_warn = False + + for lang, sub_info in subtitles.items(): + sub_ext = sub_info['ext'] + if ext != 'webm' or ext == 'webm' and sub_ext == 'vtt': + sub_langs.append(lang) + sub_filenames.append(subtitles_filename(filename, lang, sub_ext)) + else: + if not webm_vtt_warn and ext == 'webm' and sub_ext != 'vtt': + webm_vtt_warn = True + self._downloader.to_screen('[ffmpeg] Only WebVTT subtitles can be embedded in webm files') + + if not sub_langs: + return [], information + input_files = [filename] + sub_filenames opts = [ diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 067b8a184..6d27b80c0 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -47,6 +47,7 @@ from .compat import ( compat_str, compat_urllib_error, compat_urllib_parse, + compat_urllib_parse_urlencode, compat_urllib_parse_urlparse, compat_urllib_request, compat_urlparse, @@ -416,9 +417,12 @@ def sanitize_path(s): # Prepend protocol-less URLs with `http:` scheme in order to mitigate the number of # unwanted failures due to missing protocol +def sanitize_url(url): + return 'http:%s' % url if url.startswith('//') else url + + def sanitized_Request(url, *args, **kwargs): - return compat_urllib_request.Request( - 'http:%s' % url if url.startswith('//') else url, *args, **kwargs) + return compat_urllib_request.Request(sanitize_url(url), *args, **kwargs) def orderedSet(iterable): @@ -1315,7 +1319,7 @@ def shell_quote(args): def smuggle_url(url, data): """ Pass additional data in a URL for internal use. """ - sdata = compat_urllib_parse.urlencode( + sdata = compat_urllib_parse_urlencode( {'__youtubedl_smuggle': json.dumps(data)}) return url + '#' + sdata @@ -1746,6 +1750,7 @@ def escape_url(url): """Escape URL as suggested by RFC 3986""" url_parsed = compat_urllib_parse_urlparse(url) return url_parsed._replace( + netloc=url_parsed.netloc.encode('idna').decode('ascii'), path=escape_rfc3986(url_parsed.path), params=escape_rfc3986(url_parsed.params), query=escape_rfc3986(url_parsed.query), @@ -1755,7 +1760,8 @@ def escape_url(url): try: struct.pack('!I', 0) except TypeError: - # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument + # In Python 2.6 and 2.7.x < 2.7.7, struct requires a bytes argument + # See https://bugs.python.org/issue19099 def struct_pack(spec, *args): if isinstance(spec, compat_str): spec = spec.encode('ascii') @@ -1787,22 +1793,15 @@ def read_batch_urls(batch_fd): def urlencode_postdata(*args, **kargs): - return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii') + return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii') def update_url_query(url, query): parsed_url = compat_urlparse.urlparse(url) qs = compat_parse_qs(parsed_url.query) qs.update(query) - qs = encode_dict(qs) return compat_urlparse.urlunparse(parsed_url._replace( - query=compat_urllib_parse.urlencode(qs, True))) - - -def encode_dict(d, encoding='utf-8'): - def encode(v): - return v.encode(encoding) if isinstance(v, compat_basestring) else v - return dict((encode(k), encode(v)) for k, v in d.items()) + query=compat_urllib_parse_urlencode(qs, True))) def dict_get(d, key_or_keys, default=None, skip_false_values=True): diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 6b2c5fac9..5daa7f4e8 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2016.03.18' +__version__ = '2016.03.27'